Home | History | Annotate | Download | only in priv
      1 
      2 /*--------------------------------------------------------------------*/
      3 /*--- begin                                     guest_amd64_toIR.c ---*/
      4 /*--------------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2013 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 /* Translates AMD64 code to IR. */
     37 
     38 /* TODO:
     39 
     40    All Puts to CC_OP/CC_DEP1/CC_DEP2/CC_NDEP should really be checked
     41    to ensure a 64-bit value is being written.
     42 
     43    x87 FP Limitations:
     44 
     45    * all arithmetic done at 64 bits
     46 
     47    * no FP exceptions, except for handling stack over/underflow
     48 
     49    * FP rounding mode observed only for float->int conversions and
     50      int->float conversions which could lose accuracy, and for
     51      float-to-float rounding.  For all other operations,
     52      round-to-nearest is used, regardless.
     53 
     54    * some of the FCOM cases could do with testing -- not convinced
     55      that the args are the right way round.
     56 
     57    * FSAVE does not re-initialise the FPU; it should do
     58 
     59    * FINIT not only initialises the FPU environment, it also zeroes
     60      all the FP registers.  It should leave the registers unchanged.
     61 
     62     SAHF should cause eflags[1] == 1, and in fact it produces 0.  As
     63     per Intel docs this bit has no meaning anyway.  Since PUSHF is the
     64     only way to observe eflags[1], a proper fix would be to make that
     65     bit be set by PUSHF.
     66 
     67     This module uses global variables and so is not MT-safe (if that
     68     should ever become relevant).
     69 */
     70 
     71 /* Notes re address size overrides (0x67).
     72 
     73    According to the AMD documentation (24594 Rev 3.09, Sept 2003,
     74    "AMD64 Architecture Programmer's Manual Volume 3: General-Purpose
     75    and System Instructions"), Section 1.2.3 ("Address-Size Override
     76    Prefix"):
     77 
     78    0x67 applies to all explicit memory references, causing the top
     79    32 bits of the effective address to become zero.
     80 
     81    0x67 has no effect on stack references (push/pop); these always
     82    use a 64-bit address.
     83 
     84    0x67 changes the interpretation of instructions which implicitly
     85    reference RCX/RSI/RDI, so that in fact ECX/ESI/EDI are used
     86    instead.  These are:
     87 
     88       cmp{s,sb,sw,sd,sq}
     89       in{s,sb,sw,sd}
     90       jcxz, jecxz, jrcxz
     91       lod{s,sb,sw,sd,sq}
     92       loop{,e,bz,be,z}
     93       mov{s,sb,sw,sd,sq}
     94       out{s,sb,sw,sd}
     95       rep{,e,ne,nz}
     96       sca{s,sb,sw,sd,sq}
     97       sto{s,sb,sw,sd,sq}
     98       xlat{,b} */
     99 
    100 /* "Special" instructions.
    101 
    102    This instruction decoder can decode three special instructions
    103    which mean nothing natively (are no-ops as far as regs/mem are
    104    concerned) but have meaning for supporting Valgrind.  A special
    105    instruction is flagged by the 16-byte preamble 48C1C703 48C1C70D
    106    48C1C73D 48C1C733 (in the standard interpretation, that means: rolq
    107    $3, %rdi; rolq $13, %rdi; rolq $61, %rdi; rolq $51, %rdi).
    108    Following that, one of the following 3 are allowed (standard
    109    interpretation in parentheses):
    110 
    111       4887DB (xchgq %rbx,%rbx)   %RDX = client_request ( %RAX )
    112       4887C9 (xchgq %rcx,%rcx)   %RAX = guest_NRADDR
    113       4887D2 (xchgq %rdx,%rdx)   call-noredir *%RAX
    114       4887F6 (xchgq %rdi,%rdi)   IR injection
    115 
    116    Any other bytes following the 16-byte preamble are illegal and
    117    constitute a failure in instruction decoding.  This all assumes
    118    that the preamble will never occur except in specific code
    119    fragments designed for Valgrind to catch.
    120 
    121    No prefixes may precede a "Special" instruction.
    122 */
    123 
    124 /* casLE (implementation of lock-prefixed insns) and rep-prefixed
    125    insns: the side-exit back to the start of the insn is done with
    126    Ijk_Boring.  This is quite wrong, it should be done with
    127    Ijk_NoRedir, since otherwise the side exit, which is intended to
    128    restart the instruction for whatever reason, could go somewhere
    129    entirely else.  Doing it right (with Ijk_NoRedir jumps) would make
    130    no-redir jumps performance critical, at least for rep-prefixed
    131    instructions, since all iterations thereof would involve such a
    132    jump.  It's not such a big deal with casLE since the side exit is
    133    only taken if the CAS fails, that is, the location is contended,
    134    which is relatively unlikely.
    135 
    136    Note also, the test for CAS success vs failure is done using
    137    Iop_CasCmp{EQ,NE}{8,16,32,64} rather than the ordinary
    138    Iop_Cmp{EQ,NE} equivalents.  This is so as to tell Memcheck that it
    139    shouldn't definedness-check these comparisons.  See
    140    COMMENT_ON_CasCmpEQ in memcheck/mc_translate.c for
    141    background/rationale.
    142 */
    143 
    144 /* LOCK prefixed instructions.  These are translated using IR-level
    145    CAS statements (IRCAS) and are believed to preserve atomicity, even
    146    from the point of view of some other process racing against a
    147    simulated one (presumably they communicate via a shared memory
    148    segment).
    149 
    150    Handlers which are aware of LOCK prefixes are:
    151       dis_op2_G_E      (add, or, adc, sbb, and, sub, xor)
    152       dis_cmpxchg_G_E  (cmpxchg)
    153       dis_Grp1         (add, or, adc, sbb, and, sub, xor)
    154       dis_Grp3         (not, neg)
    155       dis_Grp4         (inc, dec)
    156       dis_Grp5         (inc, dec)
    157       dis_Grp8_Imm     (bts, btc, btr)
    158       dis_bt_G_E       (bts, btc, btr)
    159       dis_xadd_G_E     (xadd)
    160 */
    161 
    162 
    163 #include "libvex_basictypes.h"
    164 #include "libvex_ir.h"
    165 #include "libvex.h"
    166 #include "libvex_guest_amd64.h"
    167 
    168 #include "main_util.h"
    169 #include "main_globals.h"
    170 #include "guest_generic_bb_to_IR.h"
    171 #include "guest_generic_x87.h"
    172 #include "guest_amd64_defs.h"
    173 
    174 
    175 /*------------------------------------------------------------*/
    176 /*--- Globals                                              ---*/
    177 /*------------------------------------------------------------*/
    178 
    179 /* These are set at the start of the translation of an insn, right
    180    down in disInstr_AMD64, so that we don't have to pass them around
    181    endlessly.  They are all constant during the translation of any
    182    given insn. */
    183 
    184 /* These are set at the start of the translation of a BB, so
    185    that we don't have to pass them around endlessly. */
    186 
    187 /* We need to know this to do sub-register accesses correctly. */
    188 static VexEndness host_endness;
    189 
    190 /* Pointer to the guest code area (points to start of BB, not to the
    191    insn being processed). */
    192 static const UChar* guest_code;
    193 
    194 /* The guest address corresponding to guest_code[0]. */
    195 static Addr64 guest_RIP_bbstart;
    196 
    197 /* The guest address for the instruction currently being
    198    translated. */
    199 static Addr64 guest_RIP_curr_instr;
    200 
    201 /* The IRSB* into which we're generating code. */
    202 static IRSB* irsb;
    203 
    204 /* For ensuring that %rip-relative addressing is done right.  A read
    205    of %rip generates the address of the next instruction.  It may be
    206    that we don't conveniently know that inside disAMode().  For sanity
    207    checking, if the next insn %rip is needed, we make a guess at what
    208    it is, record that guess here, and set the accompanying Bool to
    209    indicate that -- after this insn's decode is finished -- that guess
    210    needs to be checked.  */
    211 
    212 /* At the start of each insn decode, is set to (0, False).
    213    After the decode, if _mustcheck is now True, _assumed is
    214    checked. */
    215 
    216 static Addr64 guest_RIP_next_assumed;
    217 static Bool   guest_RIP_next_mustcheck;
    218 
    219 
    220 /*------------------------------------------------------------*/
    221 /*--- Helpers for constructing IR.                         ---*/
    222 /*------------------------------------------------------------*/
    223 
    224 /* Generate a new temporary of the given type. */
    225 static IRTemp newTemp ( IRType ty )
    226 {
    227    vassert(isPlausibleIRType(ty));
    228    return newIRTemp( irsb->tyenv, ty );
    229 }
    230 
    231 /* Add a statement to the list held by "irsb". */
    232 static void stmt ( IRStmt* st )
    233 {
    234    addStmtToIRSB( irsb, st );
    235 }
    236 
    237 /* Generate a statement "dst := e". */
    238 static void assign ( IRTemp dst, IRExpr* e )
    239 {
    240    stmt( IRStmt_WrTmp(dst, e) );
    241 }
    242 
    243 static IRExpr* unop ( IROp op, IRExpr* a )
    244 {
    245    return IRExpr_Unop(op, a);
    246 }
    247 
    248 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
    249 {
    250    return IRExpr_Binop(op, a1, a2);
    251 }
    252 
    253 static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
    254 {
    255    return IRExpr_Triop(op, a1, a2, a3);
    256 }
    257 
    258 static IRExpr* mkexpr ( IRTemp tmp )
    259 {
    260    return IRExpr_RdTmp(tmp);
    261 }
    262 
    263 static IRExpr* mkU8 ( ULong i )
    264 {
    265    vassert(i < 256);
    266    return IRExpr_Const(IRConst_U8( (UChar)i ));
    267 }
    268 
    269 static IRExpr* mkU16 ( ULong i )
    270 {
    271    vassert(i < 0x10000ULL);
    272    return IRExpr_Const(IRConst_U16( (UShort)i ));
    273 }
    274 
    275 static IRExpr* mkU32 ( ULong i )
    276 {
    277    vassert(i < 0x100000000ULL);
    278    return IRExpr_Const(IRConst_U32( (UInt)i ));
    279 }
    280 
    281 static IRExpr* mkU64 ( ULong i )
    282 {
    283    return IRExpr_Const(IRConst_U64(i));
    284 }
    285 
    286 static IRExpr* mkU ( IRType ty, ULong i )
    287 {
    288    switch (ty) {
    289       case Ity_I8:  return mkU8(i);
    290       case Ity_I16: return mkU16(i);
    291       case Ity_I32: return mkU32(i);
    292       case Ity_I64: return mkU64(i);
    293       default: vpanic("mkU(amd64)");
    294    }
    295 }
    296 
    297 static void storeLE ( IRExpr* addr, IRExpr* data )
    298 {
    299    stmt( IRStmt_Store(Iend_LE, addr, data) );
    300 }
    301 
    302 static IRExpr* loadLE ( IRType ty, IRExpr* addr )
    303 {
    304    return IRExpr_Load(Iend_LE, ty, addr);
    305 }
    306 
    307 static IROp mkSizedOp ( IRType ty, IROp op8 )
    308 {
    309    vassert(op8 == Iop_Add8 || op8 == Iop_Sub8
    310            || op8 == Iop_Mul8
    311            || op8 == Iop_Or8 || op8 == Iop_And8 || op8 == Iop_Xor8
    312            || op8 == Iop_Shl8 || op8 == Iop_Shr8 || op8 == Iop_Sar8
    313            || op8 == Iop_CmpEQ8 || op8 == Iop_CmpNE8
    314            || op8 == Iop_CasCmpNE8
    315            || op8 == Iop_Not8 );
    316    switch (ty) {
    317       case Ity_I8:  return 0 +op8;
    318       case Ity_I16: return 1 +op8;
    319       case Ity_I32: return 2 +op8;
    320       case Ity_I64: return 3 +op8;
    321       default: vpanic("mkSizedOp(amd64)");
    322    }
    323 }
    324 
    325 static
    326 IRExpr* doScalarWidening ( Int szSmall, Int szBig, Bool signd, IRExpr* src )
    327 {
    328    if (szSmall == 1 && szBig == 4) {
    329       return unop(signd ? Iop_8Sto32 : Iop_8Uto32, src);
    330    }
    331    if (szSmall == 1 && szBig == 2) {
    332       return unop(signd ? Iop_8Sto16 : Iop_8Uto16, src);
    333    }
    334    if (szSmall == 2 && szBig == 4) {
    335       return unop(signd ? Iop_16Sto32 : Iop_16Uto32, src);
    336    }
    337    if (szSmall == 1 && szBig == 8 && !signd) {
    338       return unop(Iop_8Uto64, src);
    339    }
    340    if (szSmall == 1 && szBig == 8 && signd) {
    341       return unop(Iop_8Sto64, src);
    342    }
    343    if (szSmall == 2 && szBig == 8 && !signd) {
    344       return unop(Iop_16Uto64, src);
    345    }
    346    if (szSmall == 2 && szBig == 8 && signd) {
    347       return unop(Iop_16Sto64, src);
    348    }
    349    vpanic("doScalarWidening(amd64)");
    350 }
    351 
    352 
    353 
    354 /*------------------------------------------------------------*/
    355 /*--- Debugging output                                     ---*/
    356 /*------------------------------------------------------------*/
    357 
    358 /* Bomb out if we can't handle something. */
    359 __attribute__ ((noreturn))
    360 static void unimplemented ( const HChar* str )
    361 {
    362    vex_printf("amd64toIR: unimplemented feature\n");
    363    vpanic(str);
    364 }
    365 
    366 #define DIP(format, args...)           \
    367    if (vex_traceflags & VEX_TRACE_FE)  \
    368       vex_printf(format, ## args)
    369 
    370 #define DIS(buf, format, args...)      \
    371    if (vex_traceflags & VEX_TRACE_FE)  \
    372       vex_sprintf(buf, format, ## args)
    373 
    374 
    375 /*------------------------------------------------------------*/
    376 /*--- Offsets of various parts of the amd64 guest state.   ---*/
    377 /*------------------------------------------------------------*/
    378 
    379 #define OFFB_RAX       offsetof(VexGuestAMD64State,guest_RAX)
    380 #define OFFB_RBX       offsetof(VexGuestAMD64State,guest_RBX)
    381 #define OFFB_RCX       offsetof(VexGuestAMD64State,guest_RCX)
    382 #define OFFB_RDX       offsetof(VexGuestAMD64State,guest_RDX)
    383 #define OFFB_RSP       offsetof(VexGuestAMD64State,guest_RSP)
    384 #define OFFB_RBP       offsetof(VexGuestAMD64State,guest_RBP)
    385 #define OFFB_RSI       offsetof(VexGuestAMD64State,guest_RSI)
    386 #define OFFB_RDI       offsetof(VexGuestAMD64State,guest_RDI)
    387 #define OFFB_R8        offsetof(VexGuestAMD64State,guest_R8)
    388 #define OFFB_R9        offsetof(VexGuestAMD64State,guest_R9)
    389 #define OFFB_R10       offsetof(VexGuestAMD64State,guest_R10)
    390 #define OFFB_R11       offsetof(VexGuestAMD64State,guest_R11)
    391 #define OFFB_R12       offsetof(VexGuestAMD64State,guest_R12)
    392 #define OFFB_R13       offsetof(VexGuestAMD64State,guest_R13)
    393 #define OFFB_R14       offsetof(VexGuestAMD64State,guest_R14)
    394 #define OFFB_R15       offsetof(VexGuestAMD64State,guest_R15)
    395 
    396 #define OFFB_RIP       offsetof(VexGuestAMD64State,guest_RIP)
    397 
    398 #define OFFB_FS_CONST  offsetof(VexGuestAMD64State,guest_FS_CONST)
    399 #define OFFB_GS_CONST  offsetof(VexGuestAMD64State,guest_GS_CONST)
    400 
    401 #define OFFB_CC_OP     offsetof(VexGuestAMD64State,guest_CC_OP)
    402 #define OFFB_CC_DEP1   offsetof(VexGuestAMD64State,guest_CC_DEP1)
    403 #define OFFB_CC_DEP2   offsetof(VexGuestAMD64State,guest_CC_DEP2)
    404 #define OFFB_CC_NDEP   offsetof(VexGuestAMD64State,guest_CC_NDEP)
    405 
    406 #define OFFB_FPREGS    offsetof(VexGuestAMD64State,guest_FPREG[0])
    407 #define OFFB_FPTAGS    offsetof(VexGuestAMD64State,guest_FPTAG[0])
    408 #define OFFB_DFLAG     offsetof(VexGuestAMD64State,guest_DFLAG)
    409 #define OFFB_ACFLAG    offsetof(VexGuestAMD64State,guest_ACFLAG)
    410 #define OFFB_IDFLAG    offsetof(VexGuestAMD64State,guest_IDFLAG)
    411 #define OFFB_FTOP      offsetof(VexGuestAMD64State,guest_FTOP)
    412 #define OFFB_FC3210    offsetof(VexGuestAMD64State,guest_FC3210)
    413 #define OFFB_FPROUND   offsetof(VexGuestAMD64State,guest_FPROUND)
    414 
    415 #define OFFB_SSEROUND  offsetof(VexGuestAMD64State,guest_SSEROUND)
    416 #define OFFB_YMM0      offsetof(VexGuestAMD64State,guest_YMM0)
    417 #define OFFB_YMM1      offsetof(VexGuestAMD64State,guest_YMM1)
    418 #define OFFB_YMM2      offsetof(VexGuestAMD64State,guest_YMM2)
    419 #define OFFB_YMM3      offsetof(VexGuestAMD64State,guest_YMM3)
    420 #define OFFB_YMM4      offsetof(VexGuestAMD64State,guest_YMM4)
    421 #define OFFB_YMM5      offsetof(VexGuestAMD64State,guest_YMM5)
    422 #define OFFB_YMM6      offsetof(VexGuestAMD64State,guest_YMM6)
    423 #define OFFB_YMM7      offsetof(VexGuestAMD64State,guest_YMM7)
    424 #define OFFB_YMM8      offsetof(VexGuestAMD64State,guest_YMM8)
    425 #define OFFB_YMM9      offsetof(VexGuestAMD64State,guest_YMM9)
    426 #define OFFB_YMM10     offsetof(VexGuestAMD64State,guest_YMM10)
    427 #define OFFB_YMM11     offsetof(VexGuestAMD64State,guest_YMM11)
    428 #define OFFB_YMM12     offsetof(VexGuestAMD64State,guest_YMM12)
    429 #define OFFB_YMM13     offsetof(VexGuestAMD64State,guest_YMM13)
    430 #define OFFB_YMM14     offsetof(VexGuestAMD64State,guest_YMM14)
    431 #define OFFB_YMM15     offsetof(VexGuestAMD64State,guest_YMM15)
    432 #define OFFB_YMM16     offsetof(VexGuestAMD64State,guest_YMM16)
    433 
    434 #define OFFB_EMNOTE    offsetof(VexGuestAMD64State,guest_EMNOTE)
    435 #define OFFB_CMSTART   offsetof(VexGuestAMD64State,guest_CMSTART)
    436 #define OFFB_CMLEN     offsetof(VexGuestAMD64State,guest_CMLEN)
    437 
    438 #define OFFB_NRADDR    offsetof(VexGuestAMD64State,guest_NRADDR)
    439 
    440 
    441 /*------------------------------------------------------------*/
    442 /*--- Helper bits and pieces for deconstructing the        ---*/
    443 /*--- amd64 insn stream.                                   ---*/
    444 /*------------------------------------------------------------*/
    445 
    446 /* This is the AMD64 register encoding -- integer regs. */
    447 #define R_RAX 0
    448 #define R_RCX 1
    449 #define R_RDX 2
    450 #define R_RBX 3
    451 #define R_RSP 4
    452 #define R_RBP 5
    453 #define R_RSI 6
    454 #define R_RDI 7
    455 #define R_R8  8
    456 #define R_R9  9
    457 #define R_R10 10
    458 #define R_R11 11
    459 #define R_R12 12
    460 #define R_R13 13
    461 #define R_R14 14
    462 #define R_R15 15
    463 
    464 /* This is the Intel register encoding -- segment regs. */
    465 #define R_ES 0
    466 #define R_CS 1
    467 #define R_SS 2
    468 #define R_DS 3
    469 #define R_FS 4
    470 #define R_GS 5
    471 
    472 
    473 /* Various simple conversions */
    474 
    475 static ULong extend_s_8to64 ( UChar x )
    476 {
    477    return (ULong)((Long)(((ULong)x) << 56) >> 56);
    478 }
    479 
    480 static ULong extend_s_16to64 ( UShort x )
    481 {
    482    return (ULong)((Long)(((ULong)x) << 48) >> 48);
    483 }
    484 
    485 static ULong extend_s_32to64 ( UInt x )
    486 {
    487    return (ULong)((Long)(((ULong)x) << 32) >> 32);
    488 }
    489 
    490 /* Figure out whether the mod and rm parts of a modRM byte refer to a
    491    register or memory.  If so, the byte will have the form 11XXXYYY,
    492    where YYY is the register number. */
    493 inline
    494 static Bool epartIsReg ( UChar mod_reg_rm )
    495 {
    496    return toBool(0xC0 == (mod_reg_rm & 0xC0));
    497 }
    498 
    499 /* Extract the 'g' field from a modRM byte.  This only produces 3
    500    bits, which is not a complete register number.  You should avoid
    501    this function if at all possible. */
    502 inline
    503 static Int gregLO3ofRM ( UChar mod_reg_rm )
    504 {
    505    return (Int)( (mod_reg_rm >> 3) & 7 );
    506 }
    507 
    508 /* Ditto the 'e' field of a modRM byte. */
    509 inline
    510 static Int eregLO3ofRM ( UChar mod_reg_rm )
    511 {
    512    return (Int)(mod_reg_rm & 0x7);
    513 }
    514 
    515 /* Get a 8/16/32-bit unsigned value out of the insn stream. */
    516 
    517 static inline UChar getUChar ( Long delta )
    518 {
    519    UChar v = guest_code[delta+0];
    520    return v;
    521 }
    522 
    523 static UInt getUDisp16 ( Long delta )
    524 {
    525    UInt v = guest_code[delta+1]; v <<= 8;
    526    v |= guest_code[delta+0];
    527    return v & 0xFFFF;
    528 }
    529 
    530 //.. static UInt getUDisp ( Int size, Long delta )
    531 //.. {
    532 //..    switch (size) {
    533 //..       case 4: return getUDisp32(delta);
    534 //..       case 2: return getUDisp16(delta);
    535 //..       case 1: return getUChar(delta);
    536 //..       default: vpanic("getUDisp(x86)");
    537 //..    }
    538 //..    return 0; /*notreached*/
    539 //.. }
    540 
    541 
    542 /* Get a byte value out of the insn stream and sign-extend to 64
    543    bits. */
    544 static Long getSDisp8 ( Long delta )
    545 {
    546    return extend_s_8to64( guest_code[delta] );
    547 }
    548 
    549 /* Get a 16-bit value out of the insn stream and sign-extend to 64
    550    bits. */
    551 static Long getSDisp16 ( Long delta )
    552 {
    553    UInt v = guest_code[delta+1]; v <<= 8;
    554    v |= guest_code[delta+0];
    555    return extend_s_16to64( (UShort)v );
    556 }
    557 
    558 /* Get a 32-bit value out of the insn stream and sign-extend to 64
    559    bits. */
    560 static Long getSDisp32 ( Long delta )
    561 {
    562    UInt v = guest_code[delta+3]; v <<= 8;
    563    v |= guest_code[delta+2]; v <<= 8;
    564    v |= guest_code[delta+1]; v <<= 8;
    565    v |= guest_code[delta+0];
    566    return extend_s_32to64( v );
    567 }
    568 
    569 /* Get a 64-bit value out of the insn stream. */
    570 static Long getDisp64 ( Long delta )
    571 {
    572    ULong v = 0;
    573    v |= guest_code[delta+7]; v <<= 8;
    574    v |= guest_code[delta+6]; v <<= 8;
    575    v |= guest_code[delta+5]; v <<= 8;
    576    v |= guest_code[delta+4]; v <<= 8;
    577    v |= guest_code[delta+3]; v <<= 8;
    578    v |= guest_code[delta+2]; v <<= 8;
    579    v |= guest_code[delta+1]; v <<= 8;
    580    v |= guest_code[delta+0];
    581    return v;
    582 }
    583 
    584 /* Note: because AMD64 doesn't allow 64-bit literals, it is an error
    585    if this is called with size==8.  Should not happen. */
    586 static Long getSDisp ( Int size, Long delta )
    587 {
    588    switch (size) {
    589       case 4: return getSDisp32(delta);
    590       case 2: return getSDisp16(delta);
    591       case 1: return getSDisp8(delta);
    592       default: vpanic("getSDisp(amd64)");
    593   }
    594 }
    595 
    596 static ULong mkSizeMask ( Int sz )
    597 {
    598    switch (sz) {
    599       case 1: return 0x00000000000000FFULL;
    600       case 2: return 0x000000000000FFFFULL;
    601       case 4: return 0x00000000FFFFFFFFULL;
    602       case 8: return 0xFFFFFFFFFFFFFFFFULL;
    603       default: vpanic("mkSzMask(amd64)");
    604    }
    605 }
    606 
    607 static Int imin ( Int a, Int b )
    608 {
    609    return (a < b) ? a : b;
    610 }
    611 
    612 static IRType szToITy ( Int n )
    613 {
    614    switch (n) {
    615       case 1: return Ity_I8;
    616       case 2: return Ity_I16;
    617       case 4: return Ity_I32;
    618       case 8: return Ity_I64;
    619       default: vex_printf("\nszToITy(%d)\n", n);
    620                vpanic("szToITy(amd64)");
    621    }
    622 }
    623 
    624 
    625 /*------------------------------------------------------------*/
    626 /*--- For dealing with prefixes.                           ---*/
    627 /*------------------------------------------------------------*/
    628 
    629 /* The idea is to pass around an int holding a bitmask summarising
    630    info from the prefixes seen on the current instruction, including
    631    info from the REX byte.  This info is used in various places, but
    632    most especially when making sense of register fields in
    633    instructions.
    634 
    635    The top 8 bits of the prefix are 0x55, just as a hacky way to
    636    ensure it really is a valid prefix.
    637 
    638    Things you can safely assume about a well-formed prefix:
    639    * at most one segment-override bit (CS,DS,ES,FS,GS,SS) is set.
    640    * if REX is not present then REXW,REXR,REXX,REXB will read
    641      as zero.
    642    * F2 and F3 will not both be 1.
    643 */
    644 
    645 typedef UInt  Prefix;
    646 
    647 #define PFX_ASO    (1<<0)    /* address-size override present (0x67) */
    648 #define PFX_66     (1<<1)    /* operand-size override-to-16 present (0x66) */
    649 #define PFX_REX    (1<<2)    /* REX byte present (0x40 to 0x4F) */
    650 #define PFX_REXW   (1<<3)    /* REX W bit, if REX present, else 0 */
    651 #define PFX_REXR   (1<<4)    /* REX R bit, if REX present, else 0 */
    652 #define PFX_REXX   (1<<5)    /* REX X bit, if REX present, else 0 */
    653 #define PFX_REXB   (1<<6)    /* REX B bit, if REX present, else 0 */
    654 #define PFX_LOCK   (1<<7)    /* bus LOCK prefix present (0xF0) */
    655 #define PFX_F2     (1<<8)    /* REP/REPE/REPZ prefix present (0xF2) */
    656 #define PFX_F3     (1<<9)    /* REPNE/REPNZ prefix present (0xF3) */
    657 #define PFX_CS     (1<<10)   /* CS segment prefix present (0x2E) */
    658 #define PFX_DS     (1<<11)   /* DS segment prefix present (0x3E) */
    659 #define PFX_ES     (1<<12)   /* ES segment prefix present (0x26) */
    660 #define PFX_FS     (1<<13)   /* FS segment prefix present (0x64) */
    661 #define PFX_GS     (1<<14)   /* GS segment prefix present (0x65) */
    662 #define PFX_SS     (1<<15)   /* SS segment prefix present (0x36) */
    663 #define PFX_VEX    (1<<16)   /* VEX prefix present (0xC4 or 0xC5) */
    664 #define PFX_VEXL   (1<<17)   /* VEX L bit, if VEX present, else 0 */
    665 /* The extra register field VEX.vvvv is encoded (after not-ing it) as
    666    PFX_VEXnV3 .. PFX_VEXnV0, so these must occupy adjacent bit
    667    positions. */
    668 #define PFX_VEXnV0 (1<<18)   /* ~VEX vvvv[0], if VEX present, else 0 */
    669 #define PFX_VEXnV1 (1<<19)   /* ~VEX vvvv[1], if VEX present, else 0 */
    670 #define PFX_VEXnV2 (1<<20)   /* ~VEX vvvv[2], if VEX present, else 0 */
    671 #define PFX_VEXnV3 (1<<21)   /* ~VEX vvvv[3], if VEX present, else 0 */
    672 
    673 
    674 #define PFX_EMPTY 0x55000000
    675 
    676 static Bool IS_VALID_PFX ( Prefix pfx ) {
    677    return toBool((pfx & 0xFF000000) == PFX_EMPTY);
    678 }
    679 
    680 static Bool haveREX ( Prefix pfx ) {
    681    return toBool(pfx & PFX_REX);
    682 }
    683 
    684 static Int getRexW ( Prefix pfx ) {
    685    return (pfx & PFX_REXW) ? 1 : 0;
    686 }
    687 static Int getRexR ( Prefix pfx ) {
    688    return (pfx & PFX_REXR) ? 1 : 0;
    689 }
    690 static Int getRexX ( Prefix pfx ) {
    691    return (pfx & PFX_REXX) ? 1 : 0;
    692 }
    693 static Int getRexB ( Prefix pfx ) {
    694    return (pfx & PFX_REXB) ? 1 : 0;
    695 }
    696 
    697 /* Check a prefix doesn't have F2 or F3 set in it, since usually that
    698    completely changes what instruction it really is. */
    699 static Bool haveF2orF3 ( Prefix pfx ) {
    700    return toBool((pfx & (PFX_F2|PFX_F3)) > 0);
    701 }
    702 static Bool haveF2andF3 ( Prefix pfx ) {
    703    return toBool((pfx & (PFX_F2|PFX_F3)) == (PFX_F2|PFX_F3));
    704 }
    705 static Bool haveF2 ( Prefix pfx ) {
    706    return toBool((pfx & PFX_F2) > 0);
    707 }
    708 static Bool haveF3 ( Prefix pfx ) {
    709    return toBool((pfx & PFX_F3) > 0);
    710 }
    711 
    712 static Bool have66 ( Prefix pfx ) {
    713    return toBool((pfx & PFX_66) > 0);
    714 }
    715 static Bool haveASO ( Prefix pfx ) {
    716    return toBool((pfx & PFX_ASO) > 0);
    717 }
    718 static Bool haveLOCK ( Prefix pfx ) {
    719    return toBool((pfx & PFX_LOCK) > 0);
    720 }
    721 
    722 /* Return True iff pfx has 66 set and F2 and F3 clear */
    723 static Bool have66noF2noF3 ( Prefix pfx )
    724 {
    725   return
    726      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_66);
    727 }
    728 
    729 /* Return True iff pfx has F2 set and 66 and F3 clear */
    730 static Bool haveF2no66noF3 ( Prefix pfx )
    731 {
    732   return
    733      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_F2);
    734 }
    735 
    736 /* Return True iff pfx has F3 set and 66 and F2 clear */
    737 static Bool haveF3no66noF2 ( Prefix pfx )
    738 {
    739   return
    740      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_F3);
    741 }
    742 
    743 /* Return True iff pfx has F3 set and F2 clear */
    744 static Bool haveF3noF2 ( Prefix pfx )
    745 {
    746   return
    747      toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F3);
    748 }
    749 
    750 /* Return True iff pfx has F2 set and F3 clear */
    751 static Bool haveF2noF3 ( Prefix pfx )
    752 {
    753   return
    754      toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F2);
    755 }
    756 
    757 /* Return True iff pfx has 66, F2 and F3 clear */
    758 static Bool haveNo66noF2noF3 ( Prefix pfx )
    759 {
    760   return
    761      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == 0);
    762 }
    763 
    764 /* Return True iff pfx has any of 66, F2 and F3 set */
    765 static Bool have66orF2orF3 ( Prefix pfx )
    766 {
    767   return toBool( ! haveNo66noF2noF3(pfx) );
    768 }
    769 
    770 /* Return True iff pfx has 66 or F3 set */
    771 static Bool have66orF3 ( Prefix pfx )
    772 {
    773    return toBool((pfx & (PFX_66|PFX_F3)) > 0);
    774 }
    775 
    776 /* Clear all the segment-override bits in a prefix. */
    777 static Prefix clearSegBits ( Prefix p )
    778 {
    779    return
    780       p & ~(PFX_CS | PFX_DS | PFX_ES | PFX_FS | PFX_GS | PFX_SS);
    781 }
    782 
    783 /* Get the (inverted, hence back to "normal") VEX.vvvv field. */
    784 static UInt getVexNvvvv ( Prefix pfx ) {
    785    UInt r = (UInt)pfx;
    786    r /= (UInt)PFX_VEXnV0; /* pray this turns into a shift */
    787    return r & 0xF;
    788 }
    789 
    790 static Bool haveVEX ( Prefix pfx ) {
    791    return toBool(pfx & PFX_VEX);
    792 }
    793 
    794 static Int getVexL ( Prefix pfx ) {
    795    return (pfx & PFX_VEXL) ? 1 : 0;
    796 }
    797 
    798 
    799 /*------------------------------------------------------------*/
    800 /*--- For dealing with escapes                             ---*/
    801 /*------------------------------------------------------------*/
    802 
    803 
    804 /* Escapes come after the prefixes, but before the primary opcode
    805    byte.  They escape the primary opcode byte into a bigger space.
    806    The 0xF0000000 isn't significant, except so as to make it not
    807    overlap valid Prefix values, for sanity checking.
    808 */
    809 
    810 typedef
    811    enum {
    812       ESC_NONE=0xF0000000, // none
    813       ESC_0F,              // 0F
    814       ESC_0F38,            // 0F 38
    815       ESC_0F3A             // 0F 3A
    816    }
    817    Escape;
    818 
    819 
    820 /*------------------------------------------------------------*/
    821 /*--- For dealing with integer registers                   ---*/
    822 /*------------------------------------------------------------*/
    823 
    824 /* This is somewhat complex.  The rules are:
    825 
    826    For 64, 32 and 16 bit register references, the e or g fields in the
    827    modrm bytes supply the low 3 bits of the register number.  The
    828    fourth (most-significant) bit of the register number is supplied by
    829    the REX byte, if it is present; else that bit is taken to be zero.
    830 
    831    The REX.R bit supplies the high bit corresponding to the g register
    832    field, and the REX.B bit supplies the high bit corresponding to the
    833    e register field (when the mod part of modrm indicates that modrm's
    834    e component refers to a register and not to memory).
    835 
    836    The REX.X bit supplies a high register bit for certain registers
    837    in SIB address modes, and is generally rarely used.
    838 
    839    For 8 bit register references, the presence of the REX byte itself
    840    has significance.  If there is no REX present, then the 3-bit
    841    number extracted from the modrm e or g field is treated as an index
    842    into the sequence %al %cl %dl %bl %ah %ch %dh %bh -- that is, the
    843    old x86 encoding scheme.
    844 
    845    But if there is a REX present, the register reference is
    846    interpreted in the same way as for 64/32/16-bit references: a high
    847    bit is extracted from REX, giving a 4-bit number, and the denoted
    848    register is the lowest 8 bits of the 16 integer registers denoted
    849    by the number.  In particular, values 3 through 7 of this sequence
    850    do not refer to %ah %ch %dh %bh but instead to the lowest 8 bits of
    851    %rsp %rbp %rsi %rdi.
    852 
    853    The REX.W bit has no bearing at all on register numbers.  Instead
    854    its presence indicates that the operand size is to be overridden
    855    from its default value (32 bits) to 64 bits instead.  This is in
    856    the same fashion that an 0x66 prefix indicates the operand size is
    857    to be overridden from 32 bits down to 16 bits.  When both REX.W and
    858    0x66 are present there is a conflict, and REX.W takes precedence.
    859 
    860    Rather than try to handle this complexity using a single huge
    861    function, several smaller ones are provided.  The aim is to make it
    862    as difficult as possible to screw up register decoding in a subtle
    863    and hard-to-track-down way.
    864 
    865    Because these routines fish around in the host's memory (that is,
    866    in the guest state area) for sub-parts of guest registers, their
    867    correctness depends on the host's endianness.  So far these
    868    routines only work for little-endian hosts.  Those for which
    869    endianness is important have assertions to ensure sanity.
    870 */
    871 
    872 
    873 /* About the simplest question you can ask: where do the 64-bit
    874    integer registers live (in the guest state) ? */
    875 
    876 static Int integerGuestReg64Offset ( UInt reg )
    877 {
    878    switch (reg) {
    879       case R_RAX: return OFFB_RAX;
    880       case R_RCX: return OFFB_RCX;
    881       case R_RDX: return OFFB_RDX;
    882       case R_RBX: return OFFB_RBX;
    883       case R_RSP: return OFFB_RSP;
    884       case R_RBP: return OFFB_RBP;
    885       case R_RSI: return OFFB_RSI;
    886       case R_RDI: return OFFB_RDI;
    887       case R_R8:  return OFFB_R8;
    888       case R_R9:  return OFFB_R9;
    889       case R_R10: return OFFB_R10;
    890       case R_R11: return OFFB_R11;
    891       case R_R12: return OFFB_R12;
    892       case R_R13: return OFFB_R13;
    893       case R_R14: return OFFB_R14;
    894       case R_R15: return OFFB_R15;
    895       default: vpanic("integerGuestReg64Offset(amd64)");
    896    }
    897 }
    898 
    899 
    900 /* Produce the name of an integer register, for printing purposes.
    901    reg is a number in the range 0 .. 15 that has been generated from a
    902    3-bit reg-field number and a REX extension bit.  irregular denotes
    903    the case where sz==1 and no REX byte is present. */
    904 
    905 static
    906 const HChar* nameIReg ( Int sz, UInt reg, Bool irregular )
    907 {
    908    static const HChar* ireg64_names[16]
    909      = { "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
    910          "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" };
    911    static const HChar* ireg32_names[16]
    912      = { "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
    913          "%r8d", "%r9d", "%r10d","%r11d","%r12d","%r13d","%r14d","%r15d" };
    914    static const HChar* ireg16_names[16]
    915      = { "%ax",  "%cx",  "%dx",  "%bx",  "%sp",  "%bp",  "%si",  "%di",
    916          "%r8w", "%r9w", "%r10w","%r11w","%r12w","%r13w","%r14w","%r15w" };
    917    static const HChar* ireg8_names[16]
    918      = { "%al",  "%cl",  "%dl",  "%bl",  "%spl", "%bpl", "%sil", "%dil",
    919          "%r8b", "%r9b", "%r10b","%r11b","%r12b","%r13b","%r14b","%r15b" };
    920    static const HChar* ireg8_irregular[8]
    921      = { "%al", "%cl", "%dl", "%bl", "%ah", "%ch", "%dh", "%bh" };
    922 
    923    vassert(reg < 16);
    924    if (sz == 1) {
    925       if (irregular)
    926          vassert(reg < 8);
    927    } else {
    928       vassert(irregular == False);
    929    }
    930 
    931    switch (sz) {
    932       case 8: return ireg64_names[reg];
    933       case 4: return ireg32_names[reg];
    934       case 2: return ireg16_names[reg];
    935       case 1: if (irregular) {
    936                  return ireg8_irregular[reg];
    937               } else {
    938                  return ireg8_names[reg];
    939               }
    940       default: vpanic("nameIReg(amd64)");
    941    }
    942 }
    943 
    944 /* Using the same argument conventions as nameIReg, produce the
    945    guest state offset of an integer register. */
    946 
    947 static
    948 Int offsetIReg ( Int sz, UInt reg, Bool irregular )
    949 {
    950    vassert(reg < 16);
    951    if (sz == 1) {
    952       if (irregular)
    953          vassert(reg < 8);
    954    } else {
    955       vassert(irregular == False);
    956    }
    957 
    958    /* Deal with irregular case -- sz==1 and no REX present */
    959    if (sz == 1 && irregular) {
    960       switch (reg) {
    961          case R_RSP: return 1+ OFFB_RAX;
    962          case R_RBP: return 1+ OFFB_RCX;
    963          case R_RSI: return 1+ OFFB_RDX;
    964          case R_RDI: return 1+ OFFB_RBX;
    965          default:    break; /* use the normal case */
    966       }
    967    }
    968 
    969    /* Normal case */
    970    return integerGuestReg64Offset(reg);
    971 }
    972 
    973 
    974 /* Read the %CL register :: Ity_I8, for shift/rotate operations. */
    975 
    976 static IRExpr* getIRegCL ( void )
    977 {
    978    vassert(host_endness == VexEndnessLE);
    979    return IRExpr_Get( OFFB_RCX, Ity_I8 );
    980 }
    981 
    982 
    983 /* Write to the %AH register. */
    984 
    985 static void putIRegAH ( IRExpr* e )
    986 {
    987    vassert(host_endness == VexEndnessLE);
    988    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I8);
    989    stmt( IRStmt_Put( OFFB_RAX+1, e ) );
    990 }
    991 
    992 
    993 /* Read/write various widths of %RAX, as it has various
    994    special-purpose uses. */
    995 
    996 static const HChar* nameIRegRAX ( Int sz )
    997 {
    998    switch (sz) {
    999       case 1: return "%al";
   1000       case 2: return "%ax";
   1001       case 4: return "%eax";
   1002       case 8: return "%rax";
   1003       default: vpanic("nameIRegRAX(amd64)");
   1004    }
   1005 }
   1006 
   1007 static IRExpr* getIRegRAX ( Int sz )
   1008 {
   1009    vassert(host_endness == VexEndnessLE);
   1010    switch (sz) {
   1011       case 1: return IRExpr_Get( OFFB_RAX, Ity_I8 );
   1012       case 2: return IRExpr_Get( OFFB_RAX, Ity_I16 );
   1013       case 4: return unop(Iop_64to32, IRExpr_Get( OFFB_RAX, Ity_I64 ));
   1014       case 8: return IRExpr_Get( OFFB_RAX, Ity_I64 );
   1015       default: vpanic("getIRegRAX(amd64)");
   1016    }
   1017 }
   1018 
   1019 static void putIRegRAX ( Int sz, IRExpr* e )
   1020 {
   1021    IRType ty = typeOfIRExpr(irsb->tyenv, e);
   1022    vassert(host_endness == VexEndnessLE);
   1023    switch (sz) {
   1024       case 8: vassert(ty == Ity_I64);
   1025               stmt( IRStmt_Put( OFFB_RAX, e ));
   1026               break;
   1027       case 4: vassert(ty == Ity_I32);
   1028               stmt( IRStmt_Put( OFFB_RAX, unop(Iop_32Uto64,e) ));
   1029               break;
   1030       case 2: vassert(ty == Ity_I16);
   1031               stmt( IRStmt_Put( OFFB_RAX, e ));
   1032               break;
   1033       case 1: vassert(ty == Ity_I8);
   1034               stmt( IRStmt_Put( OFFB_RAX, e ));
   1035               break;
   1036       default: vpanic("putIRegRAX(amd64)");
   1037    }
   1038 }
   1039 
   1040 
   1041 /* Read/write various widths of %RDX, as it has various
   1042    special-purpose uses. */
   1043 
   1044 static const HChar* nameIRegRDX ( Int sz )
   1045 {
   1046    switch (sz) {
   1047       case 1: return "%dl";
   1048       case 2: return "%dx";
   1049       case 4: return "%edx";
   1050       case 8: return "%rdx";
   1051       default: vpanic("nameIRegRDX(amd64)");
   1052    }
   1053 }
   1054 
   1055 static IRExpr* getIRegRDX ( Int sz )
   1056 {
   1057    vassert(host_endness == VexEndnessLE);
   1058    switch (sz) {
   1059       case 1: return IRExpr_Get( OFFB_RDX, Ity_I8 );
   1060       case 2: return IRExpr_Get( OFFB_RDX, Ity_I16 );
   1061       case 4: return unop(Iop_64to32, IRExpr_Get( OFFB_RDX, Ity_I64 ));
   1062       case 8: return IRExpr_Get( OFFB_RDX, Ity_I64 );
   1063       default: vpanic("getIRegRDX(amd64)");
   1064    }
   1065 }
   1066 
   1067 static void putIRegRDX ( Int sz, IRExpr* e )
   1068 {
   1069    vassert(host_endness == VexEndnessLE);
   1070    vassert(typeOfIRExpr(irsb->tyenv, e) == szToITy(sz));
   1071    switch (sz) {
   1072       case 8: stmt( IRStmt_Put( OFFB_RDX, e ));
   1073               break;
   1074       case 4: stmt( IRStmt_Put( OFFB_RDX, unop(Iop_32Uto64,e) ));
   1075               break;
   1076       case 2: stmt( IRStmt_Put( OFFB_RDX, e ));
   1077               break;
   1078       case 1: stmt( IRStmt_Put( OFFB_RDX, e ));
   1079               break;
   1080       default: vpanic("putIRegRDX(amd64)");
   1081    }
   1082 }
   1083 
   1084 
   1085 /* Simplistic functions to deal with the integer registers as a
   1086    straightforward bank of 16 64-bit regs. */
   1087 
   1088 static IRExpr* getIReg64 ( UInt regno )
   1089 {
   1090    return IRExpr_Get( integerGuestReg64Offset(regno),
   1091                       Ity_I64 );
   1092 }
   1093 
   1094 static void putIReg64 ( UInt regno, IRExpr* e )
   1095 {
   1096    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   1097    stmt( IRStmt_Put( integerGuestReg64Offset(regno), e ) );
   1098 }
   1099 
   1100 static const HChar* nameIReg64 ( UInt regno )
   1101 {
   1102    return nameIReg( 8, regno, False );
   1103 }
   1104 
   1105 
   1106 /* Simplistic functions to deal with the lower halves of integer
   1107    registers as a straightforward bank of 16 32-bit regs. */
   1108 
   1109 static IRExpr* getIReg32 ( UInt regno )
   1110 {
   1111    vassert(host_endness == VexEndnessLE);
   1112    return unop(Iop_64to32,
   1113                IRExpr_Get( integerGuestReg64Offset(regno),
   1114                            Ity_I64 ));
   1115 }
   1116 
   1117 static void putIReg32 ( UInt regno, IRExpr* e )
   1118 {
   1119    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
   1120    stmt( IRStmt_Put( integerGuestReg64Offset(regno),
   1121                      unop(Iop_32Uto64,e) ) );
   1122 }
   1123 
   1124 static const HChar* nameIReg32 ( UInt regno )
   1125 {
   1126    return nameIReg( 4, regno, False );
   1127 }
   1128 
   1129 
   1130 /* Simplistic functions to deal with the lower quarters of integer
   1131    registers as a straightforward bank of 16 16-bit regs. */
   1132 
   1133 static IRExpr* getIReg16 ( UInt regno )
   1134 {
   1135    vassert(host_endness == VexEndnessLE);
   1136    return IRExpr_Get( integerGuestReg64Offset(regno),
   1137                       Ity_I16 );
   1138 }
   1139 
   1140 static void putIReg16 ( UInt regno, IRExpr* e )
   1141 {
   1142    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
   1143    stmt( IRStmt_Put( integerGuestReg64Offset(regno),
   1144                      unop(Iop_16Uto64,e) ) );
   1145 }
   1146 
   1147 static const HChar* nameIReg16 ( UInt regno )
   1148 {
   1149    return nameIReg( 2, regno, False );
   1150 }
   1151 
   1152 
   1153 /* Sometimes what we know is a 3-bit register number, a REX byte, and
   1154    which field of the REX byte is to be used to extend to a 4-bit
   1155    number.  These functions cater for that situation.
   1156 */
   1157 static IRExpr* getIReg64rexX ( Prefix pfx, UInt lo3bits )
   1158 {
   1159    vassert(lo3bits < 8);
   1160    vassert(IS_VALID_PFX(pfx));
   1161    return getIReg64( lo3bits | (getRexX(pfx) << 3) );
   1162 }
   1163 
   1164 static const HChar* nameIReg64rexX ( Prefix pfx, UInt lo3bits )
   1165 {
   1166    vassert(lo3bits < 8);
   1167    vassert(IS_VALID_PFX(pfx));
   1168    return nameIReg( 8, lo3bits | (getRexX(pfx) << 3), False );
   1169 }
   1170 
   1171 static const HChar* nameIRegRexB ( Int sz, Prefix pfx, UInt lo3bits )
   1172 {
   1173    vassert(lo3bits < 8);
   1174    vassert(IS_VALID_PFX(pfx));
   1175    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1176    return nameIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1177                         toBool(sz==1 && !haveREX(pfx)) );
   1178 }
   1179 
   1180 static IRExpr* getIRegRexB ( Int sz, Prefix pfx, UInt lo3bits )
   1181 {
   1182    vassert(lo3bits < 8);
   1183    vassert(IS_VALID_PFX(pfx));
   1184    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1185    if (sz == 4) {
   1186       sz = 8;
   1187       return unop(Iop_64to32,
   1188                   IRExpr_Get(
   1189                      offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1190                                      False/*!irregular*/ ),
   1191                      szToITy(sz)
   1192                  )
   1193              );
   1194    } else {
   1195       return IRExpr_Get(
   1196                 offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1197                                 toBool(sz==1 && !haveREX(pfx)) ),
   1198                 szToITy(sz)
   1199              );
   1200    }
   1201 }
   1202 
   1203 static void putIRegRexB ( Int sz, Prefix pfx, UInt lo3bits, IRExpr* e )
   1204 {
   1205    vassert(lo3bits < 8);
   1206    vassert(IS_VALID_PFX(pfx));
   1207    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1208    vassert(typeOfIRExpr(irsb->tyenv, e) == szToITy(sz));
   1209    stmt( IRStmt_Put(
   1210             offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1211                             toBool(sz==1 && !haveREX(pfx)) ),
   1212             sz==4 ? unop(Iop_32Uto64,e) : e
   1213    ));
   1214 }
   1215 
   1216 
   1217 /* Functions for getting register numbers from modrm bytes and REX
   1218    when we don't have to consider the complexities of integer subreg
   1219    accesses.
   1220 */
   1221 /* Extract the g reg field from a modRM byte, and augment it using the
   1222    REX.R bit from the supplied REX byte.  The R bit usually is
   1223    associated with the g register field.
   1224 */
   1225 static UInt gregOfRexRM ( Prefix pfx, UChar mod_reg_rm )
   1226 {
   1227    Int reg = (Int)( (mod_reg_rm >> 3) & 7 );
   1228    reg += (pfx & PFX_REXR) ? 8 : 0;
   1229    return reg;
   1230 }
   1231 
   1232 /* Extract the e reg field from a modRM byte, and augment it using the
   1233    REX.B bit from the supplied REX byte.  The B bit usually is
   1234    associated with the e register field (when modrm indicates e is a
   1235    register, that is).
   1236 */
   1237 static UInt eregOfRexRM ( Prefix pfx, UChar mod_reg_rm )
   1238 {
   1239    Int rm;
   1240    vassert(epartIsReg(mod_reg_rm));
   1241    rm = (Int)(mod_reg_rm & 0x7);
   1242    rm += (pfx & PFX_REXB) ? 8 : 0;
   1243    return rm;
   1244 }
   1245 
   1246 
   1247 /* General functions for dealing with integer register access. */
   1248 
   1249 /* Produce the guest state offset for a reference to the 'g' register
   1250    field in a modrm byte, taking into account REX (or its absence),
   1251    and the size of the access.
   1252 */
   1253 static UInt offsetIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1254 {
   1255    UInt reg;
   1256    vassert(host_endness == VexEndnessLE);
   1257    vassert(IS_VALID_PFX(pfx));
   1258    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1259    reg = gregOfRexRM( pfx, mod_reg_rm );
   1260    return offsetIReg( sz, reg, toBool(sz == 1 && !haveREX(pfx)) );
   1261 }
   1262 
   1263 static
   1264 IRExpr* getIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1265 {
   1266    if (sz == 4) {
   1267       sz = 8;
   1268       return unop(Iop_64to32,
   1269                   IRExpr_Get( offsetIRegG( sz, pfx, mod_reg_rm ),
   1270                               szToITy(sz) ));
   1271    } else {
   1272       return IRExpr_Get( offsetIRegG( sz, pfx, mod_reg_rm ),
   1273                          szToITy(sz) );
   1274    }
   1275 }
   1276 
   1277 static
   1278 void putIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm, IRExpr* e )
   1279 {
   1280    vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
   1281    if (sz == 4) {
   1282       e = unop(Iop_32Uto64,e);
   1283    }
   1284    stmt( IRStmt_Put( offsetIRegG( sz, pfx, mod_reg_rm ), e ) );
   1285 }
   1286 
   1287 static
   1288 const HChar* nameIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1289 {
   1290    return nameIReg( sz, gregOfRexRM(pfx,mod_reg_rm),
   1291                         toBool(sz==1 && !haveREX(pfx)) );
   1292 }
   1293 
   1294 
   1295 static
   1296 IRExpr* getIRegV ( Int sz, Prefix pfx )
   1297 {
   1298    if (sz == 4) {
   1299       sz = 8;
   1300       return unop(Iop_64to32,
   1301                   IRExpr_Get( offsetIReg( sz, getVexNvvvv(pfx), False ),
   1302                               szToITy(sz) ));
   1303    } else {
   1304       return IRExpr_Get( offsetIReg( sz, getVexNvvvv(pfx), False ),
   1305                          szToITy(sz) );
   1306    }
   1307 }
   1308 
   1309 static
   1310 void putIRegV ( Int sz, Prefix pfx, IRExpr* e )
   1311 {
   1312    vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
   1313    if (sz == 4) {
   1314       e = unop(Iop_32Uto64,e);
   1315    }
   1316    stmt( IRStmt_Put( offsetIReg( sz, getVexNvvvv(pfx), False ), e ) );
   1317 }
   1318 
   1319 static
   1320 const HChar* nameIRegV ( Int sz, Prefix pfx )
   1321 {
   1322    return nameIReg( sz, getVexNvvvv(pfx), False );
   1323 }
   1324 
   1325 
   1326 
   1327 /* Produce the guest state offset for a reference to the 'e' register
   1328    field in a modrm byte, taking into account REX (or its absence),
   1329    and the size of the access.  eregOfRexRM will assert if mod_reg_rm
   1330    denotes a memory access rather than a register access.
   1331 */
   1332 static UInt offsetIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1333 {
   1334    UInt reg;
   1335    vassert(host_endness == VexEndnessLE);
   1336    vassert(IS_VALID_PFX(pfx));
   1337    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1338    reg = eregOfRexRM( pfx, mod_reg_rm );
   1339    return offsetIReg( sz, reg, toBool(sz == 1 && !haveREX(pfx)) );
   1340 }
   1341 
   1342 static
   1343 IRExpr* getIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1344 {
   1345    if (sz == 4) {
   1346       sz = 8;
   1347       return unop(Iop_64to32,
   1348                   IRExpr_Get( offsetIRegE( sz, pfx, mod_reg_rm ),
   1349                               szToITy(sz) ));
   1350    } else {
   1351       return IRExpr_Get( offsetIRegE( sz, pfx, mod_reg_rm ),
   1352                          szToITy(sz) );
   1353    }
   1354 }
   1355 
   1356 static
   1357 void putIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm, IRExpr* e )
   1358 {
   1359    vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
   1360    if (sz == 4) {
   1361       e = unop(Iop_32Uto64,e);
   1362    }
   1363    stmt( IRStmt_Put( offsetIRegE( sz, pfx, mod_reg_rm ), e ) );
   1364 }
   1365 
   1366 static
   1367 const HChar* nameIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1368 {
   1369    return nameIReg( sz, eregOfRexRM(pfx,mod_reg_rm),
   1370                         toBool(sz==1 && !haveREX(pfx)) );
   1371 }
   1372 
   1373 
   1374 /*------------------------------------------------------------*/
   1375 /*--- For dealing with XMM registers                       ---*/
   1376 /*------------------------------------------------------------*/
   1377 
   1378 static Int ymmGuestRegOffset ( UInt ymmreg )
   1379 {
   1380    switch (ymmreg) {
   1381       case 0:  return OFFB_YMM0;
   1382       case 1:  return OFFB_YMM1;
   1383       case 2:  return OFFB_YMM2;
   1384       case 3:  return OFFB_YMM3;
   1385       case 4:  return OFFB_YMM4;
   1386       case 5:  return OFFB_YMM5;
   1387       case 6:  return OFFB_YMM6;
   1388       case 7:  return OFFB_YMM7;
   1389       case 8:  return OFFB_YMM8;
   1390       case 9:  return OFFB_YMM9;
   1391       case 10: return OFFB_YMM10;
   1392       case 11: return OFFB_YMM11;
   1393       case 12: return OFFB_YMM12;
   1394       case 13: return OFFB_YMM13;
   1395       case 14: return OFFB_YMM14;
   1396       case 15: return OFFB_YMM15;
   1397       default: vpanic("ymmGuestRegOffset(amd64)");
   1398    }
   1399 }
   1400 
   1401 static Int xmmGuestRegOffset ( UInt xmmreg )
   1402 {
   1403    /* Correct for little-endian host only. */
   1404    vassert(host_endness == VexEndnessLE);
   1405    return ymmGuestRegOffset( xmmreg );
   1406 }
   1407 
   1408 /* Lanes of vector registers are always numbered from zero being the
   1409    least significant lane (rightmost in the register).  */
   1410 
   1411 static Int xmmGuestRegLane16offset ( UInt xmmreg, Int laneno )
   1412 {
   1413    /* Correct for little-endian host only. */
   1414    vassert(host_endness == VexEndnessLE);
   1415    vassert(laneno >= 0 && laneno < 8);
   1416    return xmmGuestRegOffset( xmmreg ) + 2 * laneno;
   1417 }
   1418 
   1419 static Int xmmGuestRegLane32offset ( UInt xmmreg, Int laneno )
   1420 {
   1421    /* Correct for little-endian host only. */
   1422    vassert(host_endness == VexEndnessLE);
   1423    vassert(laneno >= 0 && laneno < 4);
   1424    return xmmGuestRegOffset( xmmreg ) + 4 * laneno;
   1425 }
   1426 
   1427 static Int xmmGuestRegLane64offset ( UInt xmmreg, Int laneno )
   1428 {
   1429    /* Correct for little-endian host only. */
   1430    vassert(host_endness == VexEndnessLE);
   1431    vassert(laneno >= 0 && laneno < 2);
   1432    return xmmGuestRegOffset( xmmreg ) + 8 * laneno;
   1433 }
   1434 
   1435 static Int ymmGuestRegLane128offset ( UInt ymmreg, Int laneno )
   1436 {
   1437    /* Correct for little-endian host only. */
   1438    vassert(host_endness == VexEndnessLE);
   1439    vassert(laneno >= 0 && laneno < 2);
   1440    return ymmGuestRegOffset( ymmreg ) + 16 * laneno;
   1441 }
   1442 
   1443 static Int ymmGuestRegLane64offset ( UInt ymmreg, Int laneno )
   1444 {
   1445    /* Correct for little-endian host only. */
   1446    vassert(host_endness == VexEndnessLE);
   1447    vassert(laneno >= 0 && laneno < 4);
   1448    return ymmGuestRegOffset( ymmreg ) + 8 * laneno;
   1449 }
   1450 
   1451 static Int ymmGuestRegLane32offset ( UInt ymmreg, Int laneno )
   1452 {
   1453    /* Correct for little-endian host only. */
   1454    vassert(host_endness == VexEndnessLE);
   1455    vassert(laneno >= 0 && laneno < 8);
   1456    return ymmGuestRegOffset( ymmreg ) + 4 * laneno;
   1457 }
   1458 
   1459 static IRExpr* getXMMReg ( UInt xmmreg )
   1460 {
   1461    return IRExpr_Get( xmmGuestRegOffset(xmmreg), Ity_V128 );
   1462 }
   1463 
   1464 static IRExpr* getXMMRegLane64 ( UInt xmmreg, Int laneno )
   1465 {
   1466    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_I64 );
   1467 }
   1468 
   1469 static IRExpr* getXMMRegLane64F ( UInt xmmreg, Int laneno )
   1470 {
   1471    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_F64 );
   1472 }
   1473 
   1474 static IRExpr* getXMMRegLane32 ( UInt xmmreg, Int laneno )
   1475 {
   1476    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_I32 );
   1477 }
   1478 
   1479 static IRExpr* getXMMRegLane32F ( UInt xmmreg, Int laneno )
   1480 {
   1481    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_F32 );
   1482 }
   1483 
   1484 static IRExpr* getXMMRegLane16 ( UInt xmmreg, Int laneno )
   1485 {
   1486   return IRExpr_Get( xmmGuestRegLane16offset(xmmreg,laneno), Ity_I16 );
   1487 }
   1488 
   1489 static void putXMMReg ( UInt xmmreg, IRExpr* e )
   1490 {
   1491    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
   1492    stmt( IRStmt_Put( xmmGuestRegOffset(xmmreg), e ) );
   1493 }
   1494 
   1495 static void putXMMRegLane64 ( UInt xmmreg, Int laneno, IRExpr* e )
   1496 {
   1497    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   1498    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
   1499 }
   1500 
   1501 static void putXMMRegLane64F ( UInt xmmreg, Int laneno, IRExpr* e )
   1502 {
   1503    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
   1504    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
   1505 }
   1506 
   1507 static void putXMMRegLane32F ( UInt xmmreg, Int laneno, IRExpr* e )
   1508 {
   1509    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
   1510    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
   1511 }
   1512 
   1513 static void putXMMRegLane32 ( UInt xmmreg, Int laneno, IRExpr* e )
   1514 {
   1515    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
   1516    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
   1517 }
   1518 
   1519 static IRExpr* getYMMReg ( UInt xmmreg )
   1520 {
   1521    return IRExpr_Get( ymmGuestRegOffset(xmmreg), Ity_V256 );
   1522 }
   1523 
   1524 static IRExpr* getYMMRegLane128 ( UInt ymmreg, Int laneno )
   1525 {
   1526    return IRExpr_Get( ymmGuestRegLane128offset(ymmreg,laneno), Ity_V128 );
   1527 }
   1528 
   1529 static IRExpr* getYMMRegLane64 ( UInt ymmreg, Int laneno )
   1530 {
   1531    return IRExpr_Get( ymmGuestRegLane64offset(ymmreg,laneno), Ity_I64 );
   1532 }
   1533 
   1534 static IRExpr* getYMMRegLane32 ( UInt ymmreg, Int laneno )
   1535 {
   1536    return IRExpr_Get( ymmGuestRegLane32offset(ymmreg,laneno), Ity_I32 );
   1537 }
   1538 
   1539 static void putYMMReg ( UInt ymmreg, IRExpr* e )
   1540 {
   1541    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V256);
   1542    stmt( IRStmt_Put( ymmGuestRegOffset(ymmreg), e ) );
   1543 }
   1544 
   1545 static void putYMMRegLane128 ( UInt ymmreg, Int laneno, IRExpr* e )
   1546 {
   1547    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
   1548    stmt( IRStmt_Put( ymmGuestRegLane128offset(ymmreg,laneno), e ) );
   1549 }
   1550 
   1551 static void putYMMRegLane64F ( UInt ymmreg, Int laneno, IRExpr* e )
   1552 {
   1553    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
   1554    stmt( IRStmt_Put( ymmGuestRegLane64offset(ymmreg,laneno), e ) );
   1555 }
   1556 
   1557 static void putYMMRegLane64 ( UInt ymmreg, Int laneno, IRExpr* e )
   1558 {
   1559    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   1560    stmt( IRStmt_Put( ymmGuestRegLane64offset(ymmreg,laneno), e ) );
   1561 }
   1562 
   1563 static void putYMMRegLane32F ( UInt ymmreg, Int laneno, IRExpr* e )
   1564 {
   1565    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
   1566    stmt( IRStmt_Put( ymmGuestRegLane32offset(ymmreg,laneno), e ) );
   1567 }
   1568 
   1569 static void putYMMRegLane32 ( UInt ymmreg, Int laneno, IRExpr* e )
   1570 {
   1571    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
   1572    stmt( IRStmt_Put( ymmGuestRegLane32offset(ymmreg,laneno), e ) );
   1573 }
   1574 
   1575 static IRExpr* mkV128 ( UShort mask )
   1576 {
   1577    return IRExpr_Const(IRConst_V128(mask));
   1578 }
   1579 
   1580 /* Write the low half of a YMM reg and zero out the upper half. */
   1581 static void putYMMRegLoAndZU ( UInt ymmreg, IRExpr* e )
   1582 {
   1583    putYMMRegLane128( ymmreg, 0, e );
   1584    putYMMRegLane128( ymmreg, 1, mkV128(0) );
   1585 }
   1586 
   1587 static IRExpr* mkAnd1 ( IRExpr* x, IRExpr* y )
   1588 {
   1589    vassert(typeOfIRExpr(irsb->tyenv,x) == Ity_I1);
   1590    vassert(typeOfIRExpr(irsb->tyenv,y) == Ity_I1);
   1591    return unop(Iop_64to1,
   1592                binop(Iop_And64,
   1593                      unop(Iop_1Uto64,x),
   1594                      unop(Iop_1Uto64,y)));
   1595 }
   1596 
   1597 /* Generate a compare-and-swap operation, operating on memory at
   1598    'addr'.  The expected value is 'expVal' and the new value is
   1599    'newVal'.  If the operation fails, then transfer control (with a
   1600    no-redir jump (XXX no -- see comment at top of this file)) to
   1601    'restart_point', which is presumably the address of the guest
   1602    instruction again -- retrying, essentially. */
   1603 static void casLE ( IRExpr* addr, IRExpr* expVal, IRExpr* newVal,
   1604                     Addr64 restart_point )
   1605 {
   1606    IRCAS* cas;
   1607    IRType tyE    = typeOfIRExpr(irsb->tyenv, expVal);
   1608    IRType tyN    = typeOfIRExpr(irsb->tyenv, newVal);
   1609    IRTemp oldTmp = newTemp(tyE);
   1610    IRTemp expTmp = newTemp(tyE);
   1611    vassert(tyE == tyN);
   1612    vassert(tyE == Ity_I64 || tyE == Ity_I32
   1613            || tyE == Ity_I16 || tyE == Ity_I8);
   1614    assign(expTmp, expVal);
   1615    cas = mkIRCAS( IRTemp_INVALID, oldTmp, Iend_LE, addr,
   1616                   NULL, mkexpr(expTmp), NULL, newVal );
   1617    stmt( IRStmt_CAS(cas) );
   1618    stmt( IRStmt_Exit(
   1619             binop( mkSizedOp(tyE,Iop_CasCmpNE8),
   1620                    mkexpr(oldTmp), mkexpr(expTmp) ),
   1621             Ijk_Boring, /*Ijk_NoRedir*/
   1622             IRConst_U64( restart_point ),
   1623             OFFB_RIP
   1624          ));
   1625 }
   1626 
   1627 
   1628 /*------------------------------------------------------------*/
   1629 /*--- Helpers for %rflags.                                 ---*/
   1630 /*------------------------------------------------------------*/
   1631 
   1632 /* -------------- Evaluating the flags-thunk. -------------- */
   1633 
   1634 /* Build IR to calculate all the eflags from stored
   1635    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
   1636    Ity_I64. */
   1637 static IRExpr* mk_amd64g_calculate_rflags_all ( void )
   1638 {
   1639    IRExpr** args
   1640       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
   1641                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
   1642                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
   1643                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   1644    IRExpr* call
   1645       = mkIRExprCCall(
   1646            Ity_I64,
   1647            0/*regparm*/,
   1648            "amd64g_calculate_rflags_all", &amd64g_calculate_rflags_all,
   1649            args
   1650         );
   1651    /* Exclude OP and NDEP from definedness checking.  We're only
   1652       interested in DEP1 and DEP2. */
   1653    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
   1654    return call;
   1655 }
   1656 
   1657 /* Build IR to calculate some particular condition from stored
   1658    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
   1659    Ity_Bit. */
   1660 static IRExpr* mk_amd64g_calculate_condition ( AMD64Condcode cond )
   1661 {
   1662    IRExpr** args
   1663       = mkIRExprVec_5( mkU64(cond),
   1664                        IRExpr_Get(OFFB_CC_OP,   Ity_I64),
   1665                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
   1666                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
   1667                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   1668    IRExpr* call
   1669       = mkIRExprCCall(
   1670            Ity_I64,
   1671            0/*regparm*/,
   1672            "amd64g_calculate_condition", &amd64g_calculate_condition,
   1673            args
   1674         );
   1675    /* Exclude the requested condition, OP and NDEP from definedness
   1676       checking.  We're only interested in DEP1 and DEP2. */
   1677    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<1) | (1<<4);
   1678    return unop(Iop_64to1, call);
   1679 }
   1680 
   1681 /* Build IR to calculate just the carry flag from stored
   1682    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression :: Ity_I64. */
   1683 static IRExpr* mk_amd64g_calculate_rflags_c ( void )
   1684 {
   1685    IRExpr** args
   1686       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
   1687                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
   1688                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
   1689                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   1690    IRExpr* call
   1691       = mkIRExprCCall(
   1692            Ity_I64,
   1693            0/*regparm*/,
   1694            "amd64g_calculate_rflags_c", &amd64g_calculate_rflags_c,
   1695            args
   1696         );
   1697    /* Exclude OP and NDEP from definedness checking.  We're only
   1698       interested in DEP1 and DEP2. */
   1699    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
   1700    return call;
   1701 }
   1702 
   1703 
   1704 /* -------------- Building the flags-thunk. -------------- */
   1705 
   1706 /* The machinery in this section builds the flag-thunk following a
   1707    flag-setting operation.  Hence the various setFlags_* functions.
   1708 */
   1709 
   1710 static Bool isAddSub ( IROp op8 )
   1711 {
   1712    return toBool(op8 == Iop_Add8 || op8 == Iop_Sub8);
   1713 }
   1714 
   1715 static Bool isLogic ( IROp op8 )
   1716 {
   1717    return toBool(op8 == Iop_And8 || op8 == Iop_Or8 || op8 == Iop_Xor8);
   1718 }
   1719 
   1720 /* U-widen 1/8/16/32/64 bit int expr to 64. */
   1721 static IRExpr* widenUto64 ( IRExpr* e )
   1722 {
   1723    switch (typeOfIRExpr(irsb->tyenv,e)) {
   1724       case Ity_I64: return e;
   1725       case Ity_I32: return unop(Iop_32Uto64, e);
   1726       case Ity_I16: return unop(Iop_16Uto64, e);
   1727       case Ity_I8:  return unop(Iop_8Uto64, e);
   1728       case Ity_I1:  return unop(Iop_1Uto64, e);
   1729       default: vpanic("widenUto64");
   1730    }
   1731 }
   1732 
   1733 /* S-widen 8/16/32/64 bit int expr to 32. */
   1734 static IRExpr* widenSto64 ( IRExpr* e )
   1735 {
   1736    switch (typeOfIRExpr(irsb->tyenv,e)) {
   1737       case Ity_I64: return e;
   1738       case Ity_I32: return unop(Iop_32Sto64, e);
   1739       case Ity_I16: return unop(Iop_16Sto64, e);
   1740       case Ity_I8:  return unop(Iop_8Sto64, e);
   1741       default: vpanic("widenSto64");
   1742    }
   1743 }
   1744 
   1745 /* Narrow 8/16/32/64 bit int expr to 8/16/32/64.  Clearly only some
   1746    of these combinations make sense. */
   1747 static IRExpr* narrowTo ( IRType dst_ty, IRExpr* e )
   1748 {
   1749    IRType src_ty = typeOfIRExpr(irsb->tyenv,e);
   1750    if (src_ty == dst_ty)
   1751       return e;
   1752    if (src_ty == Ity_I32 && dst_ty == Ity_I16)
   1753       return unop(Iop_32to16, e);
   1754    if (src_ty == Ity_I32 && dst_ty == Ity_I8)
   1755       return unop(Iop_32to8, e);
   1756    if (src_ty == Ity_I64 && dst_ty == Ity_I32)
   1757       return unop(Iop_64to32, e);
   1758    if (src_ty == Ity_I64 && dst_ty == Ity_I16)
   1759       return unop(Iop_64to16, e);
   1760    if (src_ty == Ity_I64 && dst_ty == Ity_I8)
   1761       return unop(Iop_64to8, e);
   1762 
   1763    vex_printf("\nsrc, dst tys are: ");
   1764    ppIRType(src_ty);
   1765    vex_printf(", ");
   1766    ppIRType(dst_ty);
   1767    vex_printf("\n");
   1768    vpanic("narrowTo(amd64)");
   1769 }
   1770 
   1771 
   1772 /* Set the flags thunk OP, DEP1 and DEP2 fields.  The supplied op is
   1773    auto-sized up to the real op. */
   1774 
   1775 static
   1776 void setFlags_DEP1_DEP2 ( IROp op8, IRTemp dep1, IRTemp dep2, IRType ty )
   1777 {
   1778    Int ccOp = 0;
   1779    switch (ty) {
   1780       case Ity_I8:  ccOp = 0; break;
   1781       case Ity_I16: ccOp = 1; break;
   1782       case Ity_I32: ccOp = 2; break;
   1783       case Ity_I64: ccOp = 3; break;
   1784       default: vassert(0);
   1785    }
   1786    switch (op8) {
   1787       case Iop_Add8: ccOp += AMD64G_CC_OP_ADDB;   break;
   1788       case Iop_Sub8: ccOp += AMD64G_CC_OP_SUBB;   break;
   1789       default:       ppIROp(op8);
   1790                      vpanic("setFlags_DEP1_DEP2(amd64)");
   1791    }
   1792    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
   1793    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dep1))) );
   1794    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(dep2))) );
   1795 }
   1796 
   1797 
   1798 /* Set the OP and DEP1 fields only, and write zero to DEP2. */
   1799 
   1800 static
   1801 void setFlags_DEP1 ( IROp op8, IRTemp dep1, IRType ty )
   1802 {
   1803    Int ccOp = 0;
   1804    switch (ty) {
   1805       case Ity_I8:  ccOp = 0; break;
   1806       case Ity_I16: ccOp = 1; break;
   1807       case Ity_I32: ccOp = 2; break;
   1808       case Ity_I64: ccOp = 3; break;
   1809       default: vassert(0);
   1810    }
   1811    switch (op8) {
   1812       case Iop_Or8:
   1813       case Iop_And8:
   1814       case Iop_Xor8: ccOp += AMD64G_CC_OP_LOGICB; break;
   1815       default:       ppIROp(op8);
   1816                      vpanic("setFlags_DEP1(amd64)");
   1817    }
   1818    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
   1819    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dep1))) );
   1820    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
   1821 }
   1822 
   1823 
   1824 /* For shift operations, we put in the result and the undershifted
   1825    result.  Except if the shift amount is zero, the thunk is left
   1826    unchanged. */
   1827 
   1828 static void setFlags_DEP1_DEP2_shift ( IROp    op64,
   1829                                        IRTemp  res,
   1830                                        IRTemp  resUS,
   1831                                        IRType  ty,
   1832                                        IRTemp  guard )
   1833 {
   1834    Int ccOp = 0;
   1835    switch (ty) {
   1836       case Ity_I8:  ccOp = 0; break;
   1837       case Ity_I16: ccOp = 1; break;
   1838       case Ity_I32: ccOp = 2; break;
   1839       case Ity_I64: ccOp = 3; break;
   1840       default: vassert(0);
   1841    }
   1842 
   1843    vassert(guard);
   1844 
   1845    /* Both kinds of right shifts are handled by the same thunk
   1846       operation. */
   1847    switch (op64) {
   1848       case Iop_Shr64:
   1849       case Iop_Sar64: ccOp += AMD64G_CC_OP_SHRB; break;
   1850       case Iop_Shl64: ccOp += AMD64G_CC_OP_SHLB; break;
   1851       default:        ppIROp(op64);
   1852                       vpanic("setFlags_DEP1_DEP2_shift(amd64)");
   1853    }
   1854 
   1855    /* guard :: Ity_I8.  We need to convert it to I1. */
   1856    IRTemp guardB = newTemp(Ity_I1);
   1857    assign( guardB, binop(Iop_CmpNE8, mkexpr(guard), mkU8(0)) );
   1858 
   1859    /* DEP1 contains the result, DEP2 contains the undershifted value. */
   1860    stmt( IRStmt_Put( OFFB_CC_OP,
   1861                      IRExpr_ITE( mkexpr(guardB),
   1862                                  mkU64(ccOp),
   1863                                  IRExpr_Get(OFFB_CC_OP,Ity_I64) ) ));
   1864    stmt( IRStmt_Put( OFFB_CC_DEP1,
   1865                      IRExpr_ITE( mkexpr(guardB),
   1866                                  widenUto64(mkexpr(res)),
   1867                                  IRExpr_Get(OFFB_CC_DEP1,Ity_I64) ) ));
   1868    stmt( IRStmt_Put( OFFB_CC_DEP2,
   1869                      IRExpr_ITE( mkexpr(guardB),
   1870                                  widenUto64(mkexpr(resUS)),
   1871                                  IRExpr_Get(OFFB_CC_DEP2,Ity_I64) ) ));
   1872 }
   1873 
   1874 
   1875 /* For the inc/dec case, we store in DEP1 the result value and in NDEP
   1876    the former value of the carry flag, which unfortunately we have to
   1877    compute. */
   1878 
   1879 static void setFlags_INC_DEC ( Bool inc, IRTemp res, IRType ty )
   1880 {
   1881    Int ccOp = inc ? AMD64G_CC_OP_INCB : AMD64G_CC_OP_DECB;
   1882 
   1883    switch (ty) {
   1884       case Ity_I8:  ccOp += 0; break;
   1885       case Ity_I16: ccOp += 1; break;
   1886       case Ity_I32: ccOp += 2; break;
   1887       case Ity_I64: ccOp += 3; break;
   1888       default: vassert(0);
   1889    }
   1890 
   1891    /* This has to come first, because calculating the C flag
   1892       may require reading all four thunk fields. */
   1893    stmt( IRStmt_Put( OFFB_CC_NDEP, mk_amd64g_calculate_rflags_c()) );
   1894    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
   1895    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(res))) );
   1896    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
   1897 }
   1898 
   1899 
   1900 /* Multiplies are pretty much like add and sub: DEP1 and DEP2 hold the
   1901    two arguments. */
   1902 
   1903 static
   1904 void setFlags_MUL ( IRType ty, IRTemp arg1, IRTemp arg2, ULong base_op )
   1905 {
   1906    switch (ty) {
   1907       case Ity_I8:
   1908          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+0) ) );
   1909          break;
   1910       case Ity_I16:
   1911          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+1) ) );
   1912          break;
   1913       case Ity_I32:
   1914          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+2) ) );
   1915          break;
   1916       case Ity_I64:
   1917          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+3) ) );
   1918          break;
   1919       default:
   1920          vpanic("setFlags_MUL(amd64)");
   1921    }
   1922    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(arg1)) ));
   1923    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(arg2)) ));
   1924 }
   1925 
   1926 
   1927 /* -------------- Condition codes. -------------- */
   1928 
   1929 /* Condition codes, using the AMD encoding.  */
   1930 
   1931 static const HChar* name_AMD64Condcode ( AMD64Condcode cond )
   1932 {
   1933    switch (cond) {
   1934       case AMD64CondO:      return "o";
   1935       case AMD64CondNO:     return "no";
   1936       case AMD64CondB:      return "b";
   1937       case AMD64CondNB:     return "ae"; /*"nb";*/
   1938       case AMD64CondZ:      return "e"; /*"z";*/
   1939       case AMD64CondNZ:     return "ne"; /*"nz";*/
   1940       case AMD64CondBE:     return "be";
   1941       case AMD64CondNBE:    return "a"; /*"nbe";*/
   1942       case AMD64CondS:      return "s";
   1943       case AMD64CondNS:     return "ns";
   1944       case AMD64CondP:      return "p";
   1945       case AMD64CondNP:     return "np";
   1946       case AMD64CondL:      return "l";
   1947       case AMD64CondNL:     return "ge"; /*"nl";*/
   1948       case AMD64CondLE:     return "le";
   1949       case AMD64CondNLE:    return "g"; /*"nle";*/
   1950       case AMD64CondAlways: return "ALWAYS";
   1951       default: vpanic("name_AMD64Condcode");
   1952    }
   1953 }
   1954 
   1955 static
   1956 AMD64Condcode positiveIse_AMD64Condcode ( AMD64Condcode  cond,
   1957                                           /*OUT*/Bool*   needInvert )
   1958 {
   1959    vassert(cond >= AMD64CondO && cond <= AMD64CondNLE);
   1960    if (cond & 1) {
   1961       *needInvert = True;
   1962       return cond-1;
   1963    } else {
   1964       *needInvert = False;
   1965       return cond;
   1966    }
   1967 }
   1968 
   1969 
   1970 /* -------------- Helpers for ADD/SUB with carry. -------------- */
   1971 
   1972 /* Given ta1, ta2 and tres, compute tres = ADC(ta1,ta2) and set flags
   1973    appropriately.
   1974 
   1975    Optionally, generate a store for the 'tres' value.  This can either
   1976    be a normal store, or it can be a cas-with-possible-failure style
   1977    store:
   1978 
   1979    if taddr is IRTemp_INVALID, then no store is generated.
   1980 
   1981    if taddr is not IRTemp_INVALID, then a store (using taddr as
   1982    the address) is generated:
   1983 
   1984      if texpVal is IRTemp_INVALID then a normal store is
   1985      generated, and restart_point must be zero (it is irrelevant).
   1986 
   1987      if texpVal is not IRTemp_INVALID then a cas-style store is
   1988      generated.  texpVal is the expected value, restart_point
   1989      is the restart point if the store fails, and texpVal must
   1990      have the same type as tres.
   1991 
   1992 */
   1993 static void helper_ADC ( Int sz,
   1994                          IRTemp tres, IRTemp ta1, IRTemp ta2,
   1995                          /* info about optional store: */
   1996                          IRTemp taddr, IRTemp texpVal, Addr64 restart_point )
   1997 {
   1998    UInt    thunkOp;
   1999    IRType  ty    = szToITy(sz);
   2000    IRTemp  oldc  = newTemp(Ity_I64);
   2001    IRTemp  oldcn = newTemp(ty);
   2002    IROp    plus  = mkSizedOp(ty, Iop_Add8);
   2003    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
   2004 
   2005    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
   2006 
   2007    switch (sz) {
   2008       case 8:  thunkOp = AMD64G_CC_OP_ADCQ; break;
   2009       case 4:  thunkOp = AMD64G_CC_OP_ADCL; break;
   2010       case 2:  thunkOp = AMD64G_CC_OP_ADCW; break;
   2011       case 1:  thunkOp = AMD64G_CC_OP_ADCB; break;
   2012       default: vassert(0);
   2013    }
   2014 
   2015    /* oldc = old carry flag, 0 or 1 */
   2016    assign( oldc,  binop(Iop_And64,
   2017                         mk_amd64g_calculate_rflags_c(),
   2018                         mkU64(1)) );
   2019 
   2020    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
   2021 
   2022    assign( tres, binop(plus,
   2023                        binop(plus,mkexpr(ta1),mkexpr(ta2)),
   2024                        mkexpr(oldcn)) );
   2025 
   2026    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
   2027       start of this function. */
   2028    if (taddr != IRTemp_INVALID) {
   2029       if (texpVal == IRTemp_INVALID) {
   2030          vassert(restart_point == 0);
   2031          storeLE( mkexpr(taddr), mkexpr(tres) );
   2032       } else {
   2033          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
   2034          /* .. and hence 'texpVal' has the same type as 'tres'. */
   2035          casLE( mkexpr(taddr),
   2036                 mkexpr(texpVal), mkexpr(tres), restart_point );
   2037       }
   2038    }
   2039 
   2040    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
   2041    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1))  ));
   2042    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
   2043                                                          mkexpr(oldcn)) )) );
   2044    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
   2045 }
   2046 
   2047 
   2048 /* Given ta1, ta2 and tres, compute tres = SBB(ta1,ta2) and set flags
   2049    appropriately.  As with helper_ADC, possibly generate a store of
   2050    the result -- see comments on helper_ADC for details.
   2051 */
   2052 static void helper_SBB ( Int sz,
   2053                          IRTemp tres, IRTemp ta1, IRTemp ta2,
   2054                          /* info about optional store: */
   2055                          IRTemp taddr, IRTemp texpVal, Addr64 restart_point )
   2056 {
   2057    UInt    thunkOp;
   2058    IRType  ty    = szToITy(sz);
   2059    IRTemp  oldc  = newTemp(Ity_I64);
   2060    IRTemp  oldcn = newTemp(ty);
   2061    IROp    minus = mkSizedOp(ty, Iop_Sub8);
   2062    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
   2063 
   2064    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
   2065 
   2066    switch (sz) {
   2067       case 8:  thunkOp = AMD64G_CC_OP_SBBQ; break;
   2068       case 4:  thunkOp = AMD64G_CC_OP_SBBL; break;
   2069       case 2:  thunkOp = AMD64G_CC_OP_SBBW; break;
   2070       case 1:  thunkOp = AMD64G_CC_OP_SBBB; break;
   2071       default: vassert(0);
   2072    }
   2073 
   2074    /* oldc = old carry flag, 0 or 1 */
   2075    assign( oldc, binop(Iop_And64,
   2076                        mk_amd64g_calculate_rflags_c(),
   2077                        mkU64(1)) );
   2078 
   2079    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
   2080 
   2081    assign( tres, binop(minus,
   2082                        binop(minus,mkexpr(ta1),mkexpr(ta2)),
   2083                        mkexpr(oldcn)) );
   2084 
   2085    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
   2086       start of this function. */
   2087    if (taddr != IRTemp_INVALID) {
   2088       if (texpVal == IRTemp_INVALID) {
   2089          vassert(restart_point == 0);
   2090          storeLE( mkexpr(taddr), mkexpr(tres) );
   2091       } else {
   2092          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
   2093          /* .. and hence 'texpVal' has the same type as 'tres'. */
   2094          casLE( mkexpr(taddr),
   2095                 mkexpr(texpVal), mkexpr(tres), restart_point );
   2096       }
   2097    }
   2098 
   2099    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
   2100    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1) )) );
   2101    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
   2102                                                          mkexpr(oldcn)) )) );
   2103    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
   2104 }
   2105 
   2106 
   2107 /* -------------- Helpers for disassembly printing. -------------- */
   2108 
   2109 static const HChar* nameGrp1 ( Int opc_aux )
   2110 {
   2111    static const HChar* grp1_names[8]
   2112      = { "add", "or", "adc", "sbb", "and", "sub", "xor", "cmp" };
   2113    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp1(amd64)");
   2114    return grp1_names[opc_aux];
   2115 }
   2116 
   2117 static const HChar* nameGrp2 ( Int opc_aux )
   2118 {
   2119    static const HChar* grp2_names[8]
   2120      = { "rol", "ror", "rcl", "rcr", "shl", "shr", "shl", "sar" };
   2121    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp2(amd64)");
   2122    return grp2_names[opc_aux];
   2123 }
   2124 
   2125 static const HChar* nameGrp4 ( Int opc_aux )
   2126 {
   2127    static const HChar* grp4_names[8]
   2128      = { "inc", "dec", "???", "???", "???", "???", "???", "???" };
   2129    if (opc_aux < 0 || opc_aux > 1) vpanic("nameGrp4(amd64)");
   2130    return grp4_names[opc_aux];
   2131 }
   2132 
   2133 static const HChar* nameGrp5 ( Int opc_aux )
   2134 {
   2135    static const HChar* grp5_names[8]
   2136      = { "inc", "dec", "call*", "call*", "jmp*", "jmp*", "push", "???" };
   2137    if (opc_aux < 0 || opc_aux > 6) vpanic("nameGrp5(amd64)");
   2138    return grp5_names[opc_aux];
   2139 }
   2140 
   2141 static const HChar* nameGrp8 ( Int opc_aux )
   2142 {
   2143    static const HChar* grp8_names[8]
   2144       = { "???", "???", "???", "???", "bt", "bts", "btr", "btc" };
   2145    if (opc_aux < 4 || opc_aux > 7) vpanic("nameGrp8(amd64)");
   2146    return grp8_names[opc_aux];
   2147 }
   2148 
   2149 //.. static const HChar* nameSReg ( UInt sreg )
   2150 //.. {
   2151 //..    switch (sreg) {
   2152 //..       case R_ES: return "%es";
   2153 //..       case R_CS: return "%cs";
   2154 //..       case R_SS: return "%ss";
   2155 //..       case R_DS: return "%ds";
   2156 //..       case R_FS: return "%fs";
   2157 //..       case R_GS: return "%gs";
   2158 //..       default: vpanic("nameSReg(x86)");
   2159 //..    }
   2160 //.. }
   2161 
   2162 static const HChar* nameMMXReg ( Int mmxreg )
   2163 {
   2164    static const HChar* mmx_names[8]
   2165      = { "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" };
   2166    if (mmxreg < 0 || mmxreg > 7) vpanic("nameMMXReg(amd64,guest)");
   2167    return mmx_names[mmxreg];
   2168 }
   2169 
   2170 static const HChar* nameXMMReg ( Int xmmreg )
   2171 {
   2172    static const HChar* xmm_names[16]
   2173      = { "%xmm0",  "%xmm1",  "%xmm2",  "%xmm3",
   2174          "%xmm4",  "%xmm5",  "%xmm6",  "%xmm7",
   2175          "%xmm8",  "%xmm9",  "%xmm10", "%xmm11",
   2176          "%xmm12", "%xmm13", "%xmm14", "%xmm15" };
   2177    if (xmmreg < 0 || xmmreg > 15) vpanic("nameXMMReg(amd64)");
   2178    return xmm_names[xmmreg];
   2179 }
   2180 
   2181 static const HChar* nameMMXGran ( Int gran )
   2182 {
   2183    switch (gran) {
   2184       case 0: return "b";
   2185       case 1: return "w";
   2186       case 2: return "d";
   2187       case 3: return "q";
   2188       default: vpanic("nameMMXGran(amd64,guest)");
   2189    }
   2190 }
   2191 
   2192 static HChar nameISize ( Int size )
   2193 {
   2194    switch (size) {
   2195       case 8: return 'q';
   2196       case 4: return 'l';
   2197       case 2: return 'w';
   2198       case 1: return 'b';
   2199       default: vpanic("nameISize(amd64)");
   2200    }
   2201 }
   2202 
   2203 static const HChar* nameYMMReg ( Int ymmreg )
   2204 {
   2205    static const HChar* ymm_names[16]
   2206      = { "%ymm0",  "%ymm1",  "%ymm2",  "%ymm3",
   2207          "%ymm4",  "%ymm5",  "%ymm6",  "%ymm7",
   2208          "%ymm8",  "%ymm9",  "%ymm10", "%ymm11",
   2209          "%ymm12", "%ymm13", "%ymm14", "%ymm15" };
   2210    if (ymmreg < 0 || ymmreg > 15) vpanic("nameYMMReg(amd64)");
   2211    return ymm_names[ymmreg];
   2212 }
   2213 
   2214 
   2215 /*------------------------------------------------------------*/
   2216 /*--- JMP helpers                                          ---*/
   2217 /*------------------------------------------------------------*/
   2218 
   2219 static void jmp_lit( /*MOD*/DisResult* dres,
   2220                      IRJumpKind kind, Addr64 d64 )
   2221 {
   2222    vassert(dres->whatNext    == Dis_Continue);
   2223    vassert(dres->len         == 0);
   2224    vassert(dres->continueAt  == 0);
   2225    vassert(dres->jk_StopHere == Ijk_INVALID);
   2226    dres->whatNext    = Dis_StopHere;
   2227    dres->jk_StopHere = kind;
   2228    stmt( IRStmt_Put( OFFB_RIP, mkU64(d64) ) );
   2229 }
   2230 
   2231 static void jmp_treg( /*MOD*/DisResult* dres,
   2232                       IRJumpKind kind, IRTemp t )
   2233 {
   2234    vassert(dres->whatNext    == Dis_Continue);
   2235    vassert(dres->len         == 0);
   2236    vassert(dres->continueAt  == 0);
   2237    vassert(dres->jk_StopHere == Ijk_INVALID);
   2238    dres->whatNext    = Dis_StopHere;
   2239    dres->jk_StopHere = kind;
   2240    stmt( IRStmt_Put( OFFB_RIP, mkexpr(t) ) );
   2241 }
   2242 
   2243 static
   2244 void jcc_01 ( /*MOD*/DisResult* dres,
   2245               AMD64Condcode cond, Addr64 d64_false, Addr64 d64_true )
   2246 {
   2247    Bool          invert;
   2248    AMD64Condcode condPos;
   2249    vassert(dres->whatNext    == Dis_Continue);
   2250    vassert(dres->len         == 0);
   2251    vassert(dres->continueAt  == 0);
   2252    vassert(dres->jk_StopHere == Ijk_INVALID);
   2253    dres->whatNext    = Dis_StopHere;
   2254    dres->jk_StopHere = Ijk_Boring;
   2255    condPos = positiveIse_AMD64Condcode ( cond, &invert );
   2256    if (invert) {
   2257       stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
   2258                          Ijk_Boring,
   2259                          IRConst_U64(d64_false),
   2260                          OFFB_RIP ) );
   2261       stmt( IRStmt_Put( OFFB_RIP, mkU64(d64_true) ) );
   2262    } else {
   2263       stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
   2264                          Ijk_Boring,
   2265                          IRConst_U64(d64_true),
   2266                          OFFB_RIP ) );
   2267       stmt( IRStmt_Put( OFFB_RIP, mkU64(d64_false) ) );
   2268    }
   2269 }
   2270 
   2271 /* Let new_rsp be the %rsp value after a call/return.  Let nia be the
   2272    guest address of the next instruction to be executed.
   2273 
   2274    This function generates an AbiHint to say that -128(%rsp)
   2275    .. -1(%rsp) should now be regarded as uninitialised.
   2276 */
   2277 static
   2278 void make_redzone_AbiHint ( const VexAbiInfo* vbi,
   2279                             IRTemp new_rsp, IRTemp nia, const HChar* who )
   2280 {
   2281    Int szB = vbi->guest_stack_redzone_size;
   2282    vassert(szB >= 0);
   2283 
   2284    /* A bit of a kludge.  Currently the only AbI we've guested AMD64
   2285       for is ELF.  So just check it's the expected 128 value
   2286       (paranoia). */
   2287    vassert(szB == 128);
   2288 
   2289    if (0) vex_printf("AbiHint: %s\n", who);
   2290    vassert(typeOfIRTemp(irsb->tyenv, new_rsp) == Ity_I64);
   2291    vassert(typeOfIRTemp(irsb->tyenv, nia) == Ity_I64);
   2292    if (szB > 0)
   2293       stmt( IRStmt_AbiHint(
   2294                binop(Iop_Sub64, mkexpr(new_rsp), mkU64(szB)),
   2295                szB,
   2296                mkexpr(nia)
   2297             ));
   2298 }
   2299 
   2300 
   2301 /*------------------------------------------------------------*/
   2302 /*--- Disassembling addressing modes                       ---*/
   2303 /*------------------------------------------------------------*/
   2304 
   2305 static
   2306 const HChar* segRegTxt ( Prefix pfx )
   2307 {
   2308    if (pfx & PFX_CS) return "%cs:";
   2309    if (pfx & PFX_DS) return "%ds:";
   2310    if (pfx & PFX_ES) return "%es:";
   2311    if (pfx & PFX_FS) return "%fs:";
   2312    if (pfx & PFX_GS) return "%gs:";
   2313    if (pfx & PFX_SS) return "%ss:";
   2314    return ""; /* no override */
   2315 }
   2316 
   2317 
   2318 /* 'virtual' is an IRExpr* holding a virtual address.  Convert it to a
   2319    linear address by adding any required segment override as indicated
   2320    by sorb, and also dealing with any address size override
   2321    present. */
   2322 static
   2323 IRExpr* handleAddrOverrides ( const VexAbiInfo* vbi,
   2324                               Prefix pfx, IRExpr* virtual )
   2325 {
   2326    /* Note that the below are hacks that relies on the assumption
   2327       that %fs or %gs are constant.
   2328       Typically, %fs is always 0x63 on linux (in the main thread, it
   2329       stays at value 0), %gs always 0x60 on Darwin, ... */
   2330    /* --- segment overrides --- */
   2331    if (pfx & PFX_FS) {
   2332       if (vbi->guest_amd64_assume_fs_is_const) {
   2333          /* return virtual + guest_FS_CONST. */
   2334          virtual = binop(Iop_Add64, virtual,
   2335                                     IRExpr_Get(OFFB_FS_CONST, Ity_I64));
   2336       } else {
   2337          unimplemented("amd64 %fs segment override");
   2338       }
   2339    }
   2340 
   2341    if (pfx & PFX_GS) {
   2342       if (vbi->guest_amd64_assume_gs_is_const) {
   2343          /* return virtual + guest_GS_CONST. */
   2344          virtual = binop(Iop_Add64, virtual,
   2345                                     IRExpr_Get(OFFB_GS_CONST, Ity_I64));
   2346       } else {
   2347          unimplemented("amd64 %gs segment override");
   2348       }
   2349    }
   2350 
   2351    /* cs, ds, es and ss are simply ignored in 64-bit mode. */
   2352 
   2353    /* --- address size override --- */
   2354    if (haveASO(pfx))
   2355       virtual = unop(Iop_32Uto64, unop(Iop_64to32, virtual));
   2356 
   2357    return virtual;
   2358 }
   2359 
   2360 //.. {
   2361 //..    Int    sreg;
   2362 //..    IRType hWordTy;
   2363 //..    IRTemp ldt_ptr, gdt_ptr, seg_selector, r64;
   2364 //..
   2365 //..    if (sorb == 0)
   2366 //..       /* the common case - no override */
   2367 //..       return virtual;
   2368 //..
   2369 //..    switch (sorb) {
   2370 //..       case 0x3E: sreg = R_DS; break;
   2371 //..       case 0x26: sreg = R_ES; break;
   2372 //..       case 0x64: sreg = R_FS; break;
   2373 //..       case 0x65: sreg = R_GS; break;
   2374 //..       default: vpanic("handleAddrOverrides(x86,guest)");
   2375 //..    }
   2376 //..
   2377 //..    hWordTy = sizeof(HWord)==4 ? Ity_I32 : Ity_I64;
   2378 //..
   2379 //..    seg_selector = newTemp(Ity_I32);
   2380 //..    ldt_ptr      = newTemp(hWordTy);
   2381 //..    gdt_ptr      = newTemp(hWordTy);
   2382 //..    r64          = newTemp(Ity_I64);
   2383 //..
   2384 //..    assign( seg_selector, unop(Iop_16Uto32, getSReg(sreg)) );
   2385 //..    assign( ldt_ptr, IRExpr_Get( OFFB_LDT, hWordTy ));
   2386 //..    assign( gdt_ptr, IRExpr_Get( OFFB_GDT, hWordTy ));
   2387 //..
   2388 //..    /*
   2389 //..    Call this to do the translation and limit checks:
   2390 //..    ULong x86g_use_seg_selector ( HWord ldt, HWord gdt,
   2391 //..                                  UInt seg_selector, UInt virtual_addr )
   2392 //..    */
   2393 //..    assign(
   2394 //..       r64,
   2395 //..       mkIRExprCCall(
   2396 //..          Ity_I64,
   2397 //..          0/*regparms*/,
   2398 //..          "x86g_use_seg_selector",
   2399 //..          &x86g_use_seg_selector,
   2400 //..          mkIRExprVec_4( mkexpr(ldt_ptr), mkexpr(gdt_ptr),
   2401 //..                         mkexpr(seg_selector), virtual)
   2402 //..       )
   2403 //..    );
   2404 //..
   2405 //..    /* If the high 32 of the result are non-zero, there was a
   2406 //..       failure in address translation.  In which case, make a
   2407 //..       quick exit.
   2408 //..    */
   2409 //..    stmt(
   2410 //..       IRStmt_Exit(
   2411 //..          binop(Iop_CmpNE32, unop(Iop_64HIto32, mkexpr(r64)), mkU32(0)),
   2412 //..          Ijk_MapFail,
   2413 //..          IRConst_U32( guest_eip_curr_instr )
   2414 //..       )
   2415 //..    );
   2416 //..
   2417 //..    /* otherwise, here's the translated result. */
   2418 //..    return unop(Iop_64to32, mkexpr(r64));
   2419 //.. }
   2420 
   2421 
   2422 /* Generate IR to calculate an address indicated by a ModRM and
   2423    following SIB bytes.  The expression, and the number of bytes in
   2424    the address mode, are returned (the latter in *len).  Note that
   2425    this fn should not be called if the R/M part of the address denotes
   2426    a register instead of memory.  If print_codegen is true, text of
   2427    the addressing mode is placed in buf.
   2428 
   2429    The computed address is stored in a new tempreg, and the
   2430    identity of the tempreg is returned.
   2431 
   2432    extra_bytes holds the number of bytes after the amode, as supplied
   2433    by the caller.  This is needed to make sense of %rip-relative
   2434    addresses.  Note that the value that *len is set to is only the
   2435    length of the amode itself and does not include the value supplied
   2436    in extra_bytes.
   2437  */
   2438 
   2439 static IRTemp disAMode_copy2tmp ( IRExpr* addr64 )
   2440 {
   2441    IRTemp tmp = newTemp(Ity_I64);
   2442    assign( tmp, addr64 );
   2443    return tmp;
   2444 }
   2445 
   2446 static
   2447 IRTemp disAMode ( /*OUT*/Int* len,
   2448                   const VexAbiInfo* vbi, Prefix pfx, Long delta,
   2449                   /*OUT*/HChar* buf, Int extra_bytes )
   2450 {
   2451    UChar mod_reg_rm = getUChar(delta);
   2452    delta++;
   2453 
   2454    buf[0] = (UChar)0;
   2455    vassert(extra_bytes >= 0 && extra_bytes < 10);
   2456 
   2457    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
   2458       jump table seems a bit excessive.
   2459    */
   2460    mod_reg_rm &= 0xC7;                         /* is now XX000YYY */
   2461    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
   2462                                                /* is now XX0XXYYY */
   2463    mod_reg_rm &= 0x1F;                         /* is now 000XXYYY */
   2464    switch (mod_reg_rm) {
   2465 
   2466       /* REX.B==0: (%rax) .. (%rdi), not including (%rsp) or (%rbp).
   2467          REX.B==1: (%r8)  .. (%r15), not including (%r12) or (%r13).
   2468       */
   2469       case 0x00: case 0x01: case 0x02: case 0x03:
   2470       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
   2471          { UChar rm = toUChar(mod_reg_rm & 7);
   2472            DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,rm));
   2473            *len = 1;
   2474            return disAMode_copy2tmp(
   2475                   handleAddrOverrides(vbi, pfx, getIRegRexB(8,pfx,rm)));
   2476          }
   2477 
   2478       /* REX.B==0: d8(%rax) ... d8(%rdi), not including d8(%rsp)
   2479          REX.B==1: d8(%r8)  ... d8(%r15), not including d8(%r12)
   2480       */
   2481       case 0x08: case 0x09: case 0x0A: case 0x0B:
   2482       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
   2483          { UChar rm = toUChar(mod_reg_rm & 7);
   2484            Long d   = getSDisp8(delta);
   2485            if (d == 0) {
   2486               DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,rm));
   2487            } else {
   2488               DIS(buf, "%s%lld(%s)", segRegTxt(pfx), d, nameIRegRexB(8,pfx,rm));
   2489            }
   2490            *len = 2;
   2491            return disAMode_copy2tmp(
   2492                   handleAddrOverrides(vbi, pfx,
   2493                      binop(Iop_Add64,getIRegRexB(8,pfx,rm),mkU64(d))));
   2494          }
   2495 
   2496       /* REX.B==0: d32(%rax) ... d32(%rdi), not including d32(%rsp)
   2497          REX.B==1: d32(%r8)  ... d32(%r15), not including d32(%r12)
   2498       */
   2499       case 0x10: case 0x11: case 0x12: case 0x13:
   2500       /* ! 14 */ case 0x15: case 0x16: case 0x17:
   2501          { UChar rm = toUChar(mod_reg_rm & 7);
   2502            Long  d  = getSDisp32(delta);
   2503            DIS(buf, "%s%lld(%s)", segRegTxt(pfx), d, nameIRegRexB(8,pfx,rm));
   2504            *len = 5;
   2505            return disAMode_copy2tmp(
   2506                   handleAddrOverrides(vbi, pfx,
   2507                      binop(Iop_Add64,getIRegRexB(8,pfx,rm),mkU64(d))));
   2508          }
   2509 
   2510       /* REX.B==0: a register, %rax .. %rdi.  This shouldn't happen. */
   2511       /* REX.B==1: a register, %r8  .. %r16.  This shouldn't happen. */
   2512       case 0x18: case 0x19: case 0x1A: case 0x1B:
   2513       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
   2514          vpanic("disAMode(amd64): not an addr!");
   2515 
   2516       /* RIP + disp32.  This assumes that guest_RIP_curr_instr is set
   2517          correctly at the start of handling each instruction. */
   2518       case 0x05:
   2519          { Long d = getSDisp32(delta);
   2520            *len = 5;
   2521            DIS(buf, "%s%lld(%%rip)", segRegTxt(pfx), d);
   2522            /* We need to know the next instruction's start address.
   2523               Try and figure out what it is, record the guess, and ask
   2524               the top-level driver logic (bbToIR_AMD64) to check we
   2525               guessed right, after the instruction is completely
   2526               decoded. */
   2527            guest_RIP_next_mustcheck = True;
   2528            guest_RIP_next_assumed = guest_RIP_bbstart
   2529                                     + delta+4 + extra_bytes;
   2530            return disAMode_copy2tmp(
   2531                      handleAddrOverrides(vbi, pfx,
   2532                         binop(Iop_Add64, mkU64(guest_RIP_next_assumed),
   2533                                          mkU64(d))));
   2534          }
   2535 
   2536       case 0x04: {
   2537          /* SIB, with no displacement.  Special cases:
   2538             -- %rsp cannot act as an index value.
   2539                If index_r indicates %rsp, zero is used for the index.
   2540             -- when mod is zero and base indicates RBP or R13, base is
   2541                instead a 32-bit sign-extended literal.
   2542             It's all madness, I tell you.  Extract %index, %base and
   2543             scale from the SIB byte.  The value denoted is then:
   2544                | %index == %RSP && (%base == %RBP || %base == %R13)
   2545                = d32 following SIB byte
   2546                | %index == %RSP && !(%base == %RBP || %base == %R13)
   2547                = %base
   2548                | %index != %RSP && (%base == %RBP || %base == %R13)
   2549                = d32 following SIB byte + (%index << scale)
   2550                | %index != %RSP && !(%base == %RBP || %base == %R13)
   2551                = %base + (%index << scale)
   2552          */
   2553          UChar sib     = getUChar(delta);
   2554          UChar scale   = toUChar((sib >> 6) & 3);
   2555          UChar index_r = toUChar((sib >> 3) & 7);
   2556          UChar base_r  = toUChar(sib & 7);
   2557          /* correct since #(R13) == 8 + #(RBP) */
   2558          Bool  base_is_BPor13 = toBool(base_r == R_RBP);
   2559          Bool  index_is_SP    = toBool(index_r == R_RSP && 0==getRexX(pfx));
   2560          delta++;
   2561 
   2562          if ((!index_is_SP) && (!base_is_BPor13)) {
   2563             if (scale == 0) {
   2564                DIS(buf, "%s(%s,%s)", segRegTxt(pfx),
   2565                          nameIRegRexB(8,pfx,base_r),
   2566                          nameIReg64rexX(pfx,index_r));
   2567             } else {
   2568                DIS(buf, "%s(%s,%s,%d)", segRegTxt(pfx),
   2569                          nameIRegRexB(8,pfx,base_r),
   2570                          nameIReg64rexX(pfx,index_r), 1<<scale);
   2571             }
   2572             *len = 2;
   2573             return
   2574                disAMode_copy2tmp(
   2575                handleAddrOverrides(vbi, pfx,
   2576                   binop(Iop_Add64,
   2577                         getIRegRexB(8,pfx,base_r),
   2578                         binop(Iop_Shl64, getIReg64rexX(pfx,index_r),
   2579                               mkU8(scale)))));
   2580          }
   2581 
   2582          if ((!index_is_SP) && base_is_BPor13) {
   2583             Long d = getSDisp32(delta);
   2584             DIS(buf, "%s%lld(,%s,%d)", segRegTxt(pfx), d,
   2585                       nameIReg64rexX(pfx,index_r), 1<<scale);
   2586             *len = 6;
   2587             return
   2588                disAMode_copy2tmp(
   2589                handleAddrOverrides(vbi, pfx,
   2590                   binop(Iop_Add64,
   2591                         binop(Iop_Shl64, getIReg64rexX(pfx,index_r),
   2592                                          mkU8(scale)),
   2593                         mkU64(d))));
   2594          }
   2595 
   2596          if (index_is_SP && (!base_is_BPor13)) {
   2597             DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,base_r));
   2598             *len = 2;
   2599             return disAMode_copy2tmp(
   2600                    handleAddrOverrides(vbi, pfx, getIRegRexB(8,pfx,base_r)));
   2601          }
   2602 
   2603          if (index_is_SP && base_is_BPor13) {
   2604             Long d = getSDisp32(delta);
   2605             DIS(buf, "%s%lld", segRegTxt(pfx), d);
   2606             *len = 6;
   2607             return disAMode_copy2tmp(
   2608                    handleAddrOverrides(vbi, pfx, mkU64(d)));
   2609          }
   2610 
   2611          vassert(0);
   2612       }
   2613 
   2614       /* SIB, with 8-bit displacement.  Special cases:
   2615          -- %esp cannot act as an index value.
   2616             If index_r indicates %esp, zero is used for the index.
   2617          Denoted value is:
   2618             | %index == %ESP
   2619             = d8 + %base
   2620             | %index != %ESP
   2621             = d8 + %base + (%index << scale)
   2622       */
   2623       case 0x0C: {
   2624          UChar sib     = getUChar(delta);
   2625          UChar scale   = toUChar((sib >> 6) & 3);
   2626          UChar index_r = toUChar((sib >> 3) & 7);
   2627          UChar base_r  = toUChar(sib & 7);
   2628          Long d        = getSDisp8(delta+1);
   2629 
   2630          if (index_r == R_RSP && 0==getRexX(pfx)) {
   2631             DIS(buf, "%s%lld(%s)", segRegTxt(pfx),
   2632                                    d, nameIRegRexB(8,pfx,base_r));
   2633             *len = 3;
   2634             return disAMode_copy2tmp(
   2635                    handleAddrOverrides(vbi, pfx,
   2636                       binop(Iop_Add64, getIRegRexB(8,pfx,base_r), mkU64(d)) ));
   2637          } else {
   2638             if (scale == 0) {
   2639                DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
   2640                          nameIRegRexB(8,pfx,base_r),
   2641                          nameIReg64rexX(pfx,index_r));
   2642             } else {
   2643                DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
   2644                          nameIRegRexB(8,pfx,base_r),
   2645                          nameIReg64rexX(pfx,index_r), 1<<scale);
   2646             }
   2647             *len = 3;
   2648             return
   2649                 disAMode_copy2tmp(
   2650                 handleAddrOverrides(vbi, pfx,
   2651                   binop(Iop_Add64,
   2652                         binop(Iop_Add64,
   2653                               getIRegRexB(8,pfx,base_r),
   2654                               binop(Iop_Shl64,
   2655                                     getIReg64rexX(pfx,index_r), mkU8(scale))),
   2656                         mkU64(d))));
   2657          }
   2658          vassert(0); /*NOTREACHED*/
   2659       }
   2660 
   2661       /* SIB, with 32-bit displacement.  Special cases:
   2662          -- %rsp cannot act as an index value.
   2663             If index_r indicates %rsp, zero is used for the index.
   2664          Denoted value is:
   2665             | %index == %RSP
   2666             = d32 + %base
   2667             | %index != %RSP
   2668             = d32 + %base + (%index << scale)
   2669       */
   2670       case 0x14: {
   2671          UChar sib     = getUChar(delta);
   2672          UChar scale   = toUChar((sib >> 6) & 3);
   2673          UChar index_r = toUChar((sib >> 3) & 7);
   2674          UChar base_r  = toUChar(sib & 7);
   2675          Long d        = getSDisp32(delta+1);
   2676 
   2677          if (index_r == R_RSP && 0==getRexX(pfx)) {
   2678             DIS(buf, "%s%lld(%s)", segRegTxt(pfx),
   2679                                    d, nameIRegRexB(8,pfx,base_r));
   2680             *len = 6;
   2681             return disAMode_copy2tmp(
   2682                    handleAddrOverrides(vbi, pfx,
   2683                       binop(Iop_Add64, getIRegRexB(8,pfx,base_r), mkU64(d)) ));
   2684          } else {
   2685             if (scale == 0) {
   2686                DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
   2687                          nameIRegRexB(8,pfx,base_r),
   2688                          nameIReg64rexX(pfx,index_r));
   2689             } else {
   2690                DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
   2691                          nameIRegRexB(8,pfx,base_r),
   2692                          nameIReg64rexX(pfx,index_r), 1<<scale);
   2693             }
   2694             *len = 6;
   2695             return
   2696                 disAMode_copy2tmp(
   2697                 handleAddrOverrides(vbi, pfx,
   2698                   binop(Iop_Add64,
   2699                         binop(Iop_Add64,
   2700                               getIRegRexB(8,pfx,base_r),
   2701                               binop(Iop_Shl64,
   2702                                     getIReg64rexX(pfx,index_r), mkU8(scale))),
   2703                         mkU64(d))));
   2704          }
   2705          vassert(0); /*NOTREACHED*/
   2706       }
   2707 
   2708       default:
   2709          vpanic("disAMode(amd64)");
   2710          return 0; /*notreached*/
   2711    }
   2712 }
   2713 
   2714 
   2715 /* Similarly for VSIB addressing.  This returns just the addend,
   2716    and fills in *rI and *vscale with the register number of the vector
   2717    index and its multiplicand.  */
   2718 static
   2719 IRTemp disAVSIBMode ( /*OUT*/Int* len,
   2720                       const VexAbiInfo* vbi, Prefix pfx, Long delta,
   2721                       /*OUT*/HChar* buf, /*OUT*/UInt* rI,
   2722                       IRType ty, /*OUT*/Int* vscale )
   2723 {
   2724    UChar mod_reg_rm = getUChar(delta);
   2725    const HChar *vindex;
   2726 
   2727    *len = 0;
   2728    *rI = 0;
   2729    *vscale = 0;
   2730    buf[0] = (UChar)0;
   2731    if ((mod_reg_rm & 7) != 4 || epartIsReg(mod_reg_rm))
   2732       return IRTemp_INVALID;
   2733 
   2734    UChar sib     = getUChar(delta+1);
   2735    UChar scale   = toUChar((sib >> 6) & 3);
   2736    UChar index_r = toUChar((sib >> 3) & 7);
   2737    UChar base_r  = toUChar(sib & 7);
   2738    Long  d       = 0;
   2739    /* correct since #(R13) == 8 + #(RBP) */
   2740    Bool  base_is_BPor13 = toBool(base_r == R_RBP);
   2741    delta += 2;
   2742    *len = 2;
   2743 
   2744    *rI = index_r | (getRexX(pfx) << 3);
   2745    if (ty == Ity_V128)
   2746       vindex = nameXMMReg(*rI);
   2747    else
   2748       vindex = nameYMMReg(*rI);
   2749    *vscale = 1<<scale;
   2750 
   2751    switch (mod_reg_rm >> 6) {
   2752    case 0:
   2753       if (base_is_BPor13) {
   2754          d = getSDisp32(delta);
   2755          *len += 4;
   2756          if (scale == 0) {
   2757             DIS(buf, "%s%lld(,%s)", segRegTxt(pfx), d, vindex);
   2758          } else {
   2759             DIS(buf, "%s%lld(,%s,%d)", segRegTxt(pfx), d, vindex, 1<<scale);
   2760          }
   2761          return disAMode_copy2tmp( mkU64(d) );
   2762       } else {
   2763          if (scale == 0) {
   2764             DIS(buf, "%s(%s,%s)", segRegTxt(pfx),
   2765                      nameIRegRexB(8,pfx,base_r), vindex);
   2766          } else {
   2767             DIS(buf, "%s(%s,%s,%d)", segRegTxt(pfx),
   2768                      nameIRegRexB(8,pfx,base_r), vindex, 1<<scale);
   2769          }
   2770       }
   2771       break;
   2772    case 1:
   2773       d = getSDisp8(delta);
   2774       *len += 1;
   2775       goto have_disp;
   2776    case 2:
   2777       d = getSDisp32(delta);
   2778       *len += 4;
   2779    have_disp:
   2780       if (scale == 0) {
   2781          DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
   2782                   nameIRegRexB(8,pfx,base_r), vindex);
   2783       } else {
   2784          DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
   2785                   nameIRegRexB(8,pfx,base_r), vindex, 1<<scale);
   2786       }
   2787       break;
   2788    }
   2789 
   2790    if (!d)
   2791       return disAMode_copy2tmp( getIRegRexB(8,pfx,base_r) );
   2792    return disAMode_copy2tmp( binop(Iop_Add64, getIRegRexB(8,pfx,base_r),
   2793                                    mkU64(d)) );
   2794 }
   2795 
   2796 
   2797 /* Figure out the number of (insn-stream) bytes constituting the amode
   2798    beginning at delta.  Is useful for getting hold of literals beyond
   2799    the end of the amode before it has been disassembled.  */
   2800 
   2801 static UInt lengthAMode ( Prefix pfx, Long delta )
   2802 {
   2803    UChar mod_reg_rm = getUChar(delta);
   2804    delta++;
   2805 
   2806    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
   2807       jump table seems a bit excessive.
   2808    */
   2809    mod_reg_rm &= 0xC7;                         /* is now XX000YYY */
   2810    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
   2811                                                /* is now XX0XXYYY */
   2812    mod_reg_rm &= 0x1F;                         /* is now 000XXYYY */
   2813    switch (mod_reg_rm) {
   2814 
   2815       /* REX.B==0: (%rax) .. (%rdi), not including (%rsp) or (%rbp).
   2816          REX.B==1: (%r8)  .. (%r15), not including (%r12) or (%r13).
   2817       */
   2818       case 0x00: case 0x01: case 0x02: case 0x03:
   2819       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
   2820          return 1;
   2821 
   2822       /* REX.B==0: d8(%rax) ... d8(%rdi), not including d8(%rsp)
   2823          REX.B==1: d8(%r8)  ... d8(%r15), not including d8(%r12)
   2824       */
   2825       case 0x08: case 0x09: case 0x0A: case 0x0B:
   2826       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
   2827          return 2;
   2828 
   2829       /* REX.B==0: d32(%rax) ... d32(%rdi), not including d32(%rsp)
   2830          REX.B==1: d32(%r8)  ... d32(%r15), not including d32(%r12)
   2831       */
   2832       case 0x10: case 0x11: case 0x12: case 0x13:
   2833       /* ! 14 */ case 0x15: case 0x16: case 0x17:
   2834          return 5;
   2835 
   2836       /* REX.B==0: a register, %rax .. %rdi.  This shouldn't happen. */
   2837       /* REX.B==1: a register, %r8  .. %r16.  This shouldn't happen. */
   2838       /* Not an address, but still handled. */
   2839       case 0x18: case 0x19: case 0x1A: case 0x1B:
   2840       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
   2841          return 1;
   2842 
   2843       /* RIP + disp32. */
   2844       case 0x05:
   2845          return 5;
   2846 
   2847       case 0x04: {
   2848          /* SIB, with no displacement. */
   2849          UChar sib     = getUChar(delta);
   2850          UChar base_r  = toUChar(sib & 7);
   2851          /* correct since #(R13) == 8 + #(RBP) */
   2852          Bool  base_is_BPor13 = toBool(base_r == R_RBP);
   2853 
   2854          if (base_is_BPor13) {
   2855             return 6;
   2856          } else {
   2857             return 2;
   2858          }
   2859       }
   2860 
   2861       /* SIB, with 8-bit displacement. */
   2862       case 0x0C:
   2863          return 3;
   2864 
   2865       /* SIB, with 32-bit displacement. */
   2866       case 0x14:
   2867          return 6;
   2868 
   2869       default:
   2870          vpanic("lengthAMode(amd64)");
   2871          return 0; /*notreached*/
   2872    }
   2873 }
   2874 
   2875 
   2876 /*------------------------------------------------------------*/
   2877 /*--- Disassembling common idioms                          ---*/
   2878 /*------------------------------------------------------------*/
   2879 
   2880 /* Handle binary integer instructions of the form
   2881       op E, G  meaning
   2882       op reg-or-mem, reg
   2883    Is passed the a ptr to the modRM byte, the actual operation, and the
   2884    data size.  Returns the address advanced completely over this
   2885    instruction.
   2886 
   2887    E(src) is reg-or-mem
   2888    G(dst) is reg.
   2889 
   2890    If E is reg, -->    GET %G,  tmp
   2891                        OP %E,   tmp
   2892                        PUT tmp, %G
   2893 
   2894    If E is mem and OP is not reversible,
   2895                 -->    (getAddr E) -> tmpa
   2896                        LD (tmpa), tmpa
   2897                        GET %G, tmp2
   2898                        OP tmpa, tmp2
   2899                        PUT tmp2, %G
   2900 
   2901    If E is mem and OP is reversible
   2902                 -->    (getAddr E) -> tmpa
   2903                        LD (tmpa), tmpa
   2904                        OP %G, tmpa
   2905                        PUT tmpa, %G
   2906 */
   2907 static
   2908 ULong dis_op2_E_G ( const VexAbiInfo* vbi,
   2909                     Prefix      pfx,
   2910                     Bool        addSubCarry,
   2911                     IROp        op8,
   2912                     Bool        keep,
   2913                     Int         size,
   2914                     Long        delta0,
   2915                     const HChar* t_amd64opc )
   2916 {
   2917    HChar   dis_buf[50];
   2918    Int     len;
   2919    IRType  ty   = szToITy(size);
   2920    IRTemp  dst1 = newTemp(ty);
   2921    IRTemp  src  = newTemp(ty);
   2922    IRTemp  dst0 = newTemp(ty);
   2923    UChar   rm   = getUChar(delta0);
   2924    IRTemp  addr = IRTemp_INVALID;
   2925 
   2926    /* addSubCarry == True indicates the intended operation is
   2927       add-with-carry or subtract-with-borrow. */
   2928    if (addSubCarry) {
   2929       vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
   2930       vassert(keep);
   2931    }
   2932 
   2933    if (epartIsReg(rm)) {
   2934       /* Specially handle XOR reg,reg, because that doesn't really
   2935          depend on reg, and doing the obvious thing potentially
   2936          generates a spurious value check failure due to the bogus
   2937          dependency. */
   2938       if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
   2939           && offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
   2940          if (False && op8 == Iop_Sub8)
   2941             vex_printf("vex amd64->IR: sbb %%r,%%r optimisation(1)\n");
   2942          putIRegG(size,pfx,rm, mkU(ty,0));
   2943       }
   2944 
   2945       assign( dst0, getIRegG(size,pfx,rm) );
   2946       assign( src,  getIRegE(size,pfx,rm) );
   2947 
   2948       if (addSubCarry && op8 == Iop_Add8) {
   2949          helper_ADC( size, dst1, dst0, src,
   2950                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2951          putIRegG(size, pfx, rm, mkexpr(dst1));
   2952       } else
   2953       if (addSubCarry && op8 == Iop_Sub8) {
   2954          helper_SBB( size, dst1, dst0, src,
   2955                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2956          putIRegG(size, pfx, rm, mkexpr(dst1));
   2957       } else {
   2958          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   2959          if (isAddSub(op8))
   2960             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2961          else
   2962             setFlags_DEP1(op8, dst1, ty);
   2963          if (keep)
   2964             putIRegG(size, pfx, rm, mkexpr(dst1));
   2965       }
   2966 
   2967       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   2968                           nameIRegE(size,pfx,rm),
   2969                           nameIRegG(size,pfx,rm));
   2970       return 1+delta0;
   2971    } else {
   2972       /* E refers to memory */
   2973       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   2974       assign( dst0, getIRegG(size,pfx,rm) );
   2975       assign( src,  loadLE(szToITy(size), mkexpr(addr)) );
   2976 
   2977       if (addSubCarry && op8 == Iop_Add8) {
   2978          helper_ADC( size, dst1, dst0, src,
   2979                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2980          putIRegG(size, pfx, rm, mkexpr(dst1));
   2981       } else
   2982       if (addSubCarry && op8 == Iop_Sub8) {
   2983          helper_SBB( size, dst1, dst0, src,
   2984                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2985          putIRegG(size, pfx, rm, mkexpr(dst1));
   2986       } else {
   2987          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   2988          if (isAddSub(op8))
   2989             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2990          else
   2991             setFlags_DEP1(op8, dst1, ty);
   2992          if (keep)
   2993             putIRegG(size, pfx, rm, mkexpr(dst1));
   2994       }
   2995 
   2996       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   2997                           dis_buf, nameIRegG(size, pfx, rm));
   2998       return len+delta0;
   2999    }
   3000 }
   3001 
   3002 
   3003 
   3004 /* Handle binary integer instructions of the form
   3005       op G, E  meaning
   3006       op reg, reg-or-mem
   3007    Is passed the a ptr to the modRM byte, the actual operation, and the
   3008    data size.  Returns the address advanced completely over this
   3009    instruction.
   3010 
   3011    G(src) is reg.
   3012    E(dst) is reg-or-mem
   3013 
   3014    If E is reg, -->    GET %E,  tmp
   3015                        OP %G,   tmp
   3016                        PUT tmp, %E
   3017 
   3018    If E is mem, -->    (getAddr E) -> tmpa
   3019                        LD (tmpa), tmpv
   3020                        OP %G, tmpv
   3021                        ST tmpv, (tmpa)
   3022 */
   3023 static
   3024 ULong dis_op2_G_E ( const VexAbiInfo* vbi,
   3025                     Prefix      pfx,
   3026                     Bool        addSubCarry,
   3027                     IROp        op8,
   3028                     Bool        keep,
   3029                     Int         size,
   3030                     Long        delta0,
   3031                     const HChar* t_amd64opc )
   3032 {
   3033    HChar   dis_buf[50];
   3034    Int     len;
   3035    IRType  ty   = szToITy(size);
   3036    IRTemp  dst1 = newTemp(ty);
   3037    IRTemp  src  = newTemp(ty);
   3038    IRTemp  dst0 = newTemp(ty);
   3039    UChar   rm   = getUChar(delta0);
   3040    IRTemp  addr = IRTemp_INVALID;
   3041 
   3042    /* addSubCarry == True indicates the intended operation is
   3043       add-with-carry or subtract-with-borrow. */
   3044    if (addSubCarry) {
   3045       vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
   3046       vassert(keep);
   3047    }
   3048 
   3049    if (epartIsReg(rm)) {
   3050       /* Specially handle XOR reg,reg, because that doesn't really
   3051          depend on reg, and doing the obvious thing potentially
   3052          generates a spurious value check failure due to the bogus
   3053          dependency.  Ditto SBB reg,reg. */
   3054       if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
   3055           && offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
   3056          putIRegE(size,pfx,rm, mkU(ty,0));
   3057       }
   3058 
   3059       assign(dst0, getIRegE(size,pfx,rm));
   3060       assign(src,  getIRegG(size,pfx,rm));
   3061 
   3062       if (addSubCarry && op8 == Iop_Add8) {
   3063          helper_ADC( size, dst1, dst0, src,
   3064                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3065          putIRegE(size, pfx, rm, mkexpr(dst1));
   3066       } else
   3067       if (addSubCarry && op8 == Iop_Sub8) {
   3068          helper_SBB( size, dst1, dst0, src,
   3069                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3070          putIRegE(size, pfx, rm, mkexpr(dst1));
   3071       } else {
   3072          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   3073          if (isAddSub(op8))
   3074             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3075          else
   3076             setFlags_DEP1(op8, dst1, ty);
   3077          if (keep)
   3078             putIRegE(size, pfx, rm, mkexpr(dst1));
   3079       }
   3080 
   3081       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   3082                           nameIRegG(size,pfx,rm),
   3083                           nameIRegE(size,pfx,rm));
   3084       return 1+delta0;
   3085    }
   3086 
   3087    /* E refers to memory */
   3088    {
   3089       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   3090       assign(dst0, loadLE(ty,mkexpr(addr)));
   3091       assign(src,  getIRegG(size,pfx,rm));
   3092 
   3093       if (addSubCarry && op8 == Iop_Add8) {
   3094          if (haveLOCK(pfx)) {
   3095             /* cas-style store */
   3096             helper_ADC( size, dst1, dst0, src,
   3097                         /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   3098          } else {
   3099             /* normal store */
   3100             helper_ADC( size, dst1, dst0, src,
   3101                         /*store*/addr, IRTemp_INVALID, 0 );
   3102          }
   3103       } else
   3104       if (addSubCarry && op8 == Iop_Sub8) {
   3105          if (haveLOCK(pfx)) {
   3106             /* cas-style store */
   3107             helper_SBB( size, dst1, dst0, src,
   3108                         /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   3109          } else {
   3110             /* normal store */
   3111             helper_SBB( size, dst1, dst0, src,
   3112                         /*store*/addr, IRTemp_INVALID, 0 );
   3113          }
   3114       } else {
   3115          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   3116          if (keep) {
   3117             if (haveLOCK(pfx)) {
   3118                if (0) vex_printf("locked case\n" );
   3119                casLE( mkexpr(addr),
   3120                       mkexpr(dst0)/*expval*/,
   3121                       mkexpr(dst1)/*newval*/, guest_RIP_curr_instr );
   3122             } else {
   3123                if (0) vex_printf("nonlocked case\n");
   3124                storeLE(mkexpr(addr), mkexpr(dst1));
   3125             }
   3126          }
   3127          if (isAddSub(op8))
   3128             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3129          else
   3130             setFlags_DEP1(op8, dst1, ty);
   3131       }
   3132 
   3133       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   3134                           nameIRegG(size,pfx,rm), dis_buf);
   3135       return len+delta0;
   3136    }
   3137 }
   3138 
   3139 
   3140 /* Handle move instructions of the form
   3141       mov E, G  meaning
   3142       mov reg-or-mem, reg
   3143    Is passed the a ptr to the modRM byte, and the data size.  Returns
   3144    the address advanced completely over this instruction.
   3145 
   3146    E(src) is reg-or-mem
   3147    G(dst) is reg.
   3148 
   3149    If E is reg, -->    GET %E,  tmpv
   3150                        PUT tmpv, %G
   3151 
   3152    If E is mem  -->    (getAddr E) -> tmpa
   3153                        LD (tmpa), tmpb
   3154                        PUT tmpb, %G
   3155 */
   3156 static
   3157 ULong dis_mov_E_G ( const VexAbiInfo* vbi,
   3158                     Prefix      pfx,
   3159                     Int         size,
   3160                     Long        delta0 )
   3161 {
   3162    Int len;
   3163    UChar rm = getUChar(delta0);
   3164    HChar dis_buf[50];
   3165 
   3166    if (epartIsReg(rm)) {
   3167       putIRegG(size, pfx, rm, getIRegE(size, pfx, rm));
   3168       DIP("mov%c %s,%s\n", nameISize(size),
   3169                            nameIRegE(size,pfx,rm),
   3170                            nameIRegG(size,pfx,rm));
   3171       return 1+delta0;
   3172    }
   3173 
   3174    /* E refers to memory */
   3175    {
   3176       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   3177       putIRegG(size, pfx, rm, loadLE(szToITy(size), mkexpr(addr)));
   3178       DIP("mov%c %s,%s\n", nameISize(size),
   3179                            dis_buf,
   3180                            nameIRegG(size,pfx,rm));
   3181       return delta0+len;
   3182    }
   3183 }
   3184 
   3185 
   3186 /* Handle move instructions of the form
   3187       mov G, E  meaning
   3188       mov reg, reg-or-mem
   3189    Is passed the a ptr to the modRM byte, and the data size.  Returns
   3190    the address advanced completely over this instruction.
   3191    We have to decide here whether F2 or F3 are acceptable.  F2 never is.
   3192 
   3193    G(src) is reg.
   3194    E(dst) is reg-or-mem
   3195 
   3196    If E is reg, -->    GET %G,  tmp
   3197                        PUT tmp, %E
   3198 
   3199    If E is mem, -->    (getAddr E) -> tmpa
   3200                        GET %G, tmpv
   3201                        ST tmpv, (tmpa)
   3202 */
   3203 static
   3204 ULong dis_mov_G_E ( const VexAbiInfo*  vbi,
   3205                     Prefix       pfx,
   3206                     Int          size,
   3207                     Long         delta0,
   3208                     /*OUT*/Bool* ok )
   3209 {
   3210    Int   len;
   3211    UChar rm = getUChar(delta0);
   3212    HChar dis_buf[50];
   3213 
   3214    *ok = True;
   3215 
   3216    if (epartIsReg(rm)) {
   3217       if (haveF2orF3(pfx)) { *ok = False; return delta0; }
   3218       putIRegE(size, pfx, rm, getIRegG(size, pfx, rm));
   3219       DIP("mov%c %s,%s\n", nameISize(size),
   3220                            nameIRegG(size,pfx,rm),
   3221                            nameIRegE(size,pfx,rm));
   3222       return 1+delta0;
   3223    }
   3224 
   3225    /* E refers to memory */
   3226    {
   3227       if (haveF2(pfx)) { *ok = False; return delta0; }
   3228       /* F3(XRELEASE) is acceptable, though. */
   3229       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   3230       storeLE( mkexpr(addr), getIRegG(size, pfx, rm) );
   3231       DIP("mov%c %s,%s\n", nameISize(size),
   3232                            nameIRegG(size,pfx,rm),
   3233                            dis_buf);
   3234       return len+delta0;
   3235    }
   3236 }
   3237 
   3238 
   3239 /* op $immediate, AL/AX/EAX/RAX. */
   3240 static
   3241 ULong dis_op_imm_A ( Int    size,
   3242                      Bool   carrying,
   3243                      IROp   op8,
   3244                      Bool   keep,
   3245                      Long   delta,
   3246                      const HChar* t_amd64opc )
   3247 {
   3248    Int    size4 = imin(size,4);
   3249    IRType ty    = szToITy(size);
   3250    IRTemp dst0  = newTemp(ty);
   3251    IRTemp src   = newTemp(ty);
   3252    IRTemp dst1  = newTemp(ty);
   3253    Long  lit    = getSDisp(size4,delta);
   3254    assign(dst0, getIRegRAX(size));
   3255    assign(src,  mkU(ty,lit & mkSizeMask(size)));
   3256 
   3257    if (isAddSub(op8) && !carrying) {
   3258       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   3259       setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3260    }
   3261    else
   3262    if (isLogic(op8)) {
   3263       vassert(!carrying);
   3264       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   3265       setFlags_DEP1(op8, dst1, ty);
   3266    }
   3267    else
   3268    if (op8 == Iop_Add8 && carrying) {
   3269       helper_ADC( size, dst1, dst0, src,
   3270                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3271    }
   3272    else
   3273    if (op8 == Iop_Sub8 && carrying) {
   3274       helper_SBB( size, dst1, dst0, src,
   3275                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3276    }
   3277    else
   3278       vpanic("dis_op_imm_A(amd64,guest)");
   3279 
   3280    if (keep)
   3281       putIRegRAX(size, mkexpr(dst1));
   3282 
   3283    DIP("%s%c $%lld, %s\n", t_amd64opc, nameISize(size),
   3284                            lit, nameIRegRAX(size));
   3285    return delta+size4;
   3286 }
   3287 
   3288 
   3289 /* Sign- and Zero-extending moves. */
   3290 static
   3291 ULong dis_movx_E_G ( const VexAbiInfo* vbi,
   3292                      Prefix pfx,
   3293                      Long delta, Int szs, Int szd, Bool sign_extend )
   3294 {
   3295    UChar rm = getUChar(delta);
   3296    if (epartIsReg(rm)) {
   3297       putIRegG(szd, pfx, rm,
   3298                     doScalarWidening(
   3299                        szs,szd,sign_extend,
   3300                        getIRegE(szs,pfx,rm)));
   3301       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
   3302                                nameISize(szs),
   3303                                nameISize(szd),
   3304                                nameIRegE(szs,pfx,rm),
   3305                                nameIRegG(szd,pfx,rm));
   3306       return 1+delta;
   3307    }
   3308 
   3309    /* E refers to memory */
   3310    {
   3311       Int    len;
   3312       HChar  dis_buf[50];
   3313       IRTemp addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   3314       putIRegG(szd, pfx, rm,
   3315                     doScalarWidening(
   3316                        szs,szd,sign_extend,
   3317                        loadLE(szToITy(szs),mkexpr(addr))));
   3318       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
   3319                                nameISize(szs),
   3320                                nameISize(szd),
   3321                                dis_buf,
   3322                                nameIRegG(szd,pfx,rm));
   3323       return len+delta;
   3324    }
   3325 }
   3326 
   3327 
   3328 /* Generate code to divide ArchRegs RDX:RAX / EDX:EAX / DX:AX / AX by
   3329    the 64 / 32 / 16 / 8 bit quantity in the given IRTemp.  */
   3330 static
   3331 void codegen_div ( Int sz, IRTemp t, Bool signed_divide )
   3332 {
   3333    /* special-case the 64-bit case */
   3334    if (sz == 8) {
   3335       IROp   op     = signed_divide ? Iop_DivModS128to64
   3336                                     : Iop_DivModU128to64;
   3337       IRTemp src128 = newTemp(Ity_I128);
   3338       IRTemp dst128 = newTemp(Ity_I128);
   3339       assign( src128, binop(Iop_64HLto128,
   3340                             getIReg64(R_RDX),
   3341                             getIReg64(R_RAX)) );
   3342       assign( dst128, binop(op, mkexpr(src128), mkexpr(t)) );
   3343       putIReg64( R_RAX, unop(Iop_128to64,mkexpr(dst128)) );
   3344       putIReg64( R_RDX, unop(Iop_128HIto64,mkexpr(dst128)) );
   3345    } else {
   3346       IROp   op    = signed_divide ? Iop_DivModS64to32
   3347                                    : Iop_DivModU64to32;
   3348       IRTemp src64 = newTemp(Ity_I64);
   3349       IRTemp dst64 = newTemp(Ity_I64);
   3350       switch (sz) {
   3351       case 4:
   3352          assign( src64,
   3353                  binop(Iop_32HLto64, getIRegRDX(4), getIRegRAX(4)) );
   3354          assign( dst64,
   3355                  binop(op, mkexpr(src64), mkexpr(t)) );
   3356          putIRegRAX( 4, unop(Iop_64to32,mkexpr(dst64)) );
   3357          putIRegRDX( 4, unop(Iop_64HIto32,mkexpr(dst64)) );
   3358          break;
   3359       case 2: {
   3360          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
   3361          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
   3362          assign( src64, unop(widen3264,
   3363                              binop(Iop_16HLto32,
   3364                                    getIRegRDX(2),
   3365                                    getIRegRAX(2))) );
   3366          assign( dst64, binop(op, mkexpr(src64), unop(widen1632,mkexpr(t))) );
   3367          putIRegRAX( 2, unop(Iop_32to16,unop(Iop_64to32,mkexpr(dst64))) );
   3368          putIRegRDX( 2, unop(Iop_32to16,unop(Iop_64HIto32,mkexpr(dst64))) );
   3369          break;
   3370       }
   3371       case 1: {
   3372          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
   3373          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
   3374          IROp widen816  = signed_divide ? Iop_8Sto16  : Iop_8Uto16;
   3375          assign( src64, unop(widen3264,
   3376                         unop(widen1632, getIRegRAX(2))) );
   3377          assign( dst64,
   3378                  binop(op, mkexpr(src64),
   3379                            unop(widen1632, unop(widen816, mkexpr(t)))) );
   3380          putIRegRAX( 1, unop(Iop_16to8,
   3381                         unop(Iop_32to16,
   3382                         unop(Iop_64to32,mkexpr(dst64)))) );
   3383          putIRegAH( unop(Iop_16to8,
   3384                     unop(Iop_32to16,
   3385                     unop(Iop_64HIto32,mkexpr(dst64)))) );
   3386          break;
   3387       }
   3388       default:
   3389          vpanic("codegen_div(amd64)");
   3390       }
   3391    }
   3392 }
   3393 
   3394 static
   3395 ULong dis_Grp1 ( const VexAbiInfo* vbi,
   3396                  Prefix pfx,
   3397                  Long delta, UChar modrm,
   3398                  Int am_sz, Int d_sz, Int sz, Long d64 )
   3399 {
   3400    Int     len;
   3401    HChar   dis_buf[50];
   3402    IRType  ty   = szToITy(sz);
   3403    IRTemp  dst1 = newTemp(ty);
   3404    IRTemp  src  = newTemp(ty);
   3405    IRTemp  dst0 = newTemp(ty);
   3406    IRTemp  addr = IRTemp_INVALID;
   3407    IROp    op8  = Iop_INVALID;
   3408    ULong   mask = mkSizeMask(sz);
   3409 
   3410    switch (gregLO3ofRM(modrm)) {
   3411       case 0: op8 = Iop_Add8; break;  case 1: op8 = Iop_Or8;  break;
   3412       case 2: break;  // ADC
   3413       case 3: break;  // SBB
   3414       case 4: op8 = Iop_And8; break;  case 5: op8 = Iop_Sub8; break;
   3415       case 6: op8 = Iop_Xor8; break;  case 7: op8 = Iop_Sub8; break;
   3416       /*NOTREACHED*/
   3417       default: vpanic("dis_Grp1(amd64): unhandled case");
   3418    }
   3419 
   3420    if (epartIsReg(modrm)) {
   3421       vassert(am_sz == 1);
   3422 
   3423       assign(dst0, getIRegE(sz,pfx,modrm));
   3424       assign(src,  mkU(ty,d64 & mask));
   3425 
   3426       if (gregLO3ofRM(modrm) == 2 /* ADC */) {
   3427          helper_ADC( sz, dst1, dst0, src,
   3428                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3429       } else
   3430       if (gregLO3ofRM(modrm) == 3 /* SBB */) {
   3431          helper_SBB( sz, dst1, dst0, src,
   3432                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3433       } else {
   3434          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   3435          if (isAddSub(op8))
   3436             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3437          else
   3438             setFlags_DEP1(op8, dst1, ty);
   3439       }
   3440 
   3441       if (gregLO3ofRM(modrm) < 7)
   3442          putIRegE(sz, pfx, modrm, mkexpr(dst1));
   3443 
   3444       delta += (am_sz + d_sz);
   3445       DIP("%s%c $%lld, %s\n",
   3446           nameGrp1(gregLO3ofRM(modrm)), nameISize(sz), d64,
   3447           nameIRegE(sz,pfx,modrm));
   3448    } else {
   3449       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, /*xtra*/d_sz );
   3450 
   3451       assign(dst0, loadLE(ty,mkexpr(addr)));
   3452       assign(src, mkU(ty,d64 & mask));
   3453 
   3454       if (gregLO3ofRM(modrm) == 2 /* ADC */) {
   3455          if (haveLOCK(pfx)) {
   3456             /* cas-style store */
   3457             helper_ADC( sz, dst1, dst0, src,
   3458                        /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   3459          } else {
   3460             /* normal store */
   3461             helper_ADC( sz, dst1, dst0, src,
   3462                         /*store*/addr, IRTemp_INVALID, 0 );
   3463          }
   3464       } else
   3465       if (gregLO3ofRM(modrm) == 3 /* SBB */) {
   3466          if (haveLOCK(pfx)) {
   3467             /* cas-style store */
   3468             helper_SBB( sz, dst1, dst0, src,
   3469                        /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   3470          } else {
   3471             /* normal store */
   3472             helper_SBB( sz, dst1, dst0, src,
   3473                         /*store*/addr, IRTemp_INVALID, 0 );
   3474          }
   3475       } else {
   3476          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   3477          if (gregLO3ofRM(modrm) < 7) {
   3478             if (haveLOCK(pfx)) {
   3479                casLE( mkexpr(addr), mkexpr(dst0)/*expVal*/,
   3480                                     mkexpr(dst1)/*newVal*/,
   3481                                     guest_RIP_curr_instr );
   3482             } else {
   3483                storeLE(mkexpr(addr), mkexpr(dst1));
   3484             }
   3485          }
   3486          if (isAddSub(op8))
   3487             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3488          else
   3489             setFlags_DEP1(op8, dst1, ty);
   3490       }
   3491 
   3492       delta += (len+d_sz);
   3493       DIP("%s%c $%lld, %s\n",
   3494           nameGrp1(gregLO3ofRM(modrm)), nameISize(sz),
   3495           d64, dis_buf);
   3496    }
   3497    return delta;
   3498 }
   3499 
   3500 
   3501 /* Group 2 extended opcodes.  shift_expr must be an 8-bit typed
   3502    expression. */
   3503 
   3504 static
   3505 ULong dis_Grp2 ( const VexAbiInfo* vbi,
   3506                  Prefix pfx,
   3507                  Long delta, UChar modrm,
   3508                  Int am_sz, Int d_sz, Int sz, IRExpr* shift_expr,
   3509                  const HChar* shift_expr_txt, Bool* decode_OK )
   3510 {
   3511    /* delta on entry points at the modrm byte. */
   3512    HChar  dis_buf[50];
   3513    Int    len;
   3514    Bool   isShift, isRotate, isRotateC;
   3515    IRType ty    = szToITy(sz);
   3516    IRTemp dst0  = newTemp(ty);
   3517    IRTemp dst1  = newTemp(ty);
   3518    IRTemp addr  = IRTemp_INVALID;
   3519 
   3520    *decode_OK = True;
   3521 
   3522    vassert(sz == 1 || sz == 2 || sz == 4 || sz == 8);
   3523 
   3524    /* Put value to shift/rotate in dst0. */
   3525    if (epartIsReg(modrm)) {
   3526       assign(dst0, getIRegE(sz, pfx, modrm));
   3527       delta += (am_sz + d_sz);
   3528    } else {
   3529       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, /*xtra*/d_sz );
   3530       assign(dst0, loadLE(ty,mkexpr(addr)));
   3531       delta += len + d_sz;
   3532    }
   3533 
   3534    isShift = False;
   3535    switch (gregLO3ofRM(modrm)) { case 4: case 5: case 6: case 7: isShift = True; }
   3536 
   3537    isRotate = False;
   3538    switch (gregLO3ofRM(modrm)) { case 0: case 1: isRotate = True; }
   3539 
   3540    isRotateC = False;
   3541    switch (gregLO3ofRM(modrm)) { case 2: case 3: isRotateC = True; }
   3542 
   3543    if (!isShift && !isRotate && !isRotateC) {
   3544       /*NOTREACHED*/
   3545       vpanic("dis_Grp2(Reg): unhandled case(amd64)");
   3546    }
   3547 
   3548    if (isRotateC) {
   3549       /* Call a helper; this insn is so ridiculous it does not deserve
   3550          better.  One problem is, the helper has to calculate both the
   3551          new value and the new flags.  This is more than 64 bits, and
   3552          there is no way to return more than 64 bits from the helper.
   3553          Hence the crude and obvious solution is to call it twice,
   3554          using the sign of the sz field to indicate whether it is the
   3555          value or rflags result we want.
   3556       */
   3557       Bool     left = toBool(gregLO3ofRM(modrm) == 2);
   3558       IRExpr** argsVALUE;
   3559       IRExpr** argsRFLAGS;
   3560 
   3561       IRTemp new_value  = newTemp(Ity_I64);
   3562       IRTemp new_rflags = newTemp(Ity_I64);
   3563       IRTemp old_rflags = newTemp(Ity_I64);
   3564 
   3565       assign( old_rflags, widenUto64(mk_amd64g_calculate_rflags_all()) );
   3566 
   3567       argsVALUE
   3568          = mkIRExprVec_4( widenUto64(mkexpr(dst0)), /* thing to rotate */
   3569                           widenUto64(shift_expr),   /* rotate amount */
   3570                           mkexpr(old_rflags),
   3571                           mkU64(sz) );
   3572       assign( new_value,
   3573                  mkIRExprCCall(
   3574                     Ity_I64,
   3575                     0/*regparm*/,
   3576                     left ? "amd64g_calculate_RCL" : "amd64g_calculate_RCR",
   3577                     left ? &amd64g_calculate_RCL  : &amd64g_calculate_RCR,
   3578                     argsVALUE
   3579                  )
   3580             );
   3581 
   3582       argsRFLAGS
   3583          = mkIRExprVec_4( widenUto64(mkexpr(dst0)), /* thing to rotate */
   3584                           widenUto64(shift_expr),   /* rotate amount */
   3585                           mkexpr(old_rflags),
   3586                           mkU64(-sz) );
   3587       assign( new_rflags,
   3588                  mkIRExprCCall(
   3589                     Ity_I64,
   3590                     0/*regparm*/,
   3591                     left ? "amd64g_calculate_RCL" : "amd64g_calculate_RCR",
   3592                     left ? &amd64g_calculate_RCL  : &amd64g_calculate_RCR,
   3593                     argsRFLAGS
   3594                  )
   3595             );
   3596 
   3597       assign( dst1, narrowTo(ty, mkexpr(new_value)) );
   3598       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   3599       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(new_rflags) ));
   3600       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   3601       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   3602    }
   3603 
   3604    else
   3605    if (isShift) {
   3606 
   3607       IRTemp pre64     = newTemp(Ity_I64);
   3608       IRTemp res64     = newTemp(Ity_I64);
   3609       IRTemp res64ss   = newTemp(Ity_I64);
   3610       IRTemp shift_amt = newTemp(Ity_I8);
   3611       UChar  mask      = toUChar(sz==8 ? 63 : 31);
   3612       IROp   op64;
   3613 
   3614       switch (gregLO3ofRM(modrm)) {
   3615          case 4: op64 = Iop_Shl64; break;
   3616          case 5: op64 = Iop_Shr64; break;
   3617          case 6: op64 = Iop_Shl64; break;
   3618          case 7: op64 = Iop_Sar64; break;
   3619          /*NOTREACHED*/
   3620          default: vpanic("dis_Grp2:shift"); break;
   3621       }
   3622 
   3623       /* Widen the value to be shifted to 64 bits, do the shift, and
   3624          narrow back down.  This seems surprisingly long-winded, but
   3625          unfortunately the AMD semantics requires that 8/16/32-bit
   3626          shifts give defined results for shift values all the way up
   3627          to 32, and this seems the simplest way to do it.  It has the
   3628          advantage that the only IR level shifts generated are of 64
   3629          bit values, and the shift amount is guaranteed to be in the
   3630          range 0 .. 63, thereby observing the IR semantics requiring
   3631          all shift values to be in the range 0 .. 2^word_size-1.
   3632 
   3633          Therefore the shift amount is masked with 63 for 64-bit shifts
   3634          and 31 for all others.
   3635       */
   3636       /* shift_amt = shift_expr & MASK, regardless of operation size */
   3637       assign( shift_amt, binop(Iop_And8, shift_expr, mkU8(mask)) );
   3638 
   3639       /* suitably widen the value to be shifted to 64 bits. */
   3640       assign( pre64, op64==Iop_Sar64 ? widenSto64(mkexpr(dst0))
   3641                                      : widenUto64(mkexpr(dst0)) );
   3642 
   3643       /* res64 = pre64 `shift` shift_amt */
   3644       assign( res64, binop(op64, mkexpr(pre64), mkexpr(shift_amt)) );
   3645 
   3646       /* res64ss = pre64 `shift` ((shift_amt - 1) & MASK) */
   3647       assign( res64ss,
   3648               binop(op64,
   3649                     mkexpr(pre64),
   3650                     binop(Iop_And8,
   3651                           binop(Iop_Sub8,
   3652                                 mkexpr(shift_amt), mkU8(1)),
   3653                           mkU8(mask))) );
   3654 
   3655       /* Build the flags thunk. */
   3656       setFlags_DEP1_DEP2_shift(op64, res64, res64ss, ty, shift_amt);
   3657 
   3658       /* Narrow the result back down. */
   3659       assign( dst1, narrowTo(ty, mkexpr(res64)) );
   3660 
   3661    } /* if (isShift) */
   3662 
   3663    else
   3664    if (isRotate) {
   3665       Int    ccOp      = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1
   3666                                         : (ty==Ity_I32 ? 2 : 3));
   3667       Bool   left      = toBool(gregLO3ofRM(modrm) == 0);
   3668       IRTemp rot_amt   = newTemp(Ity_I8);
   3669       IRTemp rot_amt64 = newTemp(Ity_I8);
   3670       IRTemp oldFlags  = newTemp(Ity_I64);
   3671       UChar  mask      = toUChar(sz==8 ? 63 : 31);
   3672 
   3673       /* rot_amt = shift_expr & mask */
   3674       /* By masking the rotate amount thusly, the IR-level Shl/Shr
   3675          expressions never shift beyond the word size and thus remain
   3676          well defined. */
   3677       assign(rot_amt64, binop(Iop_And8, shift_expr, mkU8(mask)));
   3678 
   3679       if (ty == Ity_I64)
   3680          assign(rot_amt, mkexpr(rot_amt64));
   3681       else
   3682          assign(rot_amt, binop(Iop_And8, mkexpr(rot_amt64), mkU8(8*sz-1)));
   3683 
   3684       if (left) {
   3685 
   3686          /* dst1 = (dst0 << rot_amt) | (dst0 >>u (wordsize-rot_amt)) */
   3687          assign(dst1,
   3688             binop( mkSizedOp(ty,Iop_Or8),
   3689                    binop( mkSizedOp(ty,Iop_Shl8),
   3690                           mkexpr(dst0),
   3691                           mkexpr(rot_amt)
   3692                    ),
   3693                    binop( mkSizedOp(ty,Iop_Shr8),
   3694                           mkexpr(dst0),
   3695                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
   3696                    )
   3697             )
   3698          );
   3699          ccOp += AMD64G_CC_OP_ROLB;
   3700 
   3701       } else { /* right */
   3702 
   3703          /* dst1 = (dst0 >>u rot_amt) | (dst0 << (wordsize-rot_amt)) */
   3704          assign(dst1,
   3705             binop( mkSizedOp(ty,Iop_Or8),
   3706                    binop( mkSizedOp(ty,Iop_Shr8),
   3707                           mkexpr(dst0),
   3708                           mkexpr(rot_amt)
   3709                    ),
   3710                    binop( mkSizedOp(ty,Iop_Shl8),
   3711                           mkexpr(dst0),
   3712                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
   3713                    )
   3714             )
   3715          );
   3716          ccOp += AMD64G_CC_OP_RORB;
   3717 
   3718       }
   3719 
   3720       /* dst1 now holds the rotated value.  Build flag thunk.  We
   3721          need the resulting value for this, and the previous flags.
   3722          Except don't set it if the rotate count is zero. */
   3723 
   3724       assign(oldFlags, mk_amd64g_calculate_rflags_all());
   3725 
   3726       /* rot_amt64 :: Ity_I8.  We need to convert it to I1. */
   3727       IRTemp rot_amt64b = newTemp(Ity_I1);
   3728       assign(rot_amt64b, binop(Iop_CmpNE8, mkexpr(rot_amt64), mkU8(0)) );
   3729 
   3730       /* CC_DEP1 is the rotated value.  CC_NDEP is flags before. */
   3731       stmt( IRStmt_Put( OFFB_CC_OP,
   3732                         IRExpr_ITE( mkexpr(rot_amt64b),
   3733                                     mkU64(ccOp),
   3734                                     IRExpr_Get(OFFB_CC_OP,Ity_I64) ) ));
   3735       stmt( IRStmt_Put( OFFB_CC_DEP1,
   3736                         IRExpr_ITE( mkexpr(rot_amt64b),
   3737                                     widenUto64(mkexpr(dst1)),
   3738                                     IRExpr_Get(OFFB_CC_DEP1,Ity_I64) ) ));
   3739       stmt( IRStmt_Put( OFFB_CC_DEP2,
   3740                         IRExpr_ITE( mkexpr(rot_amt64b),
   3741                                     mkU64(0),
   3742                                     IRExpr_Get(OFFB_CC_DEP2,Ity_I64) ) ));
   3743       stmt( IRStmt_Put( OFFB_CC_NDEP,
   3744                         IRExpr_ITE( mkexpr(rot_amt64b),
   3745                                     mkexpr(oldFlags),
   3746                                     IRExpr_Get(OFFB_CC_NDEP,Ity_I64) ) ));
   3747    } /* if (isRotate) */
   3748 
   3749    /* Save result, and finish up. */
   3750    if (epartIsReg(modrm)) {
   3751       putIRegE(sz, pfx, modrm, mkexpr(dst1));
   3752       if (vex_traceflags & VEX_TRACE_FE) {
   3753          vex_printf("%s%c ",
   3754                     nameGrp2(gregLO3ofRM(modrm)), nameISize(sz) );
   3755          if (shift_expr_txt)
   3756             vex_printf("%s", shift_expr_txt);
   3757          else
   3758             ppIRExpr(shift_expr);
   3759          vex_printf(", %s\n", nameIRegE(sz,pfx,modrm));
   3760       }
   3761    } else {
   3762       storeLE(mkexpr(addr), mkexpr(dst1));
   3763       if (vex_traceflags & VEX_TRACE_FE) {
   3764          vex_printf("%s%c ",
   3765                     nameGrp2(gregLO3ofRM(modrm)), nameISize(sz) );
   3766          if (shift_expr_txt)
   3767             vex_printf("%s", shift_expr_txt);
   3768          else
   3769             ppIRExpr(shift_expr);
   3770          vex_printf(", %s\n", dis_buf);
   3771       }
   3772    }
   3773    return delta;
   3774 }
   3775 
   3776 
   3777 /* Group 8 extended opcodes (but BT/BTS/BTC/BTR only). */
   3778 static
   3779 ULong dis_Grp8_Imm ( const VexAbiInfo* vbi,
   3780                      Prefix pfx,
   3781                      Long delta, UChar modrm,
   3782                      Int am_sz, Int sz, ULong src_val,
   3783                      Bool* decode_OK )
   3784 {
   3785    /* src_val denotes a d8.
   3786       And delta on entry points at the modrm byte. */
   3787 
   3788    IRType ty     = szToITy(sz);
   3789    IRTemp t2     = newTemp(Ity_I64);
   3790    IRTemp t2m    = newTemp(Ity_I64);
   3791    IRTemp t_addr = IRTemp_INVALID;
   3792    HChar  dis_buf[50];
   3793    ULong  mask;
   3794 
   3795    /* we're optimists :-) */
   3796    *decode_OK = True;
   3797 
   3798    /* Check whether F2 or F3 are acceptable. */
   3799    if (epartIsReg(modrm)) {
   3800       /* F2 or F3 are not allowed in the register case. */
   3801       if (haveF2orF3(pfx)) {
   3802          *decode_OK = False;
   3803          return delta;
   3804      }
   3805    } else {
   3806       /* F2 or F3 (but not both) are allowable provided LOCK is also
   3807          present. */
   3808       if (haveF2orF3(pfx)) {
   3809          if (haveF2andF3(pfx) || !haveLOCK(pfx)) {
   3810             *decode_OK = False;
   3811             return delta;
   3812          }
   3813       }
   3814    }
   3815 
   3816    /* Limit src_val -- the bit offset -- to something within a word.
   3817       The Intel docs say that literal offsets larger than a word are
   3818       masked in this way. */
   3819    switch (sz) {
   3820       case 2:  src_val &= 15; break;
   3821       case 4:  src_val &= 31; break;
   3822       case 8:  src_val &= 63; break;
   3823       default: *decode_OK = False; return delta;
   3824    }
   3825 
   3826    /* Invent a mask suitable for the operation. */
   3827    switch (gregLO3ofRM(modrm)) {
   3828       case 4: /* BT */  mask = 0;                  break;
   3829       case 5: /* BTS */ mask = 1ULL << src_val;    break;
   3830       case 6: /* BTR */ mask = ~(1ULL << src_val); break;
   3831       case 7: /* BTC */ mask = 1ULL << src_val;    break;
   3832          /* If this needs to be extended, probably simplest to make a
   3833             new function to handle the other cases (0 .. 3).  The
   3834             Intel docs do however not indicate any use for 0 .. 3, so
   3835             we don't expect this to happen. */
   3836       default: *decode_OK = False; return delta;
   3837    }
   3838 
   3839    /* Fetch the value to be tested and modified into t2, which is
   3840       64-bits wide regardless of sz. */
   3841    if (epartIsReg(modrm)) {
   3842       vassert(am_sz == 1);
   3843       assign( t2, widenUto64(getIRegE(sz, pfx, modrm)) );
   3844       delta += (am_sz + 1);
   3845       DIP("%s%c $0x%llx, %s\n", nameGrp8(gregLO3ofRM(modrm)),
   3846                                 nameISize(sz),
   3847                                 src_val, nameIRegE(sz,pfx,modrm));
   3848    } else {
   3849       Int len;
   3850       t_addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 1 );
   3851       delta  += (len+1);
   3852       assign( t2, widenUto64(loadLE(ty, mkexpr(t_addr))) );
   3853       DIP("%s%c $0x%llx, %s\n", nameGrp8(gregLO3ofRM(modrm)),
   3854                                 nameISize(sz),
   3855                                 src_val, dis_buf);
   3856    }
   3857 
   3858    /* Compute the new value into t2m, if non-BT. */
   3859    switch (gregLO3ofRM(modrm)) {
   3860       case 4: /* BT */
   3861          break;
   3862       case 5: /* BTS */
   3863          assign( t2m, binop(Iop_Or64, mkU64(mask), mkexpr(t2)) );
   3864          break;
   3865       case 6: /* BTR */
   3866          assign( t2m, binop(Iop_And64, mkU64(mask), mkexpr(t2)) );
   3867          break;
   3868       case 7: /* BTC */
   3869          assign( t2m, binop(Iop_Xor64, mkU64(mask), mkexpr(t2)) );
   3870          break;
   3871      default:
   3872          /*NOTREACHED*/ /*the previous switch guards this*/
   3873          vassert(0);
   3874    }
   3875 
   3876    /* Write the result back, if non-BT. */
   3877    if (gregLO3ofRM(modrm) != 4 /* BT */) {
   3878       if (epartIsReg(modrm)) {
   3879         putIRegE(sz, pfx, modrm, narrowTo(ty, mkexpr(t2m)));
   3880       } else {
   3881          if (haveLOCK(pfx)) {
   3882             casLE( mkexpr(t_addr),
   3883                    narrowTo(ty, mkexpr(t2))/*expd*/,
   3884                    narrowTo(ty, mkexpr(t2m))/*new*/,
   3885                    guest_RIP_curr_instr );
   3886          } else {
   3887             storeLE(mkexpr(t_addr), narrowTo(ty, mkexpr(t2m)));
   3888          }
   3889       }
   3890    }
   3891 
   3892    /* Copy relevant bit from t2 into the carry flag. */
   3893    /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
   3894    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   3895    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   3896    stmt( IRStmt_Put(
   3897             OFFB_CC_DEP1,
   3898             binop(Iop_And64,
   3899                   binop(Iop_Shr64, mkexpr(t2), mkU8(src_val)),
   3900                   mkU64(1))
   3901        ));
   3902    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   3903       elimination of previous stores to this field work better. */
   3904    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   3905 
   3906    return delta;
   3907 }
   3908 
   3909 
   3910 /* Signed/unsigned widening multiply.  Generate IR to multiply the
   3911    value in RAX/EAX/AX/AL by the given IRTemp, and park the result in
   3912    RDX:RAX/EDX:EAX/DX:AX/AX.
   3913 */
   3914 static void codegen_mulL_A_D ( Int sz, Bool syned,
   3915                                IRTemp tmp, const HChar* tmp_txt )
   3916 {
   3917    IRType ty = szToITy(sz);
   3918    IRTemp t1 = newTemp(ty);
   3919 
   3920    assign( t1, getIRegRAX(sz) );
   3921 
   3922    switch (ty) {
   3923       case Ity_I64: {
   3924          IRTemp res128  = newTemp(Ity_I128);
   3925          IRTemp resHi   = newTemp(Ity_I64);
   3926          IRTemp resLo   = newTemp(Ity_I64);
   3927          IROp   mulOp   = syned ? Iop_MullS64 : Iop_MullU64;
   3928          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   3929          setFlags_MUL ( Ity_I64, t1, tmp, tBaseOp );
   3930          assign( res128, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   3931          assign( resHi, unop(Iop_128HIto64,mkexpr(res128)));
   3932          assign( resLo, unop(Iop_128to64,mkexpr(res128)));
   3933          putIReg64(R_RDX, mkexpr(resHi));
   3934          putIReg64(R_RAX, mkexpr(resLo));
   3935          break;
   3936       }
   3937       case Ity_I32: {
   3938          IRTemp res64   = newTemp(Ity_I64);
   3939          IRTemp resHi   = newTemp(Ity_I32);
   3940          IRTemp resLo   = newTemp(Ity_I32);
   3941          IROp   mulOp   = syned ? Iop_MullS32 : Iop_MullU32;
   3942          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   3943          setFlags_MUL ( Ity_I32, t1, tmp, tBaseOp );
   3944          assign( res64, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   3945          assign( resHi, unop(Iop_64HIto32,mkexpr(res64)));
   3946          assign( resLo, unop(Iop_64to32,mkexpr(res64)));
   3947          putIRegRDX(4, mkexpr(resHi));
   3948          putIRegRAX(4, mkexpr(resLo));
   3949          break;
   3950       }
   3951       case Ity_I16: {
   3952          IRTemp res32   = newTemp(Ity_I32);
   3953          IRTemp resHi   = newTemp(Ity_I16);
   3954          IRTemp resLo   = newTemp(Ity_I16);
   3955          IROp   mulOp   = syned ? Iop_MullS16 : Iop_MullU16;
   3956          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   3957          setFlags_MUL ( Ity_I16, t1, tmp, tBaseOp );
   3958          assign( res32, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   3959          assign( resHi, unop(Iop_32HIto16,mkexpr(res32)));
   3960          assign( resLo, unop(Iop_32to16,mkexpr(res32)));
   3961          putIRegRDX(2, mkexpr(resHi));
   3962          putIRegRAX(2, mkexpr(resLo));
   3963          break;
   3964       }
   3965       case Ity_I8: {
   3966          IRTemp res16   = newTemp(Ity_I16);
   3967          IRTemp resHi   = newTemp(Ity_I8);
   3968          IRTemp resLo   = newTemp(Ity_I8);
   3969          IROp   mulOp   = syned ? Iop_MullS8 : Iop_MullU8;
   3970          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   3971          setFlags_MUL ( Ity_I8, t1, tmp, tBaseOp );
   3972          assign( res16, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   3973          assign( resHi, unop(Iop_16HIto8,mkexpr(res16)));
   3974          assign( resLo, unop(Iop_16to8,mkexpr(res16)));
   3975          putIRegRAX(2, mkexpr(res16));
   3976          break;
   3977       }
   3978       default:
   3979          ppIRType(ty);
   3980          vpanic("codegen_mulL_A_D(amd64)");
   3981    }
   3982    DIP("%s%c %s\n", syned ? "imul" : "mul", nameISize(sz), tmp_txt);
   3983 }
   3984 
   3985 
   3986 /* Group 3 extended opcodes.  We have to decide here whether F2 and F3
   3987    might be valid.*/
   3988 static
   3989 ULong dis_Grp3 ( const VexAbiInfo* vbi,
   3990                  Prefix pfx, Int sz, Long delta, Bool* decode_OK )
   3991 {
   3992    Long    d64;
   3993    UChar   modrm;
   3994    HChar   dis_buf[50];
   3995    Int     len;
   3996    IRTemp  addr;
   3997    IRType  ty = szToITy(sz);
   3998    IRTemp  t1 = newTemp(ty);
   3999    IRTemp dst1, src, dst0;
   4000    *decode_OK = True;
   4001    modrm = getUChar(delta);
   4002    if (epartIsReg(modrm)) {
   4003       /* F2/XACQ and F3/XREL are always invalid in the non-mem case. */
   4004       if (haveF2orF3(pfx)) goto unhandled;
   4005       switch (gregLO3ofRM(modrm)) {
   4006          case 0: { /* TEST */
   4007             delta++;
   4008             d64 = getSDisp(imin(4,sz), delta);
   4009             delta += imin(4,sz);
   4010             dst1 = newTemp(ty);
   4011             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
   4012                                getIRegE(sz,pfx,modrm),
   4013                                mkU(ty, d64 & mkSizeMask(sz))));
   4014             setFlags_DEP1( Iop_And8, dst1, ty );
   4015             DIP("test%c $%lld, %s\n",
   4016                 nameISize(sz), d64,
   4017                 nameIRegE(sz, pfx, modrm));
   4018             break;
   4019          }
   4020          case 1:
   4021             *decode_OK = False;
   4022             return delta;
   4023          case 2: /* NOT */
   4024             delta++;
   4025             putIRegE(sz, pfx, modrm,
   4026                               unop(mkSizedOp(ty,Iop_Not8),
   4027                                    getIRegE(sz, pfx, modrm)));
   4028             DIP("not%c %s\n", nameISize(sz),
   4029                               nameIRegE(sz, pfx, modrm));
   4030             break;
   4031          case 3: /* NEG */
   4032             delta++;
   4033             dst0 = newTemp(ty);
   4034             src  = newTemp(ty);
   4035             dst1 = newTemp(ty);
   4036             assign(dst0, mkU(ty,0));
   4037             assign(src,  getIRegE(sz, pfx, modrm));
   4038             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0),
   4039                                                        mkexpr(src)));
   4040             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
   4041             putIRegE(sz, pfx, modrm, mkexpr(dst1));
   4042             DIP("neg%c %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm));
   4043             break;
   4044          case 4: /* MUL (unsigned widening) */
   4045             delta++;
   4046             src = newTemp(ty);
   4047             assign(src, getIRegE(sz,pfx,modrm));
   4048             codegen_mulL_A_D ( sz, False, src,
   4049                                nameIRegE(sz,pfx,modrm) );
   4050             break;
   4051          case 5: /* IMUL (signed widening) */
   4052             delta++;
   4053             src = newTemp(ty);
   4054             assign(src, getIRegE(sz,pfx,modrm));
   4055             codegen_mulL_A_D ( sz, True, src,
   4056                                nameIRegE(sz,pfx,modrm) );
   4057             break;
   4058          case 6: /* DIV */
   4059             delta++;
   4060             assign( t1, getIRegE(sz, pfx, modrm) );
   4061             codegen_div ( sz, t1, False );
   4062             DIP("div%c %s\n", nameISize(sz),
   4063                               nameIRegE(sz, pfx, modrm));
   4064             break;
   4065          case 7: /* IDIV */
   4066             delta++;
   4067             assign( t1, getIRegE(sz, pfx, modrm) );
   4068             codegen_div ( sz, t1, True );
   4069             DIP("idiv%c %s\n", nameISize(sz),
   4070                                nameIRegE(sz, pfx, modrm));
   4071             break;
   4072          default:
   4073             /*NOTREACHED*/
   4074             vpanic("Grp3(amd64,R)");
   4075       }
   4076    } else {
   4077       /* Decide if F2/XACQ or F3/XREL might be valid. */
   4078       Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
   4079       if ((gregLO3ofRM(modrm) == 3/*NEG*/ || gregLO3ofRM(modrm) == 2/*NOT*/)
   4080           && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
   4081          validF2orF3 = True;
   4082       }
   4083       if (!validF2orF3) goto unhandled;
   4084       /* */
   4085       addr = disAMode ( &len, vbi, pfx, delta, dis_buf,
   4086                         /* we have to inform disAMode of any immediate
   4087                            bytes used */
   4088                         gregLO3ofRM(modrm)==0/*TEST*/
   4089                            ? imin(4,sz)
   4090                            : 0
   4091                       );
   4092       t1   = newTemp(ty);
   4093       delta += len;
   4094       assign(t1, loadLE(ty,mkexpr(addr)));
   4095       switch (gregLO3ofRM(modrm)) {
   4096          case 0: { /* TEST */
   4097             d64 = getSDisp(imin(4,sz), delta);
   4098             delta += imin(4,sz);
   4099             dst1 = newTemp(ty);
   4100             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
   4101                                mkexpr(t1),
   4102                                mkU(ty, d64 & mkSizeMask(sz))));
   4103             setFlags_DEP1( Iop_And8, dst1, ty );
   4104             DIP("test%c $%lld, %s\n", nameISize(sz), d64, dis_buf);
   4105             break;
   4106          }
   4107          case 1:
   4108             *decode_OK = False;
   4109             return delta;
   4110          case 2: /* NOT */
   4111             dst1 = newTemp(ty);
   4112             assign(dst1, unop(mkSizedOp(ty,Iop_Not8), mkexpr(t1)));
   4113             if (haveLOCK(pfx)) {
   4114                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
   4115                                     guest_RIP_curr_instr );
   4116             } else {
   4117                storeLE( mkexpr(addr), mkexpr(dst1) );
   4118             }
   4119             DIP("not%c %s\n", nameISize(sz), dis_buf);
   4120             break;
   4121          case 3: /* NEG */
   4122             dst0 = newTemp(ty);
   4123             src  = newTemp(ty);
   4124             dst1 = newTemp(ty);
   4125             assign(dst0, mkU(ty,0));
   4126             assign(src,  mkexpr(t1));
   4127             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0),
   4128                                                        mkexpr(src)));
   4129             if (haveLOCK(pfx)) {
   4130                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
   4131                                     guest_RIP_curr_instr );
   4132             } else {
   4133                storeLE( mkexpr(addr), mkexpr(dst1) );
   4134             }
   4135             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
   4136             DIP("neg%c %s\n", nameISize(sz), dis_buf);
   4137             break;
   4138          case 4: /* MUL (unsigned widening) */
   4139             codegen_mulL_A_D ( sz, False, t1, dis_buf );
   4140             break;
   4141          case 5: /* IMUL */
   4142             codegen_mulL_A_D ( sz, True, t1, dis_buf );
   4143             break;
   4144          case 6: /* DIV */
   4145             codegen_div ( sz, t1, False );
   4146             DIP("div%c %s\n", nameISize(sz), dis_buf);
   4147             break;
   4148          case 7: /* IDIV */
   4149             codegen_div ( sz, t1, True );
   4150             DIP("idiv%c %s\n", nameISize(sz), dis_buf);
   4151             break;
   4152          default:
   4153             /*NOTREACHED*/
   4154             vpanic("Grp3(amd64,M)");
   4155       }
   4156    }
   4157    return delta;
   4158   unhandled:
   4159    *decode_OK = False;
   4160    return delta;
   4161 }
   4162 
   4163 
   4164 /* Group 4 extended opcodes.  We have to decide here whether F2 and F3
   4165    might be valid. */
   4166 static
   4167 ULong dis_Grp4 ( const VexAbiInfo* vbi,
   4168                  Prefix pfx, Long delta, Bool* decode_OK )
   4169 {
   4170    Int   alen;
   4171    UChar modrm;
   4172    HChar dis_buf[50];
   4173    IRType ty = Ity_I8;
   4174    IRTemp t1 = newTemp(ty);
   4175    IRTemp t2 = newTemp(ty);
   4176 
   4177    *decode_OK = True;
   4178 
   4179    modrm = getUChar(delta);
   4180    if (epartIsReg(modrm)) {
   4181       /* F2/XACQ and F3/XREL are always invalid in the non-mem case. */
   4182       if (haveF2orF3(pfx)) goto unhandled;
   4183       assign(t1, getIRegE(1, pfx, modrm));
   4184       switch (gregLO3ofRM(modrm)) {
   4185          case 0: /* INC */
   4186             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
   4187             putIRegE(1, pfx, modrm, mkexpr(t2));
   4188             setFlags_INC_DEC( True, t2, ty );
   4189             break;
   4190          case 1: /* DEC */
   4191             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
   4192             putIRegE(1, pfx, modrm, mkexpr(t2));
   4193             setFlags_INC_DEC( False, t2, ty );
   4194             break;
   4195          default:
   4196             *decode_OK = False;
   4197             return delta;
   4198       }
   4199       delta++;
   4200       DIP("%sb %s\n", nameGrp4(gregLO3ofRM(modrm)),
   4201                       nameIRegE(1, pfx, modrm));
   4202    } else {
   4203       /* Decide if F2/XACQ or F3/XREL might be valid. */
   4204       Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
   4205       if ((gregLO3ofRM(modrm) == 0/*INC*/ || gregLO3ofRM(modrm) == 1/*DEC*/)
   4206           && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
   4207          validF2orF3 = True;
   4208       }
   4209       if (!validF2orF3) goto unhandled;
   4210       /* */
   4211       IRTemp addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   4212       assign( t1, loadLE(ty, mkexpr(addr)) );
   4213       switch (gregLO3ofRM(modrm)) {
   4214          case 0: /* INC */
   4215             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
   4216             if (haveLOCK(pfx)) {
   4217                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
   4218                       guest_RIP_curr_instr );
   4219             } else {
   4220                storeLE( mkexpr(addr), mkexpr(t2) );
   4221             }
   4222             setFlags_INC_DEC( True, t2, ty );
   4223             break;
   4224          case 1: /* DEC */
   4225             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
   4226             if (haveLOCK(pfx)) {
   4227                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
   4228                       guest_RIP_curr_instr );
   4229             } else {
   4230                storeLE( mkexpr(addr), mkexpr(t2) );
   4231             }
   4232             setFlags_INC_DEC( False, t2, ty );
   4233             break;
   4234          default:
   4235             *decode_OK = False;
   4236             return delta;
   4237       }
   4238       delta += alen;
   4239       DIP("%sb %s\n", nameGrp4(gregLO3ofRM(modrm)), dis_buf);
   4240    }
   4241    return delta;
   4242   unhandled:
   4243    *decode_OK = False;
   4244    return delta;
   4245 }
   4246 
   4247 
   4248 /* Group 5 extended opcodes.  We have to decide here whether F2 and F3
   4249    might be valid. */
   4250 static
   4251 ULong dis_Grp5 ( const VexAbiInfo* vbi,
   4252                  Prefix pfx, Int sz, Long delta,
   4253                  /*MOD*/DisResult* dres, /*OUT*/Bool* decode_OK )
   4254 {
   4255    Int     len;
   4256    UChar   modrm;
   4257    HChar   dis_buf[50];
   4258    IRTemp  addr = IRTemp_INVALID;
   4259    IRType  ty = szToITy(sz);
   4260    IRTemp  t1 = newTemp(ty);
   4261    IRTemp  t2 = IRTemp_INVALID;
   4262    IRTemp  t3 = IRTemp_INVALID;
   4263    Bool    showSz = True;
   4264 
   4265    *decode_OK = True;
   4266 
   4267    modrm = getUChar(delta);
   4268    if (epartIsReg(modrm)) {
   4269       /* F2/XACQ and F3/XREL are always invalid in the non-mem case.
   4270          F2/CALL and F2/JMP may have bnd prefix. */
   4271      if (haveF2orF3(pfx)
   4272          && ! (haveF2(pfx)
   4273                && (gregLO3ofRM(modrm) == 2 || gregLO3ofRM(modrm) == 4)))
   4274         goto unhandledR;
   4275       assign(t1, getIRegE(sz,pfx,modrm));
   4276       switch (gregLO3ofRM(modrm)) {
   4277          case 0: /* INC */
   4278             t2 = newTemp(ty);
   4279             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
   4280                              mkexpr(t1), mkU(ty,1)));
   4281             setFlags_INC_DEC( True, t2, ty );
   4282             putIRegE(sz,pfx,modrm, mkexpr(t2));
   4283             break;
   4284          case 1: /* DEC */
   4285             t2 = newTemp(ty);
   4286             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
   4287                              mkexpr(t1), mkU(ty,1)));
   4288             setFlags_INC_DEC( False, t2, ty );
   4289             putIRegE(sz,pfx,modrm, mkexpr(t2));
   4290             break;
   4291          case 2: /* call Ev */
   4292             /* Ignore any sz value and operate as if sz==8. */
   4293             if (!(sz == 4 || sz == 8)) goto unhandledR;
   4294             if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   4295             sz = 8;
   4296             t3 = newTemp(Ity_I64);
   4297             assign(t3, getIRegE(sz,pfx,modrm));
   4298             t2 = newTemp(Ity_I64);
   4299             assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   4300             putIReg64(R_RSP, mkexpr(t2));
   4301             storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+1));
   4302             make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(reg)");
   4303             jmp_treg(dres, Ijk_Call, t3);
   4304             vassert(dres->whatNext == Dis_StopHere);
   4305             showSz = False;
   4306             break;
   4307          case 4: /* jmp Ev */
   4308             /* Ignore any sz value and operate as if sz==8. */
   4309             if (!(sz == 4 || sz == 8)) goto unhandledR;
   4310             if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   4311             sz = 8;
   4312             t3 = newTemp(Ity_I64);
   4313             assign(t3, getIRegE(sz,pfx,modrm));
   4314             jmp_treg(dres, Ijk_Boring, t3);
   4315             vassert(dres->whatNext == Dis_StopHere);
   4316             showSz = False;
   4317             break;
   4318          case 6: /* PUSH Ev */
   4319             /* There is no encoding for 32-bit operand size; hence ... */
   4320             if (sz == 4) sz = 8;
   4321             if (sz == 8 || sz == 2) {
   4322                ty = szToITy(sz); /* redo it, since sz might have changed */
   4323                t3 = newTemp(ty);
   4324                assign(t3, getIRegE(sz,pfx,modrm));
   4325                t2 = newTemp(Ity_I64);
   4326                assign( t2, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
   4327                putIReg64(R_RSP, mkexpr(t2) );
   4328                storeLE( mkexpr(t2), mkexpr(t3) );
   4329                break;
   4330             } else {
   4331                goto unhandledR; /* awaiting test case */
   4332             }
   4333          default:
   4334          unhandledR:
   4335             *decode_OK = False;
   4336             return delta;
   4337       }
   4338       delta++;
   4339       DIP("%s%c %s\n", nameGrp5(gregLO3ofRM(modrm)),
   4340                        showSz ? nameISize(sz) : ' ',
   4341                        nameIRegE(sz, pfx, modrm));
   4342    } else {
   4343       /* Decide if F2/XACQ, F3/XREL, F2/CALL or F2/JMP might be valid. */
   4344       Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
   4345       if ((gregLO3ofRM(modrm) == 0/*INC*/ || gregLO3ofRM(modrm) == 1/*DEC*/)
   4346           && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
   4347          validF2orF3 = True;
   4348       } else if ((gregLO3ofRM(modrm) == 2 || gregLO3ofRM(modrm) == 4)
   4349                  && (haveF2(pfx) && !haveF3(pfx))) {
   4350          validF2orF3 = True;
   4351       }
   4352       if (!validF2orF3) goto unhandledM;
   4353       /* */
   4354       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   4355       if (gregLO3ofRM(modrm) != 2 && gregLO3ofRM(modrm) != 4
   4356                                   && gregLO3ofRM(modrm) != 6) {
   4357          assign(t1, loadLE(ty,mkexpr(addr)));
   4358       }
   4359       switch (gregLO3ofRM(modrm)) {
   4360          case 0: /* INC */
   4361             t2 = newTemp(ty);
   4362             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
   4363                              mkexpr(t1), mkU(ty,1)));
   4364             if (haveLOCK(pfx)) {
   4365                casLE( mkexpr(addr),
   4366                       mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
   4367             } else {
   4368                storeLE(mkexpr(addr),mkexpr(t2));
   4369             }
   4370             setFlags_INC_DEC( True, t2, ty );
   4371             break;
   4372          case 1: /* DEC */
   4373             t2 = newTemp(ty);
   4374             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
   4375                              mkexpr(t1), mkU(ty,1)));
   4376             if (haveLOCK(pfx)) {
   4377                casLE( mkexpr(addr),
   4378                       mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
   4379             } else {
   4380                storeLE(mkexpr(addr),mkexpr(t2));
   4381             }
   4382             setFlags_INC_DEC( False, t2, ty );
   4383             break;
   4384          case 2: /* call Ev */
   4385             /* Ignore any sz value and operate as if sz==8. */
   4386             if (!(sz == 4 || sz == 8)) goto unhandledM;
   4387             if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   4388             sz = 8;
   4389             t3 = newTemp(Ity_I64);
   4390             assign(t3, loadLE(Ity_I64,mkexpr(addr)));
   4391             t2 = newTemp(Ity_I64);
   4392             assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   4393             putIReg64(R_RSP, mkexpr(t2));
   4394             storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+len));
   4395             make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(mem)");
   4396             jmp_treg(dres, Ijk_Call, t3);
   4397             vassert(dres->whatNext == Dis_StopHere);
   4398             showSz = False;
   4399             break;
   4400          case 4: /* JMP Ev */
   4401             /* Ignore any sz value and operate as if sz==8. */
   4402             if (!(sz == 4 || sz == 8)) goto unhandledM;
   4403             if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   4404             sz = 8;
   4405             t3 = newTemp(Ity_I64);
   4406             assign(t3, loadLE(Ity_I64,mkexpr(addr)));
   4407             jmp_treg(dres, Ijk_Boring, t3);
   4408             vassert(dres->whatNext == Dis_StopHere);
   4409             showSz = False;
   4410             break;
   4411          case 6: /* PUSH Ev */
   4412             /* There is no encoding for 32-bit operand size; hence ... */
   4413             if (sz == 4) sz = 8;
   4414             if (sz == 8 || sz == 2) {
   4415                ty = szToITy(sz); /* redo it, since sz might have changed */
   4416                t3 = newTemp(ty);
   4417                assign(t3, loadLE(ty,mkexpr(addr)));
   4418                t2 = newTemp(Ity_I64);
   4419                assign( t2, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
   4420                putIReg64(R_RSP, mkexpr(t2) );
   4421                storeLE( mkexpr(t2), mkexpr(t3) );
   4422                break;
   4423             } else {
   4424                goto unhandledM; /* awaiting test case */
   4425             }
   4426          default:
   4427          unhandledM:
   4428             *decode_OK = False;
   4429             return delta;
   4430       }
   4431       delta += len;
   4432       DIP("%s%c %s\n", nameGrp5(gregLO3ofRM(modrm)),
   4433                        showSz ? nameISize(sz) : ' ',
   4434                        dis_buf);
   4435    }
   4436    return delta;
   4437 }
   4438 
   4439 
   4440 /*------------------------------------------------------------*/
   4441 /*--- Disassembling string ops (including REP prefixes)    ---*/
   4442 /*------------------------------------------------------------*/
   4443 
   4444 /* Code shared by all the string ops */
   4445 static
   4446 void dis_string_op_increment ( Int sz, IRTemp t_inc )
   4447 {
   4448    UChar logSz;
   4449    if (sz == 8 || sz == 4 || sz == 2) {
   4450       logSz = 1;
   4451       if (sz == 4) logSz = 2;
   4452       if (sz == 8) logSz = 3;
   4453       assign( t_inc,
   4454               binop(Iop_Shl64, IRExpr_Get( OFFB_DFLAG, Ity_I64 ),
   4455                                mkU8(logSz) ) );
   4456    } else {
   4457       assign( t_inc,
   4458               IRExpr_Get( OFFB_DFLAG, Ity_I64 ) );
   4459    }
   4460 }
   4461 
   4462 static
   4463 void dis_string_op( void (*dis_OP)( Int, IRTemp, Prefix pfx ),
   4464                     Int sz, const HChar* name, Prefix pfx )
   4465 {
   4466    IRTemp t_inc = newTemp(Ity_I64);
   4467    /* Really we ought to inspect the override prefixes, but we don't.
   4468       The following assertion catches any resulting sillyness. */
   4469    vassert(pfx == clearSegBits(pfx));
   4470    dis_string_op_increment(sz, t_inc);
   4471    dis_OP( sz, t_inc, pfx );
   4472    DIP("%s%c\n", name, nameISize(sz));
   4473 }
   4474 
   4475 static
   4476 void dis_MOVS ( Int sz, IRTemp t_inc, Prefix pfx )
   4477 {
   4478    IRType ty = szToITy(sz);
   4479    IRTemp td = newTemp(Ity_I64);   /* RDI */
   4480    IRTemp ts = newTemp(Ity_I64);   /* RSI */
   4481    IRExpr *incd, *incs;
   4482 
   4483    if (haveASO(pfx)) {
   4484       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   4485       assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
   4486    } else {
   4487       assign( td, getIReg64(R_RDI) );
   4488       assign( ts, getIReg64(R_RSI) );
   4489    }
   4490 
   4491    storeLE( mkexpr(td), loadLE(ty,mkexpr(ts)) );
   4492 
   4493    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   4494    incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
   4495    if (haveASO(pfx)) {
   4496       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   4497       incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
   4498    }
   4499    putIReg64( R_RDI, incd );
   4500    putIReg64( R_RSI, incs );
   4501 }
   4502 
   4503 static
   4504 void dis_LODS ( Int sz, IRTemp t_inc, Prefix pfx )
   4505 {
   4506    IRType ty = szToITy(sz);
   4507    IRTemp ts = newTemp(Ity_I64);   /* RSI */
   4508    IRExpr *incs;
   4509 
   4510    if (haveASO(pfx))
   4511       assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
   4512    else
   4513       assign( ts, getIReg64(R_RSI) );
   4514 
   4515    putIRegRAX ( sz, loadLE(ty, mkexpr(ts)) );
   4516 
   4517    incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
   4518    if (haveASO(pfx))
   4519       incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
   4520    putIReg64( R_RSI, incs );
   4521 }
   4522 
   4523 static
   4524 void dis_STOS ( Int sz, IRTemp t_inc, Prefix pfx )
   4525 {
   4526    IRType ty = szToITy(sz);
   4527    IRTemp ta = newTemp(ty);        /* rAX */
   4528    IRTemp td = newTemp(Ity_I64);   /* RDI */
   4529    IRExpr *incd;
   4530 
   4531    assign( ta, getIRegRAX(sz) );
   4532 
   4533    if (haveASO(pfx))
   4534       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   4535    else
   4536       assign( td, getIReg64(R_RDI) );
   4537 
   4538    storeLE( mkexpr(td), mkexpr(ta) );
   4539 
   4540    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   4541    if (haveASO(pfx))
   4542       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   4543    putIReg64( R_RDI, incd );
   4544 }
   4545 
   4546 static
   4547 void dis_CMPS ( Int sz, IRTemp t_inc, Prefix pfx )
   4548 {
   4549    IRType ty  = szToITy(sz);
   4550    IRTemp tdv = newTemp(ty);      /* (RDI) */
   4551    IRTemp tsv = newTemp(ty);      /* (RSI) */
   4552    IRTemp td  = newTemp(Ity_I64); /*  RDI  */
   4553    IRTemp ts  = newTemp(Ity_I64); /*  RSI  */
   4554    IRExpr *incd, *incs;
   4555 
   4556    if (haveASO(pfx)) {
   4557       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   4558       assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
   4559    } else {
   4560       assign( td, getIReg64(R_RDI) );
   4561       assign( ts, getIReg64(R_RSI) );
   4562    }
   4563 
   4564    assign( tdv, loadLE(ty,mkexpr(td)) );
   4565 
   4566    assign( tsv, loadLE(ty,mkexpr(ts)) );
   4567 
   4568    setFlags_DEP1_DEP2 ( Iop_Sub8, tsv, tdv, ty );
   4569 
   4570    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   4571    incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
   4572    if (haveASO(pfx)) {
   4573       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   4574       incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
   4575    }
   4576    putIReg64( R_RDI, incd );
   4577    putIReg64( R_RSI, incs );
   4578 }
   4579 
   4580 static
   4581 void dis_SCAS ( Int sz, IRTemp t_inc, Prefix pfx )
   4582 {
   4583    IRType ty  = szToITy(sz);
   4584    IRTemp ta  = newTemp(ty);       /*  rAX  */
   4585    IRTemp td  = newTemp(Ity_I64);  /*  RDI  */
   4586    IRTemp tdv = newTemp(ty);       /* (RDI) */
   4587    IRExpr *incd;
   4588 
   4589    assign( ta, getIRegRAX(sz) );
   4590 
   4591    if (haveASO(pfx))
   4592       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   4593    else
   4594       assign( td, getIReg64(R_RDI) );
   4595 
   4596    assign( tdv, loadLE(ty,mkexpr(td)) );
   4597 
   4598    setFlags_DEP1_DEP2 ( Iop_Sub8, ta, tdv, ty );
   4599 
   4600    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   4601    if (haveASO(pfx))
   4602       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   4603    putIReg64( R_RDI, incd );
   4604 }
   4605 
   4606 
   4607 /* Wrap the appropriate string op inside a REP/REPE/REPNE.  We assume
   4608    the insn is the last one in the basic block, and so emit a jump to
   4609    the next insn, rather than just falling through. */
   4610 static
   4611 void dis_REP_op ( /*MOD*/DisResult* dres,
   4612                   AMD64Condcode cond,
   4613                   void (*dis_OP)(Int, IRTemp, Prefix),
   4614                   Int sz, Addr64 rip, Addr64 rip_next, const HChar* name,
   4615                   Prefix pfx )
   4616 {
   4617    IRTemp t_inc = newTemp(Ity_I64);
   4618    IRTemp tc;
   4619    IRExpr* cmp;
   4620 
   4621    /* Really we ought to inspect the override prefixes, but we don't.
   4622       The following assertion catches any resulting sillyness. */
   4623    vassert(pfx == clearSegBits(pfx));
   4624 
   4625    if (haveASO(pfx)) {
   4626       tc = newTemp(Ity_I32);  /*  ECX  */
   4627       assign( tc, getIReg32(R_RCX) );
   4628       cmp = binop(Iop_CmpEQ32, mkexpr(tc), mkU32(0));
   4629    } else {
   4630       tc = newTemp(Ity_I64);  /*  RCX  */
   4631       assign( tc, getIReg64(R_RCX) );
   4632       cmp = binop(Iop_CmpEQ64, mkexpr(tc), mkU64(0));
   4633    }
   4634 
   4635    stmt( IRStmt_Exit( cmp, Ijk_Boring,
   4636                       IRConst_U64(rip_next), OFFB_RIP ) );
   4637 
   4638    if (haveASO(pfx))
   4639       putIReg32(R_RCX, binop(Iop_Sub32, mkexpr(tc), mkU32(1)) );
   4640   else
   4641       putIReg64(R_RCX, binop(Iop_Sub64, mkexpr(tc), mkU64(1)) );
   4642 
   4643    dis_string_op_increment(sz, t_inc);
   4644    dis_OP (sz, t_inc, pfx);
   4645 
   4646    if (cond == AMD64CondAlways) {
   4647       jmp_lit(dres, Ijk_Boring, rip);
   4648       vassert(dres->whatNext == Dis_StopHere);
   4649    } else {
   4650       stmt( IRStmt_Exit( mk_amd64g_calculate_condition(cond),
   4651                          Ijk_Boring,
   4652                          IRConst_U64(rip),
   4653                          OFFB_RIP ) );
   4654       jmp_lit(dres, Ijk_Boring, rip_next);
   4655       vassert(dres->whatNext == Dis_StopHere);
   4656    }
   4657    DIP("%s%c\n", name, nameISize(sz));
   4658 }
   4659 
   4660 
   4661 /*------------------------------------------------------------*/
   4662 /*--- Arithmetic, etc.                                     ---*/
   4663 /*------------------------------------------------------------*/
   4664 
   4665 /* IMUL E, G.  Supplied eip points to the modR/M byte. */
   4666 static
   4667 ULong dis_mul_E_G ( const VexAbiInfo* vbi,
   4668                     Prefix      pfx,
   4669                     Int         size,
   4670                     Long        delta0 )
   4671 {
   4672    Int    alen;
   4673    HChar  dis_buf[50];
   4674    UChar  rm = getUChar(delta0);
   4675    IRType ty = szToITy(size);
   4676    IRTemp te = newTemp(ty);
   4677    IRTemp tg = newTemp(ty);
   4678    IRTemp resLo = newTemp(ty);
   4679 
   4680    assign( tg, getIRegG(size, pfx, rm) );
   4681    if (epartIsReg(rm)) {
   4682       assign( te, getIRegE(size, pfx, rm) );
   4683    } else {
   4684       IRTemp addr = disAMode( &alen, vbi, pfx, delta0, dis_buf, 0 );
   4685       assign( te, loadLE(ty,mkexpr(addr)) );
   4686    }
   4687 
   4688    setFlags_MUL ( ty, te, tg, AMD64G_CC_OP_SMULB );
   4689 
   4690    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tg) ) );
   4691 
   4692    putIRegG(size, pfx, rm, mkexpr(resLo) );
   4693 
   4694    if (epartIsReg(rm)) {
   4695       DIP("imul%c %s, %s\n", nameISize(size),
   4696                              nameIRegE(size,pfx,rm),
   4697                              nameIRegG(size,pfx,rm));
   4698       return 1+delta0;
   4699    } else {
   4700       DIP("imul%c %s, %s\n", nameISize(size),
   4701                              dis_buf,
   4702                              nameIRegG(size,pfx,rm));
   4703       return alen+delta0;
   4704    }
   4705 }
   4706 
   4707 
   4708 /* IMUL I * E -> G.  Supplied rip points to the modR/M byte. */
   4709 static
   4710 ULong dis_imul_I_E_G ( const VexAbiInfo* vbi,
   4711                        Prefix      pfx,
   4712                        Int         size,
   4713                        Long        delta,
   4714                        Int         litsize )
   4715 {
   4716    Long   d64;
   4717    Int    alen;
   4718    HChar  dis_buf[50];
   4719    UChar  rm = getUChar(delta);
   4720    IRType ty = szToITy(size);
   4721    IRTemp te = newTemp(ty);
   4722    IRTemp tl = newTemp(ty);
   4723    IRTemp resLo = newTemp(ty);
   4724 
   4725    vassert(/*size == 1 ||*/ size == 2 || size == 4 || size == 8);
   4726 
   4727    if (epartIsReg(rm)) {
   4728       assign(te, getIRegE(size, pfx, rm));
   4729       delta++;
   4730    } else {
   4731       IRTemp addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   4732                                      imin(4,litsize) );
   4733       assign(te, loadLE(ty, mkexpr(addr)));
   4734       delta += alen;
   4735    }
   4736    d64 = getSDisp(imin(4,litsize),delta);
   4737    delta += imin(4,litsize);
   4738 
   4739    d64 &= mkSizeMask(size);
   4740    assign(tl, mkU(ty,d64));
   4741 
   4742    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tl) ));
   4743 
   4744    setFlags_MUL ( ty, te, tl, AMD64G_CC_OP_SMULB );
   4745 
   4746    putIRegG(size, pfx, rm, mkexpr(resLo));
   4747 
   4748    DIP("imul%c $%lld, %s, %s\n",
   4749        nameISize(size), d64,
   4750        ( epartIsReg(rm) ? nameIRegE(size,pfx,rm) : dis_buf ),
   4751        nameIRegG(size,pfx,rm) );
   4752    return delta;
   4753 }
   4754 
   4755 
   4756 /* Generate an IR sequence to do a popcount operation on the supplied
   4757    IRTemp, and return a new IRTemp holding the result.  'ty' may be
   4758    Ity_I16, Ity_I32 or Ity_I64 only. */
   4759 static IRTemp gen_POPCOUNT ( IRType ty, IRTemp src )
   4760 {
   4761    Int i;
   4762    if (ty == Ity_I16) {
   4763       IRTemp old = IRTemp_INVALID;
   4764       IRTemp nyu = IRTemp_INVALID;
   4765       IRTemp mask[4], shift[4];
   4766       for (i = 0; i < 4; i++) {
   4767          mask[i]  = newTemp(ty);
   4768          shift[i] = 1 << i;
   4769       }
   4770       assign(mask[0], mkU16(0x5555));
   4771       assign(mask[1], mkU16(0x3333));
   4772       assign(mask[2], mkU16(0x0F0F));
   4773       assign(mask[3], mkU16(0x00FF));
   4774       old = src;
   4775       for (i = 0; i < 4; i++) {
   4776          nyu = newTemp(ty);
   4777          assign(nyu,
   4778                 binop(Iop_Add16,
   4779                       binop(Iop_And16,
   4780                             mkexpr(old),
   4781                             mkexpr(mask[i])),
   4782                       binop(Iop_And16,
   4783                             binop(Iop_Shr16, mkexpr(old), mkU8(shift[i])),
   4784                             mkexpr(mask[i]))));
   4785          old = nyu;
   4786       }
   4787       return nyu;
   4788    }
   4789    if (ty == Ity_I32) {
   4790       IRTemp old = IRTemp_INVALID;
   4791       IRTemp nyu = IRTemp_INVALID;
   4792       IRTemp mask[5], shift[5];
   4793       for (i = 0; i < 5; i++) {
   4794          mask[i]  = newTemp(ty);
   4795          shift[i] = 1 << i;
   4796       }
   4797       assign(mask[0], mkU32(0x55555555));
   4798       assign(mask[1], mkU32(0x33333333));
   4799       assign(mask[2], mkU32(0x0F0F0F0F));
   4800       assign(mask[3], mkU32(0x00FF00FF));
   4801       assign(mask[4], mkU32(0x0000FFFF));
   4802       old = src;
   4803       for (i = 0; i < 5; i++) {
   4804          nyu = newTemp(ty);
   4805          assign(nyu,
   4806                 binop(Iop_Add32,
   4807                       binop(Iop_And32,
   4808                             mkexpr(old),
   4809                             mkexpr(mask[i])),
   4810                       binop(Iop_And32,
   4811                             binop(Iop_Shr32, mkexpr(old), mkU8(shift[i])),
   4812                             mkexpr(mask[i]))));
   4813          old = nyu;
   4814       }
   4815       return nyu;
   4816    }
   4817    if (ty == Ity_I64) {
   4818       IRTemp old = IRTemp_INVALID;
   4819       IRTemp nyu = IRTemp_INVALID;
   4820       IRTemp mask[6], shift[6];
   4821       for (i = 0; i < 6; i++) {
   4822          mask[i]  = newTemp(ty);
   4823          shift[i] = 1 << i;
   4824       }
   4825       assign(mask[0], mkU64(0x5555555555555555ULL));
   4826       assign(mask[1], mkU64(0x3333333333333333ULL));
   4827       assign(mask[2], mkU64(0x0F0F0F0F0F0F0F0FULL));
   4828       assign(mask[3], mkU64(0x00FF00FF00FF00FFULL));
   4829       assign(mask[4], mkU64(0x0000FFFF0000FFFFULL));
   4830       assign(mask[5], mkU64(0x00000000FFFFFFFFULL));
   4831       old = src;
   4832       for (i = 0; i < 6; i++) {
   4833          nyu = newTemp(ty);
   4834          assign(nyu,
   4835                 binop(Iop_Add64,
   4836                       binop(Iop_And64,
   4837                             mkexpr(old),
   4838                             mkexpr(mask[i])),
   4839                       binop(Iop_And64,
   4840                             binop(Iop_Shr64, mkexpr(old), mkU8(shift[i])),
   4841                             mkexpr(mask[i]))));
   4842          old = nyu;
   4843       }
   4844       return nyu;
   4845    }
   4846    /*NOTREACHED*/
   4847    vassert(0);
   4848 }
   4849 
   4850 
   4851 /* Generate an IR sequence to do a count-leading-zeroes operation on
   4852    the supplied IRTemp, and return a new IRTemp holding the result.
   4853    'ty' may be Ity_I16, Ity_I32 or Ity_I64 only.  In the case where
   4854    the argument is zero, return the number of bits in the word (the
   4855    natural semantics). */
   4856 static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
   4857 {
   4858    vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16);
   4859 
   4860    IRTemp src64 = newTemp(Ity_I64);
   4861    assign(src64, widenUto64( mkexpr(src) ));
   4862 
   4863    IRTemp src64x = newTemp(Ity_I64);
   4864    assign(src64x,
   4865           binop(Iop_Shl64, mkexpr(src64),
   4866                            mkU8(64 - 8 * sizeofIRType(ty))));
   4867 
   4868    // Clz64 has undefined semantics when its input is zero, so
   4869    // special-case around that.
   4870    IRTemp res64 = newTemp(Ity_I64);
   4871    assign(res64,
   4872           IRExpr_ITE(
   4873              binop(Iop_CmpEQ64, mkexpr(src64x), mkU64(0)),
   4874              mkU64(8 * sizeofIRType(ty)),
   4875              unop(Iop_Clz64, mkexpr(src64x))
   4876    ));
   4877 
   4878    IRTemp res = newTemp(ty);
   4879    assign(res, narrowTo(ty, mkexpr(res64)));
   4880    return res;
   4881 }
   4882 
   4883 
   4884 /* Generate an IR sequence to do a count-trailing-zeroes operation on
   4885    the supplied IRTemp, and return a new IRTemp holding the result.
   4886    'ty' may be Ity_I16, Ity_I32 or Ity_I64 only.  In the case where
   4887    the argument is zero, return the number of bits in the word (the
   4888    natural semantics). */
   4889 static IRTemp gen_TZCNT ( IRType ty, IRTemp src )
   4890 {
   4891    vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16);
   4892 
   4893    IRTemp src64 = newTemp(Ity_I64);
   4894    assign(src64, widenUto64( mkexpr(src) ));
   4895 
   4896    // Ctz64 has undefined semantics when its input is zero, so
   4897    // special-case around that.
   4898    IRTemp res64 = newTemp(Ity_I64);
   4899    assign(res64,
   4900           IRExpr_ITE(
   4901              binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0)),
   4902              mkU64(8 * sizeofIRType(ty)),
   4903              unop(Iop_Ctz64, mkexpr(src64))
   4904    ));
   4905 
   4906    IRTemp res = newTemp(ty);
   4907    assign(res, narrowTo(ty, mkexpr(res64)));
   4908    return res;
   4909 }
   4910 
   4911 
   4912 /*------------------------------------------------------------*/
   4913 /*---                                                      ---*/
   4914 /*--- x87 FLOATING POINT INSTRUCTIONS                      ---*/
   4915 /*---                                                      ---*/
   4916 /*------------------------------------------------------------*/
   4917 
   4918 /* --- Helper functions for dealing with the register stack. --- */
   4919 
   4920 /* --- Set the emulation-warning pseudo-register. --- */
   4921 
   4922 static void put_emwarn ( IRExpr* e /* :: Ity_I32 */ )
   4923 {
   4924    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   4925    stmt( IRStmt_Put( OFFB_EMNOTE, e ) );
   4926 }
   4927 
   4928 /* --- Produce an IRExpr* denoting a 64-bit QNaN. --- */
   4929 
   4930 static IRExpr* mkQNaN64 ( void )
   4931 {
   4932   /* QNaN is 0 2047 1 0(51times)
   4933      == 0b 11111111111b 1 0(51times)
   4934      == 0x7FF8 0000 0000 0000
   4935    */
   4936    return IRExpr_Const(IRConst_F64i(0x7FF8000000000000ULL));
   4937 }
   4938 
   4939 /* --------- Get/put the top-of-stack pointer :: Ity_I32 --------- */
   4940 
   4941 static IRExpr* get_ftop ( void )
   4942 {
   4943    return IRExpr_Get( OFFB_FTOP, Ity_I32 );
   4944 }
   4945 
   4946 static void put_ftop ( IRExpr* e )
   4947 {
   4948    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   4949    stmt( IRStmt_Put( OFFB_FTOP, e ) );
   4950 }
   4951 
   4952 /* --------- Get/put the C3210 bits. --------- */
   4953 
   4954 static IRExpr*  /* :: Ity_I64 */ get_C3210 ( void )
   4955 {
   4956    return IRExpr_Get( OFFB_FC3210, Ity_I64 );
   4957 }
   4958 
   4959 static void put_C3210 ( IRExpr* e  /* :: Ity_I64 */ )
   4960 {
   4961    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
   4962    stmt( IRStmt_Put( OFFB_FC3210, e ) );
   4963 }
   4964 
   4965 /* --------- Get/put the FPU rounding mode. --------- */
   4966 static IRExpr* /* :: Ity_I32 */ get_fpround ( void )
   4967 {
   4968    return unop(Iop_64to32, IRExpr_Get( OFFB_FPROUND, Ity_I64 ));
   4969 }
   4970 
   4971 static void put_fpround ( IRExpr* /* :: Ity_I32 */ e )
   4972 {
   4973    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   4974    stmt( IRStmt_Put( OFFB_FPROUND, unop(Iop_32Uto64,e) ) );
   4975 }
   4976 
   4977 
   4978 /* --------- Synthesise a 2-bit FPU rounding mode. --------- */
   4979 /* Produces a value in 0 .. 3, which is encoded as per the type
   4980    IRRoundingMode.  Since the guest_FPROUND value is also encoded as
   4981    per IRRoundingMode, we merely need to get it and mask it for
   4982    safety.
   4983 */
   4984 static IRExpr* /* :: Ity_I32 */ get_roundingmode ( void )
   4985 {
   4986    return binop( Iop_And32, get_fpround(), mkU32(3) );
   4987 }
   4988 
   4989 static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
   4990 {
   4991    return mkU32(Irrm_NEAREST);
   4992 }
   4993 
   4994 
   4995 /* --------- Get/set FP register tag bytes. --------- */
   4996 
   4997 /* Given i, and some expression e, generate 'ST_TAG(i) = e'. */
   4998 
   4999 static void put_ST_TAG ( Int i, IRExpr* value )
   5000 {
   5001    IRRegArray* descr;
   5002    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_I8);
   5003    descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   5004    stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
   5005 }
   5006 
   5007 /* Given i, generate an expression yielding 'ST_TAG(i)'.  This will be
   5008    zero to indicate "Empty" and nonzero to indicate "NonEmpty".  */
   5009 
   5010 static IRExpr* get_ST_TAG ( Int i )
   5011 {
   5012    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   5013    return IRExpr_GetI( descr, get_ftop(), i );
   5014 }
   5015 
   5016 
   5017 /* --------- Get/set FP registers. --------- */
   5018 
   5019 /* Given i, and some expression e, emit 'ST(i) = e' and set the
   5020    register's tag to indicate the register is full.  The previous
   5021    state of the register is not checked. */
   5022 
   5023 static void put_ST_UNCHECKED ( Int i, IRExpr* value )
   5024 {
   5025    IRRegArray* descr;
   5026    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_F64);
   5027    descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
   5028    stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
   5029    /* Mark the register as in-use. */
   5030    put_ST_TAG(i, mkU8(1));
   5031 }
   5032 
   5033 /* Given i, and some expression e, emit
   5034       ST(i) = is_full(i) ? NaN : e
   5035    and set the tag accordingly.
   5036 */
   5037 
   5038 static void put_ST ( Int i, IRExpr* value )
   5039 {
   5040    put_ST_UNCHECKED(
   5041       i,
   5042       IRExpr_ITE( binop(Iop_CmpNE8, get_ST_TAG(i), mkU8(0)),
   5043                   /* non-0 means full */
   5044                   mkQNaN64(),
   5045                   /* 0 means empty */
   5046                   value
   5047       )
   5048    );
   5049 }
   5050 
   5051 
   5052 /* Given i, generate an expression yielding 'ST(i)'. */
   5053 
   5054 static IRExpr* get_ST_UNCHECKED ( Int i )
   5055 {
   5056    IRRegArray* descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
   5057    return IRExpr_GetI( descr, get_ftop(), i );
   5058 }
   5059 
   5060 
   5061 /* Given i, generate an expression yielding
   5062   is_full(i) ? ST(i) : NaN
   5063 */
   5064 
   5065 static IRExpr* get_ST ( Int i )
   5066 {
   5067    return
   5068       IRExpr_ITE( binop(Iop_CmpNE8, get_ST_TAG(i), mkU8(0)),
   5069                   /* non-0 means full */
   5070                   get_ST_UNCHECKED(i),
   5071                   /* 0 means empty */
   5072                   mkQNaN64());
   5073 }
   5074 
   5075 
   5076 /* Given i, and some expression e, and a condition cond, generate IR
   5077    which has the same effect as put_ST(i,e) when cond is true and has
   5078    no effect when cond is false.  Given the lack of proper
   5079    if-then-else in the IR, this is pretty tricky.
   5080 */
   5081 
   5082 static void maybe_put_ST ( IRTemp cond, Int i, IRExpr* value )
   5083 {
   5084    // new_tag = if cond then FULL else old_tag
   5085    // new_val = if cond then (if old_tag==FULL then NaN else val)
   5086    //                   else old_val
   5087 
   5088    IRTemp old_tag = newTemp(Ity_I8);
   5089    assign(old_tag, get_ST_TAG(i));
   5090    IRTemp new_tag = newTemp(Ity_I8);
   5091    assign(new_tag,
   5092           IRExpr_ITE(mkexpr(cond), mkU8(1)/*FULL*/, mkexpr(old_tag)));
   5093 
   5094    IRTemp old_val = newTemp(Ity_F64);
   5095    assign(old_val, get_ST_UNCHECKED(i));
   5096    IRTemp new_val = newTemp(Ity_F64);
   5097    assign(new_val,
   5098           IRExpr_ITE(mkexpr(cond),
   5099                      IRExpr_ITE(binop(Iop_CmpNE8, mkexpr(old_tag), mkU8(0)),
   5100                                 /* non-0 means full */
   5101                                 mkQNaN64(),
   5102                                 /* 0 means empty */
   5103                                 value),
   5104                      mkexpr(old_val)));
   5105 
   5106    put_ST_UNCHECKED(i, mkexpr(new_val));
   5107    // put_ST_UNCHECKED incorrectly sets tag(i) to always be FULL.  So
   5108    // now set it to new_tag instead.
   5109    put_ST_TAG(i, mkexpr(new_tag));
   5110 }
   5111 
   5112 /* Adjust FTOP downwards by one register. */
   5113 
   5114 static void fp_push ( void )
   5115 {
   5116    put_ftop( binop(Iop_Sub32, get_ftop(), mkU32(1)) );
   5117 }
   5118 
   5119 /* Adjust FTOP downwards by one register when COND is 1:I1.  Else
   5120    don't change it. */
   5121 
   5122 static void maybe_fp_push ( IRTemp cond )
   5123 {
   5124    put_ftop( binop(Iop_Sub32, get_ftop(), unop(Iop_1Uto32,mkexpr(cond))) );
   5125 }
   5126 
   5127 /* Adjust FTOP upwards by one register, and mark the vacated register
   5128    as empty.  */
   5129 
   5130 static void fp_pop ( void )
   5131 {
   5132    put_ST_TAG(0, mkU8(0));
   5133    put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
   5134 }
   5135 
   5136 /* Set the C2 bit of the FPU status register to e[0].  Assumes that
   5137    e[31:1] == 0.
   5138 */
   5139 static void set_C2 ( IRExpr* e )
   5140 {
   5141    IRExpr* cleared = binop(Iop_And64, get_C3210(), mkU64(~AMD64G_FC_MASK_C2));
   5142    put_C3210( binop(Iop_Or64,
   5143                     cleared,
   5144                     binop(Iop_Shl64, e, mkU8(AMD64G_FC_SHIFT_C2))) );
   5145 }
   5146 
   5147 /* Generate code to check that abs(d64) < 2^63 and is finite.  This is
   5148    used to do the range checks for FSIN, FCOS, FSINCOS and FPTAN.  The
   5149    test is simple, but the derivation of it is not so simple.
   5150 
   5151    The exponent field for an IEEE754 double is 11 bits.  That means it
   5152    can take values 0 through 0x7FF.  If the exponent has value 0x7FF,
   5153    the number is either a NaN or an Infinity and so is not finite.
   5154    Furthermore, a finite value of exactly 2^63 is the smallest value
   5155    that has exponent value 0x43E.  Hence, what we need to do is
   5156    extract the exponent, ignoring the sign bit and mantissa, and check
   5157    it is < 0x43E, or <= 0x43D.
   5158 
   5159    To make this easily applicable to 32- and 64-bit targets, a
   5160    roundabout approach is used.  First the number is converted to I64,
   5161    then the top 32 bits are taken.  Shifting them right by 20 bits
   5162    places the sign bit and exponent in the bottom 12 bits.  Anding
   5163    with 0x7FF gets rid of the sign bit, leaving just the exponent
   5164    available for comparison.
   5165 */
   5166 static IRTemp math_IS_TRIG_ARG_FINITE_AND_IN_RANGE ( IRTemp d64 )
   5167 {
   5168    IRTemp i64 = newTemp(Ity_I64);
   5169    assign(i64, unop(Iop_ReinterpF64asI64, mkexpr(d64)) );
   5170    IRTemp exponent = newTemp(Ity_I32);
   5171    assign(exponent,
   5172           binop(Iop_And32,
   5173                 binop(Iop_Shr32, unop(Iop_64HIto32, mkexpr(i64)), mkU8(20)),
   5174                 mkU32(0x7FF)));
   5175    IRTemp in_range_and_finite = newTemp(Ity_I1);
   5176    assign(in_range_and_finite,
   5177           binop(Iop_CmpLE32U, mkexpr(exponent), mkU32(0x43D)));
   5178    return in_range_and_finite;
   5179 }
   5180 
   5181 /* Invent a plausible-looking FPU status word value:
   5182       ((ftop & 7) << 11) | (c3210 & 0x4700)
   5183  */
   5184 static IRExpr* get_FPU_sw ( void )
   5185 {
   5186    return
   5187       unop(Iop_32to16,
   5188            binop(Iop_Or32,
   5189                  binop(Iop_Shl32,
   5190                        binop(Iop_And32, get_ftop(), mkU32(7)),
   5191                              mkU8(11)),
   5192                        binop(Iop_And32, unop(Iop_64to32, get_C3210()),
   5193                                         mkU32(0x4700))
   5194       ));
   5195 }
   5196 
   5197 
   5198 /* ------------------------------------------------------- */
   5199 /* Given all that stack-mangling junk, we can now go ahead
   5200    and describe FP instructions.
   5201 */
   5202 
   5203 /* ST(0) = ST(0) `op` mem64/32(addr)
   5204    Need to check ST(0)'s tag on read, but not on write.
   5205 */
   5206 static
   5207 void fp_do_op_mem_ST_0 ( IRTemp addr, const HChar* op_txt, HChar* dis_buf,
   5208                          IROp op, Bool dbl )
   5209 {
   5210    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
   5211    if (dbl) {
   5212       put_ST_UNCHECKED(0,
   5213          triop( op,
   5214                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5215                 get_ST(0),
   5216                 loadLE(Ity_F64,mkexpr(addr))
   5217          ));
   5218    } else {
   5219       put_ST_UNCHECKED(0,
   5220          triop( op,
   5221                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5222                 get_ST(0),
   5223                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr)))
   5224          ));
   5225    }
   5226 }
   5227 
   5228 
   5229 /* ST(0) = mem64/32(addr) `op` ST(0)
   5230    Need to check ST(0)'s tag on read, but not on write.
   5231 */
   5232 static
   5233 void fp_do_oprev_mem_ST_0 ( IRTemp addr, const HChar* op_txt, HChar* dis_buf,
   5234                             IROp op, Bool dbl )
   5235 {
   5236    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
   5237    if (dbl) {
   5238       put_ST_UNCHECKED(0,
   5239          triop( op,
   5240                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5241                 loadLE(Ity_F64,mkexpr(addr)),
   5242                 get_ST(0)
   5243          ));
   5244    } else {
   5245       put_ST_UNCHECKED(0,
   5246          triop( op,
   5247                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5248                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr))),
   5249                 get_ST(0)
   5250          ));
   5251    }
   5252 }
   5253 
   5254 
   5255 /* ST(dst) = ST(dst) `op` ST(src).
   5256    Check dst and src tags when reading but not on write.
   5257 */
   5258 static
   5259 void fp_do_op_ST_ST ( const HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
   5260                       Bool pop_after )
   5261 {
   5262    DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"", st_src, st_dst );
   5263    put_ST_UNCHECKED(
   5264       st_dst,
   5265       triop( op,
   5266              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5267              get_ST(st_dst),
   5268              get_ST(st_src) )
   5269    );
   5270    if (pop_after)
   5271       fp_pop();
   5272 }
   5273 
   5274 /* ST(dst) = ST(src) `op` ST(dst).
   5275    Check dst and src tags when reading but not on write.
   5276 */
   5277 static
   5278 void fp_do_oprev_ST_ST ( const HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
   5279                          Bool pop_after )
   5280 {
   5281    DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"", st_src, st_dst );
   5282    put_ST_UNCHECKED(
   5283       st_dst,
   5284       triop( op,
   5285              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5286              get_ST(st_src),
   5287              get_ST(st_dst) )
   5288    );
   5289    if (pop_after)
   5290       fp_pop();
   5291 }
   5292 
   5293 /* %rflags(Z,P,C) = UCOMI( st(0), st(i) ) */
   5294 static void fp_do_ucomi_ST0_STi ( UInt i, Bool pop_after )
   5295 {
   5296    DIP("fucomi%s %%st(0),%%st(%u)\n", pop_after ? "p" : "", i);
   5297    /* This is a bit of a hack (and isn't really right).  It sets
   5298       Z,P,C,O correctly, but forces A and S to zero, whereas the Intel
   5299       documentation implies A and S are unchanged.
   5300    */
   5301    /* It's also fishy in that it is used both for COMIP and
   5302       UCOMIP, and they aren't the same (although similar). */
   5303    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   5304    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   5305    stmt( IRStmt_Put(
   5306             OFFB_CC_DEP1,
   5307             binop( Iop_And64,
   5308                    unop( Iop_32Uto64,
   5309                          binop(Iop_CmpF64, get_ST(0), get_ST(i))),
   5310                    mkU64(0x45)
   5311         )));
   5312    if (pop_after)
   5313       fp_pop();
   5314 }
   5315 
   5316 
   5317 /* returns
   5318    32to16( if e32 <s -32768 || e32 >s 32767 then -32768 else e32 )
   5319 */
   5320 static IRExpr* x87ishly_qnarrow_32_to_16 ( IRExpr* e32 )
   5321 {
   5322    IRTemp t32 = newTemp(Ity_I32);
   5323    assign( t32, e32 );
   5324    return
   5325       IRExpr_ITE(
   5326          binop(Iop_CmpLT64U,
   5327                unop(Iop_32Uto64,
   5328                     binop(Iop_Add32, mkexpr(t32), mkU32(32768))),
   5329                mkU64(65536)),
   5330          unop(Iop_32to16, mkexpr(t32)),
   5331          mkU16( 0x8000 ) );
   5332 }
   5333 
   5334 
   5335 static
   5336 ULong dis_FPU ( /*OUT*/Bool* decode_ok,
   5337                 const VexAbiInfo* vbi, Prefix pfx, Long delta )
   5338 {
   5339    Int    len;
   5340    UInt   r_src, r_dst;
   5341    HChar  dis_buf[50];
   5342    IRTemp t1, t2;
   5343 
   5344    /* On entry, delta points at the second byte of the insn (the modrm
   5345       byte).*/
   5346    UChar first_opcode = getUChar(delta-1);
   5347    UChar modrm        = getUChar(delta+0);
   5348 
   5349    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD8 opcodes +-+-+-+-+-+-+-+ */
   5350 
   5351    if (first_opcode == 0xD8) {
   5352       if (modrm < 0xC0) {
   5353 
   5354          /* bits 5,4,3 are an opcode extension, and the modRM also
   5355            specifies an address. */
   5356          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   5357          delta += len;
   5358 
   5359          switch (gregLO3ofRM(modrm)) {
   5360 
   5361             case 0: /* FADD single-real */
   5362                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, False );
   5363                break;
   5364 
   5365             case 1: /* FMUL single-real */
   5366                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, False );
   5367                break;
   5368 
   5369             case 2: /* FCOM single-real */
   5370                DIP("fcoms %s\n", dis_buf);
   5371                /* This forces C1 to zero, which isn't right. */
   5372                /* The AMD documentation suggests that forcing C1 to
   5373                   zero is correct (Eliot Moss) */
   5374                put_C3210(
   5375                    unop( Iop_32Uto64,
   5376                        binop( Iop_And32,
   5377                               binop(Iop_Shl32,
   5378                                     binop(Iop_CmpF64,
   5379                                           get_ST(0),
   5380                                           unop(Iop_F32toF64,
   5381                                                loadLE(Ity_F32,mkexpr(addr)))),
   5382                                     mkU8(8)),
   5383                               mkU32(0x4500)
   5384                    )));
   5385                break;
   5386 
   5387             case 3: /* FCOMP single-real */
   5388                /* The AMD documentation suggests that forcing C1 to
   5389                   zero is correct (Eliot Moss) */
   5390                DIP("fcomps %s\n", dis_buf);
   5391                /* This forces C1 to zero, which isn't right. */
   5392                put_C3210(
   5393                    unop( Iop_32Uto64,
   5394                        binop( Iop_And32,
   5395                               binop(Iop_Shl32,
   5396                                     binop(Iop_CmpF64,
   5397                                           get_ST(0),
   5398                                           unop(Iop_F32toF64,
   5399                                                loadLE(Ity_F32,mkexpr(addr)))),
   5400                                     mkU8(8)),
   5401                               mkU32(0x4500)
   5402                    )));
   5403                fp_pop();
   5404                break;
   5405 
   5406             case 4: /* FSUB single-real */
   5407                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, False );
   5408                break;
   5409 
   5410             case 5: /* FSUBR single-real */
   5411                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, False );
   5412                break;
   5413 
   5414             case 6: /* FDIV single-real */
   5415                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, False );
   5416                break;
   5417 
   5418             case 7: /* FDIVR single-real */
   5419                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, False );
   5420                break;
   5421 
   5422             default:
   5423                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   5424                vex_printf("first_opcode == 0xD8\n");
   5425                goto decode_fail;
   5426          }
   5427       } else {
   5428          delta++;
   5429          switch (modrm) {
   5430 
   5431             case 0xC0 ... 0xC7: /* FADD %st(?),%st(0) */
   5432                fp_do_op_ST_ST ( "add", Iop_AddF64, modrm - 0xC0, 0, False );
   5433                break;
   5434 
   5435             case 0xC8 ... 0xCF: /* FMUL %st(?),%st(0) */
   5436                fp_do_op_ST_ST ( "mul", Iop_MulF64, modrm - 0xC8, 0, False );
   5437                break;
   5438 
   5439             /* Dunno if this is right */
   5440             case 0xD0 ... 0xD7: /* FCOM %st(?),%st(0) */
   5441                r_dst = (UInt)modrm - 0xD0;
   5442                DIP("fcom %%st(0),%%st(%d)\n", r_dst);
   5443                /* This forces C1 to zero, which isn't right. */
   5444                put_C3210(
   5445                    unop(Iop_32Uto64,
   5446                    binop( Iop_And32,
   5447                           binop(Iop_Shl32,
   5448                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   5449                                 mkU8(8)),
   5450                           mkU32(0x4500)
   5451                    )));
   5452                break;
   5453 
   5454             /* Dunno if this is right */
   5455             case 0xD8 ... 0xDF: /* FCOMP %st(?),%st(0) */
   5456                r_dst = (UInt)modrm - 0xD8;
   5457                DIP("fcomp %%st(0),%%st(%d)\n", r_dst);
   5458                /* This forces C1 to zero, which isn't right. */
   5459                put_C3210(
   5460                    unop(Iop_32Uto64,
   5461                    binop( Iop_And32,
   5462                           binop(Iop_Shl32,
   5463                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   5464                                 mkU8(8)),
   5465                           mkU32(0x4500)
   5466                    )));
   5467                fp_pop();
   5468                break;
   5469 
   5470             case 0xE0 ... 0xE7: /* FSUB %st(?),%st(0) */
   5471                fp_do_op_ST_ST ( "sub", Iop_SubF64, modrm - 0xE0, 0, False );
   5472                break;
   5473 
   5474             case 0xE8 ... 0xEF: /* FSUBR %st(?),%st(0) */
   5475                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, modrm - 0xE8, 0, False );
   5476                break;
   5477 
   5478             case 0xF0 ... 0xF7: /* FDIV %st(?),%st(0) */
   5479                fp_do_op_ST_ST ( "div", Iop_DivF64, modrm - 0xF0, 0, False );
   5480                break;
   5481 
   5482             case 0xF8 ... 0xFF: /* FDIVR %st(?),%st(0) */
   5483                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, modrm - 0xF8, 0, False );
   5484                break;
   5485 
   5486             default:
   5487                goto decode_fail;
   5488          }
   5489       }
   5490    }
   5491 
   5492    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD9 opcodes +-+-+-+-+-+-+-+ */
   5493    else
   5494    if (first_opcode == 0xD9) {
   5495       if (modrm < 0xC0) {
   5496 
   5497          /* bits 5,4,3 are an opcode extension, and the modRM also
   5498             specifies an address. */
   5499          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   5500          delta += len;
   5501 
   5502          switch (gregLO3ofRM(modrm)) {
   5503 
   5504             case 0: /* FLD single-real */
   5505                DIP("flds %s\n", dis_buf);
   5506                fp_push();
   5507                put_ST(0, unop(Iop_F32toF64,
   5508                               loadLE(Ity_F32, mkexpr(addr))));
   5509                break;
   5510 
   5511             case 2: /* FST single-real */
   5512                DIP("fsts %s\n", dis_buf);
   5513                storeLE(mkexpr(addr),
   5514                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
   5515                break;
   5516 
   5517             case 3: /* FSTP single-real */
   5518                DIP("fstps %s\n", dis_buf);
   5519                storeLE(mkexpr(addr),
   5520                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
   5521                fp_pop();
   5522                break;
   5523 
   5524             case 4: { /* FLDENV m28 */
   5525                /* Uses dirty helper:
   5526                      VexEmNote amd64g_do_FLDENV ( VexGuestX86State*, HWord ) */
   5527                IRTemp    ew = newTemp(Ity_I32);
   5528                IRTemp   w64 = newTemp(Ity_I64);
   5529                IRDirty*   d = unsafeIRDirty_0_N (
   5530                                  0/*regparms*/,
   5531                                  "amd64g_dirtyhelper_FLDENV",
   5532                                  &amd64g_dirtyhelper_FLDENV,
   5533                                  mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   5534                               );
   5535                d->tmp       = w64;
   5536                /* declare we're reading memory */
   5537                d->mFx   = Ifx_Read;
   5538                d->mAddr = mkexpr(addr);
   5539                d->mSize = 28;
   5540 
   5541                /* declare we're writing guest state */
   5542                d->nFxState = 4;
   5543                vex_bzero(&d->fxState, sizeof(d->fxState));
   5544 
   5545                d->fxState[0].fx     = Ifx_Write;
   5546                d->fxState[0].offset = OFFB_FTOP;
   5547                d->fxState[0].size   = sizeof(UInt);
   5548 
   5549                d->fxState[1].fx     = Ifx_Write;
   5550                d->fxState[1].offset = OFFB_FPTAGS;
   5551                d->fxState[1].size   = 8 * sizeof(UChar);
   5552 
   5553                d->fxState[2].fx     = Ifx_Write;
   5554                d->fxState[2].offset = OFFB_FPROUND;
   5555                d->fxState[2].size   = sizeof(ULong);
   5556 
   5557                d->fxState[3].fx     = Ifx_Write;
   5558                d->fxState[3].offset = OFFB_FC3210;
   5559                d->fxState[3].size   = sizeof(ULong);
   5560 
   5561                stmt( IRStmt_Dirty(d) );
   5562 
   5563                /* ew contains any emulation warning we may need to
   5564                   issue.  If needed, side-exit to the next insn,
   5565                   reporting the warning, so that Valgrind's dispatcher
   5566                   sees the warning. */
   5567                assign(ew, unop(Iop_64to32,mkexpr(w64)) );
   5568                put_emwarn( mkexpr(ew) );
   5569                stmt(
   5570                   IRStmt_Exit(
   5571                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   5572                      Ijk_EmWarn,
   5573                      IRConst_U64( guest_RIP_bbstart+delta ),
   5574                      OFFB_RIP
   5575                   )
   5576                );
   5577 
   5578                DIP("fldenv %s\n", dis_buf);
   5579                break;
   5580             }
   5581 
   5582             case 5: {/* FLDCW */
   5583                /* The only thing we observe in the control word is the
   5584                   rounding mode.  Therefore, pass the 16-bit value
   5585                   (x87 native-format control word) to a clean helper,
   5586                   getting back a 64-bit value, the lower half of which
   5587                   is the FPROUND value to store, and the upper half of
   5588                   which is the emulation-warning token which may be
   5589                   generated.
   5590                */
   5591                /* ULong amd64h_check_fldcw ( ULong ); */
   5592                IRTemp t64 = newTemp(Ity_I64);
   5593                IRTemp ew = newTemp(Ity_I32);
   5594                DIP("fldcw %s\n", dis_buf);
   5595                assign( t64, mkIRExprCCall(
   5596                                Ity_I64, 0/*regparms*/,
   5597                                "amd64g_check_fldcw",
   5598                                &amd64g_check_fldcw,
   5599                                mkIRExprVec_1(
   5600                                   unop( Iop_16Uto64,
   5601                                         loadLE(Ity_I16, mkexpr(addr)))
   5602                                )
   5603                             )
   5604                      );
   5605 
   5606                put_fpround( unop(Iop_64to32, mkexpr(t64)) );
   5607                assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
   5608                put_emwarn( mkexpr(ew) );
   5609                /* Finally, if an emulation warning was reported,
   5610                   side-exit to the next insn, reporting the warning,
   5611                   so that Valgrind's dispatcher sees the warning. */
   5612                stmt(
   5613                   IRStmt_Exit(
   5614                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   5615                      Ijk_EmWarn,
   5616                      IRConst_U64( guest_RIP_bbstart+delta ),
   5617                      OFFB_RIP
   5618                   )
   5619                );
   5620                break;
   5621             }
   5622 
   5623             case 6: { /* FNSTENV m28 */
   5624                /* Uses dirty helper:
   5625                      void amd64g_do_FSTENV ( VexGuestAMD64State*, HWord ) */
   5626                IRDirty* d = unsafeIRDirty_0_N (
   5627                                0/*regparms*/,
   5628                                "amd64g_dirtyhelper_FSTENV",
   5629                                &amd64g_dirtyhelper_FSTENV,
   5630                                mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   5631                             );
   5632                /* declare we're writing memory */
   5633                d->mFx   = Ifx_Write;
   5634                d->mAddr = mkexpr(addr);
   5635                d->mSize = 28;
   5636 
   5637                /* declare we're reading guest state */
   5638                d->nFxState = 4;
   5639                vex_bzero(&d->fxState, sizeof(d->fxState));
   5640 
   5641                d->fxState[0].fx     = Ifx_Read;
   5642                d->fxState[0].offset = OFFB_FTOP;
   5643                d->fxState[0].size   = sizeof(UInt);
   5644 
   5645                d->fxState[1].fx     = Ifx_Read;
   5646                d->fxState[1].offset = OFFB_FPTAGS;
   5647                d->fxState[1].size   = 8 * sizeof(UChar);
   5648 
   5649                d->fxState[2].fx     = Ifx_Read;
   5650                d->fxState[2].offset = OFFB_FPROUND;
   5651                d->fxState[2].size   = sizeof(ULong);
   5652 
   5653                d->fxState[3].fx     = Ifx_Read;
   5654                d->fxState[3].offset = OFFB_FC3210;
   5655                d->fxState[3].size   = sizeof(ULong);
   5656 
   5657                stmt( IRStmt_Dirty(d) );
   5658 
   5659                DIP("fnstenv %s\n", dis_buf);
   5660                break;
   5661             }
   5662 
   5663             case 7: /* FNSTCW */
   5664                /* Fake up a native x87 FPU control word.  The only
   5665                   thing it depends on is FPROUND[1:0], so call a clean
   5666                   helper to cook it up. */
   5667                /* ULong amd64g_create_fpucw ( ULong fpround ) */
   5668                DIP("fnstcw %s\n", dis_buf);
   5669                storeLE(
   5670                   mkexpr(addr),
   5671                   unop( Iop_64to16,
   5672                         mkIRExprCCall(
   5673                            Ity_I64, 0/*regp*/,
   5674                            "amd64g_create_fpucw", &amd64g_create_fpucw,
   5675                            mkIRExprVec_1( unop(Iop_32Uto64, get_fpround()) )
   5676                         )
   5677                   )
   5678                );
   5679                break;
   5680 
   5681             default:
   5682                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   5683                vex_printf("first_opcode == 0xD9\n");
   5684                goto decode_fail;
   5685          }
   5686 
   5687       } else {
   5688          delta++;
   5689          switch (modrm) {
   5690 
   5691             case 0xC0 ... 0xC7: /* FLD %st(?) */
   5692                r_src = (UInt)modrm - 0xC0;
   5693                DIP("fld %%st(%u)\n", r_src);
   5694                t1 = newTemp(Ity_F64);
   5695                assign(t1, get_ST(r_src));
   5696                fp_push();
   5697                put_ST(0, mkexpr(t1));
   5698                break;
   5699 
   5700             case 0xC8 ... 0xCF: /* FXCH %st(?) */
   5701                r_src = (UInt)modrm - 0xC8;
   5702                DIP("fxch %%st(%u)\n", r_src);
   5703                t1 = newTemp(Ity_F64);
   5704                t2 = newTemp(Ity_F64);
   5705                assign(t1, get_ST(0));
   5706                assign(t2, get_ST(r_src));
   5707                put_ST_UNCHECKED(0, mkexpr(t2));
   5708                put_ST_UNCHECKED(r_src, mkexpr(t1));
   5709                break;
   5710 
   5711             case 0xE0: /* FCHS */
   5712                DIP("fchs\n");
   5713                put_ST_UNCHECKED(0, unop(Iop_NegF64, get_ST(0)));
   5714                break;
   5715 
   5716             case 0xE1: /* FABS */
   5717                DIP("fabs\n");
   5718                put_ST_UNCHECKED(0, unop(Iop_AbsF64, get_ST(0)));
   5719                break;
   5720 
   5721             case 0xE5: { /* FXAM */
   5722                /* This is an interesting one.  It examines %st(0),
   5723                   regardless of whether the tag says it's empty or not.
   5724                   Here, just pass both the tag (in our format) and the
   5725                   value (as a double, actually a ULong) to a helper
   5726                   function. */
   5727                IRExpr** args
   5728                   = mkIRExprVec_2( unop(Iop_8Uto64, get_ST_TAG(0)),
   5729                                    unop(Iop_ReinterpF64asI64,
   5730                                         get_ST_UNCHECKED(0)) );
   5731                put_C3210(mkIRExprCCall(
   5732                             Ity_I64,
   5733                             0/*regparm*/,
   5734                             "amd64g_calculate_FXAM", &amd64g_calculate_FXAM,
   5735                             args
   5736                         ));
   5737                DIP("fxam\n");
   5738                break;
   5739             }
   5740 
   5741             case 0xE8: /* FLD1 */
   5742                DIP("fld1\n");
   5743                fp_push();
   5744                /* put_ST(0, IRExpr_Const(IRConst_F64(1.0))); */
   5745                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff0000000000000ULL)));
   5746                break;
   5747 
   5748             case 0xE9: /* FLDL2T */
   5749                DIP("fldl2t\n");
   5750                fp_push();
   5751                /* put_ST(0, IRExpr_Const(IRConst_F64(3.32192809488736234781))); */
   5752                put_ST(0, IRExpr_Const(IRConst_F64i(0x400a934f0979a371ULL)));
   5753                break;
   5754 
   5755             case 0xEA: /* FLDL2E */
   5756                DIP("fldl2e\n");
   5757                fp_push();
   5758                /* put_ST(0, IRExpr_Const(IRConst_F64(1.44269504088896340739))); */
   5759                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff71547652b82feULL)));
   5760                break;
   5761 
   5762             case 0xEB: /* FLDPI */
   5763                DIP("fldpi\n");
   5764                fp_push();
   5765                /* put_ST(0, IRExpr_Const(IRConst_F64(3.14159265358979323851))); */
   5766                put_ST(0, IRExpr_Const(IRConst_F64i(0x400921fb54442d18ULL)));
   5767                break;
   5768 
   5769             case 0xEC: /* FLDLG2 */
   5770                DIP("fldlg2\n");
   5771                fp_push();
   5772                /* put_ST(0, IRExpr_Const(IRConst_F64(0.301029995663981143))); */
   5773                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fd34413509f79ffULL)));
   5774                break;
   5775 
   5776             case 0xED: /* FLDLN2 */
   5777                DIP("fldln2\n");
   5778                fp_push();
   5779                /* put_ST(0, IRExpr_Const(IRConst_F64(0.69314718055994530942))); */
   5780                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fe62e42fefa39efULL)));
   5781                break;
   5782 
   5783             case 0xEE: /* FLDZ */
   5784                DIP("fldz\n");
   5785                fp_push();
   5786                /* put_ST(0, IRExpr_Const(IRConst_F64(0.0))); */
   5787                put_ST(0, IRExpr_Const(IRConst_F64i(0x0000000000000000ULL)));
   5788                break;
   5789 
   5790             case 0xF0: /* F2XM1 */
   5791                DIP("f2xm1\n");
   5792                put_ST_UNCHECKED(0,
   5793                   binop(Iop_2xm1F64,
   5794                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5795                         get_ST(0)));
   5796                break;
   5797 
   5798             case 0xF1: /* FYL2X */
   5799                DIP("fyl2x\n");
   5800                put_ST_UNCHECKED(1,
   5801                   triop(Iop_Yl2xF64,
   5802                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5803                         get_ST(1),
   5804                         get_ST(0)));
   5805                fp_pop();
   5806                break;
   5807 
   5808             case 0xF2: { /* FPTAN */
   5809                DIP("fptan\n");
   5810                IRTemp argD = newTemp(Ity_F64);
   5811                assign(argD, get_ST(0));
   5812                IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
   5813                IRTemp resD = newTemp(Ity_F64);
   5814                assign(resD,
   5815                   IRExpr_ITE(
   5816                      mkexpr(argOK),
   5817                      binop(Iop_TanF64,
   5818                            get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5819                            mkexpr(argD)),
   5820                      mkexpr(argD))
   5821                );
   5822                put_ST_UNCHECKED(0, mkexpr(resD));
   5823                /* Conditionally push 1.0 on the stack, if the arg is
   5824                   in range */
   5825                maybe_fp_push(argOK);
   5826                maybe_put_ST(argOK, 0,
   5827                             IRExpr_Const(IRConst_F64(1.0)));
   5828                set_C2( binop(Iop_Xor64,
   5829                              unop(Iop_1Uto64, mkexpr(argOK)),
   5830                              mkU64(1)) );
   5831                break;
   5832             }
   5833 
   5834             case 0xF3: /* FPATAN */
   5835                DIP("fpatan\n");
   5836                put_ST_UNCHECKED(1,
   5837                   triop(Iop_AtanF64,
   5838                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5839                         get_ST(1),
   5840                         get_ST(0)));
   5841                fp_pop();
   5842                break;
   5843 
   5844             case 0xF4: { /* FXTRACT */
   5845                IRTemp argF = newTemp(Ity_F64);
   5846                IRTemp sigF = newTemp(Ity_F64);
   5847                IRTemp expF = newTemp(Ity_F64);
   5848                IRTemp argI = newTemp(Ity_I64);
   5849                IRTemp sigI = newTemp(Ity_I64);
   5850                IRTemp expI = newTemp(Ity_I64);
   5851                DIP("fxtract\n");
   5852                assign( argF, get_ST(0) );
   5853                assign( argI, unop(Iop_ReinterpF64asI64, mkexpr(argF)));
   5854                assign( sigI,
   5855                        mkIRExprCCall(
   5856                           Ity_I64, 0/*regparms*/,
   5857                           "x86amd64g_calculate_FXTRACT",
   5858                           &x86amd64g_calculate_FXTRACT,
   5859                           mkIRExprVec_2( mkexpr(argI),
   5860                                          mkIRExpr_HWord(0)/*sig*/ ))
   5861                );
   5862                assign( expI,
   5863                        mkIRExprCCall(
   5864                           Ity_I64, 0/*regparms*/,
   5865                           "x86amd64g_calculate_FXTRACT",
   5866                           &x86amd64g_calculate_FXTRACT,
   5867                           mkIRExprVec_2( mkexpr(argI),
   5868                                          mkIRExpr_HWord(1)/*exp*/ ))
   5869                );
   5870                assign( sigF, unop(Iop_ReinterpI64asF64, mkexpr(sigI)) );
   5871                assign( expF, unop(Iop_ReinterpI64asF64, mkexpr(expI)) );
   5872                /* exponent */
   5873                put_ST_UNCHECKED(0, mkexpr(expF) );
   5874                fp_push();
   5875                /* significand */
   5876                put_ST(0, mkexpr(sigF) );
   5877                break;
   5878             }
   5879 
   5880             case 0xF5: { /* FPREM1 -- IEEE compliant */
   5881                IRTemp a1 = newTemp(Ity_F64);
   5882                IRTemp a2 = newTemp(Ity_F64);
   5883                DIP("fprem1\n");
   5884                /* Do FPREM1 twice, once to get the remainder, and once
   5885                   to get the C3210 flag values. */
   5886                assign( a1, get_ST(0) );
   5887                assign( a2, get_ST(1) );
   5888                put_ST_UNCHECKED(0,
   5889                   triop(Iop_PRem1F64,
   5890                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5891                         mkexpr(a1),
   5892                         mkexpr(a2)));
   5893                put_C3210(
   5894                   unop(Iop_32Uto64,
   5895                   triop(Iop_PRem1C3210F64,
   5896                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5897                         mkexpr(a1),
   5898                         mkexpr(a2)) ));
   5899                break;
   5900             }
   5901 
   5902             case 0xF7: /* FINCSTP */
   5903                DIP("fincstp\n");
   5904                put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
   5905                break;
   5906 
   5907             case 0xF8: { /* FPREM -- not IEEE compliant */
   5908                IRTemp a1 = newTemp(Ity_F64);
   5909                IRTemp a2 = newTemp(Ity_F64);
   5910                DIP("fprem\n");
   5911                /* Do FPREM twice, once to get the remainder, and once
   5912                   to get the C3210 flag values. */
   5913                assign( a1, get_ST(0) );
   5914                assign( a2, get_ST(1) );
   5915                put_ST_UNCHECKED(0,
   5916                   triop(Iop_PRemF64,
   5917                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5918                         mkexpr(a1),
   5919                         mkexpr(a2)));
   5920                put_C3210(
   5921                   unop(Iop_32Uto64,
   5922                   triop(Iop_PRemC3210F64,
   5923                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5924                         mkexpr(a1),
   5925                         mkexpr(a2)) ));
   5926                break;
   5927             }
   5928 
   5929             case 0xF9: /* FYL2XP1 */
   5930                DIP("fyl2xp1\n");
   5931                put_ST_UNCHECKED(1,
   5932                   triop(Iop_Yl2xp1F64,
   5933                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5934                         get_ST(1),
   5935                         get_ST(0)));
   5936                fp_pop();
   5937                break;
   5938 
   5939             case 0xFA: /* FSQRT */
   5940                DIP("fsqrt\n");
   5941                put_ST_UNCHECKED(0,
   5942                   binop(Iop_SqrtF64,
   5943                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5944                         get_ST(0)));
   5945                break;
   5946 
   5947             case 0xFB: { /* FSINCOS */
   5948                DIP("fsincos\n");
   5949                IRTemp argD = newTemp(Ity_F64);
   5950                assign(argD, get_ST(0));
   5951                IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
   5952                IRTemp resD = newTemp(Ity_F64);
   5953                assign(resD,
   5954                   IRExpr_ITE(
   5955                      mkexpr(argOK),
   5956                      binop(Iop_SinF64,
   5957                            get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5958                            mkexpr(argD)),
   5959                      mkexpr(argD))
   5960                );
   5961                put_ST_UNCHECKED(0, mkexpr(resD));
   5962                /* Conditionally push the cos value on the stack, if
   5963                   the arg is in range */
   5964                maybe_fp_push(argOK);
   5965                maybe_put_ST(argOK, 0,
   5966                   binop(Iop_CosF64,
   5967                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5968                         mkexpr(argD)));
   5969                set_C2( binop(Iop_Xor64,
   5970                              unop(Iop_1Uto64, mkexpr(argOK)),
   5971                              mkU64(1)) );
   5972                break;
   5973             }
   5974 
   5975             case 0xFC: /* FRNDINT */
   5976                DIP("frndint\n");
   5977                put_ST_UNCHECKED(0,
   5978                   binop(Iop_RoundF64toInt, get_roundingmode(), get_ST(0)) );
   5979                break;
   5980 
   5981             case 0xFD: /* FSCALE */
   5982                DIP("fscale\n");
   5983                put_ST_UNCHECKED(0,
   5984                   triop(Iop_ScaleF64,
   5985                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5986                         get_ST(0),
   5987                         get_ST(1)));
   5988                break;
   5989 
   5990             case 0xFE:   /* FSIN */
   5991             case 0xFF: { /* FCOS */
   5992                Bool isSIN = modrm == 0xFE;
   5993                DIP("%s\n", isSIN ? "fsin" : "fcos");
   5994                IRTemp argD = newTemp(Ity_F64);
   5995                assign(argD, get_ST(0));
   5996                IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
   5997                IRTemp resD = newTemp(Ity_F64);
   5998                assign(resD,
   5999                   IRExpr_ITE(
   6000                      mkexpr(argOK),
   6001                      binop(isSIN ? Iop_SinF64 : Iop_CosF64,
   6002                            get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6003                            mkexpr(argD)),
   6004                      mkexpr(argD))
   6005                );
   6006                put_ST_UNCHECKED(0, mkexpr(resD));
   6007                set_C2( binop(Iop_Xor64,
   6008                              unop(Iop_1Uto64, mkexpr(argOK)),
   6009                              mkU64(1)) );
   6010                break;
   6011             }
   6012 
   6013             default:
   6014                goto decode_fail;
   6015          }
   6016       }
   6017    }
   6018 
   6019    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDA opcodes +-+-+-+-+-+-+-+ */
   6020    else
   6021    if (first_opcode == 0xDA) {
   6022 
   6023       if (modrm < 0xC0) {
   6024 
   6025          /* bits 5,4,3 are an opcode extension, and the modRM also
   6026             specifies an address. */
   6027          IROp   fop;
   6028          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6029          delta += len;
   6030          switch (gregLO3ofRM(modrm)) {
   6031 
   6032             case 0: /* FIADD m32int */ /* ST(0) += m32int */
   6033                DIP("fiaddl %s\n", dis_buf);
   6034                fop = Iop_AddF64;
   6035                goto do_fop_m32;
   6036 
   6037             case 1: /* FIMUL m32int */ /* ST(0) *= m32int */
   6038                DIP("fimull %s\n", dis_buf);
   6039                fop = Iop_MulF64;
   6040                goto do_fop_m32;
   6041 
   6042             case 4: /* FISUB m32int */ /* ST(0) -= m32int */
   6043                DIP("fisubl %s\n", dis_buf);
   6044                fop = Iop_SubF64;
   6045                goto do_fop_m32;
   6046 
   6047             case 5: /* FISUBR m32int */ /* ST(0) = m32int - ST(0) */
   6048                DIP("fisubrl %s\n", dis_buf);
   6049                fop = Iop_SubF64;
   6050                goto do_foprev_m32;
   6051 
   6052             case 6: /* FIDIV m32int */ /* ST(0) /= m32int */
   6053                DIP("fisubl %s\n", dis_buf);
   6054                fop = Iop_DivF64;
   6055                goto do_fop_m32;
   6056 
   6057             case 7: /* FIDIVR m32int */ /* ST(0) = m32int / ST(0) */
   6058                DIP("fidivrl %s\n", dis_buf);
   6059                fop = Iop_DivF64;
   6060                goto do_foprev_m32;
   6061 
   6062             do_fop_m32:
   6063                put_ST_UNCHECKED(0,
   6064                   triop(fop,
   6065                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6066                         get_ST(0),
   6067                         unop(Iop_I32StoF64,
   6068                              loadLE(Ity_I32, mkexpr(addr)))));
   6069                break;
   6070 
   6071             do_foprev_m32:
   6072                put_ST_UNCHECKED(0,
   6073                   triop(fop,
   6074                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6075                         unop(Iop_I32StoF64,
   6076                              loadLE(Ity_I32, mkexpr(addr))),
   6077                         get_ST(0)));
   6078                break;
   6079 
   6080             default:
   6081                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   6082                vex_printf("first_opcode == 0xDA\n");
   6083                goto decode_fail;
   6084          }
   6085 
   6086       } else {
   6087 
   6088          delta++;
   6089          switch (modrm) {
   6090 
   6091             case 0xC0 ... 0xC7: /* FCMOVB ST(i), ST(0) */
   6092                r_src = (UInt)modrm - 0xC0;
   6093                DIP("fcmovb %%st(%u), %%st(0)\n", r_src);
   6094                put_ST_UNCHECKED(0,
   6095                                 IRExpr_ITE(
   6096                                     mk_amd64g_calculate_condition(AMD64CondB),
   6097                                     get_ST(r_src), get_ST(0)) );
   6098                break;
   6099 
   6100             case 0xC8 ... 0xCF: /* FCMOVE(Z) ST(i), ST(0) */
   6101                r_src = (UInt)modrm - 0xC8;
   6102                DIP("fcmovz %%st(%u), %%st(0)\n", r_src);
   6103                put_ST_UNCHECKED(0,
   6104                                 IRExpr_ITE(
   6105                                     mk_amd64g_calculate_condition(AMD64CondZ),
   6106                                     get_ST(r_src), get_ST(0)) );
   6107                break;
   6108 
   6109             case 0xD0 ... 0xD7: /* FCMOVBE ST(i), ST(0) */
   6110                r_src = (UInt)modrm - 0xD0;
   6111                DIP("fcmovbe %%st(%u), %%st(0)\n", r_src);
   6112                put_ST_UNCHECKED(0,
   6113                                 IRExpr_ITE(
   6114                                     mk_amd64g_calculate_condition(AMD64CondBE),
   6115                                     get_ST(r_src), get_ST(0)) );
   6116                break;
   6117 
   6118             case 0xD8 ... 0xDF: /* FCMOVU ST(i), ST(0) */
   6119                r_src = (UInt)modrm - 0xD8;
   6120                DIP("fcmovu %%st(%u), %%st(0)\n", r_src);
   6121                put_ST_UNCHECKED(0,
   6122                                 IRExpr_ITE(
   6123                                     mk_amd64g_calculate_condition(AMD64CondP),
   6124                                     get_ST(r_src), get_ST(0)) );
   6125                break;
   6126 
   6127             case 0xE9: /* FUCOMPP %st(0),%st(1) */
   6128                DIP("fucompp %%st(0),%%st(1)\n");
   6129                /* This forces C1 to zero, which isn't right. */
   6130                put_C3210(
   6131                    unop(Iop_32Uto64,
   6132                    binop( Iop_And32,
   6133                           binop(Iop_Shl32,
   6134                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
   6135                                 mkU8(8)),
   6136                           mkU32(0x4500)
   6137                    )));
   6138                fp_pop();
   6139                fp_pop();
   6140                break;
   6141 
   6142             default:
   6143                goto decode_fail;
   6144          }
   6145 
   6146       }
   6147    }
   6148 
   6149    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDB opcodes +-+-+-+-+-+-+-+ */
   6150    else
   6151    if (first_opcode == 0xDB) {
   6152       if (modrm < 0xC0) {
   6153 
   6154          /* bits 5,4,3 are an opcode extension, and the modRM also
   6155             specifies an address. */
   6156          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6157          delta += len;
   6158 
   6159          switch (gregLO3ofRM(modrm)) {
   6160 
   6161             case 0: /* FILD m32int */
   6162                DIP("fildl %s\n", dis_buf);
   6163                fp_push();
   6164                put_ST(0, unop(Iop_I32StoF64,
   6165                               loadLE(Ity_I32, mkexpr(addr))));
   6166                break;
   6167 
   6168             case 1: /* FISTTPL m32 (SSE3) */
   6169                DIP("fisttpl %s\n", dis_buf);
   6170                storeLE( mkexpr(addr),
   6171                         binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) );
   6172                fp_pop();
   6173                break;
   6174 
   6175             case 2: /* FIST m32 */
   6176                DIP("fistl %s\n", dis_buf);
   6177                storeLE( mkexpr(addr),
   6178                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
   6179                break;
   6180 
   6181             case 3: /* FISTP m32 */
   6182                DIP("fistpl %s\n", dis_buf);
   6183                storeLE( mkexpr(addr),
   6184                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
   6185                fp_pop();
   6186                break;
   6187 
   6188             case 5: { /* FLD extended-real */
   6189                /* Uses dirty helper:
   6190                      ULong amd64g_loadF80le ( ULong )
   6191                   addr holds the address.  First, do a dirty call to
   6192                   get hold of the data. */
   6193                IRTemp   val  = newTemp(Ity_I64);
   6194                IRExpr** args = mkIRExprVec_1 ( mkexpr(addr) );
   6195 
   6196                IRDirty* d = unsafeIRDirty_1_N (
   6197                                val,
   6198                                0/*regparms*/,
   6199                                "amd64g_dirtyhelper_loadF80le",
   6200                                &amd64g_dirtyhelper_loadF80le,
   6201                                args
   6202                             );
   6203                /* declare that we're reading memory */
   6204                d->mFx   = Ifx_Read;
   6205                d->mAddr = mkexpr(addr);
   6206                d->mSize = 10;
   6207 
   6208                /* execute the dirty call, dumping the result in val. */
   6209                stmt( IRStmt_Dirty(d) );
   6210                fp_push();
   6211                put_ST(0, unop(Iop_ReinterpI64asF64, mkexpr(val)));
   6212 
   6213                DIP("fldt %s\n", dis_buf);
   6214                break;
   6215             }
   6216 
   6217             case 7: { /* FSTP extended-real */
   6218                /* Uses dirty helper:
   6219                      void amd64g_storeF80le ( ULong addr, ULong data )
   6220                */
   6221                IRExpr** args
   6222                   = mkIRExprVec_2( mkexpr(addr),
   6223                                    unop(Iop_ReinterpF64asI64, get_ST(0)) );
   6224 
   6225                IRDirty* d = unsafeIRDirty_0_N (
   6226                                0/*regparms*/,
   6227                                "amd64g_dirtyhelper_storeF80le",
   6228                                &amd64g_dirtyhelper_storeF80le,
   6229                                args
   6230                             );
   6231                /* declare we're writing memory */
   6232                d->mFx   = Ifx_Write;
   6233                d->mAddr = mkexpr(addr);
   6234                d->mSize = 10;
   6235 
   6236                /* execute the dirty call. */
   6237                stmt( IRStmt_Dirty(d) );
   6238                fp_pop();
   6239 
   6240                DIP("fstpt\n %s", dis_buf);
   6241                break;
   6242             }
   6243 
   6244             default:
   6245                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   6246                vex_printf("first_opcode == 0xDB\n");
   6247                goto decode_fail;
   6248          }
   6249 
   6250       } else {
   6251 
   6252          delta++;
   6253          switch (modrm) {
   6254 
   6255             case 0xC0 ... 0xC7: /* FCMOVNB ST(i), ST(0) */
   6256                r_src = (UInt)modrm - 0xC0;
   6257                DIP("fcmovnb %%st(%u), %%st(0)\n", r_src);
   6258                put_ST_UNCHECKED(0,
   6259                                 IRExpr_ITE(
   6260                                     mk_amd64g_calculate_condition(AMD64CondNB),
   6261                                     get_ST(r_src), get_ST(0)) );
   6262                break;
   6263 
   6264             case 0xC8 ... 0xCF: /* FCMOVNE(NZ) ST(i), ST(0) */
   6265                r_src = (UInt)modrm - 0xC8;
   6266                DIP("fcmovnz %%st(%u), %%st(0)\n", r_src);
   6267                put_ST_UNCHECKED(
   6268                   0,
   6269                   IRExpr_ITE(
   6270                      mk_amd64g_calculate_condition(AMD64CondNZ),
   6271                      get_ST(r_src),
   6272                      get_ST(0)
   6273                   )
   6274                );
   6275                break;
   6276 
   6277             case 0xD0 ... 0xD7: /* FCMOVNBE ST(i), ST(0) */
   6278                r_src = (UInt)modrm - 0xD0;
   6279                DIP("fcmovnbe %%st(%u), %%st(0)\n", r_src);
   6280                put_ST_UNCHECKED(
   6281                   0,
   6282                   IRExpr_ITE(
   6283                      mk_amd64g_calculate_condition(AMD64CondNBE),
   6284                      get_ST(r_src),
   6285                      get_ST(0)
   6286                   )
   6287                );
   6288                break;
   6289 
   6290             case 0xD8 ... 0xDF: /* FCMOVNU ST(i), ST(0) */
   6291                r_src = (UInt)modrm - 0xD8;
   6292                DIP("fcmovnu %%st(%u), %%st(0)\n", r_src);
   6293                put_ST_UNCHECKED(
   6294                   0,
   6295                   IRExpr_ITE(
   6296                      mk_amd64g_calculate_condition(AMD64CondNP),
   6297                      get_ST(r_src),
   6298                      get_ST(0)
   6299                   )
   6300                );
   6301                break;
   6302 
   6303             case 0xE2:
   6304                DIP("fnclex\n");
   6305                break;
   6306 
   6307             case 0xE3: {
   6308                /* Uses dirty helper:
   6309                      void amd64g_do_FINIT ( VexGuestAMD64State* ) */
   6310                IRDirty* d  = unsafeIRDirty_0_N (
   6311                                 0/*regparms*/,
   6312                                 "amd64g_dirtyhelper_FINIT",
   6313                                 &amd64g_dirtyhelper_FINIT,
   6314                                 mkIRExprVec_1( IRExpr_BBPTR() )
   6315                              );
   6316 
   6317                /* declare we're writing guest state */
   6318                d->nFxState = 5;
   6319                vex_bzero(&d->fxState, sizeof(d->fxState));
   6320 
   6321                d->fxState[0].fx     = Ifx_Write;
   6322                d->fxState[0].offset = OFFB_FTOP;
   6323                d->fxState[0].size   = sizeof(UInt);
   6324 
   6325                d->fxState[1].fx     = Ifx_Write;
   6326                d->fxState[1].offset = OFFB_FPREGS;
   6327                d->fxState[1].size   = 8 * sizeof(ULong);
   6328 
   6329                d->fxState[2].fx     = Ifx_Write;
   6330                d->fxState[2].offset = OFFB_FPTAGS;
   6331                d->fxState[2].size   = 8 * sizeof(UChar);
   6332 
   6333                d->fxState[3].fx     = Ifx_Write;
   6334                d->fxState[3].offset = OFFB_FPROUND;
   6335                d->fxState[3].size   = sizeof(ULong);
   6336 
   6337                d->fxState[4].fx     = Ifx_Write;
   6338                d->fxState[4].offset = OFFB_FC3210;
   6339                d->fxState[4].size   = sizeof(ULong);
   6340 
   6341                stmt( IRStmt_Dirty(d) );
   6342 
   6343                DIP("fninit\n");
   6344                break;
   6345             }
   6346 
   6347             case 0xE8 ... 0xEF: /* FUCOMI %st(0),%st(?) */
   6348                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, False );
   6349                break;
   6350 
   6351             case 0xF0 ... 0xF7: /* FCOMI %st(0),%st(?) */
   6352                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, False );
   6353                break;
   6354 
   6355             default:
   6356                goto decode_fail;
   6357          }
   6358       }
   6359    }
   6360 
   6361    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDC opcodes +-+-+-+-+-+-+-+ */
   6362    else
   6363    if (first_opcode == 0xDC) {
   6364       if (modrm < 0xC0) {
   6365 
   6366          /* bits 5,4,3 are an opcode extension, and the modRM also
   6367             specifies an address. */
   6368          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6369          delta += len;
   6370 
   6371          switch (gregLO3ofRM(modrm)) {
   6372 
   6373             case 0: /* FADD double-real */
   6374                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, True );
   6375                break;
   6376 
   6377             case 1: /* FMUL double-real */
   6378                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, True );
   6379                break;
   6380 
   6381 //..             case 2: /* FCOM double-real */
   6382 //..                DIP("fcoml %s\n", dis_buf);
   6383 //..                /* This forces C1 to zero, which isn't right. */
   6384 //..                put_C3210(
   6385 //..                    binop( Iop_And32,
   6386 //..                           binop(Iop_Shl32,
   6387 //..                                 binop(Iop_CmpF64,
   6388 //..                                       get_ST(0),
   6389 //..                                       loadLE(Ity_F64,mkexpr(addr))),
   6390 //..                                 mkU8(8)),
   6391 //..                           mkU32(0x4500)
   6392 //..                    ));
   6393 //..                break;
   6394 
   6395             case 3: /* FCOMP double-real */
   6396                DIP("fcompl %s\n", dis_buf);
   6397                /* This forces C1 to zero, which isn't right. */
   6398                put_C3210(
   6399                    unop(Iop_32Uto64,
   6400                    binop( Iop_And32,
   6401                           binop(Iop_Shl32,
   6402                                 binop(Iop_CmpF64,
   6403                                       get_ST(0),
   6404                                       loadLE(Ity_F64,mkexpr(addr))),
   6405                                 mkU8(8)),
   6406                           mkU32(0x4500)
   6407                    )));
   6408                fp_pop();
   6409                break;
   6410 
   6411             case 4: /* FSUB double-real */
   6412                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, True );
   6413                break;
   6414 
   6415             case 5: /* FSUBR double-real */
   6416                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, True );
   6417                break;
   6418 
   6419             case 6: /* FDIV double-real */
   6420                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, True );
   6421                break;
   6422 
   6423             case 7: /* FDIVR double-real */
   6424                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, True );
   6425                break;
   6426 
   6427             default:
   6428                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   6429                vex_printf("first_opcode == 0xDC\n");
   6430                goto decode_fail;
   6431          }
   6432 
   6433       } else {
   6434 
   6435          delta++;
   6436          switch (modrm) {
   6437 
   6438             case 0xC0 ... 0xC7: /* FADD %st(0),%st(?) */
   6439                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, False );
   6440                break;
   6441 
   6442             case 0xC8 ... 0xCF: /* FMUL %st(0),%st(?) */
   6443                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, False );
   6444                break;
   6445 
   6446             case 0xE0 ... 0xE7: /* FSUBR %st(0),%st(?) */
   6447                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0, modrm - 0xE0, False );
   6448                break;
   6449 
   6450             case 0xE8 ... 0xEF: /* FSUB %st(0),%st(?) */
   6451                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0, modrm - 0xE8, False );
   6452                break;
   6453 
   6454             case 0xF0 ... 0xF7: /* FDIVR %st(0),%st(?) */
   6455                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, False );
   6456                break;
   6457 
   6458             case 0xF8 ... 0xFF: /* FDIV %st(0),%st(?) */
   6459                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, False );
   6460                break;
   6461 
   6462             default:
   6463                goto decode_fail;
   6464          }
   6465 
   6466       }
   6467    }
   6468 
   6469    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDD opcodes +-+-+-+-+-+-+-+ */
   6470    else
   6471    if (first_opcode == 0xDD) {
   6472 
   6473       if (modrm < 0xC0) {
   6474 
   6475          /* bits 5,4,3 are an opcode extension, and the modRM also
   6476             specifies an address. */
   6477          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6478          delta += len;
   6479 
   6480          switch (gregLO3ofRM(modrm)) {
   6481 
   6482             case 0: /* FLD double-real */
   6483                DIP("fldl %s\n", dis_buf);
   6484                fp_push();
   6485                put_ST(0, loadLE(Ity_F64, mkexpr(addr)));
   6486                break;
   6487 
   6488             case 1: /* FISTTPQ m64 (SSE3) */
   6489                DIP("fistppll %s\n", dis_buf);
   6490                storeLE( mkexpr(addr),
   6491                         binop(Iop_F64toI64S, mkU32(Irrm_ZERO), get_ST(0)) );
   6492                fp_pop();
   6493                break;
   6494 
   6495             case 2: /* FST double-real */
   6496                DIP("fstl %s\n", dis_buf);
   6497                storeLE(mkexpr(addr), get_ST(0));
   6498                break;
   6499 
   6500             case 3: /* FSTP double-real */
   6501                DIP("fstpl %s\n", dis_buf);
   6502                storeLE(mkexpr(addr), get_ST(0));
   6503                fp_pop();
   6504                break;
   6505 
   6506             case 4: { /* FRSTOR m94/m108 */
   6507                IRTemp   ew = newTemp(Ity_I32);
   6508                IRTemp  w64 = newTemp(Ity_I64);
   6509                IRDirty*  d;
   6510                if ( have66(pfx) ) {
   6511                   /* Uses dirty helper:
   6512                      VexEmNote amd64g_dirtyhelper_FRSTORS
   6513                                   ( VexGuestAMD64State*, HWord ) */
   6514                   d = unsafeIRDirty_0_N (
   6515                          0/*regparms*/,
   6516                          "amd64g_dirtyhelper_FRSTORS",
   6517                          &amd64g_dirtyhelper_FRSTORS,
   6518                          mkIRExprVec_1( mkexpr(addr) )
   6519                       );
   6520                   d->mSize = 94;
   6521                } else {
   6522                   /* Uses dirty helper:
   6523                      VexEmNote amd64g_dirtyhelper_FRSTOR
   6524                                   ( VexGuestAMD64State*, HWord ) */
   6525                   d = unsafeIRDirty_0_N (
   6526                          0/*regparms*/,
   6527                          "amd64g_dirtyhelper_FRSTOR",
   6528                          &amd64g_dirtyhelper_FRSTOR,
   6529                          mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   6530                       );
   6531                   d->mSize = 108;
   6532                }
   6533 
   6534                d->tmp    = w64;
   6535                /* declare we're reading memory */
   6536                d->mFx   = Ifx_Read;
   6537                d->mAddr = mkexpr(addr);
   6538                /* d->mSize set above */
   6539 
   6540                /* declare we're writing guest state */
   6541                d->nFxState = 5;
   6542                vex_bzero(&d->fxState, sizeof(d->fxState));
   6543 
   6544                d->fxState[0].fx     = Ifx_Write;
   6545                d->fxState[0].offset = OFFB_FTOP;
   6546                d->fxState[0].size   = sizeof(UInt);
   6547 
   6548                d->fxState[1].fx     = Ifx_Write;
   6549                d->fxState[1].offset = OFFB_FPREGS;
   6550                d->fxState[1].size   = 8 * sizeof(ULong);
   6551 
   6552                d->fxState[2].fx     = Ifx_Write;
   6553                d->fxState[2].offset = OFFB_FPTAGS;
   6554                d->fxState[2].size   = 8 * sizeof(UChar);
   6555 
   6556                d->fxState[3].fx     = Ifx_Write;
   6557                d->fxState[3].offset = OFFB_FPROUND;
   6558                d->fxState[3].size   = sizeof(ULong);
   6559 
   6560                d->fxState[4].fx     = Ifx_Write;
   6561                d->fxState[4].offset = OFFB_FC3210;
   6562                d->fxState[4].size   = sizeof(ULong);
   6563 
   6564                stmt( IRStmt_Dirty(d) );
   6565 
   6566                /* ew contains any emulation warning we may need to
   6567                   issue.  If needed, side-exit to the next insn,
   6568                   reporting the warning, so that Valgrind's dispatcher
   6569                   sees the warning. */
   6570                assign(ew, unop(Iop_64to32,mkexpr(w64)) );
   6571                put_emwarn( mkexpr(ew) );
   6572                stmt(
   6573                   IRStmt_Exit(
   6574                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   6575                      Ijk_EmWarn,
   6576                      IRConst_U64( guest_RIP_bbstart+delta ),
   6577                      OFFB_RIP
   6578                   )
   6579                );
   6580 
   6581                if ( have66(pfx) ) {
   6582                   DIP("frstors %s\n", dis_buf);
   6583                } else {
   6584                   DIP("frstor %s\n", dis_buf);
   6585                }
   6586                break;
   6587             }
   6588 
   6589             case 6: { /* FNSAVE m94/m108 */
   6590                IRDirty *d;
   6591                if ( have66(pfx) ) {
   6592                  /* Uses dirty helper:
   6593                     void amd64g_dirtyhelper_FNSAVES ( VexGuestAMD64State*,
   6594                                                       HWord ) */
   6595                   d = unsafeIRDirty_0_N (
   6596                          0/*regparms*/,
   6597                          "amd64g_dirtyhelper_FNSAVES",
   6598                          &amd64g_dirtyhelper_FNSAVES,
   6599                          mkIRExprVec_1( mkexpr(addr) )
   6600                          );
   6601                   d->mSize = 94;
   6602                } else {
   6603                  /* Uses dirty helper:
   6604                     void amd64g_dirtyhelper_FNSAVE ( VexGuestAMD64State*,
   6605                                                      HWord ) */
   6606                   d = unsafeIRDirty_0_N (
   6607                          0/*regparms*/,
   6608                          "amd64g_dirtyhelper_FNSAVE",
   6609                          &amd64g_dirtyhelper_FNSAVE,
   6610                          mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   6611                       );
   6612                   d->mSize = 108;
   6613                }
   6614 
   6615                /* declare we're writing memory */
   6616                d->mFx   = Ifx_Write;
   6617                d->mAddr = mkexpr(addr);
   6618                /* d->mSize set above */
   6619 
   6620                /* declare we're reading guest state */
   6621                d->nFxState = 5;
   6622                vex_bzero(&d->fxState, sizeof(d->fxState));
   6623 
   6624                d->fxState[0].fx     = Ifx_Read;
   6625                d->fxState[0].offset = OFFB_FTOP;
   6626                d->fxState[0].size   = sizeof(UInt);
   6627 
   6628                d->fxState[1].fx     = Ifx_Read;
   6629                d->fxState[1].offset = OFFB_FPREGS;
   6630                d->fxState[1].size   = 8 * sizeof(ULong);
   6631 
   6632                d->fxState[2].fx     = Ifx_Read;
   6633                d->fxState[2].offset = OFFB_FPTAGS;
   6634                d->fxState[2].size   = 8 * sizeof(UChar);
   6635 
   6636                d->fxState[3].fx     = Ifx_Read;
   6637                d->fxState[3].offset = OFFB_FPROUND;
   6638                d->fxState[3].size   = sizeof(ULong);
   6639 
   6640                d->fxState[4].fx     = Ifx_Read;
   6641                d->fxState[4].offset = OFFB_FC3210;
   6642                d->fxState[4].size   = sizeof(ULong);
   6643 
   6644                stmt( IRStmt_Dirty(d) );
   6645 
   6646                if ( have66(pfx) ) {
   6647                  DIP("fnsaves %s\n", dis_buf);
   6648                } else {
   6649                  DIP("fnsave %s\n", dis_buf);
   6650                }
   6651                break;
   6652             }
   6653 
   6654             case 7: { /* FNSTSW m16 */
   6655                IRExpr* sw = get_FPU_sw();
   6656                vassert(typeOfIRExpr(irsb->tyenv, sw) == Ity_I16);
   6657                storeLE( mkexpr(addr), sw );
   6658                DIP("fnstsw %s\n", dis_buf);
   6659                break;
   6660             }
   6661 
   6662             default:
   6663                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   6664                vex_printf("first_opcode == 0xDD\n");
   6665                goto decode_fail;
   6666          }
   6667       } else {
   6668          delta++;
   6669          switch (modrm) {
   6670 
   6671             case 0xC0 ... 0xC7: /* FFREE %st(?) */
   6672                r_dst = (UInt)modrm - 0xC0;
   6673                DIP("ffree %%st(%u)\n", r_dst);
   6674                put_ST_TAG ( r_dst, mkU8(0) );
   6675                break;
   6676 
   6677             case 0xD0 ... 0xD7: /* FST %st(0),%st(?) */
   6678                r_dst = (UInt)modrm - 0xD0;
   6679                DIP("fst %%st(0),%%st(%u)\n", r_dst);
   6680                /* P4 manual says: "If the destination operand is a
   6681                   non-empty register, the invalid-operation exception
   6682                   is not generated.  Hence put_ST_UNCHECKED. */
   6683                put_ST_UNCHECKED(r_dst, get_ST(0));
   6684                break;
   6685 
   6686             case 0xD8 ... 0xDF: /* FSTP %st(0),%st(?) */
   6687                r_dst = (UInt)modrm - 0xD8;
   6688                DIP("fstp %%st(0),%%st(%u)\n", r_dst);
   6689                /* P4 manual says: "If the destination operand is a
   6690                   non-empty register, the invalid-operation exception
   6691                   is not generated.  Hence put_ST_UNCHECKED. */
   6692                put_ST_UNCHECKED(r_dst, get_ST(0));
   6693                fp_pop();
   6694                break;
   6695 
   6696             case 0xE0 ... 0xE7: /* FUCOM %st(0),%st(?) */
   6697                r_dst = (UInt)modrm - 0xE0;
   6698                DIP("fucom %%st(0),%%st(%u)\n", r_dst);
   6699                /* This forces C1 to zero, which isn't right. */
   6700                put_C3210(
   6701                    unop(Iop_32Uto64,
   6702                    binop( Iop_And32,
   6703                           binop(Iop_Shl32,
   6704                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   6705                                 mkU8(8)),
   6706                           mkU32(0x4500)
   6707                    )));
   6708                break;
   6709 
   6710             case 0xE8 ... 0xEF: /* FUCOMP %st(0),%st(?) */
   6711                r_dst = (UInt)modrm - 0xE8;
   6712                DIP("fucomp %%st(0),%%st(%u)\n", r_dst);
   6713                /* This forces C1 to zero, which isn't right. */
   6714                put_C3210(
   6715                    unop(Iop_32Uto64,
   6716                    binop( Iop_And32,
   6717                           binop(Iop_Shl32,
   6718                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   6719                                 mkU8(8)),
   6720                           mkU32(0x4500)
   6721                    )));
   6722                fp_pop();
   6723                break;
   6724 
   6725             default:
   6726                goto decode_fail;
   6727          }
   6728       }
   6729    }
   6730 
   6731    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDE opcodes +-+-+-+-+-+-+-+ */
   6732    else
   6733    if (first_opcode == 0xDE) {
   6734 
   6735       if (modrm < 0xC0) {
   6736 
   6737          /* bits 5,4,3 are an opcode extension, and the modRM also
   6738             specifies an address. */
   6739          IROp   fop;
   6740          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6741          delta += len;
   6742 
   6743          switch (gregLO3ofRM(modrm)) {
   6744 
   6745             case 0: /* FIADD m16int */ /* ST(0) += m16int */
   6746                DIP("fiaddw %s\n", dis_buf);
   6747                fop = Iop_AddF64;
   6748                goto do_fop_m16;
   6749 
   6750             case 1: /* FIMUL m16int */ /* ST(0) *= m16int */
   6751                DIP("fimulw %s\n", dis_buf);
   6752                fop = Iop_MulF64;
   6753                goto do_fop_m16;
   6754 
   6755             case 4: /* FISUB m16int */ /* ST(0) -= m16int */
   6756                DIP("fisubw %s\n", dis_buf);
   6757                fop = Iop_SubF64;
   6758                goto do_fop_m16;
   6759 
   6760             case 5: /* FISUBR m16int */ /* ST(0) = m16int - ST(0) */
   6761                DIP("fisubrw %s\n", dis_buf);
   6762                fop = Iop_SubF64;
   6763                goto do_foprev_m16;
   6764 
   6765             case 6: /* FIDIV m16int */ /* ST(0) /= m16int */
   6766                DIP("fisubw %s\n", dis_buf);
   6767                fop = Iop_DivF64;
   6768                goto do_fop_m16;
   6769 
   6770             case 7: /* FIDIVR m16int */ /* ST(0) = m16int / ST(0) */
   6771                DIP("fidivrw %s\n", dis_buf);
   6772                fop = Iop_DivF64;
   6773                goto do_foprev_m16;
   6774 
   6775             do_fop_m16:
   6776                put_ST_UNCHECKED(0,
   6777                   triop(fop,
   6778                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6779                         get_ST(0),
   6780                         unop(Iop_I32StoF64,
   6781                              unop(Iop_16Sto32,
   6782                                   loadLE(Ity_I16, mkexpr(addr))))));
   6783                break;
   6784 
   6785             do_foprev_m16:
   6786                put_ST_UNCHECKED(0,
   6787                   triop(fop,
   6788                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6789                         unop(Iop_I32StoF64,
   6790                              unop(Iop_16Sto32,
   6791                                   loadLE(Ity_I16, mkexpr(addr)))),
   6792                         get_ST(0)));
   6793                break;
   6794 
   6795             default:
   6796                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   6797                vex_printf("first_opcode == 0xDE\n");
   6798                goto decode_fail;
   6799          }
   6800 
   6801       } else {
   6802 
   6803          delta++;
   6804          switch (modrm) {
   6805 
   6806             case 0xC0 ... 0xC7: /* FADDP %st(0),%st(?) */
   6807                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, True );
   6808                break;
   6809 
   6810             case 0xC8 ... 0xCF: /* FMULP %st(0),%st(?) */
   6811                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, True );
   6812                break;
   6813 
   6814             case 0xD9: /* FCOMPP %st(0),%st(1) */
   6815                DIP("fcompp %%st(0),%%st(1)\n");
   6816                /* This forces C1 to zero, which isn't right. */
   6817                put_C3210(
   6818                    unop(Iop_32Uto64,
   6819                    binop( Iop_And32,
   6820                           binop(Iop_Shl32,
   6821                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
   6822                                 mkU8(8)),
   6823                           mkU32(0x4500)
   6824                    )));
   6825                fp_pop();
   6826                fp_pop();
   6827                break;
   6828 
   6829             case 0xE0 ... 0xE7: /* FSUBRP %st(0),%st(?) */
   6830                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0,  modrm - 0xE0, True );
   6831                break;
   6832 
   6833             case 0xE8 ... 0xEF: /* FSUBP %st(0),%st(?) */
   6834                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0,  modrm - 0xE8, True );
   6835                break;
   6836 
   6837             case 0xF0 ... 0xF7: /* FDIVRP %st(0),%st(?) */
   6838                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, True );
   6839                break;
   6840 
   6841             case 0xF8 ... 0xFF: /* FDIVP %st(0),%st(?) */
   6842                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, True );
   6843                break;
   6844 
   6845             default:
   6846                goto decode_fail;
   6847          }
   6848 
   6849       }
   6850    }
   6851 
   6852    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDF opcodes +-+-+-+-+-+-+-+ */
   6853    else
   6854    if (first_opcode == 0xDF) {
   6855 
   6856       if (modrm < 0xC0) {
   6857 
   6858          /* bits 5,4,3 are an opcode extension, and the modRM also
   6859             specifies an address. */
   6860          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6861          delta += len;
   6862 
   6863          switch (gregLO3ofRM(modrm)) {
   6864 
   6865             case 0: /* FILD m16int */
   6866                DIP("fildw %s\n", dis_buf);
   6867                fp_push();
   6868                put_ST(0, unop(Iop_I32StoF64,
   6869                               unop(Iop_16Sto32,
   6870                                    loadLE(Ity_I16, mkexpr(addr)))));
   6871                break;
   6872 
   6873             case 1: /* FISTTPS m16 (SSE3) */
   6874                DIP("fisttps %s\n", dis_buf);
   6875                storeLE( mkexpr(addr),
   6876                         x87ishly_qnarrow_32_to_16(
   6877                         binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) ));
   6878                fp_pop();
   6879                break;
   6880 
   6881             case 2: /* FIST m16 */
   6882                DIP("fists %s\n", dis_buf);
   6883                storeLE( mkexpr(addr),
   6884                         x87ishly_qnarrow_32_to_16(
   6885                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) ));
   6886                break;
   6887 
   6888             case 3: /* FISTP m16 */
   6889                DIP("fistps %s\n", dis_buf);
   6890                storeLE( mkexpr(addr),
   6891                         x87ishly_qnarrow_32_to_16(
   6892                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) ));
   6893                fp_pop();
   6894                break;
   6895 
   6896             case 5: /* FILD m64 */
   6897                DIP("fildll %s\n", dis_buf);
   6898                fp_push();
   6899                put_ST(0, binop(Iop_I64StoF64,
   6900                                get_roundingmode(),
   6901                                loadLE(Ity_I64, mkexpr(addr))));
   6902                break;
   6903 
   6904             case 7: /* FISTP m64 */
   6905                DIP("fistpll %s\n", dis_buf);
   6906                storeLE( mkexpr(addr),
   6907                         binop(Iop_F64toI64S, get_roundingmode(), get_ST(0)) );
   6908                fp_pop();
   6909                break;
   6910 
   6911             default:
   6912                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   6913                vex_printf("first_opcode == 0xDF\n");
   6914                goto decode_fail;
   6915          }
   6916 
   6917       } else {
   6918 
   6919          delta++;
   6920          switch (modrm) {
   6921 
   6922             case 0xC0: /* FFREEP %st(0) */
   6923                DIP("ffreep %%st(%d)\n", 0);
   6924                put_ST_TAG ( 0, mkU8(0) );
   6925                fp_pop();
   6926                break;
   6927 
   6928             case 0xE0: /* FNSTSW %ax */
   6929                DIP("fnstsw %%ax\n");
   6930                /* Invent a plausible-looking FPU status word value and
   6931                   dump it in %AX:
   6932                      ((ftop & 7) << 11) | (c3210 & 0x4700)
   6933                */
   6934                putIRegRAX(
   6935                   2,
   6936                   unop(Iop_32to16,
   6937                        binop(Iop_Or32,
   6938                              binop(Iop_Shl32,
   6939                                    binop(Iop_And32, get_ftop(), mkU32(7)),
   6940                                    mkU8(11)),
   6941                              binop(Iop_And32,
   6942                                    unop(Iop_64to32, get_C3210()),
   6943                                    mkU32(0x4700))
   6944                )));
   6945                break;
   6946 
   6947             case 0xE8 ... 0xEF: /* FUCOMIP %st(0),%st(?) */
   6948                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, True );
   6949                break;
   6950 
   6951             case 0xF0 ... 0xF7: /* FCOMIP %st(0),%st(?) */
   6952                /* not really right since COMIP != UCOMIP */
   6953                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, True );
   6954                break;
   6955 
   6956             default:
   6957                goto decode_fail;
   6958          }
   6959       }
   6960 
   6961    }
   6962 
   6963    else
   6964       goto decode_fail;
   6965 
   6966    *decode_ok = True;
   6967    return delta;
   6968 
   6969   decode_fail:
   6970    *decode_ok = False;
   6971    return delta;
   6972 }
   6973 
   6974 
   6975 /*------------------------------------------------------------*/
   6976 /*---                                                      ---*/
   6977 /*--- MMX INSTRUCTIONS                                     ---*/
   6978 /*---                                                      ---*/
   6979 /*------------------------------------------------------------*/
   6980 
   6981 /* Effect of MMX insns on x87 FPU state (table 11-2 of
   6982    IA32 arch manual, volume 3):
   6983 
   6984    Read from, or write to MMX register (viz, any insn except EMMS):
   6985    * All tags set to Valid (non-empty) -- FPTAGS[i] := nonzero
   6986    * FP stack pointer set to zero
   6987 
   6988    EMMS:
   6989    * All tags set to Invalid (empty) -- FPTAGS[i] := zero
   6990    * FP stack pointer set to zero
   6991 */
   6992 
   6993 static void do_MMX_preamble ( void )
   6994 {
   6995    Int         i;
   6996    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   6997    IRExpr*     zero  = mkU32(0);
   6998    IRExpr*     tag1  = mkU8(1);
   6999    put_ftop(zero);
   7000    for (i = 0; i < 8; i++)
   7001       stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag1) ) );
   7002 }
   7003 
   7004 static void do_EMMS_preamble ( void )
   7005 {
   7006    Int         i;
   7007    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   7008    IRExpr*     zero  = mkU32(0);
   7009    IRExpr*     tag0  = mkU8(0);
   7010    put_ftop(zero);
   7011    for (i = 0; i < 8; i++)
   7012       stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag0) ) );
   7013 }
   7014 
   7015 
   7016 static IRExpr* getMMXReg ( UInt archreg )
   7017 {
   7018    vassert(archreg < 8);
   7019    return IRExpr_Get( OFFB_FPREGS + 8 * archreg, Ity_I64 );
   7020 }
   7021 
   7022 
   7023 static void putMMXReg ( UInt archreg, IRExpr* e )
   7024 {
   7025    vassert(archreg < 8);
   7026    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   7027    stmt( IRStmt_Put( OFFB_FPREGS + 8 * archreg, e ) );
   7028 }
   7029 
   7030 
   7031 /* Helper for non-shift MMX insns.  Note this is incomplete in the
   7032    sense that it does not first call do_MMX_preamble() -- that is the
   7033    responsibility of its caller. */
   7034 
   7035 static
   7036 ULong dis_MMXop_regmem_to_reg ( const VexAbiInfo* vbi,
   7037                                 Prefix      pfx,
   7038                                 Long        delta,
   7039                                 UChar       opc,
   7040                                 const HChar* name,
   7041                                 Bool        show_granularity )
   7042 {
   7043    HChar   dis_buf[50];
   7044    UChar   modrm = getUChar(delta);
   7045    Bool    isReg = epartIsReg(modrm);
   7046    IRExpr* argL  = NULL;
   7047    IRExpr* argR  = NULL;
   7048    IRExpr* argG  = NULL;
   7049    IRExpr* argE  = NULL;
   7050    IRTemp  res   = newTemp(Ity_I64);
   7051 
   7052    Bool    invG  = False;
   7053    IROp    op    = Iop_INVALID;
   7054    void*   hAddr = NULL;
   7055    const HChar*  hName = NULL;
   7056    Bool    eLeft = False;
   7057 
   7058 #  define XXX(_name) do { hAddr = &_name; hName = #_name; } while (0)
   7059 
   7060    switch (opc) {
   7061       /* Original MMX ones */
   7062       case 0xFC: op = Iop_Add8x8; break;
   7063       case 0xFD: op = Iop_Add16x4; break;
   7064       case 0xFE: op = Iop_Add32x2; break;
   7065 
   7066       case 0xEC: op = Iop_QAdd8Sx8; break;
   7067       case 0xED: op = Iop_QAdd16Sx4; break;
   7068 
   7069       case 0xDC: op = Iop_QAdd8Ux8; break;
   7070       case 0xDD: op = Iop_QAdd16Ux4; break;
   7071 
   7072       case 0xF8: op = Iop_Sub8x8;  break;
   7073       case 0xF9: op = Iop_Sub16x4; break;
   7074       case 0xFA: op = Iop_Sub32x2; break;
   7075 
   7076       case 0xE8: op = Iop_QSub8Sx8; break;
   7077       case 0xE9: op = Iop_QSub16Sx4; break;
   7078 
   7079       case 0xD8: op = Iop_QSub8Ux8; break;
   7080       case 0xD9: op = Iop_QSub16Ux4; break;
   7081 
   7082       case 0xE5: op = Iop_MulHi16Sx4; break;
   7083       case 0xD5: op = Iop_Mul16x4; break;
   7084       case 0xF5: XXX(amd64g_calculate_mmx_pmaddwd); break;
   7085 
   7086       case 0x74: op = Iop_CmpEQ8x8; break;
   7087       case 0x75: op = Iop_CmpEQ16x4; break;
   7088       case 0x76: op = Iop_CmpEQ32x2; break;
   7089 
   7090       case 0x64: op = Iop_CmpGT8Sx8; break;
   7091       case 0x65: op = Iop_CmpGT16Sx4; break;
   7092       case 0x66: op = Iop_CmpGT32Sx2; break;
   7093 
   7094       case 0x6B: op = Iop_QNarrowBin32Sto16Sx4; eLeft = True; break;
   7095       case 0x63: op = Iop_QNarrowBin16Sto8Sx8;  eLeft = True; break;
   7096       case 0x67: op = Iop_QNarrowBin16Sto8Ux8;  eLeft = True; break;
   7097 
   7098       case 0x68: op = Iop_InterleaveHI8x8;  eLeft = True; break;
   7099       case 0x69: op = Iop_InterleaveHI16x4; eLeft = True; break;
   7100       case 0x6A: op = Iop_InterleaveHI32x2; eLeft = True; break;
   7101 
   7102       case 0x60: op = Iop_InterleaveLO8x8;  eLeft = True; break;
   7103       case 0x61: op = Iop_InterleaveLO16x4; eLeft = True; break;
   7104       case 0x62: op = Iop_InterleaveLO32x2; eLeft = True; break;
   7105 
   7106       case 0xDB: op = Iop_And64; break;
   7107       case 0xDF: op = Iop_And64; invG = True; break;
   7108       case 0xEB: op = Iop_Or64; break;
   7109       case 0xEF: /* Possibly do better here if argL and argR are the
   7110                     same reg */
   7111                  op = Iop_Xor64; break;
   7112 
   7113       /* Introduced in SSE1 */
   7114       case 0xE0: op = Iop_Avg8Ux8;    break;
   7115       case 0xE3: op = Iop_Avg16Ux4;   break;
   7116       case 0xEE: op = Iop_Max16Sx4;   break;
   7117       case 0xDE: op = Iop_Max8Ux8;    break;
   7118       case 0xEA: op = Iop_Min16Sx4;   break;
   7119       case 0xDA: op = Iop_Min8Ux8;    break;
   7120       case 0xE4: op = Iop_MulHi16Ux4; break;
   7121       case 0xF6: XXX(amd64g_calculate_mmx_psadbw); break;
   7122 
   7123       /* Introduced in SSE2 */
   7124       case 0xD4: op = Iop_Add64; break;
   7125       case 0xFB: op = Iop_Sub64; break;
   7126 
   7127       default:
   7128          vex_printf("\n0x%x\n", (Int)opc);
   7129          vpanic("dis_MMXop_regmem_to_reg");
   7130    }
   7131 
   7132 #  undef XXX
   7133 
   7134    argG = getMMXReg(gregLO3ofRM(modrm));
   7135    if (invG)
   7136       argG = unop(Iop_Not64, argG);
   7137 
   7138    if (isReg) {
   7139       delta++;
   7140       argE = getMMXReg(eregLO3ofRM(modrm));
   7141    } else {
   7142       Int    len;
   7143       IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7144       delta += len;
   7145       argE = loadLE(Ity_I64, mkexpr(addr));
   7146    }
   7147 
   7148    if (eLeft) {
   7149       argL = argE;
   7150       argR = argG;
   7151    } else {
   7152       argL = argG;
   7153       argR = argE;
   7154    }
   7155 
   7156    if (op != Iop_INVALID) {
   7157       vassert(hName == NULL);
   7158       vassert(hAddr == NULL);
   7159       assign(res, binop(op, argL, argR));
   7160    } else {
   7161       vassert(hName != NULL);
   7162       vassert(hAddr != NULL);
   7163       assign( res,
   7164               mkIRExprCCall(
   7165                  Ity_I64,
   7166                  0/*regparms*/, hName, hAddr,
   7167                  mkIRExprVec_2( argL, argR )
   7168               )
   7169             );
   7170    }
   7171 
   7172    putMMXReg( gregLO3ofRM(modrm), mkexpr(res) );
   7173 
   7174    DIP("%s%s %s, %s\n",
   7175        name, show_granularity ? nameMMXGran(opc & 3) : "",
   7176        ( isReg ? nameMMXReg(eregLO3ofRM(modrm)) : dis_buf ),
   7177        nameMMXReg(gregLO3ofRM(modrm)) );
   7178 
   7179    return delta;
   7180 }
   7181 
   7182 
   7183 /* Vector by scalar shift of G by the amount specified at the bottom
   7184    of E.  This is a straight copy of dis_SSE_shiftG_byE. */
   7185 
   7186 static ULong dis_MMX_shiftG_byE ( const VexAbiInfo* vbi,
   7187                                   Prefix pfx, Long delta,
   7188                                   const HChar* opname, IROp op )
   7189 {
   7190    HChar   dis_buf[50];
   7191    Int     alen, size;
   7192    IRTemp  addr;
   7193    Bool    shl, shr, sar;
   7194    UChar   rm   = getUChar(delta);
   7195    IRTemp  g0   = newTemp(Ity_I64);
   7196    IRTemp  g1   = newTemp(Ity_I64);
   7197    IRTemp  amt  = newTemp(Ity_I64);
   7198    IRTemp  amt8 = newTemp(Ity_I8);
   7199 
   7200    if (epartIsReg(rm)) {
   7201       assign( amt, getMMXReg(eregLO3ofRM(rm)) );
   7202       DIP("%s %s,%s\n", opname,
   7203                         nameMMXReg(eregLO3ofRM(rm)),
   7204                         nameMMXReg(gregLO3ofRM(rm)) );
   7205       delta++;
   7206    } else {
   7207       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   7208       assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
   7209       DIP("%s %s,%s\n", opname,
   7210                         dis_buf,
   7211                         nameMMXReg(gregLO3ofRM(rm)) );
   7212       delta += alen;
   7213    }
   7214    assign( g0,   getMMXReg(gregLO3ofRM(rm)) );
   7215    assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
   7216 
   7217    shl = shr = sar = False;
   7218    size = 0;
   7219    switch (op) {
   7220       case Iop_ShlN16x4: shl = True; size = 32; break;
   7221       case Iop_ShlN32x2: shl = True; size = 32; break;
   7222       case Iop_Shl64:    shl = True; size = 64; break;
   7223       case Iop_ShrN16x4: shr = True; size = 16; break;
   7224       case Iop_ShrN32x2: shr = True; size = 32; break;
   7225       case Iop_Shr64:    shr = True; size = 64; break;
   7226       case Iop_SarN16x4: sar = True; size = 16; break;
   7227       case Iop_SarN32x2: sar = True; size = 32; break;
   7228       default: vassert(0);
   7229    }
   7230 
   7231    if (shl || shr) {
   7232      assign(
   7233         g1,
   7234         IRExpr_ITE(
   7235            binop(Iop_CmpLT64U,mkexpr(amt),mkU64(size)),
   7236            binop(op, mkexpr(g0), mkexpr(amt8)),
   7237            mkU64(0)
   7238         )
   7239      );
   7240    } else
   7241    if (sar) {
   7242      assign(
   7243         g1,
   7244         IRExpr_ITE(
   7245            binop(Iop_CmpLT64U,mkexpr(amt),mkU64(size)),
   7246            binop(op, mkexpr(g0), mkexpr(amt8)),
   7247            binop(op, mkexpr(g0), mkU8(size-1))
   7248         )
   7249      );
   7250    } else {
   7251       vassert(0);
   7252    }
   7253 
   7254    putMMXReg( gregLO3ofRM(rm), mkexpr(g1) );
   7255    return delta;
   7256 }
   7257 
   7258 
   7259 /* Vector by scalar shift of E by an immediate byte.  This is a
   7260    straight copy of dis_SSE_shiftE_imm. */
   7261 
   7262 static
   7263 ULong dis_MMX_shiftE_imm ( Long delta, const HChar* opname, IROp op )
   7264 {
   7265    Bool    shl, shr, sar;
   7266    UChar   rm   = getUChar(delta);
   7267    IRTemp  e0   = newTemp(Ity_I64);
   7268    IRTemp  e1   = newTemp(Ity_I64);
   7269    UChar   amt, size;
   7270    vassert(epartIsReg(rm));
   7271    vassert(gregLO3ofRM(rm) == 2
   7272            || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
   7273    amt = getUChar(delta+1);
   7274    delta += 2;
   7275    DIP("%s $%d,%s\n", opname,
   7276                       (Int)amt,
   7277                       nameMMXReg(eregLO3ofRM(rm)) );
   7278 
   7279    assign( e0, getMMXReg(eregLO3ofRM(rm)) );
   7280 
   7281    shl = shr = sar = False;
   7282    size = 0;
   7283    switch (op) {
   7284       case Iop_ShlN16x4: shl = True; size = 16; break;
   7285       case Iop_ShlN32x2: shl = True; size = 32; break;
   7286       case Iop_Shl64:    shl = True; size = 64; break;
   7287       case Iop_SarN16x4: sar = True; size = 16; break;
   7288       case Iop_SarN32x2: sar = True; size = 32; break;
   7289       case Iop_ShrN16x4: shr = True; size = 16; break;
   7290       case Iop_ShrN32x2: shr = True; size = 32; break;
   7291       case Iop_Shr64:    shr = True; size = 64; break;
   7292       default: vassert(0);
   7293    }
   7294 
   7295    if (shl || shr) {
   7296      assign( e1, amt >= size
   7297                     ? mkU64(0)
   7298                     : binop(op, mkexpr(e0), mkU8(amt))
   7299      );
   7300    } else
   7301    if (sar) {
   7302      assign( e1, amt >= size
   7303                     ? binop(op, mkexpr(e0), mkU8(size-1))
   7304                     : binop(op, mkexpr(e0), mkU8(amt))
   7305      );
   7306    } else {
   7307       vassert(0);
   7308    }
   7309 
   7310    putMMXReg( eregLO3ofRM(rm), mkexpr(e1) );
   7311    return delta;
   7312 }
   7313 
   7314 
   7315 /* Completely handle all MMX instructions except emms. */
   7316 
   7317 static
   7318 ULong dis_MMX ( Bool* decode_ok,
   7319                 const VexAbiInfo* vbi, Prefix pfx, Int sz, Long delta )
   7320 {
   7321    Int   len;
   7322    UChar modrm;
   7323    HChar dis_buf[50];
   7324    UChar opc = getUChar(delta);
   7325    delta++;
   7326 
   7327    /* dis_MMX handles all insns except emms. */
   7328    do_MMX_preamble();
   7329 
   7330    switch (opc) {
   7331 
   7332       case 0x6E:
   7333          if (sz == 4) {
   7334             /* MOVD (src)ireg32-or-mem32 (E), (dst)mmxreg (G)*/
   7335             modrm = getUChar(delta);
   7336             if (epartIsReg(modrm)) {
   7337                delta++;
   7338                putMMXReg(
   7339                   gregLO3ofRM(modrm),
   7340                   binop( Iop_32HLto64,
   7341                          mkU32(0),
   7342                          getIReg32(eregOfRexRM(pfx,modrm)) ) );
   7343                DIP("movd %s, %s\n",
   7344                    nameIReg32(eregOfRexRM(pfx,modrm)),
   7345                    nameMMXReg(gregLO3ofRM(modrm)));
   7346             } else {
   7347                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7348                delta += len;
   7349                putMMXReg(
   7350                   gregLO3ofRM(modrm),
   7351                   binop( Iop_32HLto64,
   7352                          mkU32(0),
   7353                          loadLE(Ity_I32, mkexpr(addr)) ) );
   7354                DIP("movd %s, %s\n", dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
   7355             }
   7356          }
   7357          else
   7358          if (sz == 8) {
   7359             /* MOVD (src)ireg64-or-mem64 (E), (dst)mmxreg (G)*/
   7360             modrm = getUChar(delta);
   7361             if (epartIsReg(modrm)) {
   7362                delta++;
   7363                putMMXReg( gregLO3ofRM(modrm),
   7364                           getIReg64(eregOfRexRM(pfx,modrm)) );
   7365                DIP("movd %s, %s\n",
   7366                    nameIReg64(eregOfRexRM(pfx,modrm)),
   7367                    nameMMXReg(gregLO3ofRM(modrm)));
   7368             } else {
   7369                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7370                delta += len;
   7371                putMMXReg( gregLO3ofRM(modrm),
   7372                           loadLE(Ity_I64, mkexpr(addr)) );
   7373                DIP("movd{64} %s, %s\n", dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
   7374             }
   7375          }
   7376          else {
   7377             goto mmx_decode_failure;
   7378          }
   7379          break;
   7380 
   7381       case 0x7E:
   7382          if (sz == 4) {
   7383             /* MOVD (src)mmxreg (G), (dst)ireg32-or-mem32 (E) */
   7384             modrm = getUChar(delta);
   7385             if (epartIsReg(modrm)) {
   7386                delta++;
   7387                putIReg32( eregOfRexRM(pfx,modrm),
   7388                           unop(Iop_64to32, getMMXReg(gregLO3ofRM(modrm)) ) );
   7389                DIP("movd %s, %s\n",
   7390                    nameMMXReg(gregLO3ofRM(modrm)),
   7391                    nameIReg32(eregOfRexRM(pfx,modrm)));
   7392             } else {
   7393                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7394                delta += len;
   7395                storeLE( mkexpr(addr),
   7396                         unop(Iop_64to32, getMMXReg(gregLO3ofRM(modrm)) ) );
   7397                DIP("movd %s, %s\n", nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
   7398             }
   7399          }
   7400          else
   7401          if (sz == 8) {
   7402             /* MOVD (src)mmxreg (G), (dst)ireg64-or-mem64 (E) */
   7403             modrm = getUChar(delta);
   7404             if (epartIsReg(modrm)) {
   7405                delta++;
   7406                putIReg64( eregOfRexRM(pfx,modrm),
   7407                           getMMXReg(gregLO3ofRM(modrm)) );
   7408                DIP("movd %s, %s\n",
   7409                    nameMMXReg(gregLO3ofRM(modrm)),
   7410                    nameIReg64(eregOfRexRM(pfx,modrm)));
   7411             } else {
   7412                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7413                delta += len;
   7414                storeLE( mkexpr(addr),
   7415                        getMMXReg(gregLO3ofRM(modrm)) );
   7416                DIP("movd{64} %s, %s\n", nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
   7417             }
   7418          } else {
   7419             goto mmx_decode_failure;
   7420          }
   7421          break;
   7422 
   7423       case 0x6F:
   7424          /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
   7425          if (sz != 4
   7426              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7427             goto mmx_decode_failure;
   7428          modrm = getUChar(delta);
   7429          if (epartIsReg(modrm)) {
   7430             delta++;
   7431             putMMXReg( gregLO3ofRM(modrm), getMMXReg(eregLO3ofRM(modrm)) );
   7432             DIP("movq %s, %s\n",
   7433                 nameMMXReg(eregLO3ofRM(modrm)),
   7434                 nameMMXReg(gregLO3ofRM(modrm)));
   7435          } else {
   7436             IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7437             delta += len;
   7438             putMMXReg( gregLO3ofRM(modrm), loadLE(Ity_I64, mkexpr(addr)) );
   7439             DIP("movq %s, %s\n",
   7440                 dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
   7441          }
   7442          break;
   7443 
   7444       case 0x7F:
   7445          /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
   7446          if (sz != 4
   7447              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7448             goto mmx_decode_failure;
   7449          modrm = getUChar(delta);
   7450          if (epartIsReg(modrm)) {
   7451             delta++;
   7452             putMMXReg( eregLO3ofRM(modrm), getMMXReg(gregLO3ofRM(modrm)) );
   7453             DIP("movq %s, %s\n",
   7454                 nameMMXReg(gregLO3ofRM(modrm)),
   7455                 nameMMXReg(eregLO3ofRM(modrm)));
   7456          } else {
   7457             IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7458             delta += len;
   7459             storeLE( mkexpr(addr), getMMXReg(gregLO3ofRM(modrm)) );
   7460             DIP("mov(nt)q %s, %s\n",
   7461                 nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
   7462          }
   7463          break;
   7464 
   7465       case 0xFC:
   7466       case 0xFD:
   7467       case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
   7468          if (sz != 4)
   7469             goto mmx_decode_failure;
   7470          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "padd", True );
   7471          break;
   7472 
   7473       case 0xEC:
   7474       case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
   7475          if (sz != 4
   7476              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7477             goto mmx_decode_failure;
   7478          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "padds", True );
   7479          break;
   7480 
   7481       case 0xDC:
   7482       case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   7483          if (sz != 4)
   7484             goto mmx_decode_failure;
   7485          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "paddus", True );
   7486          break;
   7487 
   7488       case 0xF8:
   7489       case 0xF9:
   7490       case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
   7491          if (sz != 4)
   7492             goto mmx_decode_failure;
   7493          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psub", True );
   7494          break;
   7495 
   7496       case 0xE8:
   7497       case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
   7498          if (sz != 4)
   7499             goto mmx_decode_failure;
   7500          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psubs", True );
   7501          break;
   7502 
   7503       case 0xD8:
   7504       case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   7505          if (sz != 4)
   7506             goto mmx_decode_failure;
   7507          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psubus", True );
   7508          break;
   7509 
   7510       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
   7511          if (sz != 4)
   7512             goto mmx_decode_failure;
   7513          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmulhw", False );
   7514          break;
   7515 
   7516       case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
   7517          if (sz != 4)
   7518             goto mmx_decode_failure;
   7519          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmullw", False );
   7520          break;
   7521 
   7522       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
   7523          vassert(sz == 4);
   7524          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmaddwd", False );
   7525          break;
   7526 
   7527       case 0x74:
   7528       case 0x75:
   7529       case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
   7530          if (sz != 4)
   7531             goto mmx_decode_failure;
   7532          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pcmpeq", True );
   7533          break;
   7534 
   7535       case 0x64:
   7536       case 0x65:
   7537       case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
   7538          if (sz != 4)
   7539             goto mmx_decode_failure;
   7540          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pcmpgt", True );
   7541          break;
   7542 
   7543       case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
   7544          if (sz != 4)
   7545             goto mmx_decode_failure;
   7546          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packssdw", False );
   7547          break;
   7548 
   7549       case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
   7550          if (sz != 4)
   7551             goto mmx_decode_failure;
   7552          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packsswb", False );
   7553          break;
   7554 
   7555       case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
   7556          if (sz != 4)
   7557             goto mmx_decode_failure;
   7558          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packuswb", False );
   7559          break;
   7560 
   7561       case 0x68:
   7562       case 0x69:
   7563       case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
   7564          if (sz != 4
   7565              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7566             goto mmx_decode_failure;
   7567          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "punpckh", True );
   7568          break;
   7569 
   7570       case 0x60:
   7571       case 0x61:
   7572       case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
   7573          if (sz != 4
   7574              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7575             goto mmx_decode_failure;
   7576          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "punpckl", True );
   7577          break;
   7578 
   7579       case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
   7580          if (sz != 4)
   7581             goto mmx_decode_failure;
   7582          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pand", False );
   7583          break;
   7584 
   7585       case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
   7586          if (sz != 4)
   7587             goto mmx_decode_failure;
   7588          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pandn", False );
   7589          break;
   7590 
   7591       case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
   7592          if (sz != 4)
   7593             goto mmx_decode_failure;
   7594          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "por", False );
   7595          break;
   7596 
   7597       case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
   7598          if (sz != 4)
   7599             goto mmx_decode_failure;
   7600          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pxor", False );
   7601          break;
   7602 
   7603 #     define SHIFT_BY_REG(_name,_op)                                     \
   7604                 delta = dis_MMX_shiftG_byE(vbi, pfx, delta, _name, _op); \
   7605                 break;
   7606 
   7607       /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
   7608       case 0xF1: SHIFT_BY_REG("psllw", Iop_ShlN16x4);
   7609       case 0xF2: SHIFT_BY_REG("pslld", Iop_ShlN32x2);
   7610       case 0xF3: SHIFT_BY_REG("psllq", Iop_Shl64);
   7611 
   7612       /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
   7613       case 0xD1: SHIFT_BY_REG("psrlw", Iop_ShrN16x4);
   7614       case 0xD2: SHIFT_BY_REG("psrld", Iop_ShrN32x2);
   7615       case 0xD3: SHIFT_BY_REG("psrlq", Iop_Shr64);
   7616 
   7617       /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
   7618       case 0xE1: SHIFT_BY_REG("psraw", Iop_SarN16x4);
   7619       case 0xE2: SHIFT_BY_REG("psrad", Iop_SarN32x2);
   7620 
   7621 #     undef SHIFT_BY_REG
   7622 
   7623       case 0x71:
   7624       case 0x72:
   7625       case 0x73: {
   7626          /* (sz==4): PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
   7627          UChar byte2, subopc;
   7628          if (sz != 4)
   7629             goto mmx_decode_failure;
   7630          byte2  = getUChar(delta);      /* amode / sub-opcode */
   7631          subopc = toUChar( (byte2 >> 3) & 7 );
   7632 
   7633 #        define SHIFT_BY_IMM(_name,_op)                        \
   7634             do { delta = dis_MMX_shiftE_imm(delta,_name,_op);  \
   7635             } while (0)
   7636 
   7637               if (subopc == 2 /*SRL*/ && opc == 0x71)
   7638                   SHIFT_BY_IMM("psrlw", Iop_ShrN16x4);
   7639          else if (subopc == 2 /*SRL*/ && opc == 0x72)
   7640                  SHIFT_BY_IMM("psrld", Iop_ShrN32x2);
   7641          else if (subopc == 2 /*SRL*/ && opc == 0x73)
   7642                  SHIFT_BY_IMM("psrlq", Iop_Shr64);
   7643 
   7644          else if (subopc == 4 /*SAR*/ && opc == 0x71)
   7645                  SHIFT_BY_IMM("psraw", Iop_SarN16x4);
   7646          else if (subopc == 4 /*SAR*/ && opc == 0x72)
   7647                  SHIFT_BY_IMM("psrad", Iop_SarN32x2);
   7648 
   7649          else if (subopc == 6 /*SHL*/ && opc == 0x71)
   7650                  SHIFT_BY_IMM("psllw", Iop_ShlN16x4);
   7651          else if (subopc == 6 /*SHL*/ && opc == 0x72)
   7652                   SHIFT_BY_IMM("pslld", Iop_ShlN32x2);
   7653          else if (subopc == 6 /*SHL*/ && opc == 0x73)
   7654                  SHIFT_BY_IMM("psllq", Iop_Shl64);
   7655 
   7656          else goto mmx_decode_failure;
   7657 
   7658 #        undef SHIFT_BY_IMM
   7659          break;
   7660       }
   7661 
   7662       case 0xF7: {
   7663          IRTemp addr    = newTemp(Ity_I64);
   7664          IRTemp regD    = newTemp(Ity_I64);
   7665          IRTemp regM    = newTemp(Ity_I64);
   7666          IRTemp mask    = newTemp(Ity_I64);
   7667          IRTemp olddata = newTemp(Ity_I64);
   7668          IRTemp newdata = newTemp(Ity_I64);
   7669 
   7670          modrm = getUChar(delta);
   7671          if (sz != 4 || (!epartIsReg(modrm)))
   7672             goto mmx_decode_failure;
   7673          delta++;
   7674 
   7675          assign( addr, handleAddrOverrides( vbi, pfx, getIReg64(R_RDI) ));
   7676          assign( regM, getMMXReg( eregLO3ofRM(modrm) ));
   7677          assign( regD, getMMXReg( gregLO3ofRM(modrm) ));
   7678          assign( mask, binop(Iop_SarN8x8, mkexpr(regM), mkU8(7)) );
   7679          assign( olddata, loadLE( Ity_I64, mkexpr(addr) ));
   7680          assign( newdata,
   7681                  binop(Iop_Or64,
   7682                        binop(Iop_And64,
   7683                              mkexpr(regD),
   7684                              mkexpr(mask) ),
   7685                        binop(Iop_And64,
   7686                              mkexpr(olddata),
   7687                              unop(Iop_Not64, mkexpr(mask)))) );
   7688          storeLE( mkexpr(addr), mkexpr(newdata) );
   7689          DIP("maskmovq %s,%s\n", nameMMXReg( eregLO3ofRM(modrm) ),
   7690                                  nameMMXReg( gregLO3ofRM(modrm) ) );
   7691          break;
   7692       }
   7693 
   7694       /* --- MMX decode failure --- */
   7695       default:
   7696       mmx_decode_failure:
   7697          *decode_ok = False;
   7698          return delta; /* ignored */
   7699 
   7700    }
   7701 
   7702    *decode_ok = True;
   7703    return delta;
   7704 }
   7705 
   7706 
   7707 /*------------------------------------------------------------*/
   7708 /*--- More misc arithmetic and other obscure insns.        ---*/
   7709 /*------------------------------------------------------------*/
   7710 
   7711 /* Generate base << amt with vacated places filled with stuff
   7712    from xtra.  amt guaranteed in 0 .. 63. */
   7713 static
   7714 IRExpr* shiftL64_with_extras ( IRTemp base, IRTemp xtra, IRTemp amt )
   7715 {
   7716    /* if   amt == 0
   7717       then base
   7718       else (base << amt) | (xtra >>u (64-amt))
   7719    */
   7720    return
   7721       IRExpr_ITE(
   7722          binop(Iop_CmpNE8, mkexpr(amt), mkU8(0)),
   7723          binop(Iop_Or64,
   7724                binop(Iop_Shl64, mkexpr(base), mkexpr(amt)),
   7725                binop(Iop_Shr64, mkexpr(xtra),
   7726                                 binop(Iop_Sub8, mkU8(64), mkexpr(amt)))
   7727                ),
   7728          mkexpr(base)
   7729       );
   7730 }
   7731 
   7732 /* Generate base >>u amt with vacated places filled with stuff
   7733    from xtra.  amt guaranteed in 0 .. 63. */
   7734 static
   7735 IRExpr* shiftR64_with_extras ( IRTemp xtra, IRTemp base, IRTemp amt )
   7736 {
   7737    /* if   amt == 0
   7738       then base
   7739       else (base >>u amt) | (xtra << (64-amt))
   7740    */
   7741    return
   7742       IRExpr_ITE(
   7743          binop(Iop_CmpNE8, mkexpr(amt), mkU8(0)),
   7744          binop(Iop_Or64,
   7745                binop(Iop_Shr64, mkexpr(base), mkexpr(amt)),
   7746                binop(Iop_Shl64, mkexpr(xtra),
   7747                                 binop(Iop_Sub8, mkU8(64), mkexpr(amt)))
   7748                ),
   7749          mkexpr(base)
   7750       );
   7751 }
   7752 
   7753 /* Double length left and right shifts.  Apparently only required in
   7754    v-size (no b- variant). */
   7755 static
   7756 ULong dis_SHLRD_Gv_Ev ( const VexAbiInfo* vbi,
   7757                         Prefix pfx,
   7758                         Long delta, UChar modrm,
   7759                         Int sz,
   7760                         IRExpr* shift_amt,
   7761                         Bool amt_is_literal,
   7762                         const HChar* shift_amt_txt,
   7763                         Bool left_shift )
   7764 {
   7765    /* shift_amt :: Ity_I8 is the amount to shift.  shift_amt_txt is used
   7766       for printing it.   And eip on entry points at the modrm byte. */
   7767    Int len;
   7768    HChar dis_buf[50];
   7769 
   7770    IRType ty     = szToITy(sz);
   7771    IRTemp gsrc   = newTemp(ty);
   7772    IRTemp esrc   = newTemp(ty);
   7773    IRTemp addr   = IRTemp_INVALID;
   7774    IRTemp tmpSH  = newTemp(Ity_I8);
   7775    IRTemp tmpSS  = newTemp(Ity_I8);
   7776    IRTemp tmp64  = IRTemp_INVALID;
   7777    IRTemp res64  = IRTemp_INVALID;
   7778    IRTemp rss64  = IRTemp_INVALID;
   7779    IRTemp resTy  = IRTemp_INVALID;
   7780    IRTemp rssTy  = IRTemp_INVALID;
   7781    Int    mask   = sz==8 ? 63 : 31;
   7782 
   7783    vassert(sz == 2 || sz == 4 || sz == 8);
   7784 
   7785    /* The E-part is the destination; this is shifted.  The G-part
   7786       supplies bits to be shifted into the E-part, but is not
   7787       changed.
   7788 
   7789       If shifting left, form a double-length word with E at the top
   7790       and G at the bottom, and shift this left.  The result is then in
   7791       the high part.
   7792 
   7793       If shifting right, form a double-length word with G at the top
   7794       and E at the bottom, and shift this right.  The result is then
   7795       at the bottom.  */
   7796 
   7797    /* Fetch the operands. */
   7798 
   7799    assign( gsrc, getIRegG(sz, pfx, modrm) );
   7800 
   7801    if (epartIsReg(modrm)) {
   7802       delta++;
   7803       assign( esrc, getIRegE(sz, pfx, modrm) );
   7804       DIP("sh%cd%c %s, %s, %s\n",
   7805           ( left_shift ? 'l' : 'r' ), nameISize(sz),
   7806           shift_amt_txt,
   7807           nameIRegG(sz, pfx, modrm), nameIRegE(sz, pfx, modrm));
   7808    } else {
   7809       addr = disAMode ( &len, vbi, pfx, delta, dis_buf,
   7810                         /* # bytes following amode */
   7811                         amt_is_literal ? 1 : 0 );
   7812       delta += len;
   7813       assign( esrc, loadLE(ty, mkexpr(addr)) );
   7814       DIP("sh%cd%c %s, %s, %s\n",
   7815           ( left_shift ? 'l' : 'r' ), nameISize(sz),
   7816           shift_amt_txt,
   7817           nameIRegG(sz, pfx, modrm), dis_buf);
   7818    }
   7819 
   7820    /* Calculate the masked shift amount (tmpSH), the masked subshift
   7821       amount (tmpSS), the shifted value (res64) and the subshifted
   7822       value (rss64). */
   7823 
   7824    assign( tmpSH, binop(Iop_And8, shift_amt, mkU8(mask)) );
   7825    assign( tmpSS, binop(Iop_And8,
   7826                         binop(Iop_Sub8, mkexpr(tmpSH), mkU8(1) ),
   7827                         mkU8(mask)));
   7828 
   7829    tmp64 = newTemp(Ity_I64);
   7830    res64 = newTemp(Ity_I64);
   7831    rss64 = newTemp(Ity_I64);
   7832 
   7833    if (sz == 2 || sz == 4) {
   7834 
   7835       /* G is xtra; E is data */
   7836       /* what a freaking nightmare: */
   7837       if (sz == 4 && left_shift) {
   7838          assign( tmp64, binop(Iop_32HLto64, mkexpr(esrc), mkexpr(gsrc)) );
   7839          assign( res64,
   7840                  binop(Iop_Shr64,
   7841                        binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSH)),
   7842                        mkU8(32)) );
   7843          assign( rss64,
   7844                  binop(Iop_Shr64,
   7845                        binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSS)),
   7846                        mkU8(32)) );
   7847       }
   7848       else
   7849       if (sz == 4 && !left_shift) {
   7850          assign( tmp64, binop(Iop_32HLto64, mkexpr(gsrc), mkexpr(esrc)) );
   7851          assign( res64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSH)) );
   7852          assign( rss64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSS)) );
   7853       }
   7854       else
   7855       if (sz == 2 && left_shift) {
   7856          assign( tmp64,
   7857                  binop(Iop_32HLto64,
   7858                        binop(Iop_16HLto32, mkexpr(esrc), mkexpr(gsrc)),
   7859                        binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(gsrc))
   7860          ));
   7861          /* result formed by shifting [esrc'gsrc'gsrc'gsrc] */
   7862          assign( res64,
   7863                  binop(Iop_Shr64,
   7864                        binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSH)),
   7865                        mkU8(48)) );
   7866          /* subshift formed by shifting [esrc'0000'0000'0000] */
   7867          assign( rss64,
   7868                  binop(Iop_Shr64,
   7869                        binop(Iop_Shl64,
   7870                              binop(Iop_Shl64, unop(Iop_16Uto64, mkexpr(esrc)),
   7871                                               mkU8(48)),
   7872                              mkexpr(tmpSS)),
   7873                        mkU8(48)) );
   7874       }
   7875       else
   7876       if (sz == 2 && !left_shift) {
   7877          assign( tmp64,
   7878                  binop(Iop_32HLto64,
   7879                        binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(gsrc)),
   7880                        binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(esrc))
   7881          ));
   7882          /* result formed by shifting [gsrc'gsrc'gsrc'esrc] */
   7883          assign( res64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSH)) );
   7884          /* subshift formed by shifting [0000'0000'0000'esrc] */
   7885          assign( rss64, binop(Iop_Shr64,
   7886                               unop(Iop_16Uto64, mkexpr(esrc)),
   7887                               mkexpr(tmpSS)) );
   7888       }
   7889 
   7890    } else {
   7891 
   7892       vassert(sz == 8);
   7893       if (left_shift) {
   7894          assign( res64, shiftL64_with_extras( esrc, gsrc, tmpSH ));
   7895          assign( rss64, shiftL64_with_extras( esrc, gsrc, tmpSS ));
   7896       } else {
   7897          assign( res64, shiftR64_with_extras( gsrc, esrc, tmpSH ));
   7898          assign( rss64, shiftR64_with_extras( gsrc, esrc, tmpSS ));
   7899       }
   7900 
   7901    }
   7902 
   7903    resTy = newTemp(ty);
   7904    rssTy = newTemp(ty);
   7905    assign( resTy, narrowTo(ty, mkexpr(res64)) );
   7906    assign( rssTy, narrowTo(ty, mkexpr(rss64)) );
   7907 
   7908    /* Put result back and write the flags thunk. */
   7909    setFlags_DEP1_DEP2_shift ( left_shift ? Iop_Shl64 : Iop_Sar64,
   7910                               resTy, rssTy, ty, tmpSH );
   7911 
   7912    if (epartIsReg(modrm)) {
   7913       putIRegE(sz, pfx, modrm, mkexpr(resTy));
   7914    } else {
   7915       storeLE( mkexpr(addr), mkexpr(resTy) );
   7916    }
   7917 
   7918    if (amt_is_literal) delta++;
   7919    return delta;
   7920 }
   7921 
   7922 
   7923 /* Handle BT/BTS/BTR/BTC Gv, Ev.  Apparently b-size is not
   7924    required. */
   7925 
   7926 typedef enum { BtOpNone, BtOpSet, BtOpReset, BtOpComp } BtOp;
   7927 
   7928 static const HChar* nameBtOp ( BtOp op )
   7929 {
   7930    switch (op) {
   7931       case BtOpNone:  return "";
   7932       case BtOpSet:   return "s";
   7933       case BtOpReset: return "r";
   7934       case BtOpComp:  return "c";
   7935       default: vpanic("nameBtOp(amd64)");
   7936    }
   7937 }
   7938 
   7939 
   7940 static
   7941 ULong dis_bt_G_E ( const VexAbiInfo* vbi,
   7942                    Prefix pfx, Int sz, Long delta, BtOp op,
   7943                    /*OUT*/Bool* decode_OK )
   7944 {
   7945    HChar  dis_buf[50];
   7946    UChar  modrm;
   7947    Int    len;
   7948    IRTemp t_fetched, t_bitno0, t_bitno1, t_bitno2, t_addr0,
   7949           t_addr1, t_rsp, t_mask, t_new;
   7950 
   7951    vassert(sz == 2 || sz == 4 || sz == 8);
   7952 
   7953    t_fetched = t_bitno0 = t_bitno1 = t_bitno2
   7954              = t_addr0 = t_addr1 = t_rsp
   7955              = t_mask = t_new = IRTemp_INVALID;
   7956 
   7957    t_fetched = newTemp(Ity_I8);
   7958    t_new     = newTemp(Ity_I8);
   7959    t_bitno0  = newTemp(Ity_I64);
   7960    t_bitno1  = newTemp(Ity_I64);
   7961    t_bitno2  = newTemp(Ity_I8);
   7962    t_addr1   = newTemp(Ity_I64);
   7963    modrm     = getUChar(delta);
   7964 
   7965    *decode_OK = True;
   7966    if (epartIsReg(modrm)) {
   7967       /* F2 and F3 are never acceptable. */
   7968       if (haveF2orF3(pfx)) {
   7969          *decode_OK = False;
   7970          return delta;
   7971       }
   7972    } else {
   7973       /* F2 or F3 (but not both) are allowed, provided LOCK is also
   7974          present, and only for the BTC/BTS/BTR cases (not BT). */
   7975       if (haveF2orF3(pfx)) {
   7976          if (haveF2andF3(pfx) || !haveLOCK(pfx) || op == BtOpNone) {
   7977             *decode_OK = False;
   7978             return delta;
   7979          }
   7980       }
   7981    }
   7982 
   7983    assign( t_bitno0, widenSto64(getIRegG(sz, pfx, modrm)) );
   7984 
   7985    if (epartIsReg(modrm)) {
   7986       delta++;
   7987       /* Get it onto the client's stack.  Oh, this is a horrible
   7988          kludge.  See https://bugs.kde.org/show_bug.cgi?id=245925.
   7989          Because of the ELF ABI stack redzone, there may be live data
   7990          up to 128 bytes below %RSP.  So we can't just push it on the
   7991          stack, else we may wind up trashing live data, and causing
   7992          impossible-to-find simulation errors.  (Yes, this did
   7993          happen.)  So we need to drop RSP before at least 128 before
   7994          pushing it.  That unfortunately means hitting Memcheck's
   7995          fast-case painting code.  Ideally we should drop more than
   7996          128, to reduce the chances of breaking buggy programs that
   7997          have live data below -128(%RSP).  Memcheck fast-cases moves
   7998          of 288 bytes due to the need to handle ppc64-linux quickly,
   7999          so let's use 288.  Of course the real fix is to get rid of
   8000          this kludge entirely.  */
   8001       t_rsp = newTemp(Ity_I64);
   8002       t_addr0 = newTemp(Ity_I64);
   8003 
   8004       vassert(vbi->guest_stack_redzone_size == 128);
   8005       assign( t_rsp, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(288)) );
   8006       putIReg64(R_RSP, mkexpr(t_rsp));
   8007 
   8008       storeLE( mkexpr(t_rsp), getIRegE(sz, pfx, modrm) );
   8009 
   8010       /* Make t_addr0 point at it. */
   8011       assign( t_addr0, mkexpr(t_rsp) );
   8012 
   8013       /* Mask out upper bits of the shift amount, since we're doing a
   8014          reg. */
   8015       assign( t_bitno1, binop(Iop_And64,
   8016                               mkexpr(t_bitno0),
   8017                               mkU64(sz == 8 ? 63 : sz == 4 ? 31 : 15)) );
   8018 
   8019    } else {
   8020       t_addr0 = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   8021       delta += len;
   8022       assign( t_bitno1, mkexpr(t_bitno0) );
   8023    }
   8024 
   8025    /* At this point: t_addr0 is the address being operated on.  If it
   8026       was a reg, we will have pushed it onto the client's stack.
   8027       t_bitno1 is the bit number, suitably masked in the case of a
   8028       reg.  */
   8029 
   8030    /* Now the main sequence. */
   8031    assign( t_addr1,
   8032            binop(Iop_Add64,
   8033                  mkexpr(t_addr0),
   8034                  binop(Iop_Sar64, mkexpr(t_bitno1), mkU8(3))) );
   8035 
   8036    /* t_addr1 now holds effective address */
   8037 
   8038    assign( t_bitno2,
   8039            unop(Iop_64to8,
   8040                 binop(Iop_And64, mkexpr(t_bitno1), mkU64(7))) );
   8041 
   8042    /* t_bitno2 contains offset of bit within byte */
   8043 
   8044    if (op != BtOpNone) {
   8045       t_mask = newTemp(Ity_I8);
   8046       assign( t_mask, binop(Iop_Shl8, mkU8(1), mkexpr(t_bitno2)) );
   8047    }
   8048 
   8049    /* t_mask is now a suitable byte mask */
   8050 
   8051    assign( t_fetched, loadLE(Ity_I8, mkexpr(t_addr1)) );
   8052 
   8053    if (op != BtOpNone) {
   8054       switch (op) {
   8055          case BtOpSet:
   8056             assign( t_new,
   8057                     binop(Iop_Or8, mkexpr(t_fetched), mkexpr(t_mask)) );
   8058             break;
   8059          case BtOpComp:
   8060             assign( t_new,
   8061                     binop(Iop_Xor8, mkexpr(t_fetched), mkexpr(t_mask)) );
   8062             break;
   8063          case BtOpReset:
   8064             assign( t_new,
   8065                     binop(Iop_And8, mkexpr(t_fetched),
   8066                                     unop(Iop_Not8, mkexpr(t_mask))) );
   8067             break;
   8068          default:
   8069             vpanic("dis_bt_G_E(amd64)");
   8070       }
   8071       if ((haveLOCK(pfx)) && !epartIsReg(modrm)) {
   8072          casLE( mkexpr(t_addr1), mkexpr(t_fetched)/*expd*/,
   8073                                  mkexpr(t_new)/*new*/,
   8074                                  guest_RIP_curr_instr );
   8075       } else {
   8076          storeLE( mkexpr(t_addr1), mkexpr(t_new) );
   8077       }
   8078    }
   8079 
   8080    /* Side effect done; now get selected bit into Carry flag */
   8081    /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
   8082    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   8083    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   8084    stmt( IRStmt_Put(
   8085             OFFB_CC_DEP1,
   8086             binop(Iop_And64,
   8087                   binop(Iop_Shr64,
   8088                         unop(Iop_8Uto64, mkexpr(t_fetched)),
   8089                         mkexpr(t_bitno2)),
   8090                   mkU64(1)))
   8091        );
   8092    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   8093       elimination of previous stores to this field work better. */
   8094    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   8095 
   8096    /* Move reg operand from stack back to reg */
   8097    if (epartIsReg(modrm)) {
   8098       /* t_rsp still points at it. */
   8099       /* only write the reg if actually modifying it; doing otherwise
   8100          zeroes the top half erroneously when doing btl due to
   8101          standard zero-extend rule */
   8102       if (op != BtOpNone)
   8103          putIRegE(sz, pfx, modrm, loadLE(szToITy(sz), mkexpr(t_rsp)) );
   8104       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t_rsp), mkU64(288)) );
   8105    }
   8106 
   8107    DIP("bt%s%c %s, %s\n",
   8108        nameBtOp(op), nameISize(sz), nameIRegG(sz, pfx, modrm),
   8109        ( epartIsReg(modrm) ? nameIRegE(sz, pfx, modrm) : dis_buf ) );
   8110 
   8111    return delta;
   8112 }
   8113 
   8114 
   8115 
   8116 /* Handle BSF/BSR.  Only v-size seems necessary. */
   8117 static
   8118 ULong dis_bs_E_G ( const VexAbiInfo* vbi,
   8119                    Prefix pfx, Int sz, Long delta, Bool fwds )
   8120 {
   8121    Bool   isReg;
   8122    UChar  modrm;
   8123    HChar  dis_buf[50];
   8124 
   8125    IRType ty    = szToITy(sz);
   8126    IRTemp src   = newTemp(ty);
   8127    IRTemp dst   = newTemp(ty);
   8128    IRTemp src64 = newTemp(Ity_I64);
   8129    IRTemp dst64 = newTemp(Ity_I64);
   8130    IRTemp srcB  = newTemp(Ity_I1);
   8131 
   8132    vassert(sz == 8 || sz == 4 || sz == 2);
   8133 
   8134    modrm = getUChar(delta);
   8135    isReg = epartIsReg(modrm);
   8136    if (isReg) {
   8137       delta++;
   8138       assign( src, getIRegE(sz, pfx, modrm) );
   8139    } else {
   8140       Int    len;
   8141       IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   8142       delta += len;
   8143       assign( src, loadLE(ty, mkexpr(addr)) );
   8144    }
   8145 
   8146    DIP("bs%c%c %s, %s\n",
   8147        fwds ? 'f' : 'r', nameISize(sz),
   8148        ( isReg ? nameIRegE(sz, pfx, modrm) : dis_buf ),
   8149        nameIRegG(sz, pfx, modrm));
   8150 
   8151    /* First, widen src to 64 bits if it is not already. */
   8152    assign( src64, widenUto64(mkexpr(src)) );
   8153 
   8154    /* Generate a bool expression which is zero iff the original is
   8155       zero, and nonzero otherwise.  Ask for a CmpNE version which, if
   8156       instrumented by Memcheck, is instrumented expensively, since
   8157       this may be used on the output of a preceding movmskb insn,
   8158       which has been known to be partially defined, and in need of
   8159       careful handling. */
   8160    assign( srcB, binop(Iop_ExpCmpNE64, mkexpr(src64), mkU64(0)) );
   8161 
   8162    /* Flags: Z is 1 iff source value is zero.  All others
   8163       are undefined -- we force them to zero. */
   8164    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   8165    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   8166    stmt( IRStmt_Put(
   8167             OFFB_CC_DEP1,
   8168             IRExpr_ITE( mkexpr(srcB),
   8169                         /* src!=0 */
   8170                         mkU64(0),
   8171                         /* src==0 */
   8172                         mkU64(AMD64G_CC_MASK_Z)
   8173                         )
   8174        ));
   8175    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   8176       elimination of previous stores to this field work better. */
   8177    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   8178 
   8179    /* Result: iff source value is zero, we can't use
   8180       Iop_Clz64/Iop_Ctz64 as they have no defined result in that case.
   8181       But anyway, amd64 semantics say the result is undefined in
   8182       such situations.  Hence handle the zero case specially. */
   8183 
   8184    /* Bleh.  What we compute:
   8185 
   8186           bsf64:  if src == 0 then {dst is unchanged}
   8187                               else Ctz64(src)
   8188 
   8189           bsr64:  if src == 0 then {dst is unchanged}
   8190                               else 63 - Clz64(src)
   8191 
   8192           bsf32:  if src == 0 then {dst is unchanged}
   8193                               else Ctz64(32Uto64(src))
   8194 
   8195           bsr32:  if src == 0 then {dst is unchanged}
   8196                               else 63 - Clz64(32Uto64(src))
   8197 
   8198           bsf16:  if src == 0 then {dst is unchanged}
   8199                               else Ctz64(32Uto64(16Uto32(src)))
   8200 
   8201           bsr16:  if src == 0 then {dst is unchanged}
   8202                               else 63 - Clz64(32Uto64(16Uto32(src)))
   8203    */
   8204 
   8205    /* The main computation, guarding against zero. */
   8206    assign( dst64,
   8207            IRExpr_ITE(
   8208               mkexpr(srcB),
   8209               /* src != 0 */
   8210               fwds ? unop(Iop_Ctz64, mkexpr(src64))
   8211                    : binop(Iop_Sub64,
   8212                            mkU64(63),
   8213                            unop(Iop_Clz64, mkexpr(src64))),
   8214               /* src == 0 -- leave dst unchanged */
   8215               widenUto64( getIRegG( sz, pfx, modrm ) )
   8216            )
   8217          );
   8218 
   8219    if (sz == 2)
   8220       assign( dst, unop(Iop_64to16, mkexpr(dst64)) );
   8221    else
   8222    if (sz == 4)
   8223       assign( dst, unop(Iop_64to32, mkexpr(dst64)) );
   8224    else
   8225       assign( dst, mkexpr(dst64) );
   8226 
   8227    /* dump result back */
   8228    putIRegG( sz, pfx, modrm, mkexpr(dst) );
   8229 
   8230    return delta;
   8231 }
   8232 
   8233 
   8234 /* swap rAX with the reg specified by reg and REX.B */
   8235 static
   8236 void codegen_xchg_rAX_Reg ( Prefix pfx, Int sz, UInt regLo3 )
   8237 {
   8238    IRType ty = szToITy(sz);
   8239    IRTemp t1 = newTemp(ty);
   8240    IRTemp t2 = newTemp(ty);
   8241    vassert(sz == 2 || sz == 4 || sz == 8);
   8242    vassert(regLo3 < 8);
   8243    if (sz == 8) {
   8244       assign( t1, getIReg64(R_RAX) );
   8245       assign( t2, getIRegRexB(8, pfx, regLo3) );
   8246       putIReg64( R_RAX, mkexpr(t2) );
   8247       putIRegRexB(8, pfx, regLo3, mkexpr(t1) );
   8248    } else if (sz == 4) {
   8249       assign( t1, getIReg32(R_RAX) );
   8250       assign( t2, getIRegRexB(4, pfx, regLo3) );
   8251       putIReg32( R_RAX, mkexpr(t2) );
   8252       putIRegRexB(4, pfx, regLo3, mkexpr(t1) );
   8253    } else {
   8254       assign( t1, getIReg16(R_RAX) );
   8255       assign( t2, getIRegRexB(2, pfx, regLo3) );
   8256       putIReg16( R_RAX, mkexpr(t2) );
   8257       putIRegRexB(2, pfx, regLo3, mkexpr(t1) );
   8258    }
   8259    DIP("xchg%c %s, %s\n",
   8260        nameISize(sz), nameIRegRAX(sz),
   8261                       nameIRegRexB(sz,pfx, regLo3));
   8262 }
   8263 
   8264 
   8265 static
   8266 void codegen_SAHF ( void )
   8267 {
   8268    /* Set the flags to:
   8269       (amd64g_calculate_flags_all() & AMD64G_CC_MASK_O)
   8270                                     -- retain the old O flag
   8271       | (%AH & (AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
   8272                 |AMD64G_CC_MASK_P|AMD64G_CC_MASK_C)
   8273    */
   8274    ULong  mask_SZACP = AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
   8275                        |AMD64G_CC_MASK_C|AMD64G_CC_MASK_P;
   8276    IRTemp oldflags   = newTemp(Ity_I64);
   8277    assign( oldflags, mk_amd64g_calculate_rflags_all() );
   8278    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   8279    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   8280    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   8281    stmt( IRStmt_Put( OFFB_CC_DEP1,
   8282          binop(Iop_Or64,
   8283                binop(Iop_And64, mkexpr(oldflags), mkU64(AMD64G_CC_MASK_O)),
   8284                binop(Iop_And64,
   8285                      binop(Iop_Shr64, getIReg64(R_RAX), mkU8(8)),
   8286                      mkU64(mask_SZACP))
   8287               )
   8288    ));
   8289 }
   8290 
   8291 
   8292 static
   8293 void codegen_LAHF ( void  )
   8294 {
   8295    /* AH <- EFLAGS(SF:ZF:0:AF:0:PF:1:CF) */
   8296    IRExpr* rax_with_hole;
   8297    IRExpr* new_byte;
   8298    IRExpr* new_rax;
   8299    ULong   mask_SZACP = AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
   8300                         |AMD64G_CC_MASK_C|AMD64G_CC_MASK_P;
   8301 
   8302    IRTemp  flags = newTemp(Ity_I64);
   8303    assign( flags, mk_amd64g_calculate_rflags_all() );
   8304 
   8305    rax_with_hole
   8306       = binop(Iop_And64, getIReg64(R_RAX), mkU64(~0xFF00ULL));
   8307    new_byte
   8308       = binop(Iop_Or64, binop(Iop_And64, mkexpr(flags), mkU64(mask_SZACP)),
   8309                         mkU64(1<<1));
   8310    new_rax
   8311       = binop(Iop_Or64, rax_with_hole,
   8312                         binop(Iop_Shl64, new_byte, mkU8(8)));
   8313    putIReg64(R_RAX, new_rax);
   8314 }
   8315 
   8316 
   8317 static
   8318 ULong dis_cmpxchg_G_E ( /*OUT*/Bool* ok,
   8319                         const VexAbiInfo*  vbi,
   8320                         Prefix       pfx,
   8321                         Int          size,
   8322                         Long         delta0 )
   8323 {
   8324    HChar dis_buf[50];
   8325    Int   len;
   8326 
   8327    IRType ty    = szToITy(size);
   8328    IRTemp acc   = newTemp(ty);
   8329    IRTemp src   = newTemp(ty);
   8330    IRTemp dest  = newTemp(ty);
   8331    IRTemp dest2 = newTemp(ty);
   8332    IRTemp acc2  = newTemp(ty);
   8333    IRTemp cond  = newTemp(Ity_I1);
   8334    IRTemp addr  = IRTemp_INVALID;
   8335    UChar  rm    = getUChar(delta0);
   8336 
   8337    /* There are 3 cases to consider:
   8338 
   8339       reg-reg: ignore any lock prefix, generate sequence based
   8340                on ITE
   8341 
   8342       reg-mem, not locked: ignore any lock prefix, generate sequence
   8343                            based on ITE
   8344 
   8345       reg-mem, locked: use IRCAS
   8346    */
   8347 
   8348    /* Decide whether F2 or F3 are acceptable.  Never for register
   8349       case, but for the memory case, one or the other is OK provided
   8350       LOCK is also present. */
   8351    if (epartIsReg(rm)) {
   8352       if (haveF2orF3(pfx)) {
   8353          *ok = False;
   8354          return delta0;
   8355       }
   8356    } else {
   8357       if (haveF2orF3(pfx)) {
   8358          if (haveF2andF3(pfx) || !haveLOCK(pfx)) {
   8359             *ok = False;
   8360             return delta0;
   8361          }
   8362       }
   8363    }
   8364 
   8365    if (epartIsReg(rm)) {
   8366       /* case 1 */
   8367       assign( dest, getIRegE(size, pfx, rm) );
   8368       delta0++;
   8369       assign( src, getIRegG(size, pfx, rm) );
   8370       assign( acc, getIRegRAX(size) );
   8371       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   8372       assign( cond, mk_amd64g_calculate_condition(AMD64CondZ) );
   8373       assign( dest2, IRExpr_ITE(mkexpr(cond), mkexpr(src), mkexpr(dest)) );
   8374       assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
   8375       putIRegRAX(size, mkexpr(acc2));
   8376       putIRegE(size, pfx, rm, mkexpr(dest2));
   8377       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   8378                                nameIRegG(size,pfx,rm),
   8379                                nameIRegE(size,pfx,rm) );
   8380    }
   8381    else if (!epartIsReg(rm) && !haveLOCK(pfx)) {
   8382       /* case 2 */
   8383       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8384       assign( dest, loadLE(ty, mkexpr(addr)) );
   8385       delta0 += len;
   8386       assign( src, getIRegG(size, pfx, rm) );
   8387       assign( acc, getIRegRAX(size) );
   8388       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   8389       assign( cond, mk_amd64g_calculate_condition(AMD64CondZ) );
   8390       assign( dest2, IRExpr_ITE(mkexpr(cond), mkexpr(src), mkexpr(dest)) );
   8391       assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
   8392       putIRegRAX(size, mkexpr(acc2));
   8393       storeLE( mkexpr(addr), mkexpr(dest2) );
   8394       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   8395                                nameIRegG(size,pfx,rm), dis_buf);
   8396    }
   8397    else if (!epartIsReg(rm) && haveLOCK(pfx)) {
   8398       /* case 3 */
   8399       /* src is new value.  acc is expected value.  dest is old value.
   8400          Compute success from the output of the IRCAS, and steer the
   8401          new value for RAX accordingly: in case of success, RAX is
   8402          unchanged. */
   8403       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8404       delta0 += len;
   8405       assign( src, getIRegG(size, pfx, rm) );
   8406       assign( acc, getIRegRAX(size) );
   8407       stmt( IRStmt_CAS(
   8408          mkIRCAS( IRTemp_INVALID, dest, Iend_LE, mkexpr(addr),
   8409                   NULL, mkexpr(acc), NULL, mkexpr(src) )
   8410       ));
   8411       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   8412       assign( cond, mk_amd64g_calculate_condition(AMD64CondZ) );
   8413       assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
   8414       putIRegRAX(size, mkexpr(acc2));
   8415       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   8416                                nameIRegG(size,pfx,rm), dis_buf);
   8417    }
   8418    else vassert(0);
   8419 
   8420    *ok = True;
   8421    return delta0;
   8422 }
   8423 
   8424 
   8425 /* Handle conditional move instructions of the form
   8426       cmovcc E(reg-or-mem), G(reg)
   8427 
   8428    E(src) is reg-or-mem
   8429    G(dst) is reg.
   8430 
   8431    If E is reg, -->    GET %E, tmps
   8432                        GET %G, tmpd
   8433                        CMOVcc tmps, tmpd
   8434                        PUT tmpd, %G
   8435 
   8436    If E is mem  -->    (getAddr E) -> tmpa
   8437                        LD (tmpa), tmps
   8438                        GET %G, tmpd
   8439                        CMOVcc tmps, tmpd
   8440                        PUT tmpd, %G
   8441 */
   8442 static
   8443 ULong dis_cmov_E_G ( const VexAbiInfo* vbi,
   8444                      Prefix        pfx,
   8445                      Int           sz,
   8446                      AMD64Condcode cond,
   8447                      Long          delta0 )
   8448 {
   8449    UChar rm  = getUChar(delta0);
   8450    HChar dis_buf[50];
   8451    Int   len;
   8452 
   8453    IRType ty   = szToITy(sz);
   8454    IRTemp tmps = newTemp(ty);
   8455    IRTemp tmpd = newTemp(ty);
   8456 
   8457    if (epartIsReg(rm)) {
   8458       assign( tmps, getIRegE(sz, pfx, rm) );
   8459       assign( tmpd, getIRegG(sz, pfx, rm) );
   8460 
   8461       putIRegG( sz, pfx, rm,
   8462                 IRExpr_ITE( mk_amd64g_calculate_condition(cond),
   8463                             mkexpr(tmps),
   8464                             mkexpr(tmpd) )
   8465               );
   8466       DIP("cmov%s %s,%s\n", name_AMD64Condcode(cond),
   8467                             nameIRegE(sz,pfx,rm),
   8468                             nameIRegG(sz,pfx,rm));
   8469       return 1+delta0;
   8470    }
   8471 
   8472    /* E refers to memory */
   8473    {
   8474       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8475       assign( tmps, loadLE(ty, mkexpr(addr)) );
   8476       assign( tmpd, getIRegG(sz, pfx, rm) );
   8477 
   8478       putIRegG( sz, pfx, rm,
   8479                 IRExpr_ITE( mk_amd64g_calculate_condition(cond),
   8480                             mkexpr(tmps),
   8481                             mkexpr(tmpd) )
   8482               );
   8483 
   8484       DIP("cmov%s %s,%s\n", name_AMD64Condcode(cond),
   8485                             dis_buf,
   8486                             nameIRegG(sz,pfx,rm));
   8487       return len+delta0;
   8488    }
   8489 }
   8490 
   8491 
   8492 static
   8493 ULong dis_xadd_G_E ( /*OUT*/Bool* decode_ok,
   8494                      const VexAbiInfo* vbi,
   8495                      Prefix pfx, Int sz, Long delta0 )
   8496 {
   8497    Int   len;
   8498    UChar rm = getUChar(delta0);
   8499    HChar dis_buf[50];
   8500 
   8501    IRType ty    = szToITy(sz);
   8502    IRTemp tmpd  = newTemp(ty);
   8503    IRTemp tmpt0 = newTemp(ty);
   8504    IRTemp tmpt1 = newTemp(ty);
   8505 
   8506    /* There are 3 cases to consider:
   8507 
   8508       reg-reg: ignore any lock prefix,
   8509                generate 'naive' (non-atomic) sequence
   8510 
   8511       reg-mem, not locked: ignore any lock prefix, generate 'naive'
   8512                            (non-atomic) sequence
   8513 
   8514       reg-mem, locked: use IRCAS
   8515    */
   8516 
   8517    if (epartIsReg(rm)) {
   8518       /* case 1 */
   8519       assign( tmpd, getIRegE(sz, pfx, rm) );
   8520       assign( tmpt0, getIRegG(sz, pfx, rm) );
   8521       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   8522                            mkexpr(tmpd), mkexpr(tmpt0)) );
   8523       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   8524       putIRegG(sz, pfx, rm, mkexpr(tmpd));
   8525       putIRegE(sz, pfx, rm, mkexpr(tmpt1));
   8526       DIP("xadd%c %s, %s\n",
   8527           nameISize(sz), nameIRegG(sz,pfx,rm), nameIRegE(sz,pfx,rm));
   8528       *decode_ok = True;
   8529       return 1+delta0;
   8530    }
   8531    else if (!epartIsReg(rm) && !haveLOCK(pfx)) {
   8532       /* case 2 */
   8533       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8534       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
   8535       assign( tmpt0, getIRegG(sz, pfx, rm) );
   8536       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   8537                            mkexpr(tmpd), mkexpr(tmpt0)) );
   8538       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   8539       storeLE( mkexpr(addr), mkexpr(tmpt1) );
   8540       putIRegG(sz, pfx, rm, mkexpr(tmpd));
   8541       DIP("xadd%c %s, %s\n",
   8542           nameISize(sz), nameIRegG(sz,pfx,rm), dis_buf);
   8543       *decode_ok = True;
   8544       return len+delta0;
   8545    }
   8546    else if (!epartIsReg(rm) && haveLOCK(pfx)) {
   8547       /* case 3 */
   8548       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8549       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
   8550       assign( tmpt0, getIRegG(sz, pfx, rm) );
   8551       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   8552                            mkexpr(tmpd), mkexpr(tmpt0)) );
   8553       casLE( mkexpr(addr), mkexpr(tmpd)/*expVal*/,
   8554                            mkexpr(tmpt1)/*newVal*/, guest_RIP_curr_instr );
   8555       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   8556       putIRegG(sz, pfx, rm, mkexpr(tmpd));
   8557       DIP("xadd%c %s, %s\n",
   8558           nameISize(sz), nameIRegG(sz,pfx,rm), dis_buf);
   8559       *decode_ok = True;
   8560       return len+delta0;
   8561    }
   8562    /*UNREACHED*/
   8563    vassert(0);
   8564 }
   8565 
   8566 //.. /* Move 16 bits from Ew (ireg or mem) to G (a segment register). */
   8567 //..
   8568 //.. static
   8569 //.. UInt dis_mov_Ew_Sw ( UChar sorb, Long delta0 )
   8570 //.. {
   8571 //..    Int    len;
   8572 //..    IRTemp addr;
   8573 //..    UChar  rm  = getUChar(delta0);
   8574 //..    HChar  dis_buf[50];
   8575 //..
   8576 //..    if (epartIsReg(rm)) {
   8577 //..       putSReg( gregOfRM(rm), getIReg(2, eregOfRM(rm)) );
   8578 //..       DIP("movw %s,%s\n", nameIReg(2,eregOfRM(rm)), nameSReg(gregOfRM(rm)));
   8579 //..       return 1+delta0;
   8580 //..    } else {
   8581 //..       addr = disAMode ( &len, sorb, delta0, dis_buf );
   8582 //..       putSReg( gregOfRM(rm), loadLE(Ity_I16, mkexpr(addr)) );
   8583 //..       DIP("movw %s,%s\n", dis_buf, nameSReg(gregOfRM(rm)));
   8584 //..       return len+delta0;
   8585 //..    }
   8586 //.. }
   8587 //..
   8588 //.. /* Move 16 bits from G (a segment register) to Ew (ireg or mem).  If
   8589 //..    dst is ireg and sz==4, zero out top half of it.  */
   8590 //..
   8591 //.. static
   8592 //.. UInt dis_mov_Sw_Ew ( UChar sorb,
   8593 //..                      Int   sz,
   8594 //..                      UInt  delta0 )
   8595 //.. {
   8596 //..    Int    len;
   8597 //..    IRTemp addr;
   8598 //..    UChar  rm  = getUChar(delta0);
   8599 //..    HChar  dis_buf[50];
   8600 //..
   8601 //..    vassert(sz == 2 || sz == 4);
   8602 //..
   8603 //..    if (epartIsReg(rm)) {
   8604 //..       if (sz == 4)
   8605 //..          putIReg(4, eregOfRM(rm), unop(Iop_16Uto32, getSReg(gregOfRM(rm))));
   8606 //..       else
   8607 //..          putIReg(2, eregOfRM(rm), getSReg(gregOfRM(rm)));
   8608 //..
   8609 //..       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), nameIReg(sz,eregOfRM(rm)));
   8610 //..       return 1+delta0;
   8611 //..    } else {
   8612 //..       addr = disAMode ( &len, sorb, delta0, dis_buf );
   8613 //..       storeLE( mkexpr(addr), getSReg(gregOfRM(rm)) );
   8614 //..       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), dis_buf);
   8615 //..       return len+delta0;
   8616 //..    }
   8617 //.. }
   8618 //..
   8619 //..
   8620 //.. static
   8621 //.. void dis_push_segreg ( UInt sreg, Int sz )
   8622 //.. {
   8623 //..     IRTemp t1 = newTemp(Ity_I16);
   8624 //..     IRTemp ta = newTemp(Ity_I32);
   8625 //..     vassert(sz == 2 || sz == 4);
   8626 //..
   8627 //..     assign( t1, getSReg(sreg) );
   8628 //..     assign( ta, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)) );
   8629 //..     putIReg(4, R_ESP, mkexpr(ta));
   8630 //..     storeLE( mkexpr(ta), mkexpr(t1) );
   8631 //..
   8632 //..     DIP("pushw %s\n", nameSReg(sreg));
   8633 //.. }
   8634 //..
   8635 //.. static
   8636 //.. void dis_pop_segreg ( UInt sreg, Int sz )
   8637 //.. {
   8638 //..     IRTemp t1 = newTemp(Ity_I16);
   8639 //..     IRTemp ta = newTemp(Ity_I32);
   8640 //..     vassert(sz == 2 || sz == 4);
   8641 //..
   8642 //..     assign( ta, getIReg(4, R_ESP) );
   8643 //..     assign( t1, loadLE(Ity_I16, mkexpr(ta)) );
   8644 //..
   8645 //..     putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(ta), mkU32(sz)) );
   8646 //..     putSReg( sreg, mkexpr(t1) );
   8647 //..     DIP("pop %s\n", nameSReg(sreg));
   8648 //.. }
   8649 
   8650 static
   8651 void dis_ret ( /*MOD*/DisResult* dres, const VexAbiInfo* vbi, ULong d64 )
   8652 {
   8653    IRTemp t1 = newTemp(Ity_I64);
   8654    IRTemp t2 = newTemp(Ity_I64);
   8655    IRTemp t3 = newTemp(Ity_I64);
   8656    assign(t1, getIReg64(R_RSP));
   8657    assign(t2, loadLE(Ity_I64,mkexpr(t1)));
   8658    assign(t3, binop(Iop_Add64, mkexpr(t1), mkU64(8+d64)));
   8659    putIReg64(R_RSP, mkexpr(t3));
   8660    make_redzone_AbiHint(vbi, t3, t2/*nia*/, "ret");
   8661    jmp_treg(dres, Ijk_Ret, t2);
   8662    vassert(dres->whatNext == Dis_StopHere);
   8663 }
   8664 
   8665 
   8666 /*------------------------------------------------------------*/
   8667 /*--- SSE/SSE2/SSE3 helpers                                ---*/
   8668 /*------------------------------------------------------------*/
   8669 
   8670 /* Indicates whether the op requires a rounding-mode argument.  Note
   8671    that this covers only vector floating point arithmetic ops, and
   8672    omits the scalar ones that need rounding modes.  Note also that
   8673    inconsistencies here will get picked up later by the IR sanity
   8674    checker, so this isn't correctness-critical. */
   8675 static Bool requiresRMode ( IROp op )
   8676 {
   8677    switch (op) {
   8678       /* 128 bit ops */
   8679       case Iop_Add32Fx4: case Iop_Sub32Fx4:
   8680       case Iop_Mul32Fx4: case Iop_Div32Fx4:
   8681       case Iop_Add64Fx2: case Iop_Sub64Fx2:
   8682       case Iop_Mul64Fx2: case Iop_Div64Fx2:
   8683       /* 256 bit ops */
   8684       case Iop_Add32Fx8: case Iop_Sub32Fx8:
   8685       case Iop_Mul32Fx8: case Iop_Div32Fx8:
   8686       case Iop_Add64Fx4: case Iop_Sub64Fx4:
   8687       case Iop_Mul64Fx4: case Iop_Div64Fx4:
   8688          return True;
   8689       default:
   8690          break;
   8691    }
   8692    return False;
   8693 }
   8694 
   8695 
   8696 /* Worker function; do not call directly.
   8697    Handles full width G = G `op` E   and   G = (not G) `op` E.
   8698 */
   8699 
   8700 static ULong dis_SSE_E_to_G_all_wrk (
   8701                 const VexAbiInfo* vbi,
   8702                 Prefix pfx, Long delta,
   8703                 const HChar* opname, IROp op,
   8704                 Bool   invertG
   8705              )
   8706 {
   8707    HChar   dis_buf[50];
   8708    Int     alen;
   8709    IRTemp  addr;
   8710    UChar   rm = getUChar(delta);
   8711    Bool    needsRMode = requiresRMode(op);
   8712    IRExpr* gpart
   8713       = invertG ? unop(Iop_NotV128, getXMMReg(gregOfRexRM(pfx,rm)))
   8714                 : getXMMReg(gregOfRexRM(pfx,rm));
   8715    if (epartIsReg(rm)) {
   8716       putXMMReg(
   8717          gregOfRexRM(pfx,rm),
   8718          needsRMode
   8719             ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   8720                         gpart,
   8721                         getXMMReg(eregOfRexRM(pfx,rm)))
   8722             : binop(op, gpart,
   8723                         getXMMReg(eregOfRexRM(pfx,rm)))
   8724       );
   8725       DIP("%s %s,%s\n", opname,
   8726                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8727                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8728       return delta+1;
   8729    } else {
   8730       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8731       putXMMReg(
   8732          gregOfRexRM(pfx,rm),
   8733          needsRMode
   8734             ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   8735                         gpart,
   8736                         loadLE(Ity_V128, mkexpr(addr)))
   8737             : binop(op, gpart,
   8738                         loadLE(Ity_V128, mkexpr(addr)))
   8739       );
   8740       DIP("%s %s,%s\n", opname,
   8741                         dis_buf,
   8742                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8743       return delta+alen;
   8744    }
   8745 }
   8746 
   8747 
   8748 /* All lanes SSE binary operation, G = G `op` E. */
   8749 
   8750 static
   8751 ULong dis_SSE_E_to_G_all ( const VexAbiInfo* vbi,
   8752                            Prefix pfx, Long delta,
   8753                            const HChar* opname, IROp op )
   8754 {
   8755    return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, False );
   8756 }
   8757 
   8758 /* All lanes SSE binary operation, G = (not G) `op` E. */
   8759 
   8760 static
   8761 ULong dis_SSE_E_to_G_all_invG ( const VexAbiInfo* vbi,
   8762                                 Prefix pfx, Long delta,
   8763                                 const HChar* opname, IROp op )
   8764 {
   8765    return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, True );
   8766 }
   8767 
   8768 
   8769 /* Lowest 32-bit lane only SSE binary operation, G = G `op` E. */
   8770 
   8771 static ULong dis_SSE_E_to_G_lo32 ( const VexAbiInfo* vbi,
   8772                                    Prefix pfx, Long delta,
   8773                                    const HChar* opname, IROp op )
   8774 {
   8775    HChar   dis_buf[50];
   8776    Int     alen;
   8777    IRTemp  addr;
   8778    UChar   rm = getUChar(delta);
   8779    IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
   8780    if (epartIsReg(rm)) {
   8781       putXMMReg( gregOfRexRM(pfx,rm),
   8782                  binop(op, gpart,
   8783                            getXMMReg(eregOfRexRM(pfx,rm))) );
   8784       DIP("%s %s,%s\n", opname,
   8785                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8786                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8787       return delta+1;
   8788    } else {
   8789       /* We can only do a 32-bit memory read, so the upper 3/4 of the
   8790          E operand needs to be made simply of zeroes. */
   8791       IRTemp epart = newTemp(Ity_V128);
   8792       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8793       assign( epart, unop( Iop_32UtoV128,
   8794                            loadLE(Ity_I32, mkexpr(addr))) );
   8795       putXMMReg( gregOfRexRM(pfx,rm),
   8796                  binop(op, gpart, mkexpr(epart)) );
   8797       DIP("%s %s,%s\n", opname,
   8798                         dis_buf,
   8799                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8800       return delta+alen;
   8801    }
   8802 }
   8803 
   8804 
   8805 /* Lower 64-bit lane only SSE binary operation, G = G `op` E. */
   8806 
   8807 static ULong dis_SSE_E_to_G_lo64 ( const VexAbiInfo* vbi,
   8808                                    Prefix pfx, Long delta,
   8809                                    const HChar* opname, IROp op )
   8810 {
   8811    HChar   dis_buf[50];
   8812    Int     alen;
   8813    IRTemp  addr;
   8814    UChar   rm = getUChar(delta);
   8815    IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
   8816    if (epartIsReg(rm)) {
   8817       putXMMReg( gregOfRexRM(pfx,rm),
   8818                  binop(op, gpart,
   8819                            getXMMReg(eregOfRexRM(pfx,rm))) );
   8820       DIP("%s %s,%s\n", opname,
   8821                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8822                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8823       return delta+1;
   8824    } else {
   8825       /* We can only do a 64-bit memory read, so the upper half of the
   8826          E operand needs to be made simply of zeroes. */
   8827       IRTemp epart = newTemp(Ity_V128);
   8828       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8829       assign( epart, unop( Iop_64UtoV128,
   8830                            loadLE(Ity_I64, mkexpr(addr))) );
   8831       putXMMReg( gregOfRexRM(pfx,rm),
   8832                  binop(op, gpart, mkexpr(epart)) );
   8833       DIP("%s %s,%s\n", opname,
   8834                         dis_buf,
   8835                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8836       return delta+alen;
   8837    }
   8838 }
   8839 
   8840 
   8841 /* All lanes unary SSE operation, G = op(E). */
   8842 
   8843 static ULong dis_SSE_E_to_G_unary_all (
   8844                 const VexAbiInfo* vbi,
   8845                 Prefix pfx, Long delta,
   8846                 const HChar* opname, IROp op
   8847              )
   8848 {
   8849    HChar   dis_buf[50];
   8850    Int     alen;
   8851    IRTemp  addr;
   8852    UChar   rm = getUChar(delta);
   8853    // Sqrt32Fx4 and Sqrt64Fx2 take a rounding mode, which is faked
   8854    // up in the usual way.
   8855    Bool needsIRRM = op == Iop_Sqrt32Fx4 || op == Iop_Sqrt64Fx2;
   8856    if (epartIsReg(rm)) {
   8857       IRExpr* src = getXMMReg(eregOfRexRM(pfx,rm));
   8858       /* XXXROUNDINGFIXME */
   8859       IRExpr* res = needsIRRM ? binop(op, get_FAKE_roundingmode(), src)
   8860                               : unop(op, src);
   8861       putXMMReg( gregOfRexRM(pfx,rm), res );
   8862       DIP("%s %s,%s\n", opname,
   8863                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8864                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8865       return delta+1;
   8866    } else {
   8867       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8868       IRExpr* src = loadLE(Ity_V128, mkexpr(addr));
   8869       /* XXXROUNDINGFIXME */
   8870       IRExpr* res = needsIRRM ? binop(op, get_FAKE_roundingmode(), src)
   8871                               : unop(op, src);
   8872       putXMMReg( gregOfRexRM(pfx,rm), res );
   8873       DIP("%s %s,%s\n", opname,
   8874                         dis_buf,
   8875                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8876       return delta+alen;
   8877    }
   8878 }
   8879 
   8880 
   8881 /* Lowest 32-bit lane only unary SSE operation, G = op(E). */
   8882 
   8883 static ULong dis_SSE_E_to_G_unary_lo32 (
   8884                 const VexAbiInfo* vbi,
   8885                 Prefix pfx, Long delta,
   8886                 const HChar* opname, IROp op
   8887              )
   8888 {
   8889    /* First we need to get the old G value and patch the low 32 bits
   8890       of the E operand into it.  Then apply op and write back to G. */
   8891    HChar   dis_buf[50];
   8892    Int     alen;
   8893    IRTemp  addr;
   8894    UChar   rm = getUChar(delta);
   8895    IRTemp  oldG0 = newTemp(Ity_V128);
   8896    IRTemp  oldG1 = newTemp(Ity_V128);
   8897 
   8898    assign( oldG0, getXMMReg(gregOfRexRM(pfx,rm)) );
   8899 
   8900    if (epartIsReg(rm)) {
   8901       assign( oldG1,
   8902               binop( Iop_SetV128lo32,
   8903                      mkexpr(oldG0),
   8904                      getXMMRegLane32(eregOfRexRM(pfx,rm), 0)) );
   8905       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   8906       DIP("%s %s,%s\n", opname,
   8907                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8908                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8909       return delta+1;
   8910    } else {
   8911       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8912       assign( oldG1,
   8913               binop( Iop_SetV128lo32,
   8914                      mkexpr(oldG0),
   8915                      loadLE(Ity_I32, mkexpr(addr)) ));
   8916       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   8917       DIP("%s %s,%s\n", opname,
   8918                         dis_buf,
   8919                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8920       return delta+alen;
   8921    }
   8922 }
   8923 
   8924 
   8925 /* Lowest 64-bit lane only unary SSE operation, G = op(E). */
   8926 
   8927 static ULong dis_SSE_E_to_G_unary_lo64 (
   8928                 const VexAbiInfo* vbi,
   8929                 Prefix pfx, Long delta,
   8930                 const HChar* opname, IROp op
   8931              )
   8932 {
   8933    /* First we need to get the old G value and patch the low 64 bits
   8934       of the E operand into it.  Then apply op and write back to G. */
   8935    HChar   dis_buf[50];
   8936    Int     alen;
   8937    IRTemp  addr;
   8938    UChar   rm = getUChar(delta);
   8939    IRTemp  oldG0 = newTemp(Ity_V128);
   8940    IRTemp  oldG1 = newTemp(Ity_V128);
   8941 
   8942    assign( oldG0, getXMMReg(gregOfRexRM(pfx,rm)) );
   8943 
   8944    if (epartIsReg(rm)) {
   8945       assign( oldG1,
   8946               binop( Iop_SetV128lo64,
   8947                      mkexpr(oldG0),
   8948                      getXMMRegLane64(eregOfRexRM(pfx,rm), 0)) );
   8949       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   8950       DIP("%s %s,%s\n", opname,
   8951                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8952                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8953       return delta+1;
   8954    } else {
   8955       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8956       assign( oldG1,
   8957               binop( Iop_SetV128lo64,
   8958                      mkexpr(oldG0),
   8959                      loadLE(Ity_I64, mkexpr(addr)) ));
   8960       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   8961       DIP("%s %s,%s\n", opname,
   8962                         dis_buf,
   8963                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8964       return delta+alen;
   8965    }
   8966 }
   8967 
   8968 
   8969 /* SSE integer binary operation:
   8970       G = G `op` E   (eLeft == False)
   8971       G = E `op` G   (eLeft == True)
   8972 */
   8973 static ULong dis_SSEint_E_to_G(
   8974                 const VexAbiInfo* vbi,
   8975                 Prefix pfx, Long delta,
   8976                 const HChar* opname, IROp op,
   8977                 Bool   eLeft
   8978              )
   8979 {
   8980    HChar   dis_buf[50];
   8981    Int     alen;
   8982    IRTemp  addr;
   8983    UChar   rm = getUChar(delta);
   8984    IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
   8985    IRExpr* epart = NULL;
   8986    if (epartIsReg(rm)) {
   8987       epart = getXMMReg(eregOfRexRM(pfx,rm));
   8988       DIP("%s %s,%s\n", opname,
   8989                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8990                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8991       delta += 1;
   8992    } else {
   8993       addr  = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8994       epart = loadLE(Ity_V128, mkexpr(addr));
   8995       DIP("%s %s,%s\n", opname,
   8996                         dis_buf,
   8997                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8998       delta += alen;
   8999    }
   9000    putXMMReg( gregOfRexRM(pfx,rm),
   9001               eLeft ? binop(op, epart, gpart)
   9002                     : binop(op, gpart, epart) );
   9003    return delta;
   9004 }
   9005 
   9006 
   9007 /* Helper for doing SSE FP comparisons.  False return ==> unhandled.
   9008    This is all a bit of a kludge in that it ignores the subtleties of
   9009    ordered-vs-unordered and signalling-vs-nonsignalling in the Intel
   9010    spec. */
   9011 static Bool findSSECmpOp ( /*OUT*/Bool* preSwapP,
   9012                            /*OUT*/IROp* opP,
   9013                            /*OUT*/Bool* postNotP,
   9014                            UInt imm8, Bool all_lanes, Int sz )
   9015 {
   9016    if (imm8 >= 32) return False;
   9017 
   9018    /* First, compute a (preSwap, op, postNot) triple from
   9019       the supplied imm8. */
   9020    Bool pre = False;
   9021    IROp op  = Iop_INVALID;
   9022    Bool not = False;
   9023 
   9024 #  define XXX(_pre, _op, _not) { pre = _pre; op = _op; not = _not; }
   9025    // If you add a case here, add a corresponding test for both VCMPSD_128
   9026    // and VCMPSS_128 in avx-1.c.
   9027    switch (imm8) {
   9028       // "O" = ordered, "U" = unordered
   9029       // "Q" = non-signalling (quiet), "S" = signalling
   9030       //
   9031       //             swap operands?
   9032       //             |
   9033       //             |      cmp op          invert after?
   9034       //             |      |               |
   9035       //             v      v               v
   9036       case 0x0:  XXX(False, Iop_CmpEQ32Fx4, False); break; // EQ_OQ
   9037       case 0x1:  XXX(False, Iop_CmpLT32Fx4, False); break; // LT_OS
   9038       case 0x2:  XXX(False, Iop_CmpLE32Fx4, False); break; // LE_OS
   9039       case 0x3:  XXX(False, Iop_CmpUN32Fx4, False); break; // UNORD_Q
   9040       case 0x4:  XXX(False, Iop_CmpEQ32Fx4, True);  break; // NEQ_UQ
   9041       case 0x5:  XXX(False, Iop_CmpLT32Fx4, True);  break; // NLT_US
   9042       case 0x6:  XXX(False, Iop_CmpLE32Fx4, True);  break; // NLE_US
   9043       case 0x7:  XXX(False, Iop_CmpUN32Fx4, True);  break; // ORD_Q
   9044       case 0x8:  XXX(False, Iop_CmpEQ32Fx4, False); break; // EQ_UQ
   9045       case 0x9:  XXX(True,  Iop_CmpLE32Fx4, True);  break; // NGE_US
   9046       /* "Enhanced Comparison Predicate[s] for VEX-Encoded [insns] */
   9047       case 0xA:  XXX(True,  Iop_CmpLT32Fx4, True);  break; // NGT_US
   9048       // 0xB  FALSE_OQ
   9049       // 0xC: this isn't really right because it returns all-1s when
   9050       // either operand is a NaN, and it should return all-0s.
   9051       case 0xC:  XXX(False, Iop_CmpEQ32Fx4, True);  break; // NEQ_OQ
   9052       case 0xD:  XXX(True,  Iop_CmpLE32Fx4, False); break; // GE_OS
   9053       case 0xE:  XXX(True,  Iop_CmpLT32Fx4, False); break; // GT_OS
   9054       // 0xF  TRUE_UQ
   9055       // 0x10  EQ_OS
   9056       case 0x11: XXX(False, Iop_CmpLT32Fx4, False); break; // LT_OQ
   9057       case 0x12: XXX(False, Iop_CmpLE32Fx4, False); break; // LE_OQ
   9058       // 0x13  UNORD_S
   9059       // 0x14  NEQ_US
   9060       // 0x15  NLT_UQ
   9061       case 0x16: XXX(False, Iop_CmpLE32Fx4, True);  break; // NLE_UQ
   9062       // 0x17  ORD_S
   9063       // 0x18  EQ_US
   9064       // 0x19  NGE_UQ
   9065       // 0x1A  NGT_UQ
   9066       // 0x1B  FALSE_OS
   9067       // 0x1C  NEQ_OS
   9068       // 0x1D  GE_OQ
   9069       case 0x1E: XXX(True,  Iop_CmpLT32Fx4, False); break; // GT_OQ
   9070       // 0x1F  TRUE_US
   9071       /* Don't forget to add test cases to VCMPSS_128_<imm8> in
   9072          avx-1.c if new cases turn up. */
   9073       default: break;
   9074    }
   9075 #  undef XXX
   9076    if (op == Iop_INVALID) return False;
   9077 
   9078    /* Now convert the op into one with the same arithmetic but that is
   9079       correct for the width and laneage requirements. */
   9080 
   9081    /**/ if (sz == 4 && all_lanes) {
   9082       switch (op) {
   9083          case Iop_CmpEQ32Fx4: op = Iop_CmpEQ32Fx4; break;
   9084          case Iop_CmpLT32Fx4: op = Iop_CmpLT32Fx4; break;
   9085          case Iop_CmpLE32Fx4: op = Iop_CmpLE32Fx4; break;
   9086          case Iop_CmpUN32Fx4: op = Iop_CmpUN32Fx4; break;
   9087          default: vassert(0);
   9088       }
   9089    }
   9090    else if (sz == 4 && !all_lanes) {
   9091       switch (op) {
   9092          case Iop_CmpEQ32Fx4: op = Iop_CmpEQ32F0x4; break;
   9093          case Iop_CmpLT32Fx4: op = Iop_CmpLT32F0x4; break;
   9094          case Iop_CmpLE32Fx4: op = Iop_CmpLE32F0x4; break;
   9095          case Iop_CmpUN32Fx4: op = Iop_CmpUN32F0x4; break;
   9096          default: vassert(0);
   9097       }
   9098    }
   9099    else if (sz == 8 && all_lanes) {
   9100       switch (op) {
   9101          case Iop_CmpEQ32Fx4: op = Iop_CmpEQ64Fx2; break;
   9102          case Iop_CmpLT32Fx4: op = Iop_CmpLT64Fx2; break;
   9103          case Iop_CmpLE32Fx4: op = Iop_CmpLE64Fx2; break;
   9104          case Iop_CmpUN32Fx4: op = Iop_CmpUN64Fx2; break;
   9105          default: vassert(0);
   9106       }
   9107    }
   9108    else if (sz == 8 && !all_lanes) {
   9109       switch (op) {
   9110          case Iop_CmpEQ32Fx4: op = Iop_CmpEQ64F0x2; break;
   9111          case Iop_CmpLT32Fx4: op = Iop_CmpLT64F0x2; break;
   9112          case Iop_CmpLE32Fx4: op = Iop_CmpLE64F0x2; break;
   9113          case Iop_CmpUN32Fx4: op = Iop_CmpUN64F0x2; break;
   9114          default: vassert(0);
   9115       }
   9116    }
   9117    else {
   9118       vpanic("findSSECmpOp(amd64,guest)");
   9119    }
   9120 
   9121    *preSwapP = pre; *opP = op; *postNotP = not;
   9122    return True;
   9123 }
   9124 
   9125 
   9126 /* Handles SSE 32F/64F comparisons.  It can fail, in which case it
   9127    returns the original delta to indicate failure. */
   9128 
   9129 static Long dis_SSE_cmp_E_to_G ( const VexAbiInfo* vbi,
   9130                                  Prefix pfx, Long delta,
   9131                                  const HChar* opname, Bool all_lanes, Int sz )
   9132 {
   9133    Long    delta0 = delta;
   9134    HChar   dis_buf[50];
   9135    Int     alen;
   9136    UInt    imm8;
   9137    IRTemp  addr;
   9138    Bool    preSwap = False;
   9139    IROp    op      = Iop_INVALID;
   9140    Bool    postNot = False;
   9141    IRTemp  plain   = newTemp(Ity_V128);
   9142    UChar   rm      = getUChar(delta);
   9143    UShort  mask    = 0;
   9144    vassert(sz == 4 || sz == 8);
   9145    if (epartIsReg(rm)) {
   9146       imm8 = getUChar(delta+1);
   9147       if (imm8 >= 8) return delta0; /* FAIL */
   9148       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
   9149       if (!ok) return delta0; /* FAIL */
   9150       vassert(!preSwap); /* never needed for imm8 < 8 */
   9151       assign( plain, binop(op, getXMMReg(gregOfRexRM(pfx,rm)),
   9152                                getXMMReg(eregOfRexRM(pfx,rm))) );
   9153       delta += 2;
   9154       DIP("%s $%d,%s,%s\n", opname,
   9155                             (Int)imm8,
   9156                             nameXMMReg(eregOfRexRM(pfx,rm)),
   9157                             nameXMMReg(gregOfRexRM(pfx,rm)) );
   9158    } else {
   9159       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   9160       imm8 = getUChar(delta+alen);
   9161       if (imm8 >= 8) return delta0; /* FAIL */
   9162       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
   9163       if (!ok) return delta0; /* FAIL */
   9164       vassert(!preSwap); /* never needed for imm8 < 8 */
   9165       assign( plain,
   9166               binop(
   9167                  op,
   9168                  getXMMReg(gregOfRexRM(pfx,rm)),
   9169                    all_lanes
   9170                       ? loadLE(Ity_V128, mkexpr(addr))
   9171                    : sz == 8
   9172                       ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
   9173                    : /*sz==4*/
   9174                       unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr)))
   9175               )
   9176       );
   9177       delta += alen+1;
   9178       DIP("%s $%d,%s,%s\n", opname,
   9179                             (Int)imm8,
   9180                             dis_buf,
   9181                             nameXMMReg(gregOfRexRM(pfx,rm)) );
   9182    }
   9183 
   9184    if (postNot && all_lanes) {
   9185       putXMMReg( gregOfRexRM(pfx,rm),
   9186                  unop(Iop_NotV128, mkexpr(plain)) );
   9187    }
   9188    else
   9189    if (postNot && !all_lanes) {
   9190       mask = toUShort(sz==4 ? 0x000F : 0x00FF);
   9191       putXMMReg( gregOfRexRM(pfx,rm),
   9192                  binop(Iop_XorV128, mkexpr(plain), mkV128(mask)) );
   9193    }
   9194    else {
   9195       putXMMReg( gregOfRexRM(pfx,rm), mkexpr(plain) );
   9196    }
   9197 
   9198    return delta;
   9199 }
   9200 
   9201 
   9202 /* Vector by scalar shift of G by the amount specified at the bottom
   9203    of E. */
   9204 
   9205 static ULong dis_SSE_shiftG_byE ( const VexAbiInfo* vbi,
   9206                                   Prefix pfx, Long delta,
   9207                                   const HChar* opname, IROp op )
   9208 {
   9209    HChar   dis_buf[50];
   9210    Int     alen, size;
   9211    IRTemp  addr;
   9212    Bool    shl, shr, sar;
   9213    UChar   rm   = getUChar(delta);
   9214    IRTemp  g0   = newTemp(Ity_V128);
   9215    IRTemp  g1   = newTemp(Ity_V128);
   9216    IRTemp  amt  = newTemp(Ity_I64);
   9217    IRTemp  amt8 = newTemp(Ity_I8);
   9218    if (epartIsReg(rm)) {
   9219       assign( amt, getXMMRegLane64(eregOfRexRM(pfx,rm), 0) );
   9220       DIP("%s %s,%s\n", opname,
   9221                         nameXMMReg(eregOfRexRM(pfx,rm)),
   9222                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9223       delta++;
   9224    } else {
   9225       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9226       assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
   9227       DIP("%s %s,%s\n", opname,
   9228                         dis_buf,
   9229                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9230       delta += alen;
   9231    }
   9232    assign( g0,   getXMMReg(gregOfRexRM(pfx,rm)) );
   9233    assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
   9234 
   9235    shl = shr = sar = False;
   9236    size = 0;
   9237    switch (op) {
   9238       case Iop_ShlN16x8: shl = True; size = 32; break;
   9239       case Iop_ShlN32x4: shl = True; size = 32; break;
   9240       case Iop_ShlN64x2: shl = True; size = 64; break;
   9241       case Iop_SarN16x8: sar = True; size = 16; break;
   9242       case Iop_SarN32x4: sar = True; size = 32; break;
   9243       case Iop_ShrN16x8: shr = True; size = 16; break;
   9244       case Iop_ShrN32x4: shr = True; size = 32; break;
   9245       case Iop_ShrN64x2: shr = True; size = 64; break;
   9246       default: vassert(0);
   9247    }
   9248 
   9249    if (shl || shr) {
   9250      assign(
   9251         g1,
   9252         IRExpr_ITE(
   9253            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
   9254            binop(op, mkexpr(g0), mkexpr(amt8)),
   9255            mkV128(0x0000)
   9256         )
   9257      );
   9258    } else
   9259    if (sar) {
   9260      assign(
   9261         g1,
   9262         IRExpr_ITE(
   9263            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
   9264            binop(op, mkexpr(g0), mkexpr(amt8)),
   9265            binop(op, mkexpr(g0), mkU8(size-1))
   9266         )
   9267      );
   9268    } else {
   9269       vassert(0);
   9270    }
   9271 
   9272    putXMMReg( gregOfRexRM(pfx,rm), mkexpr(g1) );
   9273    return delta;
   9274 }
   9275 
   9276 
   9277 /* Vector by scalar shift of E by an immediate byte. */
   9278 
   9279 static
   9280 ULong dis_SSE_shiftE_imm ( Prefix pfx,
   9281                            Long delta, const HChar* opname, IROp op )
   9282 {
   9283    Bool    shl, shr, sar;
   9284    UChar   rm   = getUChar(delta);
   9285    IRTemp  e0   = newTemp(Ity_V128);
   9286    IRTemp  e1   = newTemp(Ity_V128);
   9287    UChar   amt, size;
   9288    vassert(epartIsReg(rm));
   9289    vassert(gregLO3ofRM(rm) == 2
   9290            || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
   9291    amt = getUChar(delta+1);
   9292    delta += 2;
   9293    DIP("%s $%d,%s\n", opname,
   9294                       (Int)amt,
   9295                       nameXMMReg(eregOfRexRM(pfx,rm)) );
   9296    assign( e0, getXMMReg(eregOfRexRM(pfx,rm)) );
   9297 
   9298    shl = shr = sar = False;
   9299    size = 0;
   9300    switch (op) {
   9301       case Iop_ShlN16x8: shl = True; size = 16; break;
   9302       case Iop_ShlN32x4: shl = True; size = 32; break;
   9303       case Iop_ShlN64x2: shl = True; size = 64; break;
   9304       case Iop_SarN16x8: sar = True; size = 16; break;
   9305       case Iop_SarN32x4: sar = True; size = 32; break;
   9306       case Iop_ShrN16x8: shr = True; size = 16; break;
   9307       case Iop_ShrN32x4: shr = True; size = 32; break;
   9308       case Iop_ShrN64x2: shr = True; size = 64; break;
   9309       default: vassert(0);
   9310    }
   9311 
   9312    if (shl || shr) {
   9313      assign( e1, amt >= size
   9314                     ? mkV128(0x0000)
   9315                     : binop(op, mkexpr(e0), mkU8(amt))
   9316      );
   9317    } else
   9318    if (sar) {
   9319      assign( e1, amt >= size
   9320                     ? binop(op, mkexpr(e0), mkU8(size-1))
   9321                     : binop(op, mkexpr(e0), mkU8(amt))
   9322      );
   9323    } else {
   9324       vassert(0);
   9325    }
   9326 
   9327    putXMMReg( eregOfRexRM(pfx,rm), mkexpr(e1) );
   9328    return delta;
   9329 }
   9330 
   9331 
   9332 /* Get the current SSE rounding mode. */
   9333 
   9334 static IRExpr* /* :: Ity_I32 */ get_sse_roundingmode ( void )
   9335 {
   9336    return
   9337       unop( Iop_64to32,
   9338             binop( Iop_And64,
   9339                    IRExpr_Get( OFFB_SSEROUND, Ity_I64 ),
   9340                    mkU64(3) ));
   9341 }
   9342 
   9343 static void put_sse_roundingmode ( IRExpr* sseround )
   9344 {
   9345    vassert(typeOfIRExpr(irsb->tyenv, sseround) == Ity_I32);
   9346    stmt( IRStmt_Put( OFFB_SSEROUND,
   9347                      unop(Iop_32Uto64,sseround) ) );
   9348 }
   9349 
   9350 /* Break a V128-bit value up into four 32-bit ints. */
   9351 
   9352 static void breakupV128to32s ( IRTemp t128,
   9353                                /*OUTs*/
   9354                                IRTemp* t3, IRTemp* t2,
   9355                                IRTemp* t1, IRTemp* t0 )
   9356 {
   9357    IRTemp hi64 = newTemp(Ity_I64);
   9358    IRTemp lo64 = newTemp(Ity_I64);
   9359    assign( hi64, unop(Iop_V128HIto64, mkexpr(t128)) );
   9360    assign( lo64, unop(Iop_V128to64,   mkexpr(t128)) );
   9361 
   9362    vassert(t0 && *t0 == IRTemp_INVALID);
   9363    vassert(t1 && *t1 == IRTemp_INVALID);
   9364    vassert(t2 && *t2 == IRTemp_INVALID);
   9365    vassert(t3 && *t3 == IRTemp_INVALID);
   9366 
   9367    *t0 = newTemp(Ity_I32);
   9368    *t1 = newTemp(Ity_I32);
   9369    *t2 = newTemp(Ity_I32);
   9370    *t3 = newTemp(Ity_I32);
   9371    assign( *t0, unop(Iop_64to32,   mkexpr(lo64)) );
   9372    assign( *t1, unop(Iop_64HIto32, mkexpr(lo64)) );
   9373    assign( *t2, unop(Iop_64to32,   mkexpr(hi64)) );
   9374    assign( *t3, unop(Iop_64HIto32, mkexpr(hi64)) );
   9375 }
   9376 
   9377 /* Construct a V128-bit value from four 32-bit ints. */
   9378 
   9379 static IRExpr* mkV128from32s ( IRTemp t3, IRTemp t2,
   9380                                IRTemp t1, IRTemp t0 )
   9381 {
   9382    return
   9383       binop( Iop_64HLtoV128,
   9384              binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
   9385              binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0))
   9386    );
   9387 }
   9388 
   9389 /* Break a 64-bit value up into four 16-bit ints. */
   9390 
   9391 static void breakup64to16s ( IRTemp t64,
   9392                              /*OUTs*/
   9393                              IRTemp* t3, IRTemp* t2,
   9394                              IRTemp* t1, IRTemp* t0 )
   9395 {
   9396    IRTemp hi32 = newTemp(Ity_I32);
   9397    IRTemp lo32 = newTemp(Ity_I32);
   9398    assign( hi32, unop(Iop_64HIto32, mkexpr(t64)) );
   9399    assign( lo32, unop(Iop_64to32,   mkexpr(t64)) );
   9400 
   9401    vassert(t0 && *t0 == IRTemp_INVALID);
   9402    vassert(t1 && *t1 == IRTemp_INVALID);
   9403    vassert(t2 && *t2 == IRTemp_INVALID);
   9404    vassert(t3 && *t3 == IRTemp_INVALID);
   9405 
   9406    *t0 = newTemp(Ity_I16);
   9407    *t1 = newTemp(Ity_I16);
   9408    *t2 = newTemp(Ity_I16);
   9409    *t3 = newTemp(Ity_I16);
   9410    assign( *t0, unop(Iop_32to16,   mkexpr(lo32)) );
   9411    assign( *t1, unop(Iop_32HIto16, mkexpr(lo32)) );
   9412    assign( *t2, unop(Iop_32to16,   mkexpr(hi32)) );
   9413    assign( *t3, unop(Iop_32HIto16, mkexpr(hi32)) );
   9414 }
   9415 
   9416 /* Construct a 64-bit value from four 16-bit ints. */
   9417 
   9418 static IRExpr* mk64from16s ( IRTemp t3, IRTemp t2,
   9419                              IRTemp t1, IRTemp t0 )
   9420 {
   9421    return
   9422       binop( Iop_32HLto64,
   9423              binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)),
   9424              binop(Iop_16HLto32, mkexpr(t1), mkexpr(t0))
   9425    );
   9426 }
   9427 
   9428 /* Break a V256-bit value up into four 64-bit ints. */
   9429 
   9430 static void breakupV256to64s ( IRTemp t256,
   9431                                /*OUTs*/
   9432                                IRTemp* t3, IRTemp* t2,
   9433                                IRTemp* t1, IRTemp* t0 )
   9434 {
   9435    vassert(t0 && *t0 == IRTemp_INVALID);
   9436    vassert(t1 && *t1 == IRTemp_INVALID);
   9437    vassert(t2 && *t2 == IRTemp_INVALID);
   9438    vassert(t3 && *t3 == IRTemp_INVALID);
   9439    *t0 = newTemp(Ity_I64);
   9440    *t1 = newTemp(Ity_I64);
   9441    *t2 = newTemp(Ity_I64);
   9442    *t3 = newTemp(Ity_I64);
   9443    assign( *t0, unop(Iop_V256to64_0, mkexpr(t256)) );
   9444    assign( *t1, unop(Iop_V256to64_1, mkexpr(t256)) );
   9445    assign( *t2, unop(Iop_V256to64_2, mkexpr(t256)) );
   9446    assign( *t3, unop(Iop_V256to64_3, mkexpr(t256)) );
   9447 }
   9448 
   9449 /* Break a V256-bit value up into two V128s. */
   9450 
   9451 static void breakupV256toV128s ( IRTemp t256,
   9452                                  /*OUTs*/
   9453                                  IRTemp* t1, IRTemp* t0 )
   9454 {
   9455    vassert(t0 && *t0 == IRTemp_INVALID);
   9456    vassert(t1 && *t1 == IRTemp_INVALID);
   9457    *t0 = newTemp(Ity_V128);
   9458    *t1 = newTemp(Ity_V128);
   9459    assign(*t1, unop(Iop_V256toV128_1, mkexpr(t256)));
   9460    assign(*t0, unop(Iop_V256toV128_0, mkexpr(t256)));
   9461 }
   9462 
   9463 /* Break a V256-bit value up into eight 32-bit ints.  */
   9464 
   9465 static void breakupV256to32s ( IRTemp t256,
   9466                                /*OUTs*/
   9467                                IRTemp* t7, IRTemp* t6,
   9468                                IRTemp* t5, IRTemp* t4,
   9469                                IRTemp* t3, IRTemp* t2,
   9470                                IRTemp* t1, IRTemp* t0 )
   9471 {
   9472    IRTemp t128_1 = IRTemp_INVALID;
   9473    IRTemp t128_0 = IRTemp_INVALID;
   9474    breakupV256toV128s( t256, &t128_1, &t128_0 );
   9475    breakupV128to32s( t128_1, t7, t6, t5, t4 );
   9476    breakupV128to32s( t128_0, t3, t2, t1, t0 );
   9477 }
   9478 
   9479 /* Break a V128-bit value up into two 64-bit ints. */
   9480 
   9481 static void breakupV128to64s ( IRTemp t128,
   9482                                /*OUTs*/
   9483                                IRTemp* t1, IRTemp* t0 )
   9484 {
   9485    vassert(t0 && *t0 == IRTemp_INVALID);
   9486    vassert(t1 && *t1 == IRTemp_INVALID);
   9487    *t0 = newTemp(Ity_I64);
   9488    *t1 = newTemp(Ity_I64);
   9489    assign( *t0, unop(Iop_V128to64,   mkexpr(t128)) );
   9490    assign( *t1, unop(Iop_V128HIto64, mkexpr(t128)) );
   9491 }
   9492 
   9493 /* Construct a V256-bit value from eight 32-bit ints. */
   9494 
   9495 static IRExpr* mkV256from32s ( IRTemp t7, IRTemp t6,
   9496                                IRTemp t5, IRTemp t4,
   9497                                IRTemp t3, IRTemp t2,
   9498                                IRTemp t1, IRTemp t0 )
   9499 {
   9500    return
   9501       binop( Iop_V128HLtoV256,
   9502              binop( Iop_64HLtoV128,
   9503                     binop(Iop_32HLto64, mkexpr(t7), mkexpr(t6)),
   9504                     binop(Iop_32HLto64, mkexpr(t5), mkexpr(t4)) ),
   9505              binop( Iop_64HLtoV128,
   9506                     binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
   9507                     binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0)) )
   9508    );
   9509 }
   9510 
   9511 /* Construct a V256-bit value from four 64-bit ints. */
   9512 
   9513 static IRExpr* mkV256from64s ( IRTemp t3, IRTemp t2,
   9514                                IRTemp t1, IRTemp t0 )
   9515 {
   9516    return
   9517       binop( Iop_V128HLtoV256,
   9518              binop(Iop_64HLtoV128, mkexpr(t3), mkexpr(t2)),
   9519              binop(Iop_64HLtoV128, mkexpr(t1), mkexpr(t0))
   9520    );
   9521 }
   9522 
   9523 /* Helper for the SSSE3 (not SSE3) PMULHRSW insns.  Given two 64-bit
   9524    values (aa,bb), computes, for each of the 4 16-bit lanes:
   9525 
   9526    (((aa_lane *s32 bb_lane) >>u 14) + 1) >>u 1
   9527 */
   9528 static IRExpr* dis_PMULHRSW_helper ( IRExpr* aax, IRExpr* bbx )
   9529 {
   9530    IRTemp aa      = newTemp(Ity_I64);
   9531    IRTemp bb      = newTemp(Ity_I64);
   9532    IRTemp aahi32s = newTemp(Ity_I64);
   9533    IRTemp aalo32s = newTemp(Ity_I64);
   9534    IRTemp bbhi32s = newTemp(Ity_I64);
   9535    IRTemp bblo32s = newTemp(Ity_I64);
   9536    IRTemp rHi     = newTemp(Ity_I64);
   9537    IRTemp rLo     = newTemp(Ity_I64);
   9538    IRTemp one32x2 = newTemp(Ity_I64);
   9539    assign(aa, aax);
   9540    assign(bb, bbx);
   9541    assign( aahi32s,
   9542            binop(Iop_SarN32x2,
   9543                  binop(Iop_InterleaveHI16x4, mkexpr(aa), mkexpr(aa)),
   9544                  mkU8(16) ));
   9545    assign( aalo32s,
   9546            binop(Iop_SarN32x2,
   9547                  binop(Iop_InterleaveLO16x4, mkexpr(aa), mkexpr(aa)),
   9548                  mkU8(16) ));
   9549    assign( bbhi32s,
   9550            binop(Iop_SarN32x2,
   9551                  binop(Iop_InterleaveHI16x4, mkexpr(bb), mkexpr(bb)),
   9552                  mkU8(16) ));
   9553    assign( bblo32s,
   9554            binop(Iop_SarN32x2,
   9555                  binop(Iop_InterleaveLO16x4, mkexpr(bb), mkexpr(bb)),
   9556                  mkU8(16) ));
   9557    assign(one32x2, mkU64( (1ULL << 32) + 1 ));
   9558    assign(
   9559       rHi,
   9560       binop(
   9561          Iop_ShrN32x2,
   9562          binop(
   9563             Iop_Add32x2,
   9564             binop(
   9565                Iop_ShrN32x2,
   9566                binop(Iop_Mul32x2, mkexpr(aahi32s), mkexpr(bbhi32s)),
   9567                mkU8(14)
   9568             ),
   9569             mkexpr(one32x2)
   9570          ),
   9571          mkU8(1)
   9572       )
   9573    );
   9574    assign(
   9575       rLo,
   9576       binop(
   9577          Iop_ShrN32x2,
   9578          binop(
   9579             Iop_Add32x2,
   9580             binop(
   9581                Iop_ShrN32x2,
   9582                binop(Iop_Mul32x2, mkexpr(aalo32s), mkexpr(bblo32s)),
   9583                mkU8(14)
   9584             ),
   9585             mkexpr(one32x2)
   9586          ),
   9587          mkU8(1)
   9588       )
   9589    );
   9590    return
   9591       binop(Iop_CatEvenLanes16x4, mkexpr(rHi), mkexpr(rLo));
   9592 }
   9593 
   9594 /* Helper for the SSSE3 (not SSE3) PSIGN{B,W,D} insns.  Given two 64-bit
   9595    values (aa,bb), computes, for each lane:
   9596 
   9597           if aa_lane < 0 then - bb_lane
   9598      else if aa_lane > 0 then bb_lane
   9599      else 0
   9600 */
   9601 static IRExpr* dis_PSIGN_helper ( IRExpr* aax, IRExpr* bbx, Int laneszB )
   9602 {
   9603    IRTemp aa       = newTemp(Ity_I64);
   9604    IRTemp bb       = newTemp(Ity_I64);
   9605    IRTemp zero     = newTemp(Ity_I64);
   9606    IRTemp bbNeg    = newTemp(Ity_I64);
   9607    IRTemp negMask  = newTemp(Ity_I64);
   9608    IRTemp posMask  = newTemp(Ity_I64);
   9609    IROp   opSub    = Iop_INVALID;
   9610    IROp   opCmpGTS = Iop_INVALID;
   9611 
   9612    switch (laneszB) {
   9613       case 1: opSub = Iop_Sub8x8;  opCmpGTS = Iop_CmpGT8Sx8;  break;
   9614       case 2: opSub = Iop_Sub16x4; opCmpGTS = Iop_CmpGT16Sx4; break;
   9615       case 4: opSub = Iop_Sub32x2; opCmpGTS = Iop_CmpGT32Sx2; break;
   9616       default: vassert(0);
   9617    }
   9618 
   9619    assign( aa,      aax );
   9620    assign( bb,      bbx );
   9621    assign( zero,    mkU64(0) );
   9622    assign( bbNeg,   binop(opSub,    mkexpr(zero), mkexpr(bb)) );
   9623    assign( negMask, binop(opCmpGTS, mkexpr(zero), mkexpr(aa)) );
   9624    assign( posMask, binop(opCmpGTS, mkexpr(aa),   mkexpr(zero)) );
   9625 
   9626    return
   9627       binop(Iop_Or64,
   9628             binop(Iop_And64, mkexpr(bb),    mkexpr(posMask)),
   9629             binop(Iop_And64, mkexpr(bbNeg), mkexpr(negMask)) );
   9630 
   9631 }
   9632 
   9633 
   9634 /* Helper for the SSSE3 (not SSE3) PABS{B,W,D} insns.  Given a 64-bit
   9635    value aa, computes, for each lane
   9636 
   9637    if aa < 0 then -aa else aa
   9638 
   9639    Note that the result is interpreted as unsigned, so that the
   9640    absolute value of the most negative signed input can be
   9641    represented.
   9642 */
   9643 static IRTemp math_PABS_MMX ( IRTemp aa, Int laneszB )
   9644 {
   9645    IRTemp res     = newTemp(Ity_I64);
   9646    IRTemp zero    = newTemp(Ity_I64);
   9647    IRTemp aaNeg   = newTemp(Ity_I64);
   9648    IRTemp negMask = newTemp(Ity_I64);
   9649    IRTemp posMask = newTemp(Ity_I64);
   9650    IROp   opSub   = Iop_INVALID;
   9651    IROp   opSarN  = Iop_INVALID;
   9652 
   9653    switch (laneszB) {
   9654       case 1: opSub = Iop_Sub8x8;  opSarN = Iop_SarN8x8;  break;
   9655       case 2: opSub = Iop_Sub16x4; opSarN = Iop_SarN16x4; break;
   9656       case 4: opSub = Iop_Sub32x2; opSarN = Iop_SarN32x2; break;
   9657       default: vassert(0);
   9658    }
   9659 
   9660    assign( negMask, binop(opSarN, mkexpr(aa), mkU8(8*laneszB-1)) );
   9661    assign( posMask, unop(Iop_Not64, mkexpr(negMask)) );
   9662    assign( zero,    mkU64(0) );
   9663    assign( aaNeg,   binop(opSub, mkexpr(zero), mkexpr(aa)) );
   9664    assign( res,
   9665            binop(Iop_Or64,
   9666                  binop(Iop_And64, mkexpr(aa),    mkexpr(posMask)),
   9667                  binop(Iop_And64, mkexpr(aaNeg), mkexpr(negMask)) ));
   9668    return res;
   9669 }
   9670 
   9671 /* XMM version of math_PABS_MMX. */
   9672 static IRTemp math_PABS_XMM ( IRTemp aa, Int laneszB )
   9673 {
   9674    IRTemp res  = newTemp(Ity_V128);
   9675    IRTemp aaHi = newTemp(Ity_I64);
   9676    IRTemp aaLo = newTemp(Ity_I64);
   9677    assign(aaHi, unop(Iop_V128HIto64, mkexpr(aa)));
   9678    assign(aaLo, unop(Iop_V128to64, mkexpr(aa)));
   9679    assign(res, binop(Iop_64HLtoV128,
   9680                      mkexpr(math_PABS_MMX(aaHi, laneszB)),
   9681                      mkexpr(math_PABS_MMX(aaLo, laneszB))));
   9682    return res;
   9683 }
   9684 
   9685 /* Specialisations of math_PABS_XMM, since there's no easy way to do
   9686    partial applications in C :-( */
   9687 static IRTemp math_PABS_XMM_pap4 ( IRTemp aa ) {
   9688    return math_PABS_XMM(aa, 4);
   9689 }
   9690 
   9691 static IRTemp math_PABS_XMM_pap2 ( IRTemp aa ) {
   9692    return math_PABS_XMM(aa, 2);
   9693 }
   9694 
   9695 static IRTemp math_PABS_XMM_pap1 ( IRTemp aa ) {
   9696    return math_PABS_XMM(aa, 1);
   9697 }
   9698 
   9699 /* YMM version of math_PABS_XMM. */
   9700 static IRTemp math_PABS_YMM ( IRTemp aa, Int laneszB )
   9701 {
   9702    IRTemp res  = newTemp(Ity_V256);
   9703    IRTemp aaHi = IRTemp_INVALID;
   9704    IRTemp aaLo = IRTemp_INVALID;
   9705    breakupV256toV128s(aa, &aaHi, &aaLo);
   9706    assign(res, binop(Iop_V128HLtoV256,
   9707                      mkexpr(math_PABS_XMM(aaHi, laneszB)),
   9708                      mkexpr(math_PABS_XMM(aaLo, laneszB))));
   9709    return res;
   9710 }
   9711 
   9712 static IRTemp math_PABS_YMM_pap4 ( IRTemp aa ) {
   9713    return math_PABS_YMM(aa, 4);
   9714 }
   9715 
   9716 static IRTemp math_PABS_YMM_pap2 ( IRTemp aa ) {
   9717    return math_PABS_YMM(aa, 2);
   9718 }
   9719 
   9720 static IRTemp math_PABS_YMM_pap1 ( IRTemp aa ) {
   9721    return math_PABS_YMM(aa, 1);
   9722 }
   9723 
   9724 static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64,
   9725                                         IRTemp lo64, Long byteShift )
   9726 {
   9727    vassert(byteShift >= 1 && byteShift <= 7);
   9728    return
   9729       binop(Iop_Or64,
   9730             binop(Iop_Shl64, mkexpr(hi64), mkU8(8*(8-byteShift))),
   9731             binop(Iop_Shr64, mkexpr(lo64), mkU8(8*byteShift))
   9732       );
   9733 }
   9734 
   9735 static IRTemp math_PALIGNR_XMM ( IRTemp sV, IRTemp dV, UInt imm8 )
   9736 {
   9737    IRTemp res = newTemp(Ity_V128);
   9738    IRTemp sHi = newTemp(Ity_I64);
   9739    IRTemp sLo = newTemp(Ity_I64);
   9740    IRTemp dHi = newTemp(Ity_I64);
   9741    IRTemp dLo = newTemp(Ity_I64);
   9742    IRTemp rHi = newTemp(Ity_I64);
   9743    IRTemp rLo = newTemp(Ity_I64);
   9744 
   9745    assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   9746    assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   9747    assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   9748    assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   9749 
   9750    if (imm8 == 0) {
   9751       assign( rHi, mkexpr(sHi) );
   9752       assign( rLo, mkexpr(sLo) );
   9753    }
   9754    else if (imm8 >= 1 && imm8 <= 7) {
   9755       assign( rHi, dis_PALIGNR_XMM_helper(dLo, sHi, imm8) );
   9756       assign( rLo, dis_PALIGNR_XMM_helper(sHi, sLo, imm8) );
   9757    }
   9758    else if (imm8 == 8) {
   9759       assign( rHi, mkexpr(dLo) );
   9760       assign( rLo, mkexpr(sHi) );
   9761    }
   9762    else if (imm8 >= 9 && imm8 <= 15) {
   9763       assign( rHi, dis_PALIGNR_XMM_helper(dHi, dLo, imm8-8) );
   9764       assign( rLo, dis_PALIGNR_XMM_helper(dLo, sHi, imm8-8) );
   9765    }
   9766    else if (imm8 == 16) {
   9767       assign( rHi, mkexpr(dHi) );
   9768       assign( rLo, mkexpr(dLo) );
   9769    }
   9770    else if (imm8 >= 17 && imm8 <= 23) {
   9771       assign( rHi, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(imm8-16))) );
   9772       assign( rLo, dis_PALIGNR_XMM_helper(dHi, dLo, imm8-16) );
   9773    }
   9774    else if (imm8 == 24) {
   9775       assign( rHi, mkU64(0) );
   9776       assign( rLo, mkexpr(dHi) );
   9777    }
   9778    else if (imm8 >= 25 && imm8 <= 31) {
   9779       assign( rHi, mkU64(0) );
   9780       assign( rLo, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(imm8-24))) );
   9781    }
   9782    else if (imm8 >= 32 && imm8 <= 255) {
   9783       assign( rHi, mkU64(0) );
   9784       assign( rLo, mkU64(0) );
   9785    }
   9786    else
   9787       vassert(0);
   9788 
   9789    assign( res, binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo)));
   9790    return res;
   9791 }
   9792 
   9793 
   9794 /* Generate a SIGSEGV followed by a restart of the current instruction
   9795    if effective_addr is not 16-aligned.  This is required behaviour
   9796    for some SSE3 instructions and all 128-bit SSSE3 instructions.
   9797    This assumes that guest_RIP_curr_instr is set correctly! */
   9798 static
   9799 void gen_SEGV_if_not_XX_aligned ( IRTemp effective_addr, ULong mask )
   9800 {
   9801    stmt(
   9802       IRStmt_Exit(
   9803          binop(Iop_CmpNE64,
   9804                binop(Iop_And64,mkexpr(effective_addr),mkU64(mask)),
   9805                mkU64(0)),
   9806          Ijk_SigSEGV,
   9807          IRConst_U64(guest_RIP_curr_instr),
   9808          OFFB_RIP
   9809       )
   9810    );
   9811 }
   9812 
   9813 static void gen_SEGV_if_not_16_aligned ( IRTemp effective_addr ) {
   9814    gen_SEGV_if_not_XX_aligned(effective_addr, 16-1);
   9815 }
   9816 
   9817 static void gen_SEGV_if_not_32_aligned ( IRTemp effective_addr ) {
   9818    gen_SEGV_if_not_XX_aligned(effective_addr, 32-1);
   9819 }
   9820 
   9821 /* Helper for deciding whether a given insn (starting at the opcode
   9822    byte) may validly be used with a LOCK prefix.  The following insns
   9823    may be used with LOCK when their destination operand is in memory.
   9824    AFAICS this is exactly the same for both 32-bit and 64-bit mode.
   9825 
   9826    ADD        80 /0,  81 /0,  82 /0,  83 /0,  00,  01
   9827    OR         80 /1,  81 /1,  82 /x,  83 /1,  08,  09
   9828    ADC        80 /2,  81 /2,  82 /2,  83 /2,  10,  11
   9829    SBB        81 /3,  81 /3,  82 /x,  83 /3,  18,  19
   9830    AND        80 /4,  81 /4,  82 /x,  83 /4,  20,  21
   9831    SUB        80 /5,  81 /5,  82 /x,  83 /5,  28,  29
   9832    XOR        80 /6,  81 /6,  82 /x,  83 /6,  30,  31
   9833 
   9834    DEC        FE /1,  FF /1
   9835    INC        FE /0,  FF /0
   9836 
   9837    NEG        F6 /3,  F7 /3
   9838    NOT        F6 /2,  F7 /2
   9839 
   9840    XCHG       86, 87
   9841 
   9842    BTC        0F BB,  0F BA /7
   9843    BTR        0F B3,  0F BA /6
   9844    BTS        0F AB,  0F BA /5
   9845 
   9846    CMPXCHG    0F B0,  0F B1
   9847    CMPXCHG8B  0F C7 /1
   9848 
   9849    XADD       0F C0,  0F C1
   9850 
   9851    ------------------------------
   9852 
   9853    80 /0  =  addb $imm8,  rm8
   9854    81 /0  =  addl $imm32, rm32  and  addw $imm16, rm16
   9855    82 /0  =  addb $imm8,  rm8
   9856    83 /0  =  addl $simm8, rm32  and  addw $simm8, rm16
   9857 
   9858    00     =  addb r8,  rm8
   9859    01     =  addl r32, rm32  and  addw r16, rm16
   9860 
   9861    Same for ADD OR ADC SBB AND SUB XOR
   9862 
   9863    FE /1  = dec rm8
   9864    FF /1  = dec rm32  and  dec rm16
   9865 
   9866    FE /0  = inc rm8
   9867    FF /0  = inc rm32  and  inc rm16
   9868 
   9869    F6 /3  = neg rm8
   9870    F7 /3  = neg rm32  and  neg rm16
   9871 
   9872    F6 /2  = not rm8
   9873    F7 /2  = not rm32  and  not rm16
   9874 
   9875    0F BB     = btcw r16, rm16    and  btcl r32, rm32
   9876    OF BA /7  = btcw $imm8, rm16  and  btcw $imm8, rm32
   9877 
   9878    Same for BTS, BTR
   9879 */
   9880 static Bool can_be_used_with_LOCK_prefix ( const UChar* opc )
   9881 {
   9882    switch (opc[0]) {
   9883       case 0x00: case 0x01: case 0x08: case 0x09:
   9884       case 0x10: case 0x11: case 0x18: case 0x19:
   9885       case 0x20: case 0x21: case 0x28: case 0x29:
   9886       case 0x30: case 0x31:
   9887          if (!epartIsReg(opc[1]))
   9888             return True;
   9889          break;
   9890 
   9891       case 0x80: case 0x81: case 0x82: case 0x83:
   9892          if (gregLO3ofRM(opc[1]) >= 0 && gregLO3ofRM(opc[1]) <= 6
   9893              && !epartIsReg(opc[1]))
   9894             return True;
   9895          break;
   9896 
   9897       case 0xFE: case 0xFF:
   9898          if (gregLO3ofRM(opc[1]) >= 0 && gregLO3ofRM(opc[1]) <= 1
   9899              && !epartIsReg(opc[1]))
   9900             return True;
   9901          break;
   9902 
   9903       case 0xF6: case 0xF7:
   9904          if (gregLO3ofRM(opc[1]) >= 2 && gregLO3ofRM(opc[1]) <= 3
   9905              && !epartIsReg(opc[1]))
   9906             return True;
   9907          break;
   9908 
   9909       case 0x86: case 0x87:
   9910          if (!epartIsReg(opc[1]))
   9911             return True;
   9912          break;
   9913 
   9914       case 0x0F: {
   9915          switch (opc[1]) {
   9916             case 0xBB: case 0xB3: case 0xAB:
   9917                if (!epartIsReg(opc[2]))
   9918                   return True;
   9919                break;
   9920             case 0xBA:
   9921                if (gregLO3ofRM(opc[2]) >= 5 && gregLO3ofRM(opc[2]) <= 7
   9922                    && !epartIsReg(opc[2]))
   9923                   return True;
   9924                break;
   9925             case 0xB0: case 0xB1:
   9926                if (!epartIsReg(opc[2]))
   9927                   return True;
   9928                break;
   9929             case 0xC7:
   9930                if (gregLO3ofRM(opc[2]) == 1 && !epartIsReg(opc[2]) )
   9931                   return True;
   9932                break;
   9933             case 0xC0: case 0xC1:
   9934                if (!epartIsReg(opc[2]))
   9935                   return True;
   9936                break;
   9937             default:
   9938                break;
   9939          } /* switch (opc[1]) */
   9940          break;
   9941       }
   9942 
   9943       default:
   9944          break;
   9945    } /* switch (opc[0]) */
   9946 
   9947    return False;
   9948 }
   9949 
   9950 
   9951 /*------------------------------------------------------------*/
   9952 /*---                                                      ---*/
   9953 /*--- Top-level SSE/SSE2: dis_ESC_0F__SSE2                 ---*/
   9954 /*---                                                      ---*/
   9955 /*------------------------------------------------------------*/
   9956 
   9957 static Long dis_COMISD ( const VexAbiInfo* vbi, Prefix pfx,
   9958                          Long delta, Bool isAvx, UChar opc )
   9959 {
   9960    vassert(opc == 0x2F/*COMISD*/ || opc == 0x2E/*UCOMISD*/);
   9961    Int    alen  = 0;
   9962    HChar  dis_buf[50];
   9963    IRTemp argL  = newTemp(Ity_F64);
   9964    IRTemp argR  = newTemp(Ity_F64);
   9965    UChar  modrm = getUChar(delta);
   9966    IRTemp addr  = IRTemp_INVALID;
   9967    if (epartIsReg(modrm)) {
   9968       assign( argR, getXMMRegLane64F( eregOfRexRM(pfx,modrm),
   9969                                       0/*lowest lane*/ ) );
   9970       delta += 1;
   9971       DIP("%s%scomisd %s,%s\n", isAvx ? "v" : "",
   9972                                 opc==0x2E ? "u" : "",
   9973                                 nameXMMReg(eregOfRexRM(pfx,modrm)),
   9974                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
   9975    } else {
   9976       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9977       assign( argR, loadLE(Ity_F64, mkexpr(addr)) );
   9978       delta += alen;
   9979       DIP("%s%scomisd %s,%s\n", isAvx ? "v" : "",
   9980                                 opc==0x2E ? "u" : "",
   9981                                 dis_buf,
   9982                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
   9983    }
   9984    assign( argL, getXMMRegLane64F( gregOfRexRM(pfx,modrm),
   9985                                    0/*lowest lane*/ ) );
   9986 
   9987    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   9988    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   9989    stmt( IRStmt_Put(
   9990             OFFB_CC_DEP1,
   9991             binop( Iop_And64,
   9992                    unop( Iop_32Uto64,
   9993                          binop(Iop_CmpF64, mkexpr(argL), mkexpr(argR)) ),
   9994                    mkU64(0x45)
   9995        )));
   9996    return delta;
   9997 }
   9998 
   9999 
   10000 static Long dis_COMISS ( const VexAbiInfo* vbi, Prefix pfx,
   10001                          Long delta, Bool isAvx, UChar opc )
   10002 {
   10003    vassert(opc == 0x2F/*COMISS*/ || opc == 0x2E/*UCOMISS*/);
   10004    Int    alen  = 0;
   10005    HChar  dis_buf[50];
   10006    IRTemp argL  = newTemp(Ity_F32);
   10007    IRTemp argR  = newTemp(Ity_F32);
   10008    UChar  modrm = getUChar(delta);
   10009    IRTemp addr  = IRTemp_INVALID;
   10010    if (epartIsReg(modrm)) {
   10011       assign( argR, getXMMRegLane32F( eregOfRexRM(pfx,modrm),
   10012                                       0/*lowest lane*/ ) );
   10013       delta += 1;
   10014       DIP("%s%scomiss %s,%s\n", isAvx ? "v" : "",
   10015                                 opc==0x2E ? "u" : "",
   10016                                 nameXMMReg(eregOfRexRM(pfx,modrm)),
   10017                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10018    } else {
   10019       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10020       assign( argR, loadLE(Ity_F32, mkexpr(addr)) );
   10021       delta += alen;
   10022       DIP("%s%scomiss %s,%s\n", isAvx ? "v" : "",
   10023                                 opc==0x2E ? "u" : "",
   10024                                 dis_buf,
   10025                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10026    }
   10027    assign( argL, getXMMRegLane32F( gregOfRexRM(pfx,modrm),
   10028                                    0/*lowest lane*/ ) );
   10029 
   10030    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   10031    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   10032    stmt( IRStmt_Put(
   10033             OFFB_CC_DEP1,
   10034             binop( Iop_And64,
   10035                    unop( Iop_32Uto64,
   10036                          binop(Iop_CmpF64,
   10037                                unop(Iop_F32toF64,mkexpr(argL)),
   10038                                unop(Iop_F32toF64,mkexpr(argR)))),
   10039                    mkU64(0x45)
   10040        )));
   10041    return delta;
   10042 }
   10043 
   10044 
   10045 static Long dis_PSHUFD_32x4 ( const VexAbiInfo* vbi, Prefix pfx,
   10046                               Long delta, Bool writesYmm )
   10047 {
   10048    Int    order;
   10049    Int    alen  = 0;
   10050    HChar  dis_buf[50];
   10051    IRTemp sV    = newTemp(Ity_V128);
   10052    UChar  modrm = getUChar(delta);
   10053    const HChar* strV  = writesYmm ? "v" : "";
   10054    IRTemp addr  = IRTemp_INVALID;
   10055    if (epartIsReg(modrm)) {
   10056       assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   10057       order = (Int)getUChar(delta+1);
   10058       delta += 1+1;
   10059       DIP("%spshufd $%d,%s,%s\n", strV, order,
   10060                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
   10061                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   10062    } else {
   10063       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   10064                         1/*byte after the amode*/ );
   10065       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   10066       order = (Int)getUChar(delta+alen);
   10067       delta += alen+1;
   10068       DIP("%spshufd $%d,%s,%s\n", strV, order,
   10069                                  dis_buf,
   10070                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   10071    }
   10072 
   10073    IRTemp s3, s2, s1, s0;
   10074    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   10075    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   10076 
   10077 #  define SEL(n)  ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   10078    IRTemp dV = newTemp(Ity_V128);
   10079    assign(dV,
   10080           mkV128from32s( SEL((order>>6)&3), SEL((order>>4)&3),
   10081                          SEL((order>>2)&3), SEL((order>>0)&3) )
   10082    );
   10083 #  undef SEL
   10084 
   10085    (writesYmm ? putYMMRegLoAndZU : putXMMReg)
   10086       (gregOfRexRM(pfx,modrm), mkexpr(dV));
   10087    return delta;
   10088 }
   10089 
   10090 
   10091 static Long dis_PSHUFD_32x8 ( const VexAbiInfo* vbi, Prefix pfx, Long delta )
   10092 {
   10093    Int    order;
   10094    Int    alen  = 0;
   10095    HChar  dis_buf[50];
   10096    IRTemp sV    = newTemp(Ity_V256);
   10097    UChar  modrm = getUChar(delta);
   10098    IRTemp addr  = IRTemp_INVALID;
   10099    UInt   rG    = gregOfRexRM(pfx,modrm);
   10100    if (epartIsReg(modrm)) {
   10101       UInt rE = eregOfRexRM(pfx,modrm);
   10102       assign( sV, getYMMReg(rE) );
   10103       order = (Int)getUChar(delta+1);
   10104       delta += 1+1;
   10105       DIP("vpshufd $%d,%s,%s\n", order, nameYMMReg(rE), nameYMMReg(rG));
   10106    } else {
   10107       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   10108                         1/*byte after the amode*/ );
   10109       assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   10110       order = (Int)getUChar(delta+alen);
   10111       delta += alen+1;
   10112       DIP("vpshufd $%d,%s,%s\n", order,  dis_buf, nameYMMReg(rG));
   10113    }
   10114 
   10115    IRTemp s[8];
   10116    s[7] = s[6] = s[5] = s[4] = s[3] = s[2] = s[1] = s[0] = IRTemp_INVALID;
   10117    breakupV256to32s( sV, &s[7], &s[6], &s[5], &s[4],
   10118                          &s[3], &s[2], &s[1], &s[0] );
   10119 
   10120    putYMMReg( rG, mkV256from32s( s[4 + ((order>>6)&3)],
   10121                                  s[4 + ((order>>4)&3)],
   10122                                  s[4 + ((order>>2)&3)],
   10123                                  s[4 + ((order>>0)&3)],
   10124                                  s[0 + ((order>>6)&3)],
   10125                                  s[0 + ((order>>4)&3)],
   10126                                  s[0 + ((order>>2)&3)],
   10127                                  s[0 + ((order>>0)&3)] ) );
   10128    return delta;
   10129 }
   10130 
   10131 
   10132 static IRTemp math_PSRLDQ ( IRTemp sV, Int imm )
   10133 {
   10134    IRTemp dV    = newTemp(Ity_V128);
   10135    IRTemp hi64  = newTemp(Ity_I64);
   10136    IRTemp lo64  = newTemp(Ity_I64);
   10137    IRTemp hi64r = newTemp(Ity_I64);
   10138    IRTemp lo64r = newTemp(Ity_I64);
   10139 
   10140    vassert(imm >= 0 && imm <= 255);
   10141    if (imm >= 16) {
   10142       assign(dV, mkV128(0x0000));
   10143       return dV;
   10144    }
   10145 
   10146    assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
   10147    assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
   10148 
   10149    if (imm == 0) {
   10150       assign( lo64r, mkexpr(lo64) );
   10151       assign( hi64r, mkexpr(hi64) );
   10152    }
   10153    else
   10154    if (imm == 8) {
   10155       assign( hi64r, mkU64(0) );
   10156       assign( lo64r, mkexpr(hi64) );
   10157    }
   10158    else
   10159    if (imm > 8) {
   10160       assign( hi64r, mkU64(0) );
   10161       assign( lo64r, binop( Iop_Shr64, mkexpr(hi64), mkU8( 8*(imm-8) ) ));
   10162    } else {
   10163       assign( hi64r, binop( Iop_Shr64, mkexpr(hi64), mkU8(8 * imm) ));
   10164       assign( lo64r,
   10165               binop( Iop_Or64,
   10166                      binop(Iop_Shr64, mkexpr(lo64),
   10167                            mkU8(8 * imm)),
   10168                      binop(Iop_Shl64, mkexpr(hi64),
   10169                            mkU8(8 * (8 - imm)) )
   10170                      )
   10171               );
   10172    }
   10173 
   10174    assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
   10175    return dV;
   10176 }
   10177 
   10178 
   10179 static IRTemp math_PSLLDQ ( IRTemp sV, Int imm )
   10180 {
   10181    IRTemp       dV    = newTemp(Ity_V128);
   10182    IRTemp       hi64  = newTemp(Ity_I64);
   10183    IRTemp       lo64  = newTemp(Ity_I64);
   10184    IRTemp       hi64r = newTemp(Ity_I64);
   10185    IRTemp       lo64r = newTemp(Ity_I64);
   10186 
   10187    vassert(imm >= 0 && imm <= 255);
   10188    if (imm >= 16) {
   10189       assign(dV, mkV128(0x0000));
   10190       return dV;
   10191    }
   10192 
   10193    assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
   10194    assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
   10195 
   10196    if (imm == 0) {
   10197       assign( lo64r, mkexpr(lo64) );
   10198       assign( hi64r, mkexpr(hi64) );
   10199    }
   10200    else
   10201    if (imm == 8) {
   10202       assign( lo64r, mkU64(0) );
   10203       assign( hi64r, mkexpr(lo64) );
   10204    }
   10205    else
   10206    if (imm > 8) {
   10207       assign( lo64r, mkU64(0) );
   10208       assign( hi64r, binop( Iop_Shl64, mkexpr(lo64), mkU8( 8*(imm-8) ) ));
   10209    } else {
   10210       assign( lo64r, binop( Iop_Shl64, mkexpr(lo64), mkU8(8 * imm) ));
   10211       assign( hi64r,
   10212               binop( Iop_Or64,
   10213                      binop(Iop_Shl64, mkexpr(hi64),
   10214                            mkU8(8 * imm)),
   10215                      binop(Iop_Shr64, mkexpr(lo64),
   10216                            mkU8(8 * (8 - imm)) )
   10217                      )
   10218               );
   10219    }
   10220 
   10221    assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
   10222    return dV;
   10223 }
   10224 
   10225 
   10226 static Long dis_CVTxSD2SI ( const VexAbiInfo* vbi, Prefix pfx,
   10227                             Long delta, Bool isAvx, UChar opc, Int sz )
   10228 {
   10229    vassert(opc == 0x2D/*CVTSD2SI*/ || opc == 0x2C/*CVTTSD2SI*/);
   10230    HChar  dis_buf[50];
   10231    Int    alen   = 0;
   10232    UChar  modrm  = getUChar(delta);
   10233    IRTemp addr   = IRTemp_INVALID;
   10234    IRTemp rmode  = newTemp(Ity_I32);
   10235    IRTemp f64lo  = newTemp(Ity_F64);
   10236    Bool   r2zero = toBool(opc == 0x2C);
   10237 
   10238    if (epartIsReg(modrm)) {
   10239       delta += 1;
   10240       assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
   10241       DIP("%scvt%ssd2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
   10242                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
   10243                                   nameIReg(sz, gregOfRexRM(pfx,modrm),
   10244                                            False));
   10245    } else {
   10246       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10247       assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   10248       delta += alen;
   10249       DIP("%scvt%ssd2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
   10250                                   dis_buf,
   10251                                   nameIReg(sz, gregOfRexRM(pfx,modrm),
   10252                                            False));
   10253    }
   10254 
   10255    if (r2zero) {
   10256       assign( rmode, mkU32((UInt)Irrm_ZERO) );
   10257    } else {
   10258       assign( rmode, get_sse_roundingmode() );
   10259    }
   10260 
   10261    if (sz == 4) {
   10262       putIReg32( gregOfRexRM(pfx,modrm),
   10263                  binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo)) );
   10264    } else {
   10265       vassert(sz == 8);
   10266       putIReg64( gregOfRexRM(pfx,modrm),
   10267                  binop( Iop_F64toI64S, mkexpr(rmode), mkexpr(f64lo)) );
   10268    }
   10269 
   10270    return delta;
   10271 }
   10272 
   10273 
   10274 static Long dis_CVTxSS2SI ( const VexAbiInfo* vbi, Prefix pfx,
   10275                             Long delta, Bool isAvx, UChar opc, Int sz )
   10276 {
   10277    vassert(opc == 0x2D/*CVTSS2SI*/ || opc == 0x2C/*CVTTSS2SI*/);
   10278    HChar  dis_buf[50];
   10279    Int    alen   = 0;
   10280    UChar  modrm  = getUChar(delta);
   10281    IRTemp addr   = IRTemp_INVALID;
   10282    IRTemp rmode  = newTemp(Ity_I32);
   10283    IRTemp f32lo  = newTemp(Ity_F32);
   10284    Bool   r2zero = toBool(opc == 0x2C);
   10285 
   10286    if (epartIsReg(modrm)) {
   10287       delta += 1;
   10288       assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
   10289       DIP("%scvt%sss2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
   10290                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
   10291                                   nameIReg(sz, gregOfRexRM(pfx,modrm),
   10292                                            False));
   10293    } else {
   10294       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10295       assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   10296       delta += alen;
   10297       DIP("%scvt%sss2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
   10298                                   dis_buf,
   10299                                   nameIReg(sz, gregOfRexRM(pfx,modrm),
   10300                                            False));
   10301    }
   10302 
   10303    if (r2zero) {
   10304       assign( rmode, mkU32((UInt)Irrm_ZERO) );
   10305    } else {
   10306       assign( rmode, get_sse_roundingmode() );
   10307    }
   10308 
   10309    if (sz == 4) {
   10310       putIReg32( gregOfRexRM(pfx,modrm),
   10311                  binop( Iop_F64toI32S,
   10312                         mkexpr(rmode),
   10313                         unop(Iop_F32toF64, mkexpr(f32lo))) );
   10314    } else {
   10315       vassert(sz == 8);
   10316       putIReg64( gregOfRexRM(pfx,modrm),
   10317                  binop( Iop_F64toI64S,
   10318                         mkexpr(rmode),
   10319                         unop(Iop_F32toF64, mkexpr(f32lo))) );
   10320    }
   10321 
   10322    return delta;
   10323 }
   10324 
   10325 
   10326 static Long dis_CVTPS2PD_128 ( const VexAbiInfo* vbi, Prefix pfx,
   10327                                Long delta, Bool isAvx )
   10328 {
   10329    IRTemp addr  = IRTemp_INVALID;
   10330    Int    alen  = 0;
   10331    HChar  dis_buf[50];
   10332    IRTemp f32lo = newTemp(Ity_F32);
   10333    IRTemp f32hi = newTemp(Ity_F32);
   10334    UChar  modrm = getUChar(delta);
   10335    UInt   rG    = gregOfRexRM(pfx,modrm);
   10336    if (epartIsReg(modrm)) {
   10337       UInt rE = eregOfRexRM(pfx,modrm);
   10338       assign( f32lo, getXMMRegLane32F(rE, 0) );
   10339       assign( f32hi, getXMMRegLane32F(rE, 1) );
   10340       delta += 1;
   10341       DIP("%scvtps2pd %s,%s\n",
   10342           isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG));
   10343    } else {
   10344       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10345       assign( f32lo, loadLE(Ity_F32, mkexpr(addr)) );
   10346       assign( f32hi, loadLE(Ity_F32,
   10347                             binop(Iop_Add64,mkexpr(addr),mkU64(4))) );
   10348       delta += alen;
   10349       DIP("%scvtps2pd %s,%s\n",
   10350           isAvx ? "v" : "", dis_buf, nameXMMReg(rG));
   10351    }
   10352 
   10353    putXMMRegLane64F( rG, 1, unop(Iop_F32toF64, mkexpr(f32hi)) );
   10354    putXMMRegLane64F( rG, 0, unop(Iop_F32toF64, mkexpr(f32lo)) );
   10355    if (isAvx)
   10356       putYMMRegLane128( rG, 1, mkV128(0));
   10357    return delta;
   10358 }
   10359 
   10360 
   10361 static Long dis_CVTPS2PD_256 ( const VexAbiInfo* vbi, Prefix pfx,
   10362                                Long delta )
   10363 {
   10364    IRTemp addr  = IRTemp_INVALID;
   10365    Int    alen  = 0;
   10366    HChar  dis_buf[50];
   10367    IRTemp f32_0 = newTemp(Ity_F32);
   10368    IRTemp f32_1 = newTemp(Ity_F32);
   10369    IRTemp f32_2 = newTemp(Ity_F32);
   10370    IRTemp f32_3 = newTemp(Ity_F32);
   10371    UChar  modrm = getUChar(delta);
   10372    UInt   rG    = gregOfRexRM(pfx,modrm);
   10373    if (epartIsReg(modrm)) {
   10374       UInt rE = eregOfRexRM(pfx,modrm);
   10375       assign( f32_0, getXMMRegLane32F(rE, 0) );
   10376       assign( f32_1, getXMMRegLane32F(rE, 1) );
   10377       assign( f32_2, getXMMRegLane32F(rE, 2) );
   10378       assign( f32_3, getXMMRegLane32F(rE, 3) );
   10379       delta += 1;
   10380       DIP("vcvtps2pd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   10381    } else {
   10382       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10383       assign( f32_0, loadLE(Ity_F32, mkexpr(addr)) );
   10384       assign( f32_1, loadLE(Ity_F32,
   10385                             binop(Iop_Add64,mkexpr(addr),mkU64(4))) );
   10386       assign( f32_2, loadLE(Ity_F32,
   10387                             binop(Iop_Add64,mkexpr(addr),mkU64(8))) );
   10388       assign( f32_3, loadLE(Ity_F32,
   10389                             binop(Iop_Add64,mkexpr(addr),mkU64(12))) );
   10390       delta += alen;
   10391       DIP("vcvtps2pd %s,%s\n", dis_buf, nameYMMReg(rG));
   10392    }
   10393 
   10394    putYMMRegLane64F( rG, 3, unop(Iop_F32toF64, mkexpr(f32_3)) );
   10395    putYMMRegLane64F( rG, 2, unop(Iop_F32toF64, mkexpr(f32_2)) );
   10396    putYMMRegLane64F( rG, 1, unop(Iop_F32toF64, mkexpr(f32_1)) );
   10397    putYMMRegLane64F( rG, 0, unop(Iop_F32toF64, mkexpr(f32_0)) );
   10398    return delta;
   10399 }
   10400 
   10401 
   10402 static Long dis_CVTPD2PS_128 ( const VexAbiInfo* vbi, Prefix pfx,
   10403                                Long delta, Bool isAvx )
   10404 {
   10405    IRTemp addr  = IRTemp_INVALID;
   10406    Int    alen  = 0;
   10407    HChar  dis_buf[50];
   10408    UChar  modrm = getUChar(delta);
   10409    UInt   rG    = gregOfRexRM(pfx,modrm);
   10410    IRTemp argV  = newTemp(Ity_V128);
   10411    IRTemp rmode = newTemp(Ity_I32);
   10412    if (epartIsReg(modrm)) {
   10413       UInt rE = eregOfRexRM(pfx,modrm);
   10414       assign( argV, getXMMReg(rE) );
   10415       delta += 1;
   10416       DIP("%scvtpd2ps %s,%s\n", isAvx ? "v" : "",
   10417           nameXMMReg(rE), nameXMMReg(rG));
   10418    } else {
   10419       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10420       assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10421       delta += alen;
   10422       DIP("%scvtpd2ps %s,%s\n", isAvx ? "v" : "",
   10423           dis_buf, nameXMMReg(rG) );
   10424    }
   10425 
   10426    assign( rmode, get_sse_roundingmode() );
   10427    IRTemp t0 = newTemp(Ity_F64);
   10428    IRTemp t1 = newTemp(Ity_F64);
   10429    assign( t0, unop(Iop_ReinterpI64asF64,
   10430                     unop(Iop_V128to64, mkexpr(argV))) );
   10431    assign( t1, unop(Iop_ReinterpI64asF64,
   10432                     unop(Iop_V128HIto64, mkexpr(argV))) );
   10433 
   10434 #  define CVT(_t)  binop( Iop_F64toF32, mkexpr(rmode), mkexpr(_t) )
   10435    putXMMRegLane32(  rG, 3, mkU32(0) );
   10436    putXMMRegLane32(  rG, 2, mkU32(0) );
   10437    putXMMRegLane32F( rG, 1, CVT(t1) );
   10438    putXMMRegLane32F( rG, 0, CVT(t0) );
   10439 #  undef CVT
   10440    if (isAvx)
   10441       putYMMRegLane128( rG, 1, mkV128(0) );
   10442 
   10443    return delta;
   10444 }
   10445 
   10446 
   10447 static Long dis_CVTxPS2DQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
   10448                                 Long delta, Bool isAvx, Bool r2zero )
   10449 {
   10450    IRTemp addr  = IRTemp_INVALID;
   10451    Int    alen  = 0;
   10452    HChar  dis_buf[50];
   10453    UChar  modrm = getUChar(delta);
   10454    IRTemp argV  = newTemp(Ity_V128);
   10455    IRTemp rmode = newTemp(Ity_I32);
   10456    UInt   rG    = gregOfRexRM(pfx,modrm);
   10457    IRTemp t0, t1, t2, t3;
   10458 
   10459    if (epartIsReg(modrm)) {
   10460       UInt rE = eregOfRexRM(pfx,modrm);
   10461       assign( argV, getXMMReg(rE) );
   10462       delta += 1;
   10463       DIP("%scvt%sps2dq %s,%s\n",
   10464           isAvx ? "v" : "", r2zero ? "t" : "", nameXMMReg(rE), nameXMMReg(rG));
   10465    } else {
   10466       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10467       assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10468       delta += alen;
   10469       DIP("%scvt%sps2dq %s,%s\n",
   10470           isAvx ? "v" : "", r2zero ? "t" : "", dis_buf, nameXMMReg(rG) );
   10471    }
   10472 
   10473    assign( rmode, r2zero ? mkU32((UInt)Irrm_ZERO)
   10474                          : get_sse_roundingmode() );
   10475    t0 = t1 = t2 = t3 = IRTemp_INVALID;
   10476    breakupV128to32s( argV, &t3, &t2, &t1, &t0 );
   10477    /* This is less than ideal.  If it turns out to be a performance
   10478       bottleneck it can be improved. */
   10479 #  define CVT(_t)                             \
   10480       binop( Iop_F64toI32S,                   \
   10481              mkexpr(rmode),                   \
   10482              unop( Iop_F32toF64,              \
   10483                    unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
   10484 
   10485    putXMMRegLane32( rG, 3, CVT(t3) );
   10486    putXMMRegLane32( rG, 2, CVT(t2) );
   10487    putXMMRegLane32( rG, 1, CVT(t1) );
   10488    putXMMRegLane32( rG, 0, CVT(t0) );
   10489 #  undef CVT
   10490    if (isAvx)
   10491       putYMMRegLane128( rG, 1, mkV128(0) );
   10492 
   10493    return delta;
   10494 }
   10495 
   10496 
   10497 static Long dis_CVTxPS2DQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
   10498                                 Long delta, Bool r2zero )
   10499 {
   10500    IRTemp addr  = IRTemp_INVALID;
   10501    Int    alen  = 0;
   10502    HChar  dis_buf[50];
   10503    UChar  modrm = getUChar(delta);
   10504    IRTemp argV  = newTemp(Ity_V256);
   10505    IRTemp rmode = newTemp(Ity_I32);
   10506    UInt   rG    = gregOfRexRM(pfx,modrm);
   10507    IRTemp t0, t1, t2, t3, t4, t5, t6, t7;
   10508 
   10509    if (epartIsReg(modrm)) {
   10510       UInt rE = eregOfRexRM(pfx,modrm);
   10511       assign( argV, getYMMReg(rE) );
   10512       delta += 1;
   10513       DIP("vcvt%sps2dq %s,%s\n",
   10514           r2zero ? "t" : "", nameYMMReg(rE), nameYMMReg(rG));
   10515    } else {
   10516       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10517       assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
   10518       delta += alen;
   10519       DIP("vcvt%sps2dq %s,%s\n",
   10520           r2zero ? "t" : "", dis_buf, nameYMMReg(rG) );
   10521    }
   10522 
   10523    assign( rmode, r2zero ? mkU32((UInt)Irrm_ZERO)
   10524                          : get_sse_roundingmode() );
   10525    t0 = t1 = t2 = t3 = t4 = t5 = t6 = t7 = IRTemp_INVALID;
   10526    breakupV256to32s( argV, &t7, &t6, &t5, &t4, &t3, &t2, &t1, &t0 );
   10527    /* This is less than ideal.  If it turns out to be a performance
   10528       bottleneck it can be improved. */
   10529 #  define CVT(_t)                             \
   10530       binop( Iop_F64toI32S,                   \
   10531              mkexpr(rmode),                   \
   10532              unop( Iop_F32toF64,              \
   10533                    unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
   10534 
   10535    putYMMRegLane32( rG, 7, CVT(t7) );
   10536    putYMMRegLane32( rG, 6, CVT(t6) );
   10537    putYMMRegLane32( rG, 5, CVT(t5) );
   10538    putYMMRegLane32( rG, 4, CVT(t4) );
   10539    putYMMRegLane32( rG, 3, CVT(t3) );
   10540    putYMMRegLane32( rG, 2, CVT(t2) );
   10541    putYMMRegLane32( rG, 1, CVT(t1) );
   10542    putYMMRegLane32( rG, 0, CVT(t0) );
   10543 #  undef CVT
   10544 
   10545    return delta;
   10546 }
   10547 
   10548 
   10549 static Long dis_CVTxPD2DQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
   10550                                 Long delta, Bool isAvx, Bool r2zero )
   10551 {
   10552    IRTemp addr  = IRTemp_INVALID;
   10553    Int    alen  = 0;
   10554    HChar  dis_buf[50];
   10555    UChar  modrm = getUChar(delta);
   10556    IRTemp argV  = newTemp(Ity_V128);
   10557    IRTemp rmode = newTemp(Ity_I32);
   10558    UInt   rG    = gregOfRexRM(pfx,modrm);
   10559    IRTemp t0, t1;
   10560 
   10561    if (epartIsReg(modrm)) {
   10562       UInt rE = eregOfRexRM(pfx,modrm);
   10563       assign( argV, getXMMReg(rE) );
   10564       delta += 1;
   10565       DIP("%scvt%spd2dq %s,%s\n",
   10566           isAvx ? "v" : "", r2zero ? "t" : "", nameXMMReg(rE), nameXMMReg(rG));
   10567    } else {
   10568       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10569       assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10570       delta += alen;
   10571       DIP("%scvt%spd2dqx %s,%s\n",
   10572           isAvx ? "v" : "", r2zero ? "t" : "", dis_buf, nameXMMReg(rG) );
   10573    }
   10574 
   10575    if (r2zero) {
   10576       assign(rmode, mkU32((UInt)Irrm_ZERO) );
   10577    } else {
   10578       assign( rmode, get_sse_roundingmode() );
   10579    }
   10580 
   10581    t0 = newTemp(Ity_F64);
   10582    t1 = newTemp(Ity_F64);
   10583    assign( t0, unop(Iop_ReinterpI64asF64,
   10584                     unop(Iop_V128to64, mkexpr(argV))) );
   10585    assign( t1, unop(Iop_ReinterpI64asF64,
   10586                     unop(Iop_V128HIto64, mkexpr(argV))) );
   10587 
   10588 #  define CVT(_t)  binop( Iop_F64toI32S,                   \
   10589                           mkexpr(rmode),                   \
   10590                           mkexpr(_t) )
   10591 
   10592    putXMMRegLane32( rG, 3, mkU32(0) );
   10593    putXMMRegLane32( rG, 2, mkU32(0) );
   10594    putXMMRegLane32( rG, 1, CVT(t1) );
   10595    putXMMRegLane32( rG, 0, CVT(t0) );
   10596 #  undef CVT
   10597    if (isAvx)
   10598       putYMMRegLane128( rG, 1, mkV128(0) );
   10599 
   10600    return delta;
   10601 }
   10602 
   10603 
   10604 static Long dis_CVTxPD2DQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
   10605                                 Long delta, Bool r2zero )
   10606 {
   10607    IRTemp addr  = IRTemp_INVALID;
   10608    Int    alen  = 0;
   10609    HChar  dis_buf[50];
   10610    UChar  modrm = getUChar(delta);
   10611    IRTemp argV  = newTemp(Ity_V256);
   10612    IRTemp rmode = newTemp(Ity_I32);
   10613    UInt   rG    = gregOfRexRM(pfx,modrm);
   10614    IRTemp t0, t1, t2, t3;
   10615 
   10616    if (epartIsReg(modrm)) {
   10617       UInt rE = eregOfRexRM(pfx,modrm);
   10618       assign( argV, getYMMReg(rE) );
   10619       delta += 1;
   10620       DIP("vcvt%spd2dq %s,%s\n",
   10621           r2zero ? "t" : "", nameYMMReg(rE), nameXMMReg(rG));
   10622    } else {
   10623       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10624       assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
   10625       delta += alen;
   10626       DIP("vcvt%spd2dqy %s,%s\n",
   10627           r2zero ? "t" : "", dis_buf, nameXMMReg(rG) );
   10628    }
   10629 
   10630    if (r2zero) {
   10631       assign(rmode, mkU32((UInt)Irrm_ZERO) );
   10632    } else {
   10633       assign( rmode, get_sse_roundingmode() );
   10634    }
   10635 
   10636    t0 = IRTemp_INVALID;
   10637    t1 = IRTemp_INVALID;
   10638    t2 = IRTemp_INVALID;
   10639    t3 = IRTemp_INVALID;
   10640    breakupV256to64s( argV, &t3, &t2, &t1, &t0 );
   10641 
   10642 #  define CVT(_t)  binop( Iop_F64toI32S,                   \
   10643                           mkexpr(rmode),                   \
   10644                           unop( Iop_ReinterpI64asF64,      \
   10645                                 mkexpr(_t) ) )
   10646 
   10647    putXMMRegLane32( rG, 3, CVT(t3) );
   10648    putXMMRegLane32( rG, 2, CVT(t2) );
   10649    putXMMRegLane32( rG, 1, CVT(t1) );
   10650    putXMMRegLane32( rG, 0, CVT(t0) );
   10651 #  undef CVT
   10652    putYMMRegLane128( rG, 1, mkV128(0) );
   10653 
   10654    return delta;
   10655 }
   10656 
   10657 
   10658 static Long dis_CVTDQ2PS_128 ( const VexAbiInfo* vbi, Prefix pfx,
   10659                                Long delta, Bool isAvx )
   10660 {
   10661    IRTemp addr  = IRTemp_INVALID;
   10662    Int    alen  = 0;
   10663    HChar  dis_buf[50];
   10664    UChar  modrm = getUChar(delta);
   10665    IRTemp argV  = newTemp(Ity_V128);
   10666    IRTemp rmode = newTemp(Ity_I32);
   10667    UInt   rG    = gregOfRexRM(pfx,modrm);
   10668    IRTemp t0, t1, t2, t3;
   10669 
   10670    if (epartIsReg(modrm)) {
   10671       UInt rE = eregOfRexRM(pfx,modrm);
   10672       assign( argV, getXMMReg(rE) );
   10673       delta += 1;
   10674       DIP("%scvtdq2ps %s,%s\n",
   10675           isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG));
   10676    } else {
   10677       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10678       assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10679       delta += alen;
   10680       DIP("%scvtdq2ps %s,%s\n",
   10681           isAvx ? "v" : "", dis_buf, nameXMMReg(rG) );
   10682    }
   10683 
   10684    assign( rmode, get_sse_roundingmode() );
   10685    t0 = IRTemp_INVALID;
   10686    t1 = IRTemp_INVALID;
   10687    t2 = IRTemp_INVALID;
   10688    t3 = IRTemp_INVALID;
   10689    breakupV128to32s( argV, &t3, &t2, &t1, &t0 );
   10690 
   10691 #  define CVT(_t)  binop( Iop_F64toF32,                    \
   10692                           mkexpr(rmode),                   \
   10693                           unop(Iop_I32StoF64,mkexpr(_t)))
   10694 
   10695    putXMMRegLane32F( rG, 3, CVT(t3) );
   10696    putXMMRegLane32F( rG, 2, CVT(t2) );
   10697    putXMMRegLane32F( rG, 1, CVT(t1) );
   10698    putXMMRegLane32F( rG, 0, CVT(t0) );
   10699 #  undef CVT
   10700    if (isAvx)
   10701       putYMMRegLane128( rG, 1, mkV128(0) );
   10702 
   10703    return delta;
   10704 }
   10705 
   10706 static Long dis_CVTDQ2PS_256 ( const VexAbiInfo* vbi, Prefix pfx,
   10707                                Long delta )
   10708 {
   10709    IRTemp addr   = IRTemp_INVALID;
   10710    Int    alen   = 0;
   10711    HChar  dis_buf[50];
   10712    UChar  modrm  = getUChar(delta);
   10713    IRTemp argV   = newTemp(Ity_V256);
   10714    IRTemp rmode  = newTemp(Ity_I32);
   10715    UInt   rG     = gregOfRexRM(pfx,modrm);
   10716    IRTemp t0, t1, t2, t3, t4, t5, t6, t7;
   10717 
   10718    if (epartIsReg(modrm)) {
   10719       UInt rE = eregOfRexRM(pfx,modrm);
   10720       assign( argV, getYMMReg(rE) );
   10721       delta += 1;
   10722       DIP("vcvtdq2ps %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   10723    } else {
   10724       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10725       assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
   10726       delta += alen;
   10727       DIP("vcvtdq2ps %s,%s\n", dis_buf, nameYMMReg(rG) );
   10728    }
   10729 
   10730    assign( rmode, get_sse_roundingmode() );
   10731    t0 = IRTemp_INVALID;
   10732    t1 = IRTemp_INVALID;
   10733    t2 = IRTemp_INVALID;
   10734    t3 = IRTemp_INVALID;
   10735    t4 = IRTemp_INVALID;
   10736    t5 = IRTemp_INVALID;
   10737    t6 = IRTemp_INVALID;
   10738    t7 = IRTemp_INVALID;
   10739    breakupV256to32s( argV, &t7, &t6, &t5, &t4, &t3, &t2, &t1, &t0 );
   10740 
   10741 #  define CVT(_t)  binop( Iop_F64toF32,                    \
   10742                           mkexpr(rmode),                   \
   10743                           unop(Iop_I32StoF64,mkexpr(_t)))
   10744 
   10745    putYMMRegLane32F( rG, 7, CVT(t7) );
   10746    putYMMRegLane32F( rG, 6, CVT(t6) );
   10747    putYMMRegLane32F( rG, 5, CVT(t5) );
   10748    putYMMRegLane32F( rG, 4, CVT(t4) );
   10749    putYMMRegLane32F( rG, 3, CVT(t3) );
   10750    putYMMRegLane32F( rG, 2, CVT(t2) );
   10751    putYMMRegLane32F( rG, 1, CVT(t1) );
   10752    putYMMRegLane32F( rG, 0, CVT(t0) );
   10753 #  undef CVT
   10754 
   10755    return delta;
   10756 }
   10757 
   10758 
   10759 static Long dis_PMOVMSKB_128 ( const VexAbiInfo* vbi, Prefix pfx,
   10760                                Long delta, Bool isAvx )
   10761 {
   10762    UChar modrm = getUChar(delta);
   10763    vassert(epartIsReg(modrm)); /* ensured by caller */
   10764    UInt   rE = eregOfRexRM(pfx,modrm);
   10765    UInt   rG = gregOfRexRM(pfx,modrm);
   10766    IRTemp t0 = newTemp(Ity_V128);
   10767    IRTemp t1 = newTemp(Ity_I32);
   10768    assign(t0, getXMMReg(rE));
   10769    assign(t1, unop(Iop_16Uto32, unop(Iop_GetMSBs8x16, mkexpr(t0))));
   10770    putIReg32(rG, mkexpr(t1));
   10771    DIP("%spmovmskb %s,%s\n", isAvx ? "v" : "", nameXMMReg(rE),
   10772        nameIReg32(rG));
   10773    delta += 1;
   10774    return delta;
   10775 }
   10776 
   10777 
   10778 static Long dis_PMOVMSKB_256 ( const VexAbiInfo* vbi, Prefix pfx,
   10779                                Long delta  )
   10780 {
   10781    UChar modrm = getUChar(delta);
   10782    vassert(epartIsReg(modrm)); /* ensured by caller */
   10783    UInt   rE = eregOfRexRM(pfx,modrm);
   10784    UInt   rG = gregOfRexRM(pfx,modrm);
   10785    IRTemp t0 = newTemp(Ity_V128);
   10786    IRTemp t1 = newTemp(Ity_V128);
   10787    IRTemp t2 = newTemp(Ity_I16);
   10788    IRTemp t3 = newTemp(Ity_I16);
   10789    assign(t0, getYMMRegLane128(rE, 0));
   10790    assign(t1, getYMMRegLane128(rE, 1));
   10791    assign(t2, unop(Iop_GetMSBs8x16, mkexpr(t0)));
   10792    assign(t3, unop(Iop_GetMSBs8x16, mkexpr(t1)));
   10793    putIReg32(rG, binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)));
   10794    DIP("vpmovmskb %s,%s\n", nameYMMReg(rE), nameIReg32(rG));
   10795    delta += 1;
   10796    return delta;
   10797 }
   10798 
   10799 
   10800 /* FIXME: why not just use InterleaveLO / InterleaveHI?  I think the
   10801    relevant ops are "xIsH ? InterleaveHI32x4 : InterleaveLO32x4". */
   10802 /* Does the maths for 128 bit versions of UNPCKLPS and UNPCKHPS */
   10803 static IRTemp math_UNPCKxPS_128 ( IRTemp sV, IRTemp dV, Bool xIsH )
   10804 {
   10805    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   10806    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   10807    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   10808    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   10809    IRTemp res = newTemp(Ity_V128);
   10810    assign(res,  xIsH ? mkV128from32s( s3, d3, s2, d2 )
   10811                      : mkV128from32s( s1, d1, s0, d0 ));
   10812    return res;
   10813 }
   10814 
   10815 
   10816 /* FIXME: why not just use InterleaveLO / InterleaveHI ?? */
   10817 /* Does the maths for 128 bit versions of UNPCKLPD and UNPCKHPD */
   10818 static IRTemp math_UNPCKxPD_128 ( IRTemp sV, IRTemp dV, Bool xIsH )
   10819 {
   10820    IRTemp s1 = newTemp(Ity_I64);
   10821    IRTemp s0 = newTemp(Ity_I64);
   10822    IRTemp d1 = newTemp(Ity_I64);
   10823    IRTemp d0 = newTemp(Ity_I64);
   10824    assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
   10825    assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
   10826    assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
   10827    assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
   10828    IRTemp res = newTemp(Ity_V128);
   10829    assign(res, xIsH ? binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1))
   10830                     : binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)));
   10831    return res;
   10832 }
   10833 
   10834 
   10835 /* Does the maths for 256 bit versions of UNPCKLPD and UNPCKHPD.
   10836    Doesn't seem like this fits in either of the Iop_Interleave{LO,HI}
   10837    or the Iop_Cat{Odd,Even}Lanes idioms, hence just do it the stupid
   10838    way. */
   10839 static IRTemp math_UNPCKxPD_256 ( IRTemp sV, IRTemp dV, Bool xIsH )
   10840 {
   10841    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   10842    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   10843    breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
   10844    breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
   10845    IRTemp res = newTemp(Ity_V256);
   10846    assign(res, xIsH
   10847                ? IRExpr_Qop(Iop_64x4toV256, mkexpr(s3), mkexpr(d3),
   10848                                             mkexpr(s1), mkexpr(d1))
   10849                : IRExpr_Qop(Iop_64x4toV256, mkexpr(s2), mkexpr(d2),
   10850                                             mkexpr(s0), mkexpr(d0)));
   10851    return res;
   10852 }
   10853 
   10854 
   10855 /* FIXME: this is really bad.  Surely can do something better here?
   10856    One observation is that the steering in the upper and lower 128 bit
   10857    halves is the same as with math_UNPCKxPS_128, so we simply split
   10858    into two halves, and use that.  Consequently any improvement in
   10859    math_UNPCKxPS_128 (probably, to use interleave-style primops)
   10860    benefits this too. */
   10861 static IRTemp math_UNPCKxPS_256 ( IRTemp sV, IRTemp dV, Bool xIsH )
   10862 {
   10863    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   10864    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   10865    breakupV256toV128s( sV, &sVhi, &sVlo );
   10866    breakupV256toV128s( dV, &dVhi, &dVlo );
   10867    IRTemp rVhi = math_UNPCKxPS_128(sVhi, dVhi, xIsH);
   10868    IRTemp rVlo = math_UNPCKxPS_128(sVlo, dVlo, xIsH);
   10869    IRTemp rV   = newTemp(Ity_V256);
   10870    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   10871    return rV;
   10872 }
   10873 
   10874 
   10875 static IRTemp math_SHUFPS_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   10876 {
   10877    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   10878    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   10879    vassert(imm8 < 256);
   10880 
   10881    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   10882    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   10883 
   10884 #  define SELD(n) ((n)==0 ? d0 : ((n)==1 ? d1 : ((n)==2 ? d2 : d3)))
   10885 #  define SELS(n) ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   10886    IRTemp res = newTemp(Ity_V128);
   10887    assign(res,
   10888           mkV128from32s( SELS((imm8>>6)&3), SELS((imm8>>4)&3),
   10889                          SELD((imm8>>2)&3), SELD((imm8>>0)&3) ) );
   10890 #  undef SELD
   10891 #  undef SELS
   10892    return res;
   10893 }
   10894 
   10895 
   10896 /* 256-bit SHUFPS appears to steer each of the 128-bit halves
   10897    identically.  Hence do the clueless thing and use math_SHUFPS_128
   10898    twice. */
   10899 static IRTemp math_SHUFPS_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
   10900 {
   10901    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   10902    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   10903    breakupV256toV128s( sV, &sVhi, &sVlo );
   10904    breakupV256toV128s( dV, &dVhi, &dVlo );
   10905    IRTemp rVhi = math_SHUFPS_128(sVhi, dVhi, imm8);
   10906    IRTemp rVlo = math_SHUFPS_128(sVlo, dVlo, imm8);
   10907    IRTemp rV   = newTemp(Ity_V256);
   10908    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   10909    return rV;
   10910 }
   10911 
   10912 
   10913 static IRTemp math_SHUFPD_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   10914 {
   10915    IRTemp s1 = newTemp(Ity_I64);
   10916    IRTemp s0 = newTemp(Ity_I64);
   10917    IRTemp d1 = newTemp(Ity_I64);
   10918    IRTemp d0 = newTemp(Ity_I64);
   10919 
   10920    assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
   10921    assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
   10922    assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
   10923    assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
   10924 
   10925 #  define SELD(n) mkexpr((n)==0 ? d0 : d1)
   10926 #  define SELS(n) mkexpr((n)==0 ? s0 : s1)
   10927 
   10928    IRTemp res = newTemp(Ity_V128);
   10929    assign(res, binop( Iop_64HLtoV128,
   10930                       SELS((imm8>>1)&1), SELD((imm8>>0)&1) ) );
   10931 
   10932 #  undef SELD
   10933 #  undef SELS
   10934    return res;
   10935 }
   10936 
   10937 
   10938 static IRTemp math_SHUFPD_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
   10939 {
   10940    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   10941    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   10942    breakupV256toV128s( sV, &sVhi, &sVlo );
   10943    breakupV256toV128s( dV, &dVhi, &dVlo );
   10944    IRTemp rVhi = math_SHUFPD_128(sVhi, dVhi, (imm8 >> 2) & 3);
   10945    IRTemp rVlo = math_SHUFPD_128(sVlo, dVlo, imm8 & 3);
   10946    IRTemp rV   = newTemp(Ity_V256);
   10947    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   10948    return rV;
   10949 }
   10950 
   10951 
   10952 static IRTemp math_BLENDPD_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   10953 {
   10954    UShort imm8_mask_16;
   10955    IRTemp imm8_mask = newTemp(Ity_V128);
   10956 
   10957    switch( imm8 & 3 ) {
   10958       case 0:  imm8_mask_16 = 0x0000; break;
   10959       case 1:  imm8_mask_16 = 0x00FF; break;
   10960       case 2:  imm8_mask_16 = 0xFF00; break;
   10961       case 3:  imm8_mask_16 = 0xFFFF; break;
   10962       default: vassert(0);            break;
   10963    }
   10964    assign( imm8_mask, mkV128( imm8_mask_16 ) );
   10965 
   10966    IRTemp res = newTemp(Ity_V128);
   10967    assign ( res, binop( Iop_OrV128,
   10968                         binop( Iop_AndV128, mkexpr(sV),
   10969                                             mkexpr(imm8_mask) ),
   10970                         binop( Iop_AndV128, mkexpr(dV),
   10971                                unop( Iop_NotV128, mkexpr(imm8_mask) ) ) ) );
   10972    return res;
   10973 }
   10974 
   10975 
   10976 static IRTemp math_BLENDPD_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
   10977 {
   10978    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   10979    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   10980    breakupV256toV128s( sV, &sVhi, &sVlo );
   10981    breakupV256toV128s( dV, &dVhi, &dVlo );
   10982    IRTemp rVhi = math_BLENDPD_128(sVhi, dVhi, (imm8 >> 2) & 3);
   10983    IRTemp rVlo = math_BLENDPD_128(sVlo, dVlo, imm8 & 3);
   10984    IRTemp rV   = newTemp(Ity_V256);
   10985    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   10986    return rV;
   10987 }
   10988 
   10989 
   10990 static IRTemp math_BLENDPS_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   10991 {
   10992    UShort imm8_perms[16] = { 0x0000, 0x000F, 0x00F0, 0x00FF, 0x0F00,
   10993                              0x0F0F, 0x0FF0, 0x0FFF, 0xF000, 0xF00F,
   10994                              0xF0F0, 0xF0FF, 0xFF00, 0xFF0F, 0xFFF0,
   10995                              0xFFFF };
   10996    IRTemp imm8_mask = newTemp(Ity_V128);
   10997    assign( imm8_mask, mkV128( imm8_perms[ (imm8 & 15) ] ) );
   10998 
   10999    IRTemp res = newTemp(Ity_V128);
   11000    assign ( res, binop( Iop_OrV128,
   11001                         binop( Iop_AndV128, mkexpr(sV),
   11002                                             mkexpr(imm8_mask) ),
   11003                         binop( Iop_AndV128, mkexpr(dV),
   11004                                unop( Iop_NotV128, mkexpr(imm8_mask) ) ) ) );
   11005    return res;
   11006 }
   11007 
   11008 
   11009 static IRTemp math_BLENDPS_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11010 {
   11011    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   11012    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   11013    breakupV256toV128s( sV, &sVhi, &sVlo );
   11014    breakupV256toV128s( dV, &dVhi, &dVlo );
   11015    IRTemp rVhi = math_BLENDPS_128(sVhi, dVhi, (imm8 >> 4) & 15);
   11016    IRTemp rVlo = math_BLENDPS_128(sVlo, dVlo, imm8 & 15);
   11017    IRTemp rV   = newTemp(Ity_V256);
   11018    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   11019    return rV;
   11020 }
   11021 
   11022 
   11023 static IRTemp math_PBLENDW_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11024 {
   11025    /* Make w be a 16-bit version of imm8, formed by duplicating each
   11026       bit in imm8. */
   11027    Int i;
   11028    UShort imm16 = 0;
   11029    for (i = 0; i < 8; i++) {
   11030       if (imm8 & (1 << i))
   11031          imm16 |= (3 << (2*i));
   11032    }
   11033    IRTemp imm16_mask = newTemp(Ity_V128);
   11034    assign( imm16_mask, mkV128( imm16 ));
   11035 
   11036    IRTemp res = newTemp(Ity_V128);
   11037    assign ( res, binop( Iop_OrV128,
   11038                         binop( Iop_AndV128, mkexpr(sV),
   11039                                             mkexpr(imm16_mask) ),
   11040                         binop( Iop_AndV128, mkexpr(dV),
   11041                                unop( Iop_NotV128, mkexpr(imm16_mask) ) ) ) );
   11042    return res;
   11043 }
   11044 
   11045 
   11046 static IRTemp math_PMULUDQ_128 ( IRTemp sV, IRTemp dV )
   11047 {
   11048    /* This is a really poor translation -- could be improved if
   11049       performance critical */
   11050    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   11051    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   11052    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   11053    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   11054    IRTemp res = newTemp(Ity_V128);
   11055    assign(res, binop(Iop_64HLtoV128,
   11056                      binop( Iop_MullU32, mkexpr(d2), mkexpr(s2)),
   11057                      binop( Iop_MullU32, mkexpr(d0), mkexpr(s0)) ));
   11058    return res;
   11059 }
   11060 
   11061 
   11062 static IRTemp math_PMULUDQ_256 ( IRTemp sV, IRTemp dV )
   11063 {
   11064    /* This is a really poor translation -- could be improved if
   11065       performance critical */
   11066    IRTemp sHi, sLo, dHi, dLo;
   11067    sHi = sLo = dHi = dLo = IRTemp_INVALID;
   11068    breakupV256toV128s( dV, &dHi, &dLo);
   11069    breakupV256toV128s( sV, &sHi, &sLo);
   11070    IRTemp res = newTemp(Ity_V256);
   11071    assign(res, binop(Iop_V128HLtoV256,
   11072                      mkexpr(math_PMULUDQ_128(sHi, dHi)),
   11073                      mkexpr(math_PMULUDQ_128(sLo, dLo))));
   11074    return res;
   11075 }
   11076 
   11077 
   11078 static IRTemp math_PMULDQ_128 ( IRTemp dV, IRTemp sV )
   11079 {
   11080    /* This is a really poor translation -- could be improved if
   11081       performance critical */
   11082    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   11083    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   11084    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   11085    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   11086    IRTemp res = newTemp(Ity_V128);
   11087    assign(res, binop(Iop_64HLtoV128,
   11088                      binop( Iop_MullS32, mkexpr(d2), mkexpr(s2)),
   11089                      binop( Iop_MullS32, mkexpr(d0), mkexpr(s0)) ));
   11090    return res;
   11091 }
   11092 
   11093 
   11094 static IRTemp math_PMULDQ_256 ( IRTemp sV, IRTemp dV )
   11095 {
   11096    /* This is a really poor translation -- could be improved if
   11097       performance critical */
   11098    IRTemp sHi, sLo, dHi, dLo;
   11099    sHi = sLo = dHi = dLo = IRTemp_INVALID;
   11100    breakupV256toV128s( dV, &dHi, &dLo);
   11101    breakupV256toV128s( sV, &sHi, &sLo);
   11102    IRTemp res = newTemp(Ity_V256);
   11103    assign(res, binop(Iop_V128HLtoV256,
   11104                      mkexpr(math_PMULDQ_128(sHi, dHi)),
   11105                      mkexpr(math_PMULDQ_128(sLo, dLo))));
   11106    return res;
   11107 }
   11108 
   11109 
   11110 static IRTemp math_PMADDWD_128 ( IRTemp dV, IRTemp sV )
   11111 {
   11112    IRTemp sVhi, sVlo, dVhi, dVlo;
   11113    IRTemp resHi = newTemp(Ity_I64);
   11114    IRTemp resLo = newTemp(Ity_I64);
   11115    sVhi = sVlo = dVhi = dVlo = IRTemp_INVALID;
   11116    breakupV128to64s( sV, &sVhi, &sVlo );
   11117    breakupV128to64s( dV, &dVhi, &dVlo );
   11118    assign( resHi, mkIRExprCCall(Ity_I64, 0/*regparms*/,
   11119                                 "amd64g_calculate_mmx_pmaddwd",
   11120                                 &amd64g_calculate_mmx_pmaddwd,
   11121                                 mkIRExprVec_2( mkexpr(sVhi), mkexpr(dVhi))));
   11122    assign( resLo, mkIRExprCCall(Ity_I64, 0/*regparms*/,
   11123                                 "amd64g_calculate_mmx_pmaddwd",
   11124                                 &amd64g_calculate_mmx_pmaddwd,
   11125                                 mkIRExprVec_2( mkexpr(sVlo), mkexpr(dVlo))));
   11126    IRTemp res = newTemp(Ity_V128);
   11127    assign( res, binop(Iop_64HLtoV128, mkexpr(resHi), mkexpr(resLo))) ;
   11128    return res;
   11129 }
   11130 
   11131 
   11132 static IRTemp math_PMADDWD_256 ( IRTemp dV, IRTemp sV )
   11133 {
   11134    IRTemp sHi, sLo, dHi, dLo;
   11135    sHi = sLo = dHi = dLo = IRTemp_INVALID;
   11136    breakupV256toV128s( dV, &dHi, &dLo);
   11137    breakupV256toV128s( sV, &sHi, &sLo);
   11138    IRTemp res = newTemp(Ity_V256);
   11139    assign(res, binop(Iop_V128HLtoV256,
   11140                      mkexpr(math_PMADDWD_128(dHi, sHi)),
   11141                      mkexpr(math_PMADDWD_128(dLo, sLo))));
   11142    return res;
   11143 }
   11144 
   11145 
   11146 static IRTemp math_ADDSUBPD_128 ( IRTemp dV, IRTemp sV )
   11147 {
   11148    IRTemp addV = newTemp(Ity_V128);
   11149    IRTemp subV = newTemp(Ity_V128);
   11150    IRTemp a1   = newTemp(Ity_I64);
   11151    IRTemp s0   = newTemp(Ity_I64);
   11152    IRTemp rm   = newTemp(Ity_I32);
   11153 
   11154    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   11155    assign( addV, triop(Iop_Add64Fx2, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11156    assign( subV, triop(Iop_Sub64Fx2, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11157 
   11158    assign( a1, unop(Iop_V128HIto64, mkexpr(addV) ));
   11159    assign( s0, unop(Iop_V128to64,   mkexpr(subV) ));
   11160 
   11161    IRTemp res = newTemp(Ity_V128);
   11162    assign( res, binop(Iop_64HLtoV128, mkexpr(a1), mkexpr(s0)) );
   11163    return res;
   11164 }
   11165 
   11166 
   11167 static IRTemp math_ADDSUBPD_256 ( IRTemp dV, IRTemp sV )
   11168 {
   11169    IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
   11170    IRTemp addV = newTemp(Ity_V256);
   11171    IRTemp subV = newTemp(Ity_V256);
   11172    IRTemp rm   = newTemp(Ity_I32);
   11173    a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11174 
   11175    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   11176    assign( addV, triop(Iop_Add64Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11177    assign( subV, triop(Iop_Sub64Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11178 
   11179    breakupV256to64s( addV, &a3, &a2, &a1, &a0 );
   11180    breakupV256to64s( subV, &s3, &s2, &s1, &s0 );
   11181 
   11182    IRTemp res = newTemp(Ity_V256);
   11183    assign( res, mkV256from64s( a3, s2, a1, s0 ) );
   11184    return res;
   11185 }
   11186 
   11187 
   11188 static IRTemp math_ADDSUBPS_128 ( IRTemp dV, IRTemp sV )
   11189 {
   11190    IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
   11191    IRTemp addV = newTemp(Ity_V128);
   11192    IRTemp subV = newTemp(Ity_V128);
   11193    IRTemp rm   = newTemp(Ity_I32);
   11194    a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11195 
   11196    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   11197    assign( addV, triop(Iop_Add32Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11198    assign( subV, triop(Iop_Sub32Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11199 
   11200    breakupV128to32s( addV, &a3, &a2, &a1, &a0 );
   11201    breakupV128to32s( subV, &s3, &s2, &s1, &s0 );
   11202 
   11203    IRTemp res = newTemp(Ity_V128);
   11204    assign( res, mkV128from32s( a3, s2, a1, s0 ) );
   11205    return res;
   11206 }
   11207 
   11208 
   11209 static IRTemp math_ADDSUBPS_256 ( IRTemp dV, IRTemp sV )
   11210 {
   11211    IRTemp a7, a6, a5, a4, a3, a2, a1, a0;
   11212    IRTemp s7, s6, s5, s4, s3, s2, s1, s0;
   11213    IRTemp addV = newTemp(Ity_V256);
   11214    IRTemp subV = newTemp(Ity_V256);
   11215    IRTemp rm   = newTemp(Ity_I32);
   11216    a7 = a6 = a5 = a4 = a3 = a2 = a1 = a0 = IRTemp_INVALID;
   11217    s7 = s6 = s5 = s4 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11218 
   11219    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   11220    assign( addV, triop(Iop_Add32Fx8, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11221    assign( subV, triop(Iop_Sub32Fx8, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11222 
   11223    breakupV256to32s( addV, &a7, &a6, &a5, &a4, &a3, &a2, &a1, &a0 );
   11224    breakupV256to32s( subV, &s7, &s6, &s5, &s4, &s3, &s2, &s1, &s0 );
   11225 
   11226    IRTemp res = newTemp(Ity_V256);
   11227    assign( res, mkV256from32s( a7, s6, a5, s4, a3, s2, a1, s0 ) );
   11228    return res;
   11229 }
   11230 
   11231 
   11232 /* Handle 128 bit PSHUFLW and PSHUFHW. */
   11233 static Long dis_PSHUFxW_128 ( const VexAbiInfo* vbi, Prefix pfx,
   11234                               Long delta, Bool isAvx, Bool xIsH )
   11235 {
   11236    IRTemp addr  = IRTemp_INVALID;
   11237    Int    alen  = 0;
   11238    HChar  dis_buf[50];
   11239    UChar  modrm = getUChar(delta);
   11240    UInt   rG = gregOfRexRM(pfx,modrm);
   11241    UInt   imm8;
   11242    IRTemp sVmut, dVmut, sVcon, sV, dV, s3, s2, s1, s0;
   11243    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11244    sV    = newTemp(Ity_V128);
   11245    dV    = newTemp(Ity_V128);
   11246    sVmut = newTemp(Ity_I64);
   11247    dVmut = newTemp(Ity_I64);
   11248    sVcon = newTemp(Ity_I64);
   11249    if (epartIsReg(modrm)) {
   11250       UInt rE = eregOfRexRM(pfx,modrm);
   11251       assign( sV, getXMMReg(rE) );
   11252       imm8 = (UInt)getUChar(delta+1);
   11253       delta += 1+1;
   11254       DIP("%spshuf%cw $%u,%s,%s\n",
   11255           isAvx ? "v" : "", xIsH ? 'h' : 'l',
   11256           imm8, nameXMMReg(rE), nameXMMReg(rG));
   11257    } else {
   11258       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   11259       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11260       imm8 = (UInt)getUChar(delta+alen);
   11261       delta += alen+1;
   11262       DIP("%spshuf%cw $%u,%s,%s\n",
   11263           isAvx ? "v" : "", xIsH ? 'h' : 'l',
   11264           imm8, dis_buf, nameXMMReg(rG));
   11265    }
   11266 
   11267    /* Get the to-be-changed (mut) and unchanging (con) bits of the
   11268       source. */
   11269    assign( sVmut, unop(xIsH ? Iop_V128HIto64 : Iop_V128to64,   mkexpr(sV)) );
   11270    assign( sVcon, unop(xIsH ? Iop_V128to64   : Iop_V128HIto64, mkexpr(sV)) );
   11271 
   11272    breakup64to16s( sVmut, &s3, &s2, &s1, &s0 );
   11273 #  define SEL(n) \
   11274              ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   11275    assign(dVmut, mk64from16s( SEL((imm8>>6)&3), SEL((imm8>>4)&3),
   11276                               SEL((imm8>>2)&3), SEL((imm8>>0)&3) ));
   11277 #  undef SEL
   11278 
   11279    assign(dV, xIsH ? binop(Iop_64HLtoV128, mkexpr(dVmut), mkexpr(sVcon))
   11280                    : binop(Iop_64HLtoV128, mkexpr(sVcon), mkexpr(dVmut)) );
   11281 
   11282    (isAvx ? putYMMRegLoAndZU : putXMMReg)(rG, mkexpr(dV));
   11283    return delta;
   11284 }
   11285 
   11286 
   11287 /* Handle 256 bit PSHUFLW and PSHUFHW. */
   11288 static Long dis_PSHUFxW_256 ( const VexAbiInfo* vbi, Prefix pfx,
   11289                               Long delta, Bool xIsH )
   11290 {
   11291    IRTemp addr  = IRTemp_INVALID;
   11292    Int    alen  = 0;
   11293    HChar  dis_buf[50];
   11294    UChar  modrm = getUChar(delta);
   11295    UInt   rG = gregOfRexRM(pfx,modrm);
   11296    UInt   imm8;
   11297    IRTemp sV, s[8], sV64[4], dVhi, dVlo;
   11298    sV64[3] = sV64[2] = sV64[1] = sV64[0] = IRTemp_INVALID;
   11299    s[7] = s[6] = s[5] = s[4] = s[3] = s[2] = s[1] = s[0] = IRTemp_INVALID;
   11300    sV    = newTemp(Ity_V256);
   11301    dVhi  = newTemp(Ity_I64);
   11302    dVlo  = newTemp(Ity_I64);
   11303    if (epartIsReg(modrm)) {
   11304       UInt rE = eregOfRexRM(pfx,modrm);
   11305       assign( sV, getYMMReg(rE) );
   11306       imm8 = (UInt)getUChar(delta+1);
   11307       delta += 1+1;
   11308       DIP("vpshuf%cw $%u,%s,%s\n", xIsH ? 'h' : 'l',
   11309           imm8, nameYMMReg(rE), nameYMMReg(rG));
   11310    } else {
   11311       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   11312       assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   11313       imm8 = (UInt)getUChar(delta+alen);
   11314       delta += alen+1;
   11315       DIP("vpshuf%cw $%u,%s,%s\n", xIsH ? 'h' : 'l',
   11316           imm8, dis_buf, nameYMMReg(rG));
   11317    }
   11318 
   11319    breakupV256to64s( sV, &sV64[3], &sV64[2], &sV64[1], &sV64[0] );
   11320    breakup64to16s( sV64[xIsH ? 3 : 2], &s[7], &s[6], &s[5], &s[4] );
   11321    breakup64to16s( sV64[xIsH ? 1 : 0], &s[3], &s[2], &s[1], &s[0] );
   11322 
   11323    assign( dVhi, mk64from16s( s[4 + ((imm8>>6)&3)], s[4 + ((imm8>>4)&3)],
   11324                               s[4 + ((imm8>>2)&3)], s[4 + ((imm8>>0)&3)] ) );
   11325    assign( dVlo, mk64from16s( s[0 + ((imm8>>6)&3)], s[0 + ((imm8>>4)&3)],
   11326                               s[0 + ((imm8>>2)&3)], s[0 + ((imm8>>0)&3)] ) );
   11327    putYMMReg( rG, mkV256from64s( xIsH ? dVhi : sV64[3],
   11328                                  xIsH ? sV64[2] : dVhi,
   11329                                  xIsH ? dVlo : sV64[1],
   11330                                  xIsH ? sV64[0] : dVlo ) );
   11331    return delta;
   11332 }
   11333 
   11334 
   11335 static Long dis_PEXTRW_128_EregOnly_toG ( const VexAbiInfo* vbi, Prefix pfx,
   11336                                           Long delta, Bool isAvx )
   11337 {
   11338    Long   deltaIN = delta;
   11339    UChar  modrm   = getUChar(delta);
   11340    UInt   rG      = gregOfRexRM(pfx,modrm);
   11341    IRTemp sV      = newTemp(Ity_V128);
   11342    IRTemp d16     = newTemp(Ity_I16);
   11343    UInt   imm8;
   11344    IRTemp s0, s1, s2, s3;
   11345    if (epartIsReg(modrm)) {
   11346       UInt rE = eregOfRexRM(pfx,modrm);
   11347       assign(sV, getXMMReg(rE));
   11348       imm8 = getUChar(delta+1) & 7;
   11349       delta += 1+1;
   11350       DIP("%spextrw $%d,%s,%s\n", isAvx ? "v" : "",
   11351           (Int)imm8, nameXMMReg(rE), nameIReg32(rG));
   11352    } else {
   11353       /* The memory case is disallowed, apparently. */
   11354       return deltaIN; /* FAIL */
   11355    }
   11356    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11357    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   11358    switch (imm8) {
   11359       case 0:  assign(d16, unop(Iop_32to16,   mkexpr(s0))); break;
   11360       case 1:  assign(d16, unop(Iop_32HIto16, mkexpr(s0))); break;
   11361       case 2:  assign(d16, unop(Iop_32to16,   mkexpr(s1))); break;
   11362       case 3:  assign(d16, unop(Iop_32HIto16, mkexpr(s1))); break;
   11363       case 4:  assign(d16, unop(Iop_32to16,   mkexpr(s2))); break;
   11364       case 5:  assign(d16, unop(Iop_32HIto16, mkexpr(s2))); break;
   11365       case 6:  assign(d16, unop(Iop_32to16,   mkexpr(s3))); break;
   11366       case 7:  assign(d16, unop(Iop_32HIto16, mkexpr(s3))); break;
   11367       default: vassert(0);
   11368    }
   11369    putIReg32(rG, unop(Iop_16Uto32, mkexpr(d16)));
   11370    return delta;
   11371 }
   11372 
   11373 
   11374 static Long dis_CVTDQ2PD_128 ( const VexAbiInfo* vbi, Prefix pfx,
   11375                                Long delta, Bool isAvx )
   11376 {
   11377    IRTemp addr  = IRTemp_INVALID;
   11378    Int    alen  = 0;
   11379    HChar  dis_buf[50];
   11380    UChar  modrm = getUChar(delta);
   11381    IRTemp arg64 = newTemp(Ity_I64);
   11382    UInt   rG    = gregOfRexRM(pfx,modrm);
   11383    const HChar* mbV   = isAvx ? "v" : "";
   11384    if (epartIsReg(modrm)) {
   11385       UInt rE = eregOfRexRM(pfx,modrm);
   11386       assign( arg64, getXMMRegLane64(rE, 0) );
   11387       delta += 1;
   11388       DIP("%scvtdq2pd %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG));
   11389    } else {
   11390       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11391       assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   11392       delta += alen;
   11393       DIP("%scvtdq2pd %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
   11394    }
   11395    putXMMRegLane64F(
   11396       rG, 0,
   11397       unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)))
   11398    );
   11399    putXMMRegLane64F(
   11400       rG, 1,
   11401       unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)))
   11402    );
   11403    if (isAvx)
   11404       putYMMRegLane128(rG, 1, mkV128(0));
   11405    return delta;
   11406 }
   11407 
   11408 
   11409 static Long dis_STMXCSR ( const VexAbiInfo* vbi, Prefix pfx,
   11410                           Long delta, Bool isAvx )
   11411 {
   11412    IRTemp addr  = IRTemp_INVALID;
   11413    Int    alen  = 0;
   11414    HChar  dis_buf[50];
   11415    UChar  modrm = getUChar(delta);
   11416    vassert(!epartIsReg(modrm)); /* ensured by caller */
   11417    vassert(gregOfRexRM(pfx,modrm) == 3); /* ditto */
   11418 
   11419    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11420    delta += alen;
   11421 
   11422    /* Fake up a native SSE mxcsr word.  The only thing it depends on
   11423       is SSEROUND[1:0], so call a clean helper to cook it up.
   11424    */
   11425    /* ULong amd64h_create_mxcsr ( ULong sseround ) */
   11426    DIP("%sstmxcsr %s\n",  isAvx ? "v" : "", dis_buf);
   11427    storeLE(
   11428       mkexpr(addr),
   11429       unop(Iop_64to32,
   11430            mkIRExprCCall(
   11431               Ity_I64, 0/*regp*/,
   11432               "amd64g_create_mxcsr", &amd64g_create_mxcsr,
   11433               mkIRExprVec_1( unop(Iop_32Uto64,get_sse_roundingmode()) )
   11434            )
   11435       )
   11436    );
   11437    return delta;
   11438 }
   11439 
   11440 
   11441 static Long dis_LDMXCSR ( const VexAbiInfo* vbi, Prefix pfx,
   11442                           Long delta, Bool isAvx )
   11443 {
   11444    IRTemp addr  = IRTemp_INVALID;
   11445    Int    alen  = 0;
   11446    HChar  dis_buf[50];
   11447    UChar  modrm = getUChar(delta);
   11448    vassert(!epartIsReg(modrm)); /* ensured by caller */
   11449    vassert(gregOfRexRM(pfx,modrm) == 2); /* ditto */
   11450 
   11451    IRTemp t64 = newTemp(Ity_I64);
   11452    IRTemp ew  = newTemp(Ity_I32);
   11453 
   11454    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11455    delta += alen;
   11456    DIP("%sldmxcsr %s\n",  isAvx ? "v" : "", dis_buf);
   11457 
   11458    /* The only thing we observe in %mxcsr is the rounding mode.
   11459       Therefore, pass the 32-bit value (SSE native-format control
   11460       word) to a clean helper, getting back a 64-bit value, the
   11461       lower half of which is the SSEROUND value to store, and the
   11462       upper half of which is the emulation-warning token which may
   11463       be generated.
   11464    */
   11465    /* ULong amd64h_check_ldmxcsr ( ULong ); */
   11466    assign( t64, mkIRExprCCall(
   11467                    Ity_I64, 0/*regparms*/,
   11468                    "amd64g_check_ldmxcsr",
   11469                    &amd64g_check_ldmxcsr,
   11470                    mkIRExprVec_1(
   11471                       unop(Iop_32Uto64,
   11472                            loadLE(Ity_I32, mkexpr(addr))
   11473                       )
   11474                    )
   11475                 )
   11476          );
   11477 
   11478    put_sse_roundingmode( unop(Iop_64to32, mkexpr(t64)) );
   11479    assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
   11480    put_emwarn( mkexpr(ew) );
   11481    /* Finally, if an emulation warning was reported, side-exit to
   11482       the next insn, reporting the warning, so that Valgrind's
   11483       dispatcher sees the warning. */
   11484    stmt(
   11485       IRStmt_Exit(
   11486          binop(Iop_CmpNE64, unop(Iop_32Uto64,mkexpr(ew)), mkU64(0)),
   11487          Ijk_EmWarn,
   11488          IRConst_U64(guest_RIP_bbstart+delta),
   11489          OFFB_RIP
   11490       )
   11491    );
   11492    return delta;
   11493 }
   11494 
   11495 
   11496 static IRTemp math_PINSRW_128 ( IRTemp v128, IRTemp u16, UInt imm8 )
   11497 {
   11498    vassert(imm8 >= 0 && imm8 <= 7);
   11499 
   11500    // Create a V128 value which has the selected word in the
   11501    // specified lane, and zeroes everywhere else.
   11502    IRTemp tmp128    = newTemp(Ity_V128);
   11503    IRTemp halfshift = newTemp(Ity_I64);
   11504    assign(halfshift, binop(Iop_Shl64,
   11505                            unop(Iop_16Uto64, mkexpr(u16)),
   11506                            mkU8(16 * (imm8 & 3))));
   11507    if (imm8 < 4) {
   11508       assign(tmp128, binop(Iop_64HLtoV128, mkU64(0), mkexpr(halfshift)));
   11509    } else {
   11510       assign(tmp128, binop(Iop_64HLtoV128, mkexpr(halfshift), mkU64(0)));
   11511    }
   11512 
   11513    UShort mask = ~(3 << (imm8 * 2));
   11514    IRTemp res  = newTemp(Ity_V128);
   11515    assign( res, binop(Iop_OrV128,
   11516                       mkexpr(tmp128),
   11517                       binop(Iop_AndV128, mkexpr(v128), mkV128(mask))) );
   11518    return res;
   11519 }
   11520 
   11521 
   11522 static IRTemp math_PSADBW_128 ( IRTemp dV, IRTemp sV )
   11523 {
   11524    IRTemp s1, s0, d1, d0;
   11525    s1 = s0 = d1 = d0 = IRTemp_INVALID;
   11526 
   11527    breakupV128to64s( sV, &s1, &s0 );
   11528    breakupV128to64s( dV, &d1, &d0 );
   11529 
   11530    IRTemp res = newTemp(Ity_V128);
   11531    assign( res,
   11532            binop(Iop_64HLtoV128,
   11533                  mkIRExprCCall(Ity_I64, 0/*regparms*/,
   11534                                "amd64g_calculate_mmx_psadbw",
   11535                                &amd64g_calculate_mmx_psadbw,
   11536                                mkIRExprVec_2( mkexpr(s1), mkexpr(d1))),
   11537                  mkIRExprCCall(Ity_I64, 0/*regparms*/,
   11538                                "amd64g_calculate_mmx_psadbw",
   11539                                &amd64g_calculate_mmx_psadbw,
   11540                                mkIRExprVec_2( mkexpr(s0), mkexpr(d0)))) );
   11541    return res;
   11542 }
   11543 
   11544 
   11545 static IRTemp math_PSADBW_256 ( IRTemp dV, IRTemp sV )
   11546 {
   11547    IRTemp sHi, sLo, dHi, dLo;
   11548    sHi = sLo = dHi = dLo = IRTemp_INVALID;
   11549    breakupV256toV128s( dV, &dHi, &dLo);
   11550    breakupV256toV128s( sV, &sHi, &sLo);
   11551    IRTemp res = newTemp(Ity_V256);
   11552    assign(res, binop(Iop_V128HLtoV256,
   11553                      mkexpr(math_PSADBW_128(dHi, sHi)),
   11554                      mkexpr(math_PSADBW_128(dLo, sLo))));
   11555    return res;
   11556 }
   11557 
   11558 
   11559 static Long dis_MASKMOVDQU ( const VexAbiInfo* vbi, Prefix pfx,
   11560                              Long delta, Bool isAvx )
   11561 {
   11562    IRTemp regD    = newTemp(Ity_V128);
   11563    IRTemp mask    = newTemp(Ity_V128);
   11564    IRTemp olddata = newTemp(Ity_V128);
   11565    IRTemp newdata = newTemp(Ity_V128);
   11566    IRTemp addr    = newTemp(Ity_I64);
   11567    UChar  modrm   = getUChar(delta);
   11568    UInt   rG      = gregOfRexRM(pfx,modrm);
   11569    UInt   rE      = eregOfRexRM(pfx,modrm);
   11570 
   11571    assign( addr, handleAddrOverrides( vbi, pfx, getIReg64(R_RDI) ));
   11572    assign( regD, getXMMReg( rG ));
   11573 
   11574    /* Unfortunately can't do the obvious thing with SarN8x16
   11575       here since that can't be re-emitted as SSE2 code - no such
   11576       insn. */
   11577    assign( mask,
   11578            binop(Iop_64HLtoV128,
   11579                  binop(Iop_SarN8x8,
   11580                        getXMMRegLane64( eregOfRexRM(pfx,modrm), 1 ),
   11581                        mkU8(7) ),
   11582                  binop(Iop_SarN8x8,
   11583                        getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ),
   11584                        mkU8(7) ) ));
   11585    assign( olddata, loadLE( Ity_V128, mkexpr(addr) ));
   11586    assign( newdata, binop(Iop_OrV128,
   11587                           binop(Iop_AndV128,
   11588                                 mkexpr(regD),
   11589                                 mkexpr(mask) ),
   11590                           binop(Iop_AndV128,
   11591                                 mkexpr(olddata),
   11592                                 unop(Iop_NotV128, mkexpr(mask)))) );
   11593    storeLE( mkexpr(addr), mkexpr(newdata) );
   11594 
   11595    delta += 1;
   11596    DIP("%smaskmovdqu %s,%s\n", isAvx ? "v" : "",
   11597        nameXMMReg(rE), nameXMMReg(rG) );
   11598    return delta;
   11599 }
   11600 
   11601 
   11602 static Long dis_MOVMSKPS_128 ( const VexAbiInfo* vbi, Prefix pfx,
   11603                                Long delta, Bool isAvx )
   11604 {
   11605    UChar modrm = getUChar(delta);
   11606    UInt   rG   = gregOfRexRM(pfx,modrm);
   11607    UInt   rE   = eregOfRexRM(pfx,modrm);
   11608    IRTemp t0   = newTemp(Ity_I32);
   11609    IRTemp t1   = newTemp(Ity_I32);
   11610    IRTemp t2   = newTemp(Ity_I32);
   11611    IRTemp t3   = newTemp(Ity_I32);
   11612    delta += 1;
   11613    assign( t0, binop( Iop_And32,
   11614                       binop(Iop_Shr32, getXMMRegLane32(rE,0), mkU8(31)),
   11615                       mkU32(1) ));
   11616    assign( t1, binop( Iop_And32,
   11617                       binop(Iop_Shr32, getXMMRegLane32(rE,1), mkU8(30)),
   11618                       mkU32(2) ));
   11619    assign( t2, binop( Iop_And32,
   11620                       binop(Iop_Shr32, getXMMRegLane32(rE,2), mkU8(29)),
   11621                       mkU32(4) ));
   11622    assign( t3, binop( Iop_And32,
   11623                       binop(Iop_Shr32, getXMMRegLane32(rE,3), mkU8(28)),
   11624                       mkU32(8) ));
   11625    putIReg32( rG, binop(Iop_Or32,
   11626                         binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
   11627                         binop(Iop_Or32, mkexpr(t2), mkexpr(t3)) ) );
   11628    DIP("%smovmskps %s,%s\n", isAvx ? "v" : "",
   11629        nameXMMReg(rE), nameIReg32(rG));
   11630    return delta;
   11631 }
   11632 
   11633 
   11634 static Long dis_MOVMSKPS_256 ( const VexAbiInfo* vbi, Prefix pfx, Long delta )
   11635 {
   11636    UChar modrm = getUChar(delta);
   11637    UInt   rG   = gregOfRexRM(pfx,modrm);
   11638    UInt   rE   = eregOfRexRM(pfx,modrm);
   11639    IRTemp t0   = newTemp(Ity_I32);
   11640    IRTemp t1   = newTemp(Ity_I32);
   11641    IRTemp t2   = newTemp(Ity_I32);
   11642    IRTemp t3   = newTemp(Ity_I32);
   11643    IRTemp t4   = newTemp(Ity_I32);
   11644    IRTemp t5   = newTemp(Ity_I32);
   11645    IRTemp t6   = newTemp(Ity_I32);
   11646    IRTemp t7   = newTemp(Ity_I32);
   11647    delta += 1;
   11648    assign( t0, binop( Iop_And32,
   11649                       binop(Iop_Shr32, getYMMRegLane32(rE,0), mkU8(31)),
   11650                       mkU32(1) ));
   11651    assign( t1, binop( Iop_And32,
   11652                       binop(Iop_Shr32, getYMMRegLane32(rE,1), mkU8(30)),
   11653                       mkU32(2) ));
   11654    assign( t2, binop( Iop_And32,
   11655                       binop(Iop_Shr32, getYMMRegLane32(rE,2), mkU8(29)),
   11656                       mkU32(4) ));
   11657    assign( t3, binop( Iop_And32,
   11658                       binop(Iop_Shr32, getYMMRegLane32(rE,3), mkU8(28)),
   11659                       mkU32(8) ));
   11660    assign( t4, binop( Iop_And32,
   11661                       binop(Iop_Shr32, getYMMRegLane32(rE,4), mkU8(27)),
   11662                       mkU32(16) ));
   11663    assign( t5, binop( Iop_And32,
   11664                       binop(Iop_Shr32, getYMMRegLane32(rE,5), mkU8(26)),
   11665                       mkU32(32) ));
   11666    assign( t6, binop( Iop_And32,
   11667                       binop(Iop_Shr32, getYMMRegLane32(rE,6), mkU8(25)),
   11668                       mkU32(64) ));
   11669    assign( t7, binop( Iop_And32,
   11670                       binop(Iop_Shr32, getYMMRegLane32(rE,7), mkU8(24)),
   11671                       mkU32(128) ));
   11672    putIReg32( rG, binop(Iop_Or32,
   11673                         binop(Iop_Or32,
   11674                               binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
   11675                               binop(Iop_Or32, mkexpr(t2), mkexpr(t3)) ),
   11676                         binop(Iop_Or32,
   11677                               binop(Iop_Or32, mkexpr(t4), mkexpr(t5)),
   11678                               binop(Iop_Or32, mkexpr(t6), mkexpr(t7)) ) ) );
   11679    DIP("vmovmskps %s,%s\n", nameYMMReg(rE), nameIReg32(rG));
   11680    return delta;
   11681 }
   11682 
   11683 
   11684 static Long dis_MOVMSKPD_128 ( const VexAbiInfo* vbi, Prefix pfx,
   11685                                Long delta, Bool isAvx )
   11686 {
   11687    UChar modrm = getUChar(delta);
   11688    UInt   rG   = gregOfRexRM(pfx,modrm);
   11689    UInt   rE   = eregOfRexRM(pfx,modrm);
   11690    IRTemp t0   = newTemp(Ity_I32);
   11691    IRTemp t1   = newTemp(Ity_I32);
   11692    delta += 1;
   11693    assign( t0, binop( Iop_And32,
   11694                       binop(Iop_Shr32, getXMMRegLane32(rE,1), mkU8(31)),
   11695                       mkU32(1) ));
   11696    assign( t1, binop( Iop_And32,
   11697                       binop(Iop_Shr32, getXMMRegLane32(rE,3), mkU8(30)),
   11698                       mkU32(2) ));
   11699    putIReg32( rG, binop(Iop_Or32, mkexpr(t0), mkexpr(t1) ) );
   11700    DIP("%smovmskpd %s,%s\n", isAvx ? "v" : "",
   11701        nameXMMReg(rE), nameIReg32(rG));
   11702    return delta;
   11703 }
   11704 
   11705 
   11706 static Long dis_MOVMSKPD_256 ( const VexAbiInfo* vbi, Prefix pfx, Long delta )
   11707 {
   11708    UChar modrm = getUChar(delta);
   11709    UInt   rG   = gregOfRexRM(pfx,modrm);
   11710    UInt   rE   = eregOfRexRM(pfx,modrm);
   11711    IRTemp t0   = newTemp(Ity_I32);
   11712    IRTemp t1   = newTemp(Ity_I32);
   11713    IRTemp t2   = newTemp(Ity_I32);
   11714    IRTemp t3   = newTemp(Ity_I32);
   11715    delta += 1;
   11716    assign( t0, binop( Iop_And32,
   11717                       binop(Iop_Shr32, getYMMRegLane32(rE,1), mkU8(31)),
   11718                       mkU32(1) ));
   11719    assign( t1, binop( Iop_And32,
   11720                       binop(Iop_Shr32, getYMMRegLane32(rE,3), mkU8(30)),
   11721                       mkU32(2) ));
   11722    assign( t2, binop( Iop_And32,
   11723                       binop(Iop_Shr32, getYMMRegLane32(rE,5), mkU8(29)),
   11724                       mkU32(4) ));
   11725    assign( t3, binop( Iop_And32,
   11726                       binop(Iop_Shr32, getYMMRegLane32(rE,7), mkU8(28)),
   11727                       mkU32(8) ));
   11728    putIReg32( rG, binop(Iop_Or32,
   11729                         binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
   11730                         binop(Iop_Or32, mkexpr(t2), mkexpr(t3)) ) );
   11731    DIP("vmovmskps %s,%s\n", nameYMMReg(rE), nameIReg32(rG));
   11732    return delta;
   11733 }
   11734 
   11735 
   11736 /* Note, this also handles SSE(1) insns. */
   11737 __attribute__((noinline))
   11738 static
   11739 Long dis_ESC_0F__SSE2 ( Bool* decode_OK,
   11740                         const VexAbiInfo* vbi,
   11741                         Prefix pfx, Int sz, Long deltaIN,
   11742                         DisResult* dres )
   11743 {
   11744    IRTemp addr  = IRTemp_INVALID;
   11745    IRTemp t0    = IRTemp_INVALID;
   11746    IRTemp t1    = IRTemp_INVALID;
   11747    IRTemp t2    = IRTemp_INVALID;
   11748    IRTemp t3    = IRTemp_INVALID;
   11749    IRTemp t4    = IRTemp_INVALID;
   11750    IRTemp t5    = IRTemp_INVALID;
   11751    IRTemp t6    = IRTemp_INVALID;
   11752    UChar  modrm = 0;
   11753    Int    alen  = 0;
   11754    HChar  dis_buf[50];
   11755 
   11756    *decode_OK = False;
   11757 
   11758    Long   delta = deltaIN;
   11759    UChar  opc   = getUChar(delta);
   11760    delta++;
   11761    switch (opc) {
   11762 
   11763    case 0x10:
   11764       if (have66noF2noF3(pfx)
   11765           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   11766          /* 66 0F 10 = MOVUPD -- move from E (mem or xmm) to G (xmm). */
   11767          modrm = getUChar(delta);
   11768          if (epartIsReg(modrm)) {
   11769             putXMMReg( gregOfRexRM(pfx,modrm),
   11770                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   11771             DIP("movupd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   11772                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   11773             delta += 1;
   11774          } else {
   11775             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11776             putXMMReg( gregOfRexRM(pfx,modrm),
   11777                        loadLE(Ity_V128, mkexpr(addr)) );
   11778             DIP("movupd %s,%s\n", dis_buf,
   11779                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   11780             delta += alen;
   11781          }
   11782          goto decode_success;
   11783       }
   11784       /* F2 0F 10 = MOVSD -- move 64 bits from E (mem or lo half xmm) to
   11785          G (lo half xmm).  If E is mem, upper half of G is zeroed out.
   11786          If E is reg, upper half of G is unchanged. */
   11787       if (haveF2no66noF3(pfx)
   11788           && (sz == 4 || /* ignore redundant REX.W */ sz == 8) ) {
   11789          modrm = getUChar(delta);
   11790          if (epartIsReg(modrm)) {
   11791             putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
   11792                              getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
   11793             DIP("movsd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   11794                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   11795             delta += 1;
   11796          } else {
   11797             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11798             putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
   11799             putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
   11800                              loadLE(Ity_I64, mkexpr(addr)) );
   11801             DIP("movsd %s,%s\n", dis_buf,
   11802                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   11803             delta += alen;
   11804          }
   11805          goto decode_success;
   11806       }
   11807       /* F3 0F 10 = MOVSS -- move 32 bits from E (mem or lo 1/4 xmm) to G
   11808          (lo 1/4 xmm).  If E is mem, upper 3/4 of G is zeroed out. */
   11809       if (haveF3no66noF2(pfx)
   11810           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   11811          modrm = getUChar(delta);
   11812          if (epartIsReg(modrm)) {
   11813             putXMMRegLane32( gregOfRexRM(pfx,modrm), 0,
   11814                              getXMMRegLane32( eregOfRexRM(pfx,modrm), 0 ));
   11815             DIP("movss %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   11816                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   11817             delta += 1;
   11818          } else {
   11819             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11820             putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
   11821             putXMMRegLane32( gregOfRexRM(pfx,modrm), 0,
   11822                              loadLE(Ity_I32, mkexpr(addr)) );
   11823             DIP("movss %s,%s\n", dis_buf,
   11824                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   11825             delta += alen;
   11826          }
   11827          goto decode_success;
   11828       }
   11829       /* 0F 10 = MOVUPS -- move from E (mem or xmm) to G (xmm). */
   11830       if (haveNo66noF2noF3(pfx)
   11831           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   11832          modrm = getUChar(delta);
   11833          if (epartIsReg(modrm)) {
   11834             putXMMReg( gregOfRexRM(pfx,modrm),
   11835                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   11836             DIP("movups %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   11837                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   11838             delta += 1;
   11839          } else {
   11840             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11841             putXMMReg( gregOfRexRM(pfx,modrm),
   11842                        loadLE(Ity_V128, mkexpr(addr)) );
   11843             DIP("movups %s,%s\n", dis_buf,
   11844                                      nameXMMReg(gregOfRexRM(pfx,modrm)));
   11845             delta += alen;
   11846          }
   11847          goto decode_success;
   11848       }
   11849       break;
   11850 
   11851    case 0x11:
   11852       /* F2 0F 11 = MOVSD -- move 64 bits from G (lo half xmm) to E (mem
   11853          or lo half xmm). */
   11854       if (haveF2no66noF3(pfx)
   11855           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   11856          modrm = getUChar(delta);
   11857          if (epartIsReg(modrm)) {
   11858             putXMMRegLane64( eregOfRexRM(pfx,modrm), 0,
   11859                              getXMMRegLane64( gregOfRexRM(pfx,modrm), 0 ));
   11860             DIP("movsd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11861                                  nameXMMReg(eregOfRexRM(pfx,modrm)));
   11862             delta += 1;
   11863          } else {
   11864             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11865             storeLE( mkexpr(addr),
   11866                      getXMMRegLane64(gregOfRexRM(pfx,modrm), 0) );
   11867             DIP("movsd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11868                                  dis_buf);
   11869             delta += alen;
   11870          }
   11871          goto decode_success;
   11872       }
   11873       /* F3 0F 11 = MOVSS -- move 32 bits from G (lo 1/4 xmm) to E (mem
   11874          or lo 1/4 xmm). */
   11875       if (haveF3no66noF2(pfx) && sz == 4) {
   11876          modrm = getUChar(delta);
   11877          if (epartIsReg(modrm)) {
   11878             /* fall through, we don't yet have a test case */
   11879          } else {
   11880             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11881             storeLE( mkexpr(addr),
   11882                      getXMMRegLane32(gregOfRexRM(pfx,modrm), 0) );
   11883             DIP("movss %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11884                                  dis_buf);
   11885             delta += alen;
   11886             goto decode_success;
   11887          }
   11888       }
   11889       /* 66 0F 11 = MOVUPD -- move from G (xmm) to E (mem or xmm). */
   11890       if (have66noF2noF3(pfx)
   11891           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   11892          modrm = getUChar(delta);
   11893          if (epartIsReg(modrm)) {
   11894             putXMMReg( eregOfRexRM(pfx,modrm),
   11895                        getXMMReg( gregOfRexRM(pfx,modrm) ) );
   11896             DIP("movupd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11897                                   nameXMMReg(eregOfRexRM(pfx,modrm)));
   11898             delta += 1;
   11899          } else {
   11900             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11901             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   11902             DIP("movupd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11903                                   dis_buf );
   11904             delta += alen;
   11905          }
   11906          goto decode_success;
   11907       }
   11908       /* 0F 11 = MOVUPS -- move from G (xmm) to E (mem or xmm). */
   11909       if (haveNo66noF2noF3(pfx)
   11910           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   11911          modrm = getUChar(delta);
   11912          if (epartIsReg(modrm)) {
   11913             /* fall through; awaiting test case */
   11914          } else {
   11915             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11916             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   11917             DIP("movups %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11918                                   dis_buf );
   11919             delta += alen;
   11920             goto decode_success;
   11921          }
   11922       }
   11923       break;
   11924 
   11925    case 0x12:
   11926       /* 66 0F 12 = MOVLPD -- move from mem to low half of XMM. */
   11927       /* Identical to MOVLPS ? */
   11928       if (have66noF2noF3(pfx)
   11929           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   11930          modrm = getUChar(delta);
   11931          if (epartIsReg(modrm)) {
   11932             /* fall through; apparently reg-reg is not possible */
   11933          } else {
   11934             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11935             delta += alen;
   11936             putXMMRegLane64( gregOfRexRM(pfx,modrm),
   11937                              0/*lower lane*/,
   11938                              loadLE(Ity_I64, mkexpr(addr)) );
   11939             DIP("movlpd %s, %s\n",
   11940                 dis_buf, nameXMMReg( gregOfRexRM(pfx,modrm) ));
   11941             goto decode_success;
   11942          }
   11943       }
   11944       /* 0F 12 = MOVLPS -- move from mem to low half of XMM. */
   11945       /* OF 12 = MOVHLPS -- from from hi half to lo half of XMM. */
   11946       if (haveNo66noF2noF3(pfx)
   11947           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   11948          modrm = getUChar(delta);
   11949          if (epartIsReg(modrm)) {
   11950             delta += 1;
   11951             putXMMRegLane64( gregOfRexRM(pfx,modrm),
   11952                              0/*lower lane*/,
   11953                              getXMMRegLane64( eregOfRexRM(pfx,modrm), 1 ));
   11954             DIP("movhlps %s, %s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   11955                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   11956          } else {
   11957             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11958             delta += alen;
   11959             putXMMRegLane64( gregOfRexRM(pfx,modrm),  0/*lower lane*/,
   11960                              loadLE(Ity_I64, mkexpr(addr)) );
   11961             DIP("movlps %s, %s\n",
   11962                 dis_buf, nameXMMReg( gregOfRexRM(pfx,modrm) ));
   11963          }
   11964          goto decode_success;
   11965       }
   11966       break;
   11967 
   11968    case 0x13:
   11969       /* 0F 13 = MOVLPS -- move from low half of XMM to mem. */
   11970       if (haveNo66noF2noF3(pfx)
   11971           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   11972          modrm = getUChar(delta);
   11973          if (!epartIsReg(modrm)) {
   11974             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11975             delta += alen;
   11976             storeLE( mkexpr(addr),
   11977                      getXMMRegLane64( gregOfRexRM(pfx,modrm),
   11978                                       0/*lower lane*/ ) );
   11979             DIP("movlps %s, %s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
   11980                                    dis_buf);
   11981             goto decode_success;
   11982          }
   11983          /* else fall through */
   11984       }
   11985       /* 66 0F 13 = MOVLPD -- move from low half of XMM to mem. */
   11986       /* Identical to MOVLPS ? */
   11987       if (have66noF2noF3(pfx)
   11988           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   11989          modrm = getUChar(delta);
   11990          if (!epartIsReg(modrm)) {
   11991             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11992             delta += alen;
   11993             storeLE( mkexpr(addr),
   11994                      getXMMRegLane64( gregOfRexRM(pfx,modrm),
   11995                                       0/*lower lane*/ ) );
   11996             DIP("movlpd %s, %s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
   11997                                    dis_buf);
   11998             goto decode_success;
   11999          }
   12000          /* else fall through */
   12001       }
   12002       break;
   12003 
   12004    case 0x14:
   12005    case 0x15:
   12006       /* 0F 14 = UNPCKLPS -- unpack and interleave low part F32s */
   12007       /* 0F 15 = UNPCKHPS -- unpack and interleave high part F32s */
   12008       /* These just appear to be special cases of SHUFPS */
   12009       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12010          Bool   hi = toBool(opc == 0x15);
   12011          IRTemp sV = newTemp(Ity_V128);
   12012          IRTemp dV = newTemp(Ity_V128);
   12013          modrm = getUChar(delta);
   12014          UInt   rG = gregOfRexRM(pfx,modrm);
   12015          assign( dV, getXMMReg(rG) );
   12016          if (epartIsReg(modrm)) {
   12017             UInt rE = eregOfRexRM(pfx,modrm);
   12018             assign( sV, getXMMReg(rE) );
   12019             delta += 1;
   12020             DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   12021                 nameXMMReg(rE), nameXMMReg(rG));
   12022          } else {
   12023             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12024             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12025             delta += alen;
   12026             DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   12027                 dis_buf, nameXMMReg(rG));
   12028          }
   12029          IRTemp res = math_UNPCKxPS_128( sV, dV, hi );
   12030          putXMMReg( rG, mkexpr(res) );
   12031          goto decode_success;
   12032       }
   12033       /* 66 0F 15 = UNPCKHPD -- unpack and interleave high part F64s */
   12034       /* 66 0F 14 = UNPCKLPD -- unpack and interleave low part F64s */
   12035       /* These just appear to be special cases of SHUFPS */
   12036       if (have66noF2noF3(pfx)
   12037           && sz == 2 /* could be 8 if rex also present */) {
   12038          Bool   hi = toBool(opc == 0x15);
   12039          IRTemp sV = newTemp(Ity_V128);
   12040          IRTemp dV = newTemp(Ity_V128);
   12041          modrm = getUChar(delta);
   12042          UInt   rG = gregOfRexRM(pfx,modrm);
   12043          assign( dV, getXMMReg(rG) );
   12044          if (epartIsReg(modrm)) {
   12045             UInt rE = eregOfRexRM(pfx,modrm);
   12046             assign( sV, getXMMReg(rE) );
   12047             delta += 1;
   12048             DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   12049                 nameXMMReg(rE), nameXMMReg(rG));
   12050          } else {
   12051             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12052             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12053             delta += alen;
   12054             DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   12055                 dis_buf, nameXMMReg(rG));
   12056          }
   12057          IRTemp res = math_UNPCKxPD_128( sV, dV, hi );
   12058          putXMMReg( rG, mkexpr(res) );
   12059          goto decode_success;
   12060       }
   12061       break;
   12062 
   12063    case 0x16:
   12064       /* 66 0F 16 = MOVHPD -- move from mem to high half of XMM. */
   12065       /* These seems identical to MOVHPS.  This instruction encoding is
   12066          completely crazy. */
   12067       if (have66noF2noF3(pfx)
   12068           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12069          modrm = getUChar(delta);
   12070          if (epartIsReg(modrm)) {
   12071             /* fall through; apparently reg-reg is not possible */
   12072          } else {
   12073             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12074             delta += alen;
   12075             putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
   12076                              loadLE(Ity_I64, mkexpr(addr)) );
   12077             DIP("movhpd %s,%s\n", dis_buf,
   12078                                   nameXMMReg( gregOfRexRM(pfx,modrm) ));
   12079             goto decode_success;
   12080          }
   12081       }
   12082       /* 0F 16 = MOVHPS -- move from mem to high half of XMM. */
   12083       /* 0F 16 = MOVLHPS -- move from lo half to hi half of XMM. */
   12084       if (haveNo66noF2noF3(pfx)
   12085           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12086          modrm = getUChar(delta);
   12087          if (epartIsReg(modrm)) {
   12088             delta += 1;
   12089             putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
   12090                              getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ) );
   12091             DIP("movhps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12092                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12093          } else {
   12094             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12095             delta += alen;
   12096             putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
   12097                              loadLE(Ity_I64, mkexpr(addr)) );
   12098             DIP("movhps %s,%s\n", dis_buf,
   12099                                   nameXMMReg( gregOfRexRM(pfx,modrm) ));
   12100          }
   12101          goto decode_success;
   12102       }
   12103       break;
   12104 
   12105    case 0x17:
   12106       /* 0F 17 = MOVHPS -- move from high half of XMM to mem. */
   12107       if (haveNo66noF2noF3(pfx)
   12108           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12109          modrm = getUChar(delta);
   12110          if (!epartIsReg(modrm)) {
   12111             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12112             delta += alen;
   12113             storeLE( mkexpr(addr),
   12114                      getXMMRegLane64( gregOfRexRM(pfx,modrm),
   12115                                       1/*upper lane*/ ) );
   12116             DIP("movhps %s,%s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
   12117                                   dis_buf);
   12118             goto decode_success;
   12119          }
   12120          /* else fall through */
   12121       }
   12122       /* 66 0F 17 = MOVHPD -- move from high half of XMM to mem. */
   12123       /* Again, this seems identical to MOVHPS. */
   12124       if (have66noF2noF3(pfx)
   12125           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12126          modrm = getUChar(delta);
   12127          if (!epartIsReg(modrm)) {
   12128             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12129             delta += alen;
   12130             storeLE( mkexpr(addr),
   12131                      getXMMRegLane64( gregOfRexRM(pfx,modrm),
   12132                                       1/*upper lane*/ ) );
   12133             DIP("movhpd %s,%s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
   12134                                   dis_buf);
   12135             goto decode_success;
   12136          }
   12137          /* else fall through */
   12138       }
   12139       break;
   12140 
   12141    case 0x18:
   12142       /* 0F 18 /0 = PREFETCHNTA -- prefetch into caches, */
   12143       /* 0F 18 /1 = PREFETCH0   -- with various different hints */
   12144       /* 0F 18 /2 = PREFETCH1 */
   12145       /* 0F 18 /3 = PREFETCH2 */
   12146       if (haveNo66noF2noF3(pfx)
   12147           && !epartIsReg(getUChar(delta))
   12148           && gregLO3ofRM(getUChar(delta)) >= 0
   12149           && gregLO3ofRM(getUChar(delta)) <= 3) {
   12150          const HChar* hintstr = "??";
   12151 
   12152          modrm = getUChar(delta);
   12153          vassert(!epartIsReg(modrm));
   12154 
   12155          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12156          delta += alen;
   12157 
   12158          switch (gregLO3ofRM(modrm)) {
   12159             case 0: hintstr = "nta"; break;
   12160             case 1: hintstr = "t0"; break;
   12161             case 2: hintstr = "t1"; break;
   12162             case 3: hintstr = "t2"; break;
   12163             default: vassert(0);
   12164          }
   12165 
   12166          DIP("prefetch%s %s\n", hintstr, dis_buf);
   12167          goto decode_success;
   12168       }
   12169       break;
   12170 
   12171    case 0x28:
   12172       /* 66 0F 28 = MOVAPD -- move from E (mem or xmm) to G (xmm). */
   12173       if (have66noF2noF3(pfx)
   12174           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12175          modrm = getUChar(delta);
   12176          if (epartIsReg(modrm)) {
   12177             putXMMReg( gregOfRexRM(pfx,modrm),
   12178                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   12179             DIP("movapd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12180                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12181             delta += 1;
   12182          } else {
   12183             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12184             gen_SEGV_if_not_16_aligned( addr );
   12185             putXMMReg( gregOfRexRM(pfx,modrm),
   12186                        loadLE(Ity_V128, mkexpr(addr)) );
   12187             DIP("movapd %s,%s\n", dis_buf,
   12188                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12189             delta += alen;
   12190          }
   12191          goto decode_success;
   12192       }
   12193       /* 0F 28 = MOVAPS -- move from E (mem or xmm) to G (xmm). */
   12194       if (haveNo66noF2noF3(pfx)
   12195           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12196          modrm = getUChar(delta);
   12197          if (epartIsReg(modrm)) {
   12198             putXMMReg( gregOfRexRM(pfx,modrm),
   12199                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   12200             DIP("movaps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12201                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12202             delta += 1;
   12203          } else {
   12204             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12205             gen_SEGV_if_not_16_aligned( addr );
   12206             putXMMReg( gregOfRexRM(pfx,modrm),
   12207                        loadLE(Ity_V128, mkexpr(addr)) );
   12208             DIP("movaps %s,%s\n", dis_buf,
   12209                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12210             delta += alen;
   12211          }
   12212          goto decode_success;
   12213       }
   12214       break;
   12215 
   12216    case 0x29:
   12217       /* 0F 29 = MOVAPS -- move from G (xmm) to E (mem or xmm). */
   12218       if (haveNo66noF2noF3(pfx)
   12219           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12220          modrm = getUChar(delta);
   12221          if (epartIsReg(modrm)) {
   12222             putXMMReg( eregOfRexRM(pfx,modrm),
   12223                        getXMMReg( gregOfRexRM(pfx,modrm) ));
   12224             DIP("movaps %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12225                                   nameXMMReg(eregOfRexRM(pfx,modrm)));
   12226             delta += 1;
   12227          } else {
   12228             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12229             gen_SEGV_if_not_16_aligned( addr );
   12230             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   12231             DIP("movaps %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12232                                   dis_buf );
   12233             delta += alen;
   12234          }
   12235          goto decode_success;
   12236       }
   12237       /* 66 0F 29 = MOVAPD -- move from G (xmm) to E (mem or xmm). */
   12238       if (have66noF2noF3(pfx)
   12239           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12240          modrm = getUChar(delta);
   12241          if (epartIsReg(modrm)) {
   12242             putXMMReg( eregOfRexRM(pfx,modrm),
   12243                        getXMMReg( gregOfRexRM(pfx,modrm) ) );
   12244             DIP("movapd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12245                                   nameXMMReg(eregOfRexRM(pfx,modrm)));
   12246             delta += 1;
   12247          } else {
   12248             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12249             gen_SEGV_if_not_16_aligned( addr );
   12250             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   12251             DIP("movapd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12252                                   dis_buf );
   12253             delta += alen;
   12254          }
   12255          goto decode_success;
   12256       }
   12257       break;
   12258 
   12259    case 0x2A:
   12260       /* 0F 2A = CVTPI2PS -- convert 2 x I32 in mem/mmx to 2 x F32 in low
   12261          half xmm */
   12262       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12263          IRTemp arg64 = newTemp(Ity_I64);
   12264          IRTemp rmode = newTemp(Ity_I32);
   12265 
   12266          modrm = getUChar(delta);
   12267          do_MMX_preamble();
   12268          if (epartIsReg(modrm)) {
   12269             assign( arg64, getMMXReg(eregLO3ofRM(modrm)) );
   12270             delta += 1;
   12271             DIP("cvtpi2ps %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   12272                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12273          } else {
   12274             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12275             assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   12276             delta += alen;
   12277             DIP("cvtpi2ps %s,%s\n", dis_buf,
   12278                                     nameXMMReg(gregOfRexRM(pfx,modrm)) );
   12279          }
   12280 
   12281          assign( rmode, get_sse_roundingmode() );
   12282 
   12283          putXMMRegLane32F(
   12284             gregOfRexRM(pfx,modrm), 0,
   12285             binop(Iop_F64toF32,
   12286                   mkexpr(rmode),
   12287                   unop(Iop_I32StoF64,
   12288                        unop(Iop_64to32, mkexpr(arg64)) )) );
   12289 
   12290          putXMMRegLane32F(
   12291             gregOfRexRM(pfx,modrm), 1,
   12292             binop(Iop_F64toF32,
   12293                   mkexpr(rmode),
   12294                   unop(Iop_I32StoF64,
   12295                        unop(Iop_64HIto32, mkexpr(arg64)) )) );
   12296 
   12297          goto decode_success;
   12298       }
   12299       /* F3 0F 2A = CVTSI2SS
   12300          -- sz==4: convert I32 in mem/ireg to F32 in low quarter xmm
   12301          -- sz==8: convert I64 in mem/ireg to F32 in low quarter xmm */
   12302       if (haveF3no66noF2(pfx) && (sz == 4 || sz == 8)) {
   12303          IRTemp rmode = newTemp(Ity_I32);
   12304          assign( rmode, get_sse_roundingmode() );
   12305          modrm = getUChar(delta);
   12306          if (sz == 4) {
   12307             IRTemp arg32 = newTemp(Ity_I32);
   12308             if (epartIsReg(modrm)) {
   12309                assign( arg32, getIReg32(eregOfRexRM(pfx,modrm)) );
   12310                delta += 1;
   12311                DIP("cvtsi2ss %s,%s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
   12312                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
   12313             } else {
   12314                addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12315                assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   12316                delta += alen;
   12317                DIP("cvtsi2ss %s,%s\n", dis_buf,
   12318                                        nameXMMReg(gregOfRexRM(pfx,modrm)) );
   12319             }
   12320             putXMMRegLane32F(
   12321                gregOfRexRM(pfx,modrm), 0,
   12322                binop(Iop_F64toF32,
   12323                      mkexpr(rmode),
   12324                      unop(Iop_I32StoF64, mkexpr(arg32)) ) );
   12325          } else {
   12326             /* sz == 8 */
   12327             IRTemp arg64 = newTemp(Ity_I64);
   12328             if (epartIsReg(modrm)) {
   12329                assign( arg64, getIReg64(eregOfRexRM(pfx,modrm)) );
   12330                delta += 1;
   12331                DIP("cvtsi2ssq %s,%s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
   12332                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
   12333             } else {
   12334                addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12335                assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   12336                delta += alen;
   12337                DIP("cvtsi2ssq %s,%s\n", dis_buf,
   12338                                         nameXMMReg(gregOfRexRM(pfx,modrm)) );
   12339             }
   12340             putXMMRegLane32F(
   12341                gregOfRexRM(pfx,modrm), 0,
   12342                binop(Iop_F64toF32,
   12343                      mkexpr(rmode),
   12344                      binop(Iop_I64StoF64, mkexpr(rmode), mkexpr(arg64)) ) );
   12345          }
   12346          goto decode_success;
   12347       }
   12348       /* F2 0F 2A = CVTSI2SD
   12349          when sz==4 -- convert I32 in mem/ireg to F64 in low half xmm
   12350          when sz==8 -- convert I64 in mem/ireg to F64 in low half xmm
   12351       */
   12352       if (haveF2no66noF3(pfx) && (sz == 4 || sz == 8)) {
   12353          modrm = getUChar(delta);
   12354          if (sz == 4) {
   12355             IRTemp arg32 = newTemp(Ity_I32);
   12356             if (epartIsReg(modrm)) {
   12357                assign( arg32, getIReg32(eregOfRexRM(pfx,modrm)) );
   12358                delta += 1;
   12359                DIP("cvtsi2sdl %s,%s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
   12360                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
   12361             } else {
   12362                addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12363                assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   12364                delta += alen;
   12365                DIP("cvtsi2sdl %s,%s\n", dis_buf,
   12366                                         nameXMMReg(gregOfRexRM(pfx,modrm)) );
   12367             }
   12368             putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0,
   12369                               unop(Iop_I32StoF64, mkexpr(arg32))
   12370             );
   12371          } else {
   12372             /* sz == 8 */
   12373             IRTemp arg64 = newTemp(Ity_I64);
   12374             if (epartIsReg(modrm)) {
   12375                assign( arg64, getIReg64(eregOfRexRM(pfx,modrm)) );
   12376                delta += 1;
   12377                DIP("cvtsi2sdq %s,%s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
   12378                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
   12379             } else {
   12380                addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12381                assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   12382                delta += alen;
   12383                DIP("cvtsi2sdq %s,%s\n", dis_buf,
   12384                                         nameXMMReg(gregOfRexRM(pfx,modrm)) );
   12385             }
   12386             putXMMRegLane64F(
   12387                gregOfRexRM(pfx,modrm),
   12388                0,
   12389                binop( Iop_I64StoF64,
   12390                       get_sse_roundingmode(),
   12391                       mkexpr(arg64)
   12392                )
   12393             );
   12394          }
   12395          goto decode_success;
   12396       }
   12397       /* 66 0F 2A = CVTPI2PD -- convert 2 x I32 in mem/mmx to 2 x F64 in
   12398          xmm(G) */
   12399       if (have66noF2noF3(pfx) && sz == 2) {
   12400          IRTemp arg64 = newTemp(Ity_I64);
   12401 
   12402          modrm = getUChar(delta);
   12403          if (epartIsReg(modrm)) {
   12404             /* Only switch to MMX mode if the source is a MMX register.
   12405                This is inconsistent with all other instructions which
   12406                convert between XMM and (M64 or MMX), which always switch
   12407                to MMX mode even if 64-bit operand is M64 and not MMX.  At
   12408                least, that's what the Intel docs seem to me to say.
   12409                Fixes #210264. */
   12410             do_MMX_preamble();
   12411             assign( arg64, getMMXReg(eregLO3ofRM(modrm)) );
   12412             delta += 1;
   12413             DIP("cvtpi2pd %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   12414                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12415          } else {
   12416             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12417             assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   12418             delta += alen;
   12419             DIP("cvtpi2pd %s,%s\n", dis_buf,
   12420                                     nameXMMReg(gregOfRexRM(pfx,modrm)) );
   12421          }
   12422 
   12423          putXMMRegLane64F(
   12424             gregOfRexRM(pfx,modrm), 0,
   12425             unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)) )
   12426          );
   12427 
   12428          putXMMRegLane64F(
   12429             gregOfRexRM(pfx,modrm), 1,
   12430             unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)) )
   12431          );
   12432 
   12433          goto decode_success;
   12434       }
   12435       break;
   12436 
   12437    case 0x2B:
   12438       /* 66 0F 2B = MOVNTPD -- for us, just a plain SSE store. */
   12439       /* 0F 2B = MOVNTPS -- for us, just a plain SSE store. */
   12440       if ( (haveNo66noF2noF3(pfx) && sz == 4)
   12441            || (have66noF2noF3(pfx) && sz == 2) ) {
   12442          modrm = getUChar(delta);
   12443          if (!epartIsReg(modrm)) {
   12444             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12445             gen_SEGV_if_not_16_aligned( addr );
   12446             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   12447             DIP("movntp%s %s,%s\n", sz==2 ? "d" : "s",
   12448                                     dis_buf,
   12449                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12450             delta += alen;
   12451             goto decode_success;
   12452          }
   12453          /* else fall through */
   12454       }
   12455       break;
   12456 
   12457    case 0x2C:
   12458    case 0x2D:
   12459       /* 0F 2D = CVTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
   12460          I32 in mmx, according to prevailing SSE rounding mode */
   12461       /* 0F 2C = CVTTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
   12462          I32 in mmx, rounding towards zero */
   12463       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12464          IRTemp dst64  = newTemp(Ity_I64);
   12465          IRTemp rmode  = newTemp(Ity_I32);
   12466          IRTemp f32lo  = newTemp(Ity_F32);
   12467          IRTemp f32hi  = newTemp(Ity_F32);
   12468          Bool   r2zero = toBool(opc == 0x2C);
   12469 
   12470          do_MMX_preamble();
   12471          modrm = getUChar(delta);
   12472 
   12473          if (epartIsReg(modrm)) {
   12474             delta += 1;
   12475             assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
   12476             assign(f32hi, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 1));
   12477             DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
   12478                                       nameXMMReg(eregOfRexRM(pfx,modrm)),
   12479                                       nameMMXReg(gregLO3ofRM(modrm)));
   12480          } else {
   12481             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12482             assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   12483             assign(f32hi, loadLE(Ity_F32, binop( Iop_Add64,
   12484                                                  mkexpr(addr),
   12485                                                  mkU64(4) )));
   12486             delta += alen;
   12487             DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
   12488                                       dis_buf,
   12489                                       nameMMXReg(gregLO3ofRM(modrm)));
   12490          }
   12491 
   12492          if (r2zero) {
   12493             assign(rmode, mkU32((UInt)Irrm_ZERO) );
   12494          } else {
   12495             assign( rmode, get_sse_roundingmode() );
   12496          }
   12497 
   12498          assign(
   12499             dst64,
   12500             binop( Iop_32HLto64,
   12501                    binop( Iop_F64toI32S,
   12502                           mkexpr(rmode),
   12503                           unop( Iop_F32toF64, mkexpr(f32hi) ) ),
   12504                    binop( Iop_F64toI32S,
   12505                           mkexpr(rmode),
   12506                           unop( Iop_F32toF64, mkexpr(f32lo) ) )
   12507                  )
   12508          );
   12509 
   12510          putMMXReg(gregLO3ofRM(modrm), mkexpr(dst64));
   12511          goto decode_success;
   12512       }
   12513       /* F3 0F 2D = CVTSS2SI
   12514          when sz==4 -- convert F32 in mem/low quarter xmm to I32 in ireg,
   12515                        according to prevailing SSE rounding mode
   12516          when sz==8 -- convert F32 in mem/low quarter xmm to I64 in ireg,
   12517                        according to prevailing SSE rounding mode
   12518       */
   12519       /* F3 0F 2C = CVTTSS2SI
   12520          when sz==4 -- convert F32 in mem/low quarter xmm to I32 in ireg,
   12521                        truncating towards zero
   12522          when sz==8 -- convert F32 in mem/low quarter xmm to I64 in ireg,
   12523                        truncating towards zero
   12524       */
   12525       if (haveF3no66noF2(pfx) && (sz == 4 || sz == 8)) {
   12526          delta = dis_CVTxSS2SI( vbi, pfx, delta, False/*!isAvx*/, opc, sz);
   12527          goto decode_success;
   12528       }
   12529       /* F2 0F 2D = CVTSD2SI
   12530          when sz==4 -- convert F64 in mem/low half xmm to I32 in ireg,
   12531                        according to prevailing SSE rounding mode
   12532          when sz==8 -- convert F64 in mem/low half xmm to I64 in ireg,
   12533                        according to prevailing SSE rounding mode
   12534       */
   12535       /* F2 0F 2C = CVTTSD2SI
   12536          when sz==4 -- convert F64 in mem/low half xmm to I32 in ireg,
   12537                        truncating towards zero
   12538          when sz==8 -- convert F64 in mem/low half xmm to I64 in ireg,
   12539                        truncating towards zero
   12540       */
   12541       if (haveF2no66noF3(pfx) && (sz == 4 || sz == 8)) {
   12542          delta = dis_CVTxSD2SI( vbi, pfx, delta, False/*!isAvx*/, opc, sz);
   12543          goto decode_success;
   12544       }
   12545       /* 66 0F 2D = CVTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
   12546          I32 in mmx, according to prevailing SSE rounding mode */
   12547       /* 66 0F 2C = CVTTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
   12548          I32 in mmx, rounding towards zero */
   12549       if (have66noF2noF3(pfx) && sz == 2) {
   12550          IRTemp dst64  = newTemp(Ity_I64);
   12551          IRTemp rmode  = newTemp(Ity_I32);
   12552          IRTemp f64lo  = newTemp(Ity_F64);
   12553          IRTemp f64hi  = newTemp(Ity_F64);
   12554          Bool   r2zero = toBool(opc == 0x2C);
   12555 
   12556          do_MMX_preamble();
   12557          modrm = getUChar(delta);
   12558 
   12559          if (epartIsReg(modrm)) {
   12560             delta += 1;
   12561             assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
   12562             assign(f64hi, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 1));
   12563             DIP("cvt%spd2pi %s,%s\n", r2zero ? "t" : "",
   12564                                       nameXMMReg(eregOfRexRM(pfx,modrm)),
   12565                                       nameMMXReg(gregLO3ofRM(modrm)));
   12566          } else {
   12567             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12568             assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   12569             assign(f64hi, loadLE(Ity_F64, binop( Iop_Add64,
   12570                                                  mkexpr(addr),
   12571                                                  mkU64(8) )));
   12572             delta += alen;
   12573             DIP("cvt%spf2pi %s,%s\n", r2zero ? "t" : "",
   12574                                       dis_buf,
   12575                                       nameMMXReg(gregLO3ofRM(modrm)));
   12576          }
   12577 
   12578          if (r2zero) {
   12579             assign(rmode, mkU32((UInt)Irrm_ZERO) );
   12580          } else {
   12581             assign( rmode, get_sse_roundingmode() );
   12582          }
   12583 
   12584          assign(
   12585             dst64,
   12586             binop( Iop_32HLto64,
   12587                    binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64hi) ),
   12588                    binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo) )
   12589                  )
   12590          );
   12591 
   12592          putMMXReg(gregLO3ofRM(modrm), mkexpr(dst64));
   12593          goto decode_success;
   12594       }
   12595       break;
   12596 
   12597    case 0x2E:
   12598    case 0x2F:
   12599       /* 66 0F 2E = UCOMISD -- 64F0x2 comparison G,E, and set ZCP */
   12600       /* 66 0F 2F = COMISD  -- 64F0x2 comparison G,E, and set ZCP */
   12601       if (have66noF2noF3(pfx) && sz == 2) {
   12602          delta = dis_COMISD( vbi, pfx, delta, False/*!isAvx*/, opc );
   12603          goto decode_success;
   12604       }
   12605       /* 0F 2E = UCOMISS -- 32F0x4 comparison G,E, and set ZCP */
   12606       /* 0F 2F = COMISS  -- 32F0x4 comparison G,E, and set ZCP */
   12607       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12608          delta = dis_COMISS( vbi, pfx, delta, False/*!isAvx*/, opc );
   12609          goto decode_success;
   12610       }
   12611       break;
   12612 
   12613    case 0x50:
   12614       /* 0F 50 = MOVMSKPS - move 4 sign bits from 4 x F32 in xmm(E)
   12615          to 4 lowest bits of ireg(G) */
   12616       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   12617           && epartIsReg(getUChar(delta))) {
   12618          /* sz == 8 is a kludge to handle insns with REX.W redundantly
   12619             set to 1, which has been known to happen:
   12620 
   12621             4c 0f 50 d9             rex64X movmskps %xmm1,%r11d
   12622 
   12623             20071106: Intel docs say that REX.W isn't redundant: when
   12624             present, a 64-bit register is written; when not present, only
   12625             the 32-bit half is written.  However, testing on a Core2
   12626             machine suggests the entire 64 bit register is written
   12627             irrespective of the status of REX.W.  That could be because
   12628             of the default rule that says "if the lower half of a 32-bit
   12629             register is written, the upper half is zeroed".  By using
   12630             putIReg32 here we inadvertantly produce the same behaviour as
   12631             the Core2, for the same reason -- putIReg32 implements said
   12632             rule.
   12633 
   12634             AMD docs give no indication that REX.W is even valid for this
   12635             insn. */
   12636          delta = dis_MOVMSKPS_128( vbi, pfx, delta, False/*!isAvx*/ );
   12637          goto decode_success;
   12638       }
   12639       /* 66 0F 50 = MOVMSKPD - move 2 sign bits from 2 x F64 in xmm(E) to
   12640          2 lowest bits of ireg(G) */
   12641       if (have66noF2noF3(pfx) && (sz == 2 || sz == 8)) {
   12642          /* sz == 8 is a kludge to handle insns with REX.W redundantly
   12643             set to 1, which has been known to happen:
   12644             66 4c 0f 50 d9          rex64X movmskpd %xmm1,%r11d
   12645             20071106: see further comments on MOVMSKPS implementation above.
   12646          */
   12647          delta = dis_MOVMSKPD_128( vbi, pfx, delta, False/*!isAvx*/ );
   12648          goto decode_success;
   12649       }
   12650       break;
   12651 
   12652    case 0x51:
   12653       /* F3 0F 51 = SQRTSS -- approx sqrt 32F0x4 from R/M to R */
   12654       if (haveF3no66noF2(pfx) && sz == 4) {
   12655          delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta,
   12656                                             "sqrtss", Iop_Sqrt32F0x4 );
   12657          goto decode_success;
   12658       }
   12659       /* 0F 51 = SQRTPS -- approx sqrt 32Fx4 from R/M to R */
   12660       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12661          delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
   12662                                            "sqrtps", Iop_Sqrt32Fx4 );
   12663          goto decode_success;
   12664       }
   12665       /* F2 0F 51 = SQRTSD -- approx sqrt 64F0x2 from R/M to R */
   12666       if (haveF2no66noF3(pfx) && sz == 4) {
   12667          delta = dis_SSE_E_to_G_unary_lo64( vbi, pfx, delta,
   12668                                             "sqrtsd", Iop_Sqrt64F0x2 );
   12669          goto decode_success;
   12670       }
   12671       /* 66 0F 51 = SQRTPD -- approx sqrt 64Fx2 from R/M to R */
   12672       if (have66noF2noF3(pfx) && sz == 2) {
   12673          delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
   12674                                            "sqrtpd", Iop_Sqrt64Fx2 );
   12675          goto decode_success;
   12676       }
   12677       break;
   12678 
   12679    case 0x52:
   12680       /* F3 0F 52 = RSQRTSS -- approx reciprocal sqrt 32F0x4 from R/M to R */
   12681       if (haveF3no66noF2(pfx) && sz == 4) {
   12682          delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta,
   12683                                             "rsqrtss", Iop_RSqrtEst32F0x4 );
   12684          goto decode_success;
   12685       }
   12686       /* 0F 52 = RSQRTPS -- approx reciprocal sqrt 32Fx4 from R/M to R */
   12687       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12688          delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
   12689                                            "rsqrtps", Iop_RSqrtEst32Fx4 );
   12690          goto decode_success;
   12691       }
   12692       break;
   12693 
   12694    case 0x53:
   12695       /* F3 0F 53 = RCPSS -- approx reciprocal 32F0x4 from R/M to R */
   12696       if (haveF3no66noF2(pfx) && sz == 4) {
   12697          delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta,
   12698                                             "rcpss", Iop_RecipEst32F0x4 );
   12699          goto decode_success;
   12700       }
   12701       /* 0F 53 = RCPPS -- approx reciprocal 32Fx4 from R/M to R */
   12702       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12703          delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
   12704                                            "rcpps", Iop_RecipEst32Fx4 );
   12705          goto decode_success;
   12706       }
   12707       break;
   12708 
   12709    case 0x54:
   12710       /* 0F 54 = ANDPS -- G = G and E */
   12711       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12712          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "andps", Iop_AndV128 );
   12713          goto decode_success;
   12714       }
   12715       /* 66 0F 54 = ANDPD -- G = G and E */
   12716       if (have66noF2noF3(pfx) && sz == 2) {
   12717          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "andpd", Iop_AndV128 );
   12718          goto decode_success;
   12719       }
   12720       break;
   12721 
   12722    case 0x55:
   12723       /* 0F 55 = ANDNPS -- G = (not G) and E */
   12724       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12725          delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta, "andnps",
   12726                                                            Iop_AndV128 );
   12727          goto decode_success;
   12728       }
   12729       /* 66 0F 55 = ANDNPD -- G = (not G) and E */
   12730       if (have66noF2noF3(pfx) && sz == 2) {
   12731          delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta, "andnpd",
   12732                                                            Iop_AndV128 );
   12733          goto decode_success;
   12734       }
   12735       break;
   12736 
   12737    case 0x56:
   12738       /* 0F 56 = ORPS -- G = G and E */
   12739       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12740          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "orps", Iop_OrV128 );
   12741          goto decode_success;
   12742       }
   12743       /* 66 0F 56 = ORPD -- G = G and E */
   12744       if (have66noF2noF3(pfx) && sz == 2) {
   12745          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "orpd", Iop_OrV128 );
   12746          goto decode_success;
   12747       }
   12748       break;
   12749 
   12750    case 0x57:
   12751       /* 66 0F 57 = XORPD -- G = G xor E */
   12752       if (have66noF2noF3(pfx) && sz == 2) {
   12753          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "xorpd", Iop_XorV128 );
   12754          goto decode_success;
   12755       }
   12756       /* 0F 57 = XORPS -- G = G xor E */
   12757       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12758          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "xorps", Iop_XorV128 );
   12759          goto decode_success;
   12760       }
   12761       break;
   12762 
   12763    case 0x58:
   12764       /* 0F 58 = ADDPS -- add 32Fx4 from R/M to R */
   12765       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12766          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "addps", Iop_Add32Fx4 );
   12767          goto decode_success;
   12768       }
   12769       /* F3 0F 58 = ADDSS -- add 32F0x4 from R/M to R */
   12770       if (haveF3no66noF2(pfx) && sz == 4) {
   12771          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "addss", Iop_Add32F0x4 );
   12772          goto decode_success;
   12773       }
   12774       /* F2 0F 58 = ADDSD -- add 64F0x2 from R/M to R */
   12775       if (haveF2no66noF3(pfx)
   12776           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12777          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "addsd", Iop_Add64F0x2 );
   12778          goto decode_success;
   12779       }
   12780       /* 66 0F 58 = ADDPD -- add 32Fx4 from R/M to R */
   12781       if (have66noF2noF3(pfx)
   12782           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12783          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "addpd", Iop_Add64Fx2 );
   12784          goto decode_success;
   12785       }
   12786       break;
   12787 
   12788    case 0x59:
   12789       /* F2 0F 59 = MULSD -- mul 64F0x2 from R/M to R */
   12790       if (haveF2no66noF3(pfx)
   12791           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12792          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "mulsd", Iop_Mul64F0x2 );
   12793          goto decode_success;
   12794       }
   12795       /* F3 0F 59 = MULSS -- mul 32F0x4 from R/M to R */
   12796       if (haveF3no66noF2(pfx) && sz == 4) {
   12797          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "mulss", Iop_Mul32F0x4 );
   12798          goto decode_success;
   12799       }
   12800       /* 0F 59 = MULPS -- mul 32Fx4 from R/M to R */
   12801       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12802          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "mulps", Iop_Mul32Fx4 );
   12803          goto decode_success;
   12804       }
   12805       /* 66 0F 59 = MULPD -- mul 64Fx2 from R/M to R */
   12806       if (have66noF2noF3(pfx)
   12807           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12808          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "mulpd", Iop_Mul64Fx2 );
   12809          goto decode_success;
   12810       }
   12811       break;
   12812 
   12813    case 0x5A:
   12814       /* 0F 5A = CVTPS2PD -- convert 2 x F32 in low half mem/xmm to 2 x
   12815          F64 in xmm(G). */
   12816       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12817          delta = dis_CVTPS2PD_128( vbi, pfx, delta, False/*!isAvx*/ );
   12818          goto decode_success;
   12819       }
   12820       /* F3 0F 5A = CVTSS2SD -- convert F32 in mem/low 1/4 xmm to F64 in
   12821          low half xmm(G) */
   12822       if (haveF3no66noF2(pfx) && sz == 4) {
   12823          IRTemp f32lo = newTemp(Ity_F32);
   12824 
   12825          modrm = getUChar(delta);
   12826          if (epartIsReg(modrm)) {
   12827             delta += 1;
   12828             assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
   12829             DIP("cvtss2sd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12830                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12831          } else {
   12832             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12833             assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   12834             delta += alen;
   12835             DIP("cvtss2sd %s,%s\n", dis_buf,
   12836                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12837          }
   12838 
   12839          putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0,
   12840                            unop( Iop_F32toF64, mkexpr(f32lo) ) );
   12841 
   12842          goto decode_success;
   12843       }
   12844       /* F2 0F 5A = CVTSD2SS -- convert F64 in mem/low half xmm to F32 in
   12845          low 1/4 xmm(G), according to prevailing SSE rounding mode */
   12846       if (haveF2no66noF3(pfx) && sz == 4) {
   12847          IRTemp rmode = newTemp(Ity_I32);
   12848          IRTemp f64lo = newTemp(Ity_F64);
   12849 
   12850          modrm = getUChar(delta);
   12851          if (epartIsReg(modrm)) {
   12852             delta += 1;
   12853             assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
   12854             DIP("cvtsd2ss %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12855                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12856          } else {
   12857             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12858             assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   12859             delta += alen;
   12860             DIP("cvtsd2ss %s,%s\n", dis_buf,
   12861                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12862          }
   12863 
   12864          assign( rmode, get_sse_roundingmode() );
   12865          putXMMRegLane32F(
   12866             gregOfRexRM(pfx,modrm), 0,
   12867             binop( Iop_F64toF32, mkexpr(rmode), mkexpr(f64lo) )
   12868          );
   12869 
   12870          goto decode_success;
   12871       }
   12872       /* 66 0F 5A = CVTPD2PS -- convert 2 x F64 in mem/xmm to 2 x F32 in
   12873          lo half xmm(G), rounding according to prevailing SSE rounding
   12874          mode, and zero upper half */
   12875       /* Note, this is practically identical to CVTPD2DQ.  It would have
   12876          be nice to merge them together. */
   12877       if (have66noF2noF3(pfx) && sz == 2) {
   12878          delta = dis_CVTPD2PS_128( vbi, pfx, delta, False/*!isAvx*/ );
   12879          goto decode_success;
   12880       }
   12881       break;
   12882 
   12883    case 0x5B:
   12884       /* F3 0F 5B = CVTTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
   12885          xmm(G), rounding towards zero */
   12886       /* 66 0F 5B = CVTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
   12887          xmm(G), as per the prevailing rounding mode */
   12888       if ( (have66noF2noF3(pfx) && sz == 2)
   12889            || (haveF3no66noF2(pfx) && sz == 4) ) {
   12890          Bool r2zero = toBool(sz == 4); // FIXME -- unreliable (???)
   12891          delta = dis_CVTxPS2DQ_128( vbi, pfx, delta, False/*!isAvx*/, r2zero );
   12892          goto decode_success;
   12893       }
   12894       /* 0F 5B = CVTDQ2PS -- convert 4 x I32 in mem/xmm to 4 x F32 in
   12895          xmm(G) */
   12896       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12897          delta = dis_CVTDQ2PS_128( vbi, pfx, delta, False/*!isAvx*/ );
   12898          goto decode_success;
   12899       }
   12900       break;
   12901 
   12902    case 0x5C:
   12903       /* F3 0F 5C = SUBSS -- sub 32F0x4 from R/M to R */
   12904       if (haveF3no66noF2(pfx) && sz == 4) {
   12905          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "subss", Iop_Sub32F0x4 );
   12906          goto decode_success;
   12907       }
   12908       /* F2 0F 5C = SUBSD -- sub 64F0x2 from R/M to R */
   12909       if (haveF2no66noF3(pfx)
   12910           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12911          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "subsd", Iop_Sub64F0x2 );
   12912          goto decode_success;
   12913       }
   12914       /* 0F 5C = SUBPS -- sub 32Fx4 from R/M to R */
   12915       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12916          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "subps", Iop_Sub32Fx4 );
   12917          goto decode_success;
   12918       }
   12919       /* 66 0F 5C = SUBPD -- sub 64Fx2 from R/M to R */
   12920       if (have66noF2noF3(pfx) && sz == 2) {
   12921          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "subpd", Iop_Sub64Fx2 );
   12922          goto decode_success;
   12923       }
   12924       break;
   12925 
   12926    case 0x5D:
   12927       /* 0F 5D = MINPS -- min 32Fx4 from R/M to R */
   12928       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12929          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "minps", Iop_Min32Fx4 );
   12930          goto decode_success;
   12931       }
   12932       /* F3 0F 5D = MINSS -- min 32F0x4 from R/M to R */
   12933       if (haveF3no66noF2(pfx) && sz == 4) {
   12934          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "minss", Iop_Min32F0x4 );
   12935          goto decode_success;
   12936       }
   12937       /* F2 0F 5D = MINSD -- min 64F0x2 from R/M to R */
   12938       if (haveF2no66noF3(pfx) && sz == 4) {
   12939          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "minsd", Iop_Min64F0x2 );
   12940          goto decode_success;
   12941       }
   12942       /* 66 0F 5D = MINPD -- min 64Fx2 from R/M to R */
   12943       if (have66noF2noF3(pfx) && sz == 2) {
   12944          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "minpd", Iop_Min64Fx2 );
   12945          goto decode_success;
   12946       }
   12947       break;
   12948 
   12949    case 0x5E:
   12950       /* F2 0F 5E = DIVSD -- div 64F0x2 from R/M to R */
   12951       if (haveF2no66noF3(pfx) && sz == 4) {
   12952          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "divsd", Iop_Div64F0x2 );
   12953          goto decode_success;
   12954       }
   12955       /* 0F 5E = DIVPS -- div 32Fx4 from R/M to R */
   12956       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12957          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "divps", Iop_Div32Fx4 );
   12958          goto decode_success;
   12959       }
   12960       /* F3 0F 5E = DIVSS -- div 32F0x4 from R/M to R */
   12961       if (haveF3no66noF2(pfx) && sz == 4) {
   12962          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "divss", Iop_Div32F0x4 );
   12963          goto decode_success;
   12964       }
   12965       /* 66 0F 5E = DIVPD -- div 64Fx2 from R/M to R */
   12966       if (have66noF2noF3(pfx) && sz == 2) {
   12967          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "divpd", Iop_Div64Fx2 );
   12968          goto decode_success;
   12969       }
   12970       break;
   12971 
   12972    case 0x5F:
   12973       /* 0F 5F = MAXPS -- max 32Fx4 from R/M to R */
   12974       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12975          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "maxps", Iop_Max32Fx4 );
   12976          goto decode_success;
   12977       }
   12978       /* F3 0F 5F = MAXSS -- max 32F0x4 from R/M to R */
   12979       if (haveF3no66noF2(pfx) && sz == 4) {
   12980          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "maxss", Iop_Max32F0x4 );
   12981          goto decode_success;
   12982       }
   12983       /* F2 0F 5F = MAXSD -- max 64F0x2 from R/M to R */
   12984       if (haveF2no66noF3(pfx) && sz == 4) {
   12985          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "maxsd", Iop_Max64F0x2 );
   12986          goto decode_success;
   12987       }
   12988       /* 66 0F 5F = MAXPD -- max 64Fx2 from R/M to R */
   12989       if (have66noF2noF3(pfx) && sz == 2) {
   12990          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "maxpd", Iop_Max64Fx2 );
   12991          goto decode_success;
   12992       }
   12993       break;
   12994 
   12995    case 0x60:
   12996       /* 66 0F 60 = PUNPCKLBW */
   12997       if (have66noF2noF3(pfx) && sz == 2) {
   12998          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   12999                                     "punpcklbw",
   13000                                     Iop_InterleaveLO8x16, True );
   13001          goto decode_success;
   13002       }
   13003       break;
   13004 
   13005    case 0x61:
   13006       /* 66 0F 61 = PUNPCKLWD */
   13007       if (have66noF2noF3(pfx) && sz == 2) {
   13008          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13009                                     "punpcklwd",
   13010                                     Iop_InterleaveLO16x8, True );
   13011          goto decode_success;
   13012       }
   13013       break;
   13014 
   13015    case 0x62:
   13016       /* 66 0F 62 = PUNPCKLDQ */
   13017       if (have66noF2noF3(pfx) && sz == 2) {
   13018          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13019                                     "punpckldq",
   13020                                     Iop_InterleaveLO32x4, True );
   13021          goto decode_success;
   13022       }
   13023       break;
   13024 
   13025    case 0x63:
   13026       /* 66 0F 63 = PACKSSWB */
   13027       if (have66noF2noF3(pfx) && sz == 2) {
   13028          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13029                                     "packsswb",
   13030                                     Iop_QNarrowBin16Sto8Sx16, True );
   13031          goto decode_success;
   13032       }
   13033       break;
   13034 
   13035    case 0x64:
   13036       /* 66 0F 64 = PCMPGTB */
   13037       if (have66noF2noF3(pfx) && sz == 2) {
   13038          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13039                                     "pcmpgtb", Iop_CmpGT8Sx16, False );
   13040          goto decode_success;
   13041       }
   13042       break;
   13043 
   13044    case 0x65:
   13045       /* 66 0F 65 = PCMPGTW */
   13046       if (have66noF2noF3(pfx) && sz == 2) {
   13047          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13048                                     "pcmpgtw", Iop_CmpGT16Sx8, False );
   13049          goto decode_success;
   13050       }
   13051       break;
   13052 
   13053    case 0x66:
   13054       /* 66 0F 66 = PCMPGTD */
   13055       if (have66noF2noF3(pfx) && sz == 2) {
   13056          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13057                                     "pcmpgtd", Iop_CmpGT32Sx4, False );
   13058          goto decode_success;
   13059       }
   13060       break;
   13061 
   13062    case 0x67:
   13063       /* 66 0F 67 = PACKUSWB */
   13064       if (have66noF2noF3(pfx) && sz == 2) {
   13065          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13066                                     "packuswb",
   13067                                     Iop_QNarrowBin16Sto8Ux16, True );
   13068          goto decode_success;
   13069       }
   13070       break;
   13071 
   13072    case 0x68:
   13073       /* 66 0F 68 = PUNPCKHBW */
   13074       if (have66noF2noF3(pfx) && sz == 2) {
   13075          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13076                                     "punpckhbw",
   13077                                     Iop_InterleaveHI8x16, True );
   13078          goto decode_success;
   13079       }
   13080       break;
   13081 
   13082    case 0x69:
   13083       /* 66 0F 69 = PUNPCKHWD */
   13084       if (have66noF2noF3(pfx) && sz == 2) {
   13085          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13086                                     "punpckhwd",
   13087                                     Iop_InterleaveHI16x8, True );
   13088          goto decode_success;
   13089       }
   13090       break;
   13091 
   13092    case 0x6A:
   13093       /* 66 0F 6A = PUNPCKHDQ */
   13094       if (have66noF2noF3(pfx) && sz == 2) {
   13095          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13096                                     "punpckhdq",
   13097                                     Iop_InterleaveHI32x4, True );
   13098          goto decode_success;
   13099       }
   13100       break;
   13101 
   13102    case 0x6B:
   13103       /* 66 0F 6B = PACKSSDW */
   13104       if (have66noF2noF3(pfx) && sz == 2) {
   13105          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13106                                     "packssdw",
   13107                                     Iop_QNarrowBin32Sto16Sx8, True );
   13108          goto decode_success;
   13109       }
   13110       break;
   13111 
   13112    case 0x6C:
   13113       /* 66 0F 6C = PUNPCKLQDQ */
   13114       if (have66noF2noF3(pfx) && sz == 2) {
   13115          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13116                                     "punpcklqdq",
   13117                                     Iop_InterleaveLO64x2, True );
   13118          goto decode_success;
   13119       }
   13120       break;
   13121 
   13122    case 0x6D:
   13123       /* 66 0F 6D = PUNPCKHQDQ */
   13124       if (have66noF2noF3(pfx) && sz == 2) {
   13125          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13126                                     "punpckhqdq",
   13127                                     Iop_InterleaveHI64x2, True );
   13128          goto decode_success;
   13129       }
   13130       break;
   13131 
   13132    case 0x6E:
   13133       /* 66 0F 6E = MOVD from ireg32/m32 to xmm lo 1/4,
   13134                     zeroing high 3/4 of xmm. */
   13135       /*              or from ireg64/m64 to xmm lo 1/2,
   13136                     zeroing high 1/2 of xmm. */
   13137       if (have66noF2noF3(pfx)) {
   13138          vassert(sz == 2 || sz == 8);
   13139          if (sz == 2) sz = 4;
   13140          modrm = getUChar(delta);
   13141          if (epartIsReg(modrm)) {
   13142             delta += 1;
   13143             if (sz == 4) {
   13144                putXMMReg(
   13145                   gregOfRexRM(pfx,modrm),
   13146                   unop( Iop_32UtoV128, getIReg32(eregOfRexRM(pfx,modrm)) )
   13147                );
   13148                DIP("movd %s, %s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
   13149                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13150             } else {
   13151                putXMMReg(
   13152                   gregOfRexRM(pfx,modrm),
   13153                   unop( Iop_64UtoV128, getIReg64(eregOfRexRM(pfx,modrm)) )
   13154                );
   13155                DIP("movq %s, %s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
   13156                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13157             }
   13158          } else {
   13159             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   13160             delta += alen;
   13161             putXMMReg(
   13162                gregOfRexRM(pfx,modrm),
   13163                sz == 4
   13164                   ?  unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)) )
   13165                   :  unop( Iop_64UtoV128,loadLE(Ity_I64, mkexpr(addr)) )
   13166             );
   13167             DIP("mov%c %s, %s\n", sz == 4 ? 'd' : 'q', dis_buf,
   13168                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13169          }
   13170          goto decode_success;
   13171       }
   13172       break;
   13173 
   13174    case 0x6F:
   13175       if (have66noF2noF3(pfx)
   13176           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   13177          /* 66 0F 6F = MOVDQA -- move from E (mem or xmm) to G (xmm). */
   13178          modrm = getUChar(delta);
   13179          if (epartIsReg(modrm)) {
   13180             putXMMReg( gregOfRexRM(pfx,modrm),
   13181                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   13182             DIP("movdqa %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   13183                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13184             delta += 1;
   13185          } else {
   13186             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13187             gen_SEGV_if_not_16_aligned( addr );
   13188             putXMMReg( gregOfRexRM(pfx,modrm),
   13189                        loadLE(Ity_V128, mkexpr(addr)) );
   13190             DIP("movdqa %s,%s\n", dis_buf,
   13191                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13192             delta += alen;
   13193          }
   13194          goto decode_success;
   13195       }
   13196       if (haveF3no66noF2(pfx) && sz == 4) {
   13197          /* F3 0F 6F = MOVDQU -- move from E (mem or xmm) to G (xmm). */
   13198          modrm = getUChar(delta);
   13199          if (epartIsReg(modrm)) {
   13200             putXMMReg( gregOfRexRM(pfx,modrm),
   13201                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   13202             DIP("movdqu %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   13203                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13204             delta += 1;
   13205          } else {
   13206             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13207             putXMMReg( gregOfRexRM(pfx,modrm),
   13208                        loadLE(Ity_V128, mkexpr(addr)) );
   13209             DIP("movdqu %s,%s\n", dis_buf,
   13210                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13211             delta += alen;
   13212          }
   13213          goto decode_success;
   13214       }
   13215       break;
   13216 
   13217    case 0x70:
   13218       /* 66 0F 70 = PSHUFD -- rearrange 4x32 from E(xmm or mem) to G(xmm) */
   13219       if (have66noF2noF3(pfx) && sz == 2) {
   13220          delta = dis_PSHUFD_32x4( vbi, pfx, delta, False/*!writesYmm*/);
   13221          goto decode_success;
   13222       }
   13223       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   13224       /* 0F 70 = PSHUFW -- rearrange 4x16 from E(mmx or mem) to G(mmx) */
   13225       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13226          Int order;
   13227          IRTemp sV, dV, s3, s2, s1, s0;
   13228          s3 = s2 = s1 = s0 = IRTemp_INVALID;
   13229          sV = newTemp(Ity_I64);
   13230          dV = newTemp(Ity_I64);
   13231          do_MMX_preamble();
   13232          modrm = getUChar(delta);
   13233          if (epartIsReg(modrm)) {
   13234             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   13235             order = (Int)getUChar(delta+1);
   13236             delta += 1+1;
   13237             DIP("pshufw $%d,%s,%s\n", order,
   13238                                       nameMMXReg(eregLO3ofRM(modrm)),
   13239                                       nameMMXReg(gregLO3ofRM(modrm)));
   13240          } else {
   13241             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   13242                               1/*extra byte after amode*/ );
   13243             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   13244             order = (Int)getUChar(delta+alen);
   13245             delta += 1+alen;
   13246             DIP("pshufw $%d,%s,%s\n", order,
   13247                                       dis_buf,
   13248                                       nameMMXReg(gregLO3ofRM(modrm)));
   13249          }
   13250          breakup64to16s( sV, &s3, &s2, &s1, &s0 );
   13251 #        define SEL(n) \
   13252                    ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   13253          assign(dV,
   13254                 mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
   13255                              SEL((order>>2)&3), SEL((order>>0)&3) )
   13256          );
   13257          putMMXReg(gregLO3ofRM(modrm), mkexpr(dV));
   13258 #        undef SEL
   13259          goto decode_success;
   13260       }
   13261       /* F2 0F 70 = PSHUFLW -- rearrange lower half 4x16 from E(xmm or
   13262          mem) to G(xmm), and copy upper half */
   13263       if (haveF2no66noF3(pfx) && sz == 4) {
   13264          delta = dis_PSHUFxW_128( vbi, pfx, delta,
   13265                                   False/*!isAvx*/, False/*!xIsH*/ );
   13266          goto decode_success;
   13267       }
   13268       /* F3 0F 70 = PSHUFHW -- rearrange upper half 4x16 from E(xmm or
   13269          mem) to G(xmm), and copy lower half */
   13270       if (haveF3no66noF2(pfx) && sz == 4) {
   13271          delta = dis_PSHUFxW_128( vbi, pfx, delta,
   13272                                   False/*!isAvx*/, True/*xIsH*/ );
   13273          goto decode_success;
   13274       }
   13275       break;
   13276 
   13277    case 0x71:
   13278       /* 66 0F 71 /2 ib = PSRLW by immediate */
   13279       if (have66noF2noF3(pfx) && sz == 2
   13280           && epartIsReg(getUChar(delta))
   13281           && gregLO3ofRM(getUChar(delta)) == 2) {
   13282          delta = dis_SSE_shiftE_imm( pfx, delta, "psrlw", Iop_ShrN16x8 );
   13283          goto decode_success;
   13284       }
   13285       /* 66 0F 71 /4 ib = PSRAW by immediate */
   13286       if (have66noF2noF3(pfx) && sz == 2
   13287           && epartIsReg(getUChar(delta))
   13288           && gregLO3ofRM(getUChar(delta)) == 4) {
   13289          delta = dis_SSE_shiftE_imm( pfx, delta, "psraw", Iop_SarN16x8 );
   13290          goto decode_success;
   13291       }
   13292       /* 66 0F 71 /6 ib = PSLLW by immediate */
   13293       if (have66noF2noF3(pfx) && sz == 2
   13294           && epartIsReg(getUChar(delta))
   13295           && gregLO3ofRM(getUChar(delta)) == 6) {
   13296          delta = dis_SSE_shiftE_imm( pfx, delta, "psllw", Iop_ShlN16x8 );
   13297          goto decode_success;
   13298       }
   13299       break;
   13300 
   13301    case 0x72:
   13302       /* 66 0F 72 /2 ib = PSRLD by immediate */
   13303       if (have66noF2noF3(pfx) && sz == 2
   13304           && epartIsReg(getUChar(delta))
   13305           && gregLO3ofRM(getUChar(delta)) == 2) {
   13306          delta = dis_SSE_shiftE_imm( pfx, delta, "psrld", Iop_ShrN32x4 );
   13307          goto decode_success;
   13308       }
   13309       /* 66 0F 72 /4 ib = PSRAD by immediate */
   13310       if (have66noF2noF3(pfx) && sz == 2
   13311           && epartIsReg(getUChar(delta))
   13312           && gregLO3ofRM(getUChar(delta)) == 4) {
   13313          delta = dis_SSE_shiftE_imm( pfx, delta, "psrad", Iop_SarN32x4 );
   13314          goto decode_success;
   13315       }
   13316       /* 66 0F 72 /6 ib = PSLLD by immediate */
   13317       if (have66noF2noF3(pfx) && sz == 2
   13318           && epartIsReg(getUChar(delta))
   13319           && gregLO3ofRM(getUChar(delta)) == 6) {
   13320          delta = dis_SSE_shiftE_imm( pfx, delta, "pslld", Iop_ShlN32x4 );
   13321          goto decode_success;
   13322       }
   13323       break;
   13324 
   13325    case 0x73:
   13326       /* 66 0F 73 /3 ib = PSRLDQ by immediate */
   13327       /* note, if mem case ever filled in, 1 byte after amode */
   13328       if (have66noF2noF3(pfx) && sz == 2
   13329           && epartIsReg(getUChar(delta))
   13330           && gregLO3ofRM(getUChar(delta)) == 3) {
   13331          Int imm = (Int)getUChar(delta+1);
   13332          Int reg = eregOfRexRM(pfx,getUChar(delta));
   13333          DIP("psrldq $%d,%s\n", imm, nameXMMReg(reg));
   13334          delta += 2;
   13335          IRTemp sV = newTemp(Ity_V128);
   13336          assign( sV, getXMMReg(reg) );
   13337          putXMMReg(reg, mkexpr(math_PSRLDQ( sV, imm )));
   13338          goto decode_success;
   13339       }
   13340       /* 66 0F 73 /7 ib = PSLLDQ by immediate */
   13341       /* note, if mem case ever filled in, 1 byte after amode */
   13342       if (have66noF2noF3(pfx) && sz == 2
   13343           && epartIsReg(getUChar(delta))
   13344           && gregLO3ofRM(getUChar(delta)) == 7) {
   13345          Int imm = (Int)getUChar(delta+1);
   13346          Int reg = eregOfRexRM(pfx,getUChar(delta));
   13347          DIP("pslldq $%d,%s\n", imm, nameXMMReg(reg));
   13348          vassert(imm >= 0 && imm <= 255);
   13349          delta += 2;
   13350          IRTemp sV = newTemp(Ity_V128);
   13351          assign( sV, getXMMReg(reg) );
   13352          putXMMReg(reg, mkexpr(math_PSLLDQ( sV, imm )));
   13353          goto decode_success;
   13354       }
   13355       /* 66 0F 73 /2 ib = PSRLQ by immediate */
   13356       if (have66noF2noF3(pfx) && sz == 2
   13357           && epartIsReg(getUChar(delta))
   13358           && gregLO3ofRM(getUChar(delta)) == 2) {
   13359          delta = dis_SSE_shiftE_imm( pfx, delta, "psrlq", Iop_ShrN64x2 );
   13360          goto decode_success;
   13361       }
   13362       /* 66 0F 73 /6 ib = PSLLQ by immediate */
   13363       if (have66noF2noF3(pfx) && sz == 2
   13364           && epartIsReg(getUChar(delta))
   13365           && gregLO3ofRM(getUChar(delta)) == 6) {
   13366          delta = dis_SSE_shiftE_imm( pfx, delta, "psllq", Iop_ShlN64x2 );
   13367          goto decode_success;
   13368       }
   13369       break;
   13370 
   13371    case 0x74:
   13372       /* 66 0F 74 = PCMPEQB */
   13373       if (have66noF2noF3(pfx) && sz == 2) {
   13374          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13375                                     "pcmpeqb", Iop_CmpEQ8x16, False );
   13376          goto decode_success;
   13377       }
   13378       break;
   13379 
   13380    case 0x75:
   13381       /* 66 0F 75 = PCMPEQW */
   13382       if (have66noF2noF3(pfx) && sz == 2) {
   13383          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13384                                     "pcmpeqw", Iop_CmpEQ16x8, False );
   13385          goto decode_success;
   13386       }
   13387       break;
   13388 
   13389    case 0x76:
   13390       /* 66 0F 76 = PCMPEQD */
   13391       if (have66noF2noF3(pfx) && sz == 2) {
   13392          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13393                                     "pcmpeqd", Iop_CmpEQ32x4, False );
   13394          goto decode_success;
   13395       }
   13396       break;
   13397 
   13398    case 0x7E:
   13399       /* F3 0F 7E = MOVQ -- move 64 bits from E (mem or lo half xmm) to
   13400          G (lo half xmm).  Upper half of G is zeroed out. */
   13401       if (haveF3no66noF2(pfx)
   13402           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   13403          modrm = getUChar(delta);
   13404          if (epartIsReg(modrm)) {
   13405             putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
   13406                              getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
   13407                /* zero bits 127:64 */
   13408                putXMMRegLane64( gregOfRexRM(pfx,modrm), 1, mkU64(0) );
   13409             DIP("movsd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   13410                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   13411             delta += 1;
   13412          } else {
   13413             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13414             putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
   13415             putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
   13416                              loadLE(Ity_I64, mkexpr(addr)) );
   13417             DIP("movsd %s,%s\n", dis_buf,
   13418                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   13419             delta += alen;
   13420          }
   13421          goto decode_success;
   13422       }
   13423       /* 66 0F 7E = MOVD from xmm low 1/4 to ireg32 or m32. */
   13424       /*              or from xmm low 1/2 to ireg64 or m64. */
   13425          if (have66noF2noF3(pfx) && (sz == 2 || sz == 8)) {
   13426          if (sz == 2) sz = 4;
   13427          modrm = getUChar(delta);
   13428          if (epartIsReg(modrm)) {
   13429             delta += 1;
   13430             if (sz == 4) {
   13431                putIReg32( eregOfRexRM(pfx,modrm),
   13432                           getXMMRegLane32(gregOfRexRM(pfx,modrm), 0) );
   13433                DIP("movd %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   13434                                     nameIReg32(eregOfRexRM(pfx,modrm)));
   13435             } else {
   13436                putIReg64( eregOfRexRM(pfx,modrm),
   13437                           getXMMRegLane64(gregOfRexRM(pfx,modrm), 0) );
   13438                DIP("movq %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   13439                                     nameIReg64(eregOfRexRM(pfx,modrm)));
   13440             }
   13441          } else {
   13442             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   13443             delta += alen;
   13444             storeLE( mkexpr(addr),
   13445                      sz == 4
   13446                         ? getXMMRegLane32(gregOfRexRM(pfx,modrm),0)
   13447                         : getXMMRegLane64(gregOfRexRM(pfx,modrm),0) );
   13448             DIP("mov%c %s, %s\n", sz == 4 ? 'd' : 'q',
   13449                                   nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
   13450          }
   13451          goto decode_success;
   13452       }
   13453       break;
   13454 
   13455    case 0x7F:
   13456       /* F3 0F 7F = MOVDQU -- move from G (xmm) to E (mem or xmm). */
   13457       if (haveF3no66noF2(pfx) && sz == 4) {
   13458          modrm = getUChar(delta);
   13459          if (epartIsReg(modrm)) {
   13460             goto decode_failure; /* awaiting test case */
   13461             delta += 1;
   13462             putXMMReg( eregOfRexRM(pfx,modrm),
   13463                        getXMMReg(gregOfRexRM(pfx,modrm)) );
   13464             DIP("movdqu %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   13465                                    nameXMMReg(eregOfRexRM(pfx,modrm)));
   13466          } else {
   13467             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   13468             delta += alen;
   13469             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   13470             DIP("movdqu %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
   13471          }
   13472          goto decode_success;
   13473       }
   13474       /* 66 0F 7F = MOVDQA -- move from G (xmm) to E (mem or xmm). */
   13475       if (have66noF2noF3(pfx) && sz == 2) {
   13476          modrm = getUChar(delta);
   13477          if (epartIsReg(modrm)) {
   13478             delta += 1;
   13479             putXMMReg( eregOfRexRM(pfx,modrm),
   13480                        getXMMReg(gregOfRexRM(pfx,modrm)) );
   13481             DIP("movdqa %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   13482                                    nameXMMReg(eregOfRexRM(pfx,modrm)));
   13483          } else {
   13484             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   13485             gen_SEGV_if_not_16_aligned( addr );
   13486             delta += alen;
   13487             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   13488             DIP("movdqa %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
   13489          }
   13490          goto decode_success;
   13491       }
   13492       break;
   13493 
   13494    case 0xAE:
   13495       /* 0F AE /7 = SFENCE -- flush pending operations to memory */
   13496       if (haveNo66noF2noF3(pfx)
   13497           && epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 7
   13498           && sz == 4) {
   13499          delta += 1;
   13500          /* Insert a memory fence.  It's sometimes important that these
   13501             are carried through to the generated code. */
   13502          stmt( IRStmt_MBE(Imbe_Fence) );
   13503          DIP("sfence\n");
   13504          goto decode_success;
   13505       }
   13506       /* mindless duplication follows .. */
   13507       /* 0F AE /5 = LFENCE -- flush pending operations to memory */
   13508       /* 0F AE /6 = MFENCE -- flush pending operations to memory */
   13509       if (haveNo66noF2noF3(pfx)
   13510           && epartIsReg(getUChar(delta))
   13511           && (gregLO3ofRM(getUChar(delta)) == 5
   13512               || gregLO3ofRM(getUChar(delta)) == 6)
   13513           && sz == 4) {
   13514          delta += 1;
   13515          /* Insert a memory fence.  It's sometimes important that these
   13516             are carried through to the generated code. */
   13517          stmt( IRStmt_MBE(Imbe_Fence) );
   13518          DIP("%sfence\n", gregLO3ofRM(getUChar(delta-1))==5 ? "l" : "m");
   13519          goto decode_success;
   13520       }
   13521 
   13522       /* 0F AE /7 = CLFLUSH -- flush cache line */
   13523       if (haveNo66noF2noF3(pfx)
   13524           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 7
   13525           && sz == 4) {
   13526 
   13527          /* This is something of a hack.  We need to know the size of
   13528             the cache line containing addr.  Since we don't (easily),
   13529             assume 256 on the basis that no real cache would have a
   13530             line that big.  It's safe to invalidate more stuff than we
   13531             need, just inefficient. */
   13532          ULong lineszB = 256ULL;
   13533 
   13534          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13535          delta += alen;
   13536 
   13537          /* Round addr down to the start of the containing block. */
   13538          stmt( IRStmt_Put(
   13539                   OFFB_CMSTART,
   13540                   binop( Iop_And64,
   13541                          mkexpr(addr),
   13542                          mkU64( ~(lineszB-1) ))) );
   13543 
   13544          stmt( IRStmt_Put(OFFB_CMLEN, mkU64(lineszB) ) );
   13545 
   13546          jmp_lit(dres, Ijk_InvalICache, (Addr64)(guest_RIP_bbstart+delta));
   13547 
   13548          DIP("clflush %s\n", dis_buf);
   13549          goto decode_success;
   13550       }
   13551 
   13552       /* 0F AE /3 = STMXCSR m32 -- store %mxcsr */
   13553       if (haveNo66noF2noF3(pfx)
   13554           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 3
   13555           && sz == 4) {
   13556          delta = dis_STMXCSR(vbi, pfx, delta, False/*!isAvx*/);
   13557          goto decode_success;
   13558       }
   13559       /* 0F AE /2 = LDMXCSR m32 -- load %mxcsr */
   13560       if (haveNo66noF2noF3(pfx)
   13561           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 2
   13562           && sz == 4) {
   13563          delta = dis_LDMXCSR(vbi, pfx, delta, False/*!isAvx*/);
   13564          goto decode_success;
   13565       }
   13566       /* 0F AE /0 = FXSAVE m512 -- write x87 and SSE state to memory.
   13567          Note that the presence or absence of REX.W slightly affects the
   13568          written format: whether the saved FPU IP and DP pointers are 64
   13569          or 32 bits.  But the helper function we call simply writes zero
   13570          bits in the relevant fields (which are 64 bits regardless of
   13571          what REX.W is) and so it's good enough (iow, equally broken) in
   13572          both cases. */
   13573       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   13574           && !epartIsReg(getUChar(delta))
   13575           && gregOfRexRM(pfx,getUChar(delta)) == 0) {
   13576           IRDirty* d;
   13577          modrm = getUChar(delta);
   13578          vassert(!epartIsReg(modrm));
   13579 
   13580          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13581          delta += alen;
   13582          gen_SEGV_if_not_16_aligned(addr);
   13583 
   13584          DIP("%sfxsave %s\n", sz==8 ? "rex64/" : "", dis_buf);
   13585 
   13586          /* Uses dirty helper:
   13587               void amd64g_do_FXSAVE_ALL_EXCEPT_XMM ( VexGuestAMD64State*,
   13588                                                      ULong ) */
   13589          d = unsafeIRDirty_0_N (
   13590                 0/*regparms*/,
   13591                 "amd64g_dirtyhelper_FXSAVE_ALL_EXCEPT_XMM",
   13592                 &amd64g_dirtyhelper_FXSAVE_ALL_EXCEPT_XMM,
   13593                 mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   13594              );
   13595 
   13596          /* declare we're writing memory */
   13597          d->mFx   = Ifx_Write;
   13598          d->mAddr = mkexpr(addr);
   13599          d->mSize = 464; /* according to recent Intel docs */
   13600 
   13601          /* declare we're reading guest state */
   13602          d->nFxState = 6;
   13603          vex_bzero(&d->fxState, sizeof(d->fxState));
   13604 
   13605          d->fxState[0].fx     = Ifx_Read;
   13606          d->fxState[0].offset = OFFB_FTOP;
   13607          d->fxState[0].size   = sizeof(UInt);
   13608 
   13609          d->fxState[1].fx     = Ifx_Read;
   13610          d->fxState[1].offset = OFFB_FPREGS;
   13611          d->fxState[1].size   = 8 * sizeof(ULong);
   13612 
   13613          d->fxState[2].fx     = Ifx_Read;
   13614          d->fxState[2].offset = OFFB_FPTAGS;
   13615          d->fxState[2].size   = 8 * sizeof(UChar);
   13616 
   13617          d->fxState[3].fx     = Ifx_Read;
   13618          d->fxState[3].offset = OFFB_FPROUND;
   13619          d->fxState[3].size   = sizeof(ULong);
   13620 
   13621          d->fxState[4].fx     = Ifx_Read;
   13622          d->fxState[4].offset = OFFB_FC3210;
   13623          d->fxState[4].size   = sizeof(ULong);
   13624 
   13625          d->fxState[5].fx     = Ifx_Read;
   13626          d->fxState[5].offset = OFFB_SSEROUND;
   13627          d->fxState[5].size   = sizeof(ULong);
   13628 
   13629          /* Call the helper.  This creates all parts of the in-memory
   13630             image except for the XMM[0..15] array, which we do
   13631             separately, in order that any undefinedness in the XMM
   13632             registers is tracked separately by Memcheck and does not
   13633             "infect" the in-memory shadow for the other parts of the
   13634             image (FPTOP, FPREGS, FPTAGS, FPROUND, FC3210,
   13635             SSEROUND). */
   13636          stmt( IRStmt_Dirty(d) );
   13637 
   13638          /* And now the XMMs themselves. */
   13639          UInt xmm;
   13640          for (xmm = 0; xmm < 16; xmm++) {
   13641             storeLE( binop(Iop_Add64, mkexpr(addr), mkU64(160 + xmm * 16)),
   13642                      getXMMReg(xmm) );
   13643          }
   13644 
   13645          goto decode_success;
   13646       }
   13647       /* 0F AE /1 = FXRSTOR m512 -- read x87 and SSE state from memory.
   13648          As with FXSAVE above we ignore the value of REX.W since we're
   13649          not bothering with the FPU DP and IP fields. */
   13650       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   13651           && !epartIsReg(getUChar(delta))
   13652           && gregOfRexRM(pfx,getUChar(delta)) == 1) {
   13653          IRDirty* d;
   13654          modrm = getUChar(delta);
   13655          vassert(!epartIsReg(modrm));
   13656 
   13657          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13658          delta += alen;
   13659          gen_SEGV_if_not_16_aligned(addr);
   13660 
   13661          DIP("%sfxrstor %s\n", sz==8 ? "rex64/" : "", dis_buf);
   13662 
   13663          /* Uses dirty helper:
   13664               VexEmNote amd64g_do_FXRSTOR_ALL_EXCEPT_XMM ( VexGuestAMD64State*,
   13665                                                            ULong )
   13666             NOTE:
   13667               the VexEmNote value is simply ignored
   13668          */
   13669          d = unsafeIRDirty_0_N (
   13670                 0/*regparms*/,
   13671                 "amd64g_dirtyhelper_FXRSTOR_ALL_EXCEPT_XMM",
   13672                 &amd64g_dirtyhelper_FXRSTOR_ALL_EXCEPT_XMM,
   13673                 mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   13674              );
   13675 
   13676          /* declare we're reading memory */
   13677          d->mFx   = Ifx_Read;
   13678          d->mAddr = mkexpr(addr);
   13679          d->mSize = 464; /* according to recent Intel docs */
   13680 
   13681          /* declare we're writing guest state */
   13682          d->nFxState = 6;
   13683          vex_bzero(&d->fxState, sizeof(d->fxState));
   13684 
   13685          d->fxState[0].fx     = Ifx_Write;
   13686          d->fxState[0].offset = OFFB_FTOP;
   13687          d->fxState[0].size   = sizeof(UInt);
   13688 
   13689          d->fxState[1].fx     = Ifx_Write;
   13690          d->fxState[1].offset = OFFB_FPREGS;
   13691          d->fxState[1].size   = 8 * sizeof(ULong);
   13692 
   13693          d->fxState[2].fx     = Ifx_Write;
   13694          d->fxState[2].offset = OFFB_FPTAGS;
   13695          d->fxState[2].size   = 8 * sizeof(UChar);
   13696 
   13697          d->fxState[3].fx     = Ifx_Write;
   13698          d->fxState[3].offset = OFFB_FPROUND;
   13699          d->fxState[3].size   = sizeof(ULong);
   13700 
   13701          d->fxState[4].fx     = Ifx_Write;
   13702          d->fxState[4].offset = OFFB_FC3210;
   13703          d->fxState[4].size   = sizeof(ULong);
   13704 
   13705          d->fxState[5].fx     = Ifx_Write;
   13706          d->fxState[5].offset = OFFB_SSEROUND;
   13707          d->fxState[5].size   = sizeof(ULong);
   13708 
   13709          /* Call the helper.  This reads all parts of the in-memory
   13710             image except for the XMM[0..15] array, which we do
   13711             separately, in order that any undefinedness in the XMM
   13712             registers is tracked separately by Memcheck and does not
   13713             "infect" the in-guest-state shadow for the other parts of the
   13714             image (FPTOP, FPREGS, FPTAGS, FPROUND, FC3210,
   13715             SSEROUND). */
   13716          stmt( IRStmt_Dirty(d) );
   13717 
   13718          /* And now the XMMs themselves. */
   13719          UInt xmm;
   13720          for (xmm = 0; xmm < 16; xmm++) {
   13721             putXMMReg(xmm, loadLE(Ity_V128,
   13722                                   binop(Iop_Add64, mkexpr(addr),
   13723                                                    mkU64(160 + xmm * 16))));
   13724          }
   13725 
   13726          goto decode_success;
   13727       }
   13728       break;
   13729 
   13730    case 0xC2:
   13731       /* 0F C2 = CMPPS -- 32Fx4 comparison from R/M to R */
   13732       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13733          Long delta0 = delta;
   13734          delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpps", True, 4 );
   13735          if (delta > delta0) goto decode_success;
   13736       }
   13737       /* F3 0F C2 = CMPSS -- 32F0x4 comparison from R/M to R */
   13738       if (haveF3no66noF2(pfx) && sz == 4) {
   13739          Long delta0 = delta;
   13740          delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpss", False, 4 );
   13741          if (delta > delta0) goto decode_success;
   13742       }
   13743       /* F2 0F C2 = CMPSD -- 64F0x2 comparison from R/M to R */
   13744       if (haveF2no66noF3(pfx) && sz == 4) {
   13745          Long delta0 = delta;
   13746          delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpsd", False, 8 );
   13747          if (delta > delta0) goto decode_success;
   13748       }
   13749       /* 66 0F C2 = CMPPD -- 64Fx2 comparison from R/M to R */
   13750       if (have66noF2noF3(pfx) && sz == 2) {
   13751          Long delta0 = delta;
   13752          delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmppd", True, 8 );
   13753          if (delta > delta0) goto decode_success;
   13754       }
   13755       break;
   13756 
   13757    case 0xC3:
   13758       /* 0F C3 = MOVNTI -- for us, just a plain ireg store. */
   13759       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)) {
   13760          modrm = getUChar(delta);
   13761          if (!epartIsReg(modrm)) {
   13762             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13763             storeLE( mkexpr(addr), getIRegG(sz, pfx, modrm) );
   13764             DIP("movnti %s,%s\n", dis_buf,
   13765                                   nameIRegG(sz, pfx, modrm));
   13766             delta += alen;
   13767             goto decode_success;
   13768          }
   13769          /* else fall through */
   13770       }
   13771       break;
   13772 
   13773    case 0xC4:
   13774       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   13775       /* 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
   13776          put it into the specified lane of mmx(G). */
   13777       if (haveNo66noF2noF3(pfx)
   13778           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   13779          /* Use t0 .. t3 to hold the 4 original 16-bit lanes of the
   13780             mmx reg.  t4 is the new lane value.  t5 is the original
   13781             mmx value. t6 is the new mmx value. */
   13782          Int lane;
   13783          t4 = newTemp(Ity_I16);
   13784          t5 = newTemp(Ity_I64);
   13785          t6 = newTemp(Ity_I64);
   13786          modrm = getUChar(delta);
   13787          do_MMX_preamble();
   13788 
   13789          assign(t5, getMMXReg(gregLO3ofRM(modrm)));
   13790          breakup64to16s( t5, &t3, &t2, &t1, &t0 );
   13791 
   13792          if (epartIsReg(modrm)) {
   13793             assign(t4, getIReg16(eregOfRexRM(pfx,modrm)));
   13794             delta += 1+1;
   13795             lane = getUChar(delta-1);
   13796             DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   13797                                       nameIReg16(eregOfRexRM(pfx,modrm)),
   13798                                       nameMMXReg(gregLO3ofRM(modrm)));
   13799          } else {
   13800             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   13801             delta += 1+alen;
   13802             lane = getUChar(delta-1);
   13803             assign(t4, loadLE(Ity_I16, mkexpr(addr)));
   13804             DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   13805                                       dis_buf,
   13806                                       nameMMXReg(gregLO3ofRM(modrm)));
   13807          }
   13808 
   13809          switch (lane & 3) {
   13810             case 0:  assign(t6, mk64from16s(t3,t2,t1,t4)); break;
   13811             case 1:  assign(t6, mk64from16s(t3,t2,t4,t0)); break;
   13812             case 2:  assign(t6, mk64from16s(t3,t4,t1,t0)); break;
   13813             case 3:  assign(t6, mk64from16s(t4,t2,t1,t0)); break;
   13814             default: vassert(0);
   13815          }
   13816          putMMXReg(gregLO3ofRM(modrm), mkexpr(t6));
   13817          goto decode_success;
   13818       }
   13819       /* 66 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
   13820          put it into the specified lane of xmm(G). */
   13821       if (have66noF2noF3(pfx)
   13822           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   13823          Int lane;
   13824          t4 = newTemp(Ity_I16);
   13825          modrm = getUChar(delta);
   13826          UInt rG = gregOfRexRM(pfx,modrm);
   13827          if (epartIsReg(modrm)) {
   13828             UInt rE = eregOfRexRM(pfx,modrm);
   13829             assign(t4, getIReg16(rE));
   13830             delta += 1+1;
   13831             lane = getUChar(delta-1);
   13832             DIP("pinsrw $%d,%s,%s\n",
   13833                 (Int)lane, nameIReg16(rE), nameXMMReg(rG));
   13834          } else {
   13835             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   13836                               1/*byte after the amode*/ );
   13837             delta += 1+alen;
   13838             lane = getUChar(delta-1);
   13839             assign(t4, loadLE(Ity_I16, mkexpr(addr)));
   13840             DIP("pinsrw $%d,%s,%s\n",
   13841                 (Int)lane, dis_buf, nameXMMReg(rG));
   13842          }
   13843          IRTemp src_vec = newTemp(Ity_V128);
   13844          assign(src_vec, getXMMReg(rG));
   13845          IRTemp res_vec = math_PINSRW_128( src_vec, t4, lane & 7);
   13846          putXMMReg(rG, mkexpr(res_vec));
   13847          goto decode_success;
   13848       }
   13849       break;
   13850 
   13851    case 0xC5:
   13852       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   13853       /* 0F C5 = PEXTRW -- extract 16-bit field from mmx(E) and put
   13854          zero-extend of it in ireg(G). */
   13855       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)) {
   13856          modrm = getUChar(delta);
   13857          if (epartIsReg(modrm)) {
   13858             IRTemp sV = newTemp(Ity_I64);
   13859             t5 = newTemp(Ity_I16);
   13860             do_MMX_preamble();
   13861             assign(sV, getMMXReg(eregLO3ofRM(modrm)));
   13862             breakup64to16s( sV, &t3, &t2, &t1, &t0 );
   13863             switch (getUChar(delta+1) & 3) {
   13864                case 0:  assign(t5, mkexpr(t0)); break;
   13865                case 1:  assign(t5, mkexpr(t1)); break;
   13866                case 2:  assign(t5, mkexpr(t2)); break;
   13867                case 3:  assign(t5, mkexpr(t3)); break;
   13868                default: vassert(0);
   13869             }
   13870             if (sz == 8)
   13871                putIReg64(gregOfRexRM(pfx,modrm), unop(Iop_16Uto64, mkexpr(t5)));
   13872             else
   13873                putIReg32(gregOfRexRM(pfx,modrm), unop(Iop_16Uto32, mkexpr(t5)));
   13874             DIP("pextrw $%d,%s,%s\n",
   13875                 (Int)getUChar(delta+1),
   13876                 nameMMXReg(eregLO3ofRM(modrm)),
   13877                 sz==8 ? nameIReg64(gregOfRexRM(pfx,modrm))
   13878                       : nameIReg32(gregOfRexRM(pfx,modrm))
   13879             );
   13880             delta += 2;
   13881             goto decode_success;
   13882          }
   13883          /* else fall through */
   13884          /* note, for anyone filling in the mem case: this insn has one
   13885             byte after the amode and therefore you must pass 1 as the
   13886             last arg to disAMode */
   13887       }
   13888       /* 66 0F C5 = PEXTRW -- extract 16-bit field from xmm(E) and put
   13889          zero-extend of it in ireg(G). */
   13890       if (have66noF2noF3(pfx)
   13891           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   13892          Long delta0 = delta;
   13893          delta = dis_PEXTRW_128_EregOnly_toG( vbi, pfx, delta,
   13894                                               False/*!isAvx*/ );
   13895          if (delta > delta0) goto decode_success;
   13896          /* else fall through -- decoding has failed */
   13897       }
   13898       break;
   13899 
   13900    case 0xC6:
   13901       /* 0F C6 /r ib = SHUFPS -- shuffle packed F32s */
   13902       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13903          Int    imm8 = 0;
   13904          IRTemp sV   = newTemp(Ity_V128);
   13905          IRTemp dV   = newTemp(Ity_V128);
   13906          modrm = getUChar(delta);
   13907          UInt rG = gregOfRexRM(pfx,modrm);
   13908          assign( dV, getXMMReg(rG) );
   13909          if (epartIsReg(modrm)) {
   13910             UInt rE = eregOfRexRM(pfx,modrm);
   13911             assign( sV, getXMMReg(rE) );
   13912             imm8 = (Int)getUChar(delta+1);
   13913             delta += 1+1;
   13914             DIP("shufps $%d,%s,%s\n", imm8, nameXMMReg(rE), nameXMMReg(rG));
   13915          } else {
   13916             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   13917             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   13918             imm8 = (Int)getUChar(delta+alen);
   13919             delta += 1+alen;
   13920             DIP("shufps $%d,%s,%s\n", imm8, dis_buf, nameXMMReg(rG));
   13921          }
   13922          IRTemp res = math_SHUFPS_128( sV, dV, imm8 );
   13923          putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
   13924          goto decode_success;
   13925       }
   13926       /* 66 0F C6 /r ib = SHUFPD -- shuffle packed F64s */
   13927       if (have66noF2noF3(pfx) && sz == 2) {
   13928          Int    select;
   13929          IRTemp sV = newTemp(Ity_V128);
   13930          IRTemp dV = newTemp(Ity_V128);
   13931 
   13932          modrm = getUChar(delta);
   13933          assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   13934 
   13935          if (epartIsReg(modrm)) {
   13936             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   13937             select = (Int)getUChar(delta+1);
   13938             delta += 1+1;
   13939             DIP("shufpd $%d,%s,%s\n", select,
   13940                                       nameXMMReg(eregOfRexRM(pfx,modrm)),
   13941                                       nameXMMReg(gregOfRexRM(pfx,modrm)));
   13942          } else {
   13943             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   13944             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   13945             select = getUChar(delta+alen);
   13946             delta += 1+alen;
   13947             DIP("shufpd $%d,%s,%s\n", select,
   13948                                       dis_buf,
   13949                                       nameXMMReg(gregOfRexRM(pfx,modrm)));
   13950          }
   13951 
   13952          IRTemp res = math_SHUFPD_128( sV, dV, select );
   13953          putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
   13954          goto decode_success;
   13955       }
   13956       break;
   13957 
   13958    case 0xD1:
   13959       /* 66 0F D1 = PSRLW by E */
   13960       if (have66noF2noF3(pfx) && sz == 2) {
   13961          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrlw", Iop_ShrN16x8 );
   13962          goto decode_success;
   13963       }
   13964       break;
   13965 
   13966    case 0xD2:
   13967       /* 66 0F D2 = PSRLD by E */
   13968       if (have66noF2noF3(pfx) && sz == 2) {
   13969          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrld", Iop_ShrN32x4 );
   13970          goto decode_success;
   13971       }
   13972       break;
   13973 
   13974    case 0xD3:
   13975       /* 66 0F D3 = PSRLQ by E */
   13976       if (have66noF2noF3(pfx) && sz == 2) {
   13977          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrlq", Iop_ShrN64x2 );
   13978          goto decode_success;
   13979       }
   13980       break;
   13981 
   13982    case 0xD4:
   13983       /* 66 0F D4 = PADDQ */
   13984       if (have66noF2noF3(pfx) && sz == 2) {
   13985          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13986                                     "paddq", Iop_Add64x2, False );
   13987          goto decode_success;
   13988       }
   13989       /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   13990       /* 0F D4 = PADDQ -- add 64x1 */
   13991       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13992          do_MMX_preamble();
   13993          delta = dis_MMXop_regmem_to_reg (
   13994                    vbi, pfx, delta, opc, "paddq", False );
   13995          goto decode_success;
   13996       }
   13997       break;
   13998 
   13999    case 0xD5:
   14000       /* 66 0F D5 = PMULLW -- 16x8 multiply */
   14001       if (have66noF2noF3(pfx) && sz == 2) {
   14002          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14003                                     "pmullw", Iop_Mul16x8, False );
   14004          goto decode_success;
   14005       }
   14006       break;
   14007 
   14008    case 0xD6:
   14009       /* F3 0F D6 = MOVQ2DQ -- move from E (mmx) to G (lo half xmm, zero
   14010          hi half). */
   14011       if (haveF3no66noF2(pfx) && sz == 4) {
   14012          modrm = getUChar(delta);
   14013          if (epartIsReg(modrm)) {
   14014             do_MMX_preamble();
   14015             putXMMReg( gregOfRexRM(pfx,modrm),
   14016                        unop(Iop_64UtoV128, getMMXReg( eregLO3ofRM(modrm) )) );
   14017             DIP("movq2dq %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   14018                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   14019             delta += 1;
   14020             goto decode_success;
   14021          }
   14022          /* apparently no mem case for this insn */
   14023       }
   14024       /* 66 0F D6 = MOVQ -- move 64 bits from G (lo half xmm) to E (mem
   14025          or lo half xmm).  */
   14026       if (have66noF2noF3(pfx)
   14027           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   14028          modrm = getUChar(delta);
   14029          if (epartIsReg(modrm)) {
   14030             /* fall through, awaiting test case */
   14031             /* dst: lo half copied, hi half zeroed */
   14032          } else {
   14033             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14034             storeLE( mkexpr(addr),
   14035                      getXMMRegLane64( gregOfRexRM(pfx,modrm), 0 ));
   14036             DIP("movq %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf );
   14037             delta += alen;
   14038             goto decode_success;
   14039          }
   14040       }
   14041       /* F2 0F D6 = MOVDQ2Q -- move from E (lo half xmm, not mem) to G (mmx). */
   14042       if (haveF2no66noF3(pfx) && sz == 4) {
   14043          modrm = getUChar(delta);
   14044          if (epartIsReg(modrm)) {
   14045             do_MMX_preamble();
   14046             putMMXReg( gregLO3ofRM(modrm),
   14047                        getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
   14048             DIP("movdq2q %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   14049                                    nameMMXReg(gregLO3ofRM(modrm)));
   14050             delta += 1;
   14051             goto decode_success;
   14052          }
   14053          /* apparently no mem case for this insn */
   14054       }
   14055       break;
   14056 
   14057    case 0xD7:
   14058       /* 66 0F D7 = PMOVMSKB -- extract sign bits from each of 16
   14059          lanes in xmm(E), turn them into a byte, and put
   14060          zero-extend of it in ireg(G).  Doing this directly is just
   14061          too cumbersome; give up therefore and call a helper. */
   14062       if (have66noF2noF3(pfx)
   14063           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
   14064           && epartIsReg(getUChar(delta))) { /* no memory case, it seems */
   14065          delta = dis_PMOVMSKB_128( vbi, pfx, delta, False/*!isAvx*/ );
   14066          goto decode_success;
   14067       }
   14068       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14069       /* 0F D7 = PMOVMSKB -- extract sign bits from each of 8 lanes in
   14070          mmx(E), turn them into a byte, and put zero-extend of it in
   14071          ireg(G). */
   14072       if (haveNo66noF2noF3(pfx)
   14073           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   14074          modrm = getUChar(delta);
   14075          if (epartIsReg(modrm)) {
   14076             do_MMX_preamble();
   14077             t0 = newTemp(Ity_I64);
   14078             t1 = newTemp(Ity_I32);
   14079             assign(t0, getMMXReg(eregLO3ofRM(modrm)));
   14080             assign(t1, unop(Iop_8Uto32, unop(Iop_GetMSBs8x8, mkexpr(t0))));
   14081             putIReg32(gregOfRexRM(pfx,modrm), mkexpr(t1));
   14082             DIP("pmovmskb %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   14083                                     nameIReg32(gregOfRexRM(pfx,modrm)));
   14084             delta += 1;
   14085             goto decode_success;
   14086          }
   14087          /* else fall through */
   14088       }
   14089       break;
   14090 
   14091    case 0xD8:
   14092       /* 66 0F D8 = PSUBUSB */
   14093       if (have66noF2noF3(pfx) && sz == 2) {
   14094          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14095                                     "psubusb", Iop_QSub8Ux16, False );
   14096          goto decode_success;
   14097       }
   14098       break;
   14099 
   14100    case 0xD9:
   14101       /* 66 0F D9 = PSUBUSW */
   14102       if (have66noF2noF3(pfx) && sz == 2) {
   14103          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14104                                     "psubusw", Iop_QSub16Ux8, False );
   14105          goto decode_success;
   14106       }
   14107       break;
   14108 
   14109    case 0xDA:
   14110       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14111       /* 0F DA = PMINUB -- 8x8 unsigned min */
   14112       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14113          do_MMX_preamble();
   14114          delta = dis_MMXop_regmem_to_reg (
   14115                     vbi, pfx, delta, opc, "pminub", False );
   14116          goto decode_success;
   14117       }
   14118       /* 66 0F DA = PMINUB -- 8x16 unsigned min */
   14119       if (have66noF2noF3(pfx) && sz == 2) {
   14120          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14121                                     "pminub", Iop_Min8Ux16, False );
   14122          goto decode_success;
   14123       }
   14124       break;
   14125 
   14126    case 0xDB:
   14127       /* 66 0F DB = PAND */
   14128       if (have66noF2noF3(pfx) && sz == 2) {
   14129          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "pand", Iop_AndV128 );
   14130          goto decode_success;
   14131       }
   14132       break;
   14133 
   14134    case 0xDC:
   14135       /* 66 0F DC = PADDUSB */
   14136       if (have66noF2noF3(pfx) && sz == 2) {
   14137          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14138                                     "paddusb", Iop_QAdd8Ux16, False );
   14139          goto decode_success;
   14140       }
   14141       break;
   14142 
   14143    case 0xDD:
   14144       /* 66 0F DD = PADDUSW */
   14145       if (have66noF2noF3(pfx) && sz == 2) {
   14146          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14147                                     "paddusw", Iop_QAdd16Ux8, False );
   14148          goto decode_success;
   14149       }
   14150       break;
   14151 
   14152    case 0xDE:
   14153       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14154       /* 0F DE = PMAXUB -- 8x8 unsigned max */
   14155       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14156          do_MMX_preamble();
   14157          delta = dis_MMXop_regmem_to_reg (
   14158                     vbi, pfx, delta, opc, "pmaxub", False );
   14159          goto decode_success;
   14160       }
   14161       /* 66 0F DE = PMAXUB -- 8x16 unsigned max */
   14162       if (have66noF2noF3(pfx) && sz == 2) {
   14163          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14164                                     "pmaxub", Iop_Max8Ux16, False );
   14165          goto decode_success;
   14166       }
   14167       break;
   14168 
   14169    case 0xDF:
   14170       /* 66 0F DF = PANDN */
   14171       if (have66noF2noF3(pfx) && sz == 2) {
   14172          delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta, "pandn", Iop_AndV128 );
   14173          goto decode_success;
   14174       }
   14175       break;
   14176 
   14177    case 0xE0:
   14178       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14179       /* 0F E0 = PAVGB -- 8x8 unsigned Packed Average, with rounding */
   14180       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14181          do_MMX_preamble();
   14182          delta = dis_MMXop_regmem_to_reg (
   14183                     vbi, pfx, delta, opc, "pavgb", False );
   14184          goto decode_success;
   14185       }
   14186       /* 66 0F E0 = PAVGB */
   14187       if (have66noF2noF3(pfx) && sz == 2) {
   14188          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14189                                     "pavgb", Iop_Avg8Ux16, False );
   14190          goto decode_success;
   14191       }
   14192       break;
   14193 
   14194    case 0xE1:
   14195       /* 66 0F E1 = PSRAW by E */
   14196       if (have66noF2noF3(pfx) && sz == 2) {
   14197          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psraw", Iop_SarN16x8 );
   14198          goto decode_success;
   14199       }
   14200       break;
   14201 
   14202    case 0xE2:
   14203       /* 66 0F E2 = PSRAD by E */
   14204       if (have66noF2noF3(pfx) && sz == 2) {
   14205          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrad", Iop_SarN32x4 );
   14206          goto decode_success;
   14207       }
   14208       break;
   14209 
   14210    case 0xE3:
   14211       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14212       /* 0F E3 = PAVGW -- 16x4 unsigned Packed Average, with rounding */
   14213       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14214          do_MMX_preamble();
   14215          delta = dis_MMXop_regmem_to_reg (
   14216                     vbi, pfx, delta, opc, "pavgw", False );
   14217          goto decode_success;
   14218       }
   14219       /* 66 0F E3 = PAVGW */
   14220       if (have66noF2noF3(pfx) && sz == 2) {
   14221          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14222                                     "pavgw", Iop_Avg16Ux8, False );
   14223          goto decode_success;
   14224       }
   14225       break;
   14226 
   14227    case 0xE4:
   14228       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14229       /* 0F E4 = PMULUH -- 16x4 hi-half of unsigned widening multiply */
   14230       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14231          do_MMX_preamble();
   14232          delta = dis_MMXop_regmem_to_reg (
   14233                     vbi, pfx, delta, opc, "pmuluh", False );
   14234          goto decode_success;
   14235       }
   14236       /* 66 0F E4 = PMULHUW -- 16x8 hi-half of unsigned widening multiply */
   14237       if (have66noF2noF3(pfx) && sz == 2) {
   14238          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14239                                     "pmulhuw", Iop_MulHi16Ux8, False );
   14240          goto decode_success;
   14241       }
   14242       break;
   14243 
   14244    case 0xE5:
   14245       /* 66 0F E5 = PMULHW -- 16x8 hi-half of signed widening multiply */
   14246       if (have66noF2noF3(pfx) && sz == 2) {
   14247          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14248                                     "pmulhw", Iop_MulHi16Sx8, False );
   14249          goto decode_success;
   14250       }
   14251       break;
   14252 
   14253    case 0xE6:
   14254       /* 66 0F E6 = CVTTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
   14255          lo half xmm(G), and zero upper half, rounding towards zero */
   14256       /* F2 0F E6 = CVTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
   14257          lo half xmm(G), according to prevailing rounding mode, and zero
   14258          upper half */
   14259       if ( (haveF2no66noF3(pfx) && sz == 4)
   14260            || (have66noF2noF3(pfx) && sz == 2) ) {
   14261          delta = dis_CVTxPD2DQ_128( vbi, pfx, delta, False/*!isAvx*/,
   14262                                     toBool(sz == 2)/*r2zero*/);
   14263          goto decode_success;
   14264       }
   14265       /* F3 0F E6 = CVTDQ2PD -- convert 2 x I32 in mem/lo half xmm to 2 x
   14266          F64 in xmm(G) */
   14267       if (haveF3no66noF2(pfx) && sz == 4) {
   14268          delta = dis_CVTDQ2PD_128(vbi, pfx, delta, False/*!isAvx*/);
   14269          goto decode_success;
   14270       }
   14271       break;
   14272 
   14273    case 0xE7:
   14274       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14275       /* 0F E7 = MOVNTQ -- for us, just a plain MMX store.  Note, the
   14276          Intel manual does not say anything about the usual business of
   14277          the FP reg tags getting trashed whenever an MMX insn happens.
   14278          So we just leave them alone.
   14279       */
   14280       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14281          modrm = getUChar(delta);
   14282          if (!epartIsReg(modrm)) {
   14283             /* do_MMX_preamble(); Intel docs don't specify this */
   14284             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14285             storeLE( mkexpr(addr), getMMXReg(gregLO3ofRM(modrm)) );
   14286             DIP("movntq %s,%s\n", dis_buf,
   14287                                   nameMMXReg(gregLO3ofRM(modrm)));
   14288             delta += alen;
   14289             goto decode_success;
   14290          }
   14291          /* else fall through */
   14292       }
   14293       /* 66 0F E7 = MOVNTDQ -- for us, just a plain SSE store. */
   14294       if (have66noF2noF3(pfx) && sz == 2) {
   14295          modrm = getUChar(delta);
   14296          if (!epartIsReg(modrm)) {
   14297             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14298             gen_SEGV_if_not_16_aligned( addr );
   14299             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   14300             DIP("movntdq %s,%s\n", dis_buf,
   14301                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   14302             delta += alen;
   14303             goto decode_success;
   14304          }
   14305          /* else fall through */
   14306       }
   14307       break;
   14308 
   14309    case 0xE8:
   14310       /* 66 0F E8 = PSUBSB */
   14311       if (have66noF2noF3(pfx) && sz == 2) {
   14312          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14313                                     "psubsb", Iop_QSub8Sx16, False );
   14314          goto decode_success;
   14315       }
   14316       break;
   14317 
   14318    case 0xE9:
   14319       /* 66 0F E9 = PSUBSW */
   14320       if (have66noF2noF3(pfx) && sz == 2) {
   14321          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14322                                     "psubsw", Iop_QSub16Sx8, False );
   14323          goto decode_success;
   14324       }
   14325       break;
   14326 
   14327    case 0xEA:
   14328       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14329       /* 0F EA = PMINSW -- 16x4 signed min */
   14330       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14331          do_MMX_preamble();
   14332          delta = dis_MMXop_regmem_to_reg (
   14333                     vbi, pfx, delta, opc, "pminsw", False );
   14334          goto decode_success;
   14335       }
   14336       /* 66 0F EA = PMINSW -- 16x8 signed min */
   14337       if (have66noF2noF3(pfx) && sz == 2) {
   14338          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14339                                     "pminsw", Iop_Min16Sx8, False );
   14340          goto decode_success;
   14341       }
   14342       break;
   14343 
   14344    case 0xEB:
   14345       /* 66 0F EB = POR */
   14346       if (have66noF2noF3(pfx) && sz == 2) {
   14347          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "por", Iop_OrV128 );
   14348          goto decode_success;
   14349       }
   14350       break;
   14351 
   14352    case 0xEC:
   14353       /* 66 0F EC = PADDSB */
   14354       if (have66noF2noF3(pfx) && sz == 2) {
   14355          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14356                                     "paddsb", Iop_QAdd8Sx16, False );
   14357          goto decode_success;
   14358       }
   14359       break;
   14360 
   14361    case 0xED:
   14362       /* 66 0F ED = PADDSW */
   14363       if (have66noF2noF3(pfx) && sz == 2) {
   14364          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14365                                     "paddsw", Iop_QAdd16Sx8, False );
   14366          goto decode_success;
   14367       }
   14368       break;
   14369 
   14370    case 0xEE:
   14371       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14372       /* 0F EE = PMAXSW -- 16x4 signed max */
   14373       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14374          do_MMX_preamble();
   14375          delta = dis_MMXop_regmem_to_reg (
   14376                     vbi, pfx, delta, opc, "pmaxsw", False );
   14377          goto decode_success;
   14378       }
   14379       /* 66 0F EE = PMAXSW -- 16x8 signed max */
   14380       if (have66noF2noF3(pfx) && sz == 2) {
   14381          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14382                                     "pmaxsw", Iop_Max16Sx8, False );
   14383          goto decode_success;
   14384       }
   14385       break;
   14386 
   14387    case 0xEF:
   14388       /* 66 0F EF = PXOR */
   14389       if (have66noF2noF3(pfx) && sz == 2) {
   14390          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "pxor", Iop_XorV128 );
   14391          goto decode_success;
   14392       }
   14393       break;
   14394 
   14395    case 0xF1:
   14396       /* 66 0F F1 = PSLLW by E */
   14397       if (have66noF2noF3(pfx) && sz == 2) {
   14398          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psllw", Iop_ShlN16x8 );
   14399          goto decode_success;
   14400       }
   14401       break;
   14402 
   14403    case 0xF2:
   14404       /* 66 0F F2 = PSLLD by E */
   14405       if (have66noF2noF3(pfx) && sz == 2) {
   14406          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "pslld", Iop_ShlN32x4 );
   14407          goto decode_success;
   14408       }
   14409       break;
   14410 
   14411    case 0xF3:
   14412       /* 66 0F F3 = PSLLQ by E */
   14413       if (have66noF2noF3(pfx) && sz == 2) {
   14414          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psllq", Iop_ShlN64x2 );
   14415          goto decode_success;
   14416       }
   14417       break;
   14418 
   14419    case 0xF4:
   14420       /* 66 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
   14421          0 to form lower 64-bit half and lanes 2 x 2 to form upper 64-bit
   14422          half */
   14423       if (have66noF2noF3(pfx) && sz == 2) {
   14424          IRTemp sV = newTemp(Ity_V128);
   14425          IRTemp dV = newTemp(Ity_V128);
   14426          modrm = getUChar(delta);
   14427          UInt rG = gregOfRexRM(pfx,modrm);
   14428          assign( dV, getXMMReg(rG) );
   14429          if (epartIsReg(modrm)) {
   14430             UInt rE = eregOfRexRM(pfx,modrm);
   14431             assign( sV, getXMMReg(rE) );
   14432             delta += 1;
   14433             DIP("pmuludq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   14434          } else {
   14435             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14436             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   14437             delta += alen;
   14438             DIP("pmuludq %s,%s\n", dis_buf, nameXMMReg(rG));
   14439          }
   14440          putXMMReg( rG, mkexpr(math_PMULUDQ_128( sV, dV )) );
   14441          goto decode_success;
   14442       }
   14443       /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   14444       /* 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
   14445          0 to form 64-bit result */
   14446       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14447          IRTemp sV = newTemp(Ity_I64);
   14448          IRTemp dV = newTemp(Ity_I64);
   14449          t1 = newTemp(Ity_I32);
   14450          t0 = newTemp(Ity_I32);
   14451          modrm = getUChar(delta);
   14452 
   14453          do_MMX_preamble();
   14454          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   14455 
   14456          if (epartIsReg(modrm)) {
   14457             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   14458             delta += 1;
   14459             DIP("pmuludq %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   14460                                    nameMMXReg(gregLO3ofRM(modrm)));
   14461          } else {
   14462             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14463             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   14464             delta += alen;
   14465             DIP("pmuludq %s,%s\n", dis_buf,
   14466                                    nameMMXReg(gregLO3ofRM(modrm)));
   14467          }
   14468 
   14469          assign( t0, unop(Iop_64to32, mkexpr(dV)) );
   14470          assign( t1, unop(Iop_64to32, mkexpr(sV)) );
   14471          putMMXReg( gregLO3ofRM(modrm),
   14472                     binop( Iop_MullU32, mkexpr(t0), mkexpr(t1) ) );
   14473          goto decode_success;
   14474       }
   14475       break;
   14476 
   14477    case 0xF5:
   14478       /* 66 0F F5 = PMADDWD -- Multiply and add packed integers from
   14479          E(xmm or mem) to G(xmm) */
   14480       if (have66noF2noF3(pfx) && sz == 2) {
   14481          IRTemp sV = newTemp(Ity_V128);
   14482          IRTemp dV = newTemp(Ity_V128);
   14483          modrm     = getUChar(delta);
   14484          UInt   rG = gregOfRexRM(pfx,modrm);
   14485          if (epartIsReg(modrm)) {
   14486             UInt rE = eregOfRexRM(pfx,modrm);
   14487             assign( sV, getXMMReg(rE) );
   14488             delta += 1;
   14489             DIP("pmaddwd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   14490          } else {
   14491             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14492             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   14493             delta += alen;
   14494             DIP("pmaddwd %s,%s\n", dis_buf, nameXMMReg(rG));
   14495          }
   14496          assign( dV, getXMMReg(rG) );
   14497          putXMMReg( rG, mkexpr(math_PMADDWD_128(dV, sV)) );
   14498          goto decode_success;
   14499       }
   14500       break;
   14501 
   14502    case 0xF6:
   14503       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14504       /* 0F F6 = PSADBW -- sum of 8Ux8 absolute differences */
   14505       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14506          do_MMX_preamble();
   14507          delta = dis_MMXop_regmem_to_reg (
   14508                     vbi, pfx, delta, opc, "psadbw", False );
   14509          goto decode_success;
   14510       }
   14511       /* 66 0F F6 = PSADBW -- 2 x (8x8 -> 48 zeroes ++ u16) Sum Abs Diffs
   14512          from E(xmm or mem) to G(xmm) */
   14513       if (have66noF2noF3(pfx) && sz == 2) {
   14514          IRTemp sV  = newTemp(Ity_V128);
   14515          IRTemp dV  = newTemp(Ity_V128);
   14516          modrm = getUChar(delta);
   14517          UInt   rG   = gregOfRexRM(pfx,modrm);
   14518          if (epartIsReg(modrm)) {
   14519             UInt rE = eregOfRexRM(pfx,modrm);
   14520             assign( sV, getXMMReg(rE) );
   14521             delta += 1;
   14522             DIP("psadbw %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   14523          } else {
   14524             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14525             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   14526             delta += alen;
   14527             DIP("psadbw %s,%s\n", dis_buf, nameXMMReg(rG));
   14528          }
   14529          assign( dV, getXMMReg(rG) );
   14530          putXMMReg( rG, mkexpr( math_PSADBW_128 ( dV, sV ) ) );
   14531 
   14532          goto decode_success;
   14533       }
   14534       break;
   14535 
   14536    case 0xF7:
   14537       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14538       /* 0F F7 = MASKMOVQ -- 8x8 masked store */
   14539       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14540          Bool ok = False;
   14541          delta = dis_MMX( &ok, vbi, pfx, sz, delta-1 );
   14542          if (ok) goto decode_success;
   14543       }
   14544       /* 66 0F F7 = MASKMOVDQU -- store selected bytes of double quadword */
   14545       if (have66noF2noF3(pfx) && sz == 2 && epartIsReg(getUChar(delta))) {
   14546          delta = dis_MASKMOVDQU( vbi, pfx, delta, False/*!isAvx*/ );
   14547          goto decode_success;
   14548       }
   14549       break;
   14550 
   14551    case 0xF8:
   14552       /* 66 0F F8 = PSUBB */
   14553       if (have66noF2noF3(pfx) && sz == 2) {
   14554          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14555                                     "psubb", Iop_Sub8x16, False );
   14556          goto decode_success;
   14557       }
   14558       break;
   14559 
   14560    case 0xF9:
   14561       /* 66 0F F9 = PSUBW */
   14562       if (have66noF2noF3(pfx) && sz == 2) {
   14563          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14564                                     "psubw", Iop_Sub16x8, False );
   14565          goto decode_success;
   14566       }
   14567       break;
   14568 
   14569    case 0xFA:
   14570       /* 66 0F FA = PSUBD */
   14571       if (have66noF2noF3(pfx) && sz == 2) {
   14572          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14573                                     "psubd", Iop_Sub32x4, False );
   14574          goto decode_success;
   14575       }
   14576       break;
   14577 
   14578    case 0xFB:
   14579       /* 66 0F FB = PSUBQ */
   14580       if (have66noF2noF3(pfx) && sz == 2) {
   14581          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14582                                     "psubq", Iop_Sub64x2, False );
   14583          goto decode_success;
   14584       }
   14585       /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   14586       /* 0F FB = PSUBQ -- sub 64x1 */
   14587       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14588          do_MMX_preamble();
   14589          delta = dis_MMXop_regmem_to_reg (
   14590                    vbi, pfx, delta, opc, "psubq", False );
   14591          goto decode_success;
   14592       }
   14593       break;
   14594 
   14595    case 0xFC:
   14596       /* 66 0F FC = PADDB */
   14597       if (have66noF2noF3(pfx) && sz == 2) {
   14598          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14599                                     "paddb", Iop_Add8x16, False );
   14600          goto decode_success;
   14601       }
   14602       break;
   14603 
   14604    case 0xFD:
   14605       /* 66 0F FD = PADDW */
   14606       if (have66noF2noF3(pfx) && sz == 2) {
   14607          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14608                                     "paddw", Iop_Add16x8, False );
   14609          goto decode_success;
   14610       }
   14611       break;
   14612 
   14613    case 0xFE:
   14614       /* 66 0F FE = PADDD */
   14615       if (have66noF2noF3(pfx) && sz == 2) {
   14616          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14617                                     "paddd", Iop_Add32x4, False );
   14618          goto decode_success;
   14619       }
   14620       break;
   14621 
   14622    default:
   14623       goto decode_failure;
   14624 
   14625    }
   14626 
   14627   decode_failure:
   14628    *decode_OK = False;
   14629    return deltaIN;
   14630 
   14631   decode_success:
   14632    *decode_OK = True;
   14633    return delta;
   14634 }
   14635 
   14636 
   14637 /*------------------------------------------------------------*/
   14638 /*---                                                      ---*/
   14639 /*--- Top-level SSE3 (not SupSSE3): dis_ESC_0F__SSE3       ---*/
   14640 /*---                                                      ---*/
   14641 /*------------------------------------------------------------*/
   14642 
   14643 static Long dis_MOVDDUP_128 ( const VexAbiInfo* vbi, Prefix pfx,
   14644                               Long delta, Bool isAvx )
   14645 {
   14646    IRTemp addr   = IRTemp_INVALID;
   14647    Int    alen   = 0;
   14648    HChar  dis_buf[50];
   14649    IRTemp sV    = newTemp(Ity_V128);
   14650    IRTemp d0    = newTemp(Ity_I64);
   14651    UChar  modrm = getUChar(delta);
   14652    UInt   rG    = gregOfRexRM(pfx,modrm);
   14653    if (epartIsReg(modrm)) {
   14654       UInt rE = eregOfRexRM(pfx,modrm);
   14655       assign( sV, getXMMReg(rE) );
   14656       DIP("%smovddup %s,%s\n",
   14657           isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG));
   14658       delta += 1;
   14659       assign ( d0, unop(Iop_V128to64, mkexpr(sV)) );
   14660    } else {
   14661       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14662       assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
   14663       DIP("%smovddup %s,%s\n",
   14664           isAvx ? "v" : "", dis_buf, nameXMMReg(rG));
   14665       delta += alen;
   14666    }
   14667    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   14668       ( rG, binop(Iop_64HLtoV128,mkexpr(d0),mkexpr(d0)) );
   14669    return delta;
   14670 }
   14671 
   14672 
   14673 static Long dis_MOVDDUP_256 ( const VexAbiInfo* vbi, Prefix pfx,
   14674                               Long delta )
   14675 {
   14676    IRTemp addr   = IRTemp_INVALID;
   14677    Int    alen   = 0;
   14678    HChar  dis_buf[50];
   14679    IRTemp d0    = newTemp(Ity_I64);
   14680    IRTemp d1    = newTemp(Ity_I64);
   14681    UChar  modrm = getUChar(delta);
   14682    UInt   rG    = gregOfRexRM(pfx,modrm);
   14683    if (epartIsReg(modrm)) {
   14684       UInt rE = eregOfRexRM(pfx,modrm);
   14685       DIP("vmovddup %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   14686       delta += 1;
   14687       assign ( d0, getYMMRegLane64(rE, 0) );
   14688       assign ( d1, getYMMRegLane64(rE, 2) );
   14689    } else {
   14690       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14691       assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
   14692       assign( d1, loadLE(Ity_I64, binop(Iop_Add64,
   14693                                         mkexpr(addr), mkU64(16))) );
   14694       DIP("vmovddup %s,%s\n", dis_buf, nameYMMReg(rG));
   14695       delta += alen;
   14696    }
   14697    putYMMRegLane64( rG, 0, mkexpr(d0) );
   14698    putYMMRegLane64( rG, 1, mkexpr(d0) );
   14699    putYMMRegLane64( rG, 2, mkexpr(d1) );
   14700    putYMMRegLane64( rG, 3, mkexpr(d1) );
   14701    return delta;
   14702 }
   14703 
   14704 
   14705 static Long dis_MOVSxDUP_128 ( const VexAbiInfo* vbi, Prefix pfx,
   14706                                Long delta, Bool isAvx, Bool isL )
   14707 {
   14708    IRTemp addr  = IRTemp_INVALID;
   14709    Int    alen  = 0;
   14710    HChar  dis_buf[50];
   14711    IRTemp sV    = newTemp(Ity_V128);
   14712    UChar  modrm = getUChar(delta);
   14713    UInt   rG    = gregOfRexRM(pfx,modrm);
   14714    IRTemp s3, s2, s1, s0;
   14715    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   14716    if (epartIsReg(modrm)) {
   14717       UInt rE = eregOfRexRM(pfx,modrm);
   14718       assign( sV, getXMMReg(rE) );
   14719       DIP("%smovs%cdup %s,%s\n",
   14720           isAvx ? "v" : "", isL ? 'l' : 'h', nameXMMReg(rE), nameXMMReg(rG));
   14721       delta += 1;
   14722    } else {
   14723       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14724       if (!isAvx)
   14725          gen_SEGV_if_not_16_aligned( addr );
   14726       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   14727       DIP("%smovs%cdup %s,%s\n",
   14728           isAvx ? "v" : "", isL ? 'l' : 'h', dis_buf, nameXMMReg(rG));
   14729       delta += alen;
   14730    }
   14731    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   14732    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   14733       ( rG, isL ? mkV128from32s( s2, s2, s0, s0 )
   14734                 : mkV128from32s( s3, s3, s1, s1 ) );
   14735    return delta;
   14736 }
   14737 
   14738 
   14739 static Long dis_MOVSxDUP_256 ( const VexAbiInfo* vbi, Prefix pfx,
   14740                                Long delta, Bool isL )
   14741 {
   14742    IRTemp addr  = IRTemp_INVALID;
   14743    Int    alen  = 0;
   14744    HChar  dis_buf[50];
   14745    IRTemp sV    = newTemp(Ity_V256);
   14746    UChar  modrm = getUChar(delta);
   14747    UInt   rG    = gregOfRexRM(pfx,modrm);
   14748    IRTemp s7, s6, s5, s4, s3, s2, s1, s0;
   14749    s7 = s6 = s5 = s4 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   14750    if (epartIsReg(modrm)) {
   14751       UInt rE = eregOfRexRM(pfx,modrm);
   14752       assign( sV, getYMMReg(rE) );
   14753       DIP("vmovs%cdup %s,%s\n",
   14754           isL ? 'l' : 'h', nameYMMReg(rE), nameYMMReg(rG));
   14755       delta += 1;
   14756    } else {
   14757       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14758       assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   14759       DIP("vmovs%cdup %s,%s\n",
   14760           isL ? 'l' : 'h', dis_buf, nameYMMReg(rG));
   14761       delta += alen;
   14762    }
   14763    breakupV256to32s( sV, &s7, &s6, &s5, &s4, &s3, &s2, &s1, &s0 );
   14764    putYMMRegLane128( rG, 1, isL ? mkV128from32s( s6, s6, s4, s4 )
   14765                                 : mkV128from32s( s7, s7, s5, s5 ) );
   14766    putYMMRegLane128( rG, 0, isL ? mkV128from32s( s2, s2, s0, s0 )
   14767                                 : mkV128from32s( s3, s3, s1, s1 ) );
   14768    return delta;
   14769 }
   14770 
   14771 
   14772 static IRTemp math_HADDPS_128 ( IRTemp dV, IRTemp sV, Bool isAdd )
   14773 {
   14774    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   14775    IRTemp leftV  = newTemp(Ity_V128);
   14776    IRTemp rightV = newTemp(Ity_V128);
   14777    IRTemp rm     = newTemp(Ity_I32);
   14778    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   14779 
   14780    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   14781    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   14782 
   14783    assign( leftV,  mkV128from32s( s2, s0, d2, d0 ) );
   14784    assign( rightV, mkV128from32s( s3, s1, d3, d1 ) );
   14785 
   14786    IRTemp res = newTemp(Ity_V128);
   14787    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   14788    assign( res, triop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4,
   14789                       mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
   14790    return res;
   14791 }
   14792 
   14793 
   14794 static IRTemp math_HADDPD_128 ( IRTemp dV, IRTemp sV, Bool isAdd )
   14795 {
   14796    IRTemp s1, s0, d1, d0;
   14797    IRTemp leftV  = newTemp(Ity_V128);
   14798    IRTemp rightV = newTemp(Ity_V128);
   14799    IRTemp rm     = newTemp(Ity_I32);
   14800    s1 = s0 = d1 = d0 = IRTemp_INVALID;
   14801 
   14802    breakupV128to64s( sV, &s1, &s0 );
   14803    breakupV128to64s( dV, &d1, &d0 );
   14804 
   14805    assign( leftV,  binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)) );
   14806    assign( rightV, binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1)) );
   14807 
   14808    IRTemp res = newTemp(Ity_V128);
   14809    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   14810    assign( res, triop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2,
   14811                       mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
   14812    return res;
   14813 }
   14814 
   14815 
   14816 __attribute__((noinline))
   14817 static
   14818 Long dis_ESC_0F__SSE3 ( Bool* decode_OK,
   14819                         const VexAbiInfo* vbi,
   14820                         Prefix pfx, Int sz, Long deltaIN )
   14821 {
   14822    IRTemp addr  = IRTemp_INVALID;
   14823    UChar  modrm = 0;
   14824    Int    alen  = 0;
   14825    HChar  dis_buf[50];
   14826 
   14827    *decode_OK = False;
   14828 
   14829    Long   delta = deltaIN;
   14830    UChar  opc   = getUChar(delta);
   14831    delta++;
   14832    switch (opc) {
   14833 
   14834    case 0x12:
   14835       /* F3 0F 12 = MOVSLDUP -- move from E (mem or xmm) to G (xmm),
   14836          duplicating some lanes (2:2:0:0). */
   14837       if (haveF3no66noF2(pfx) && sz == 4) {
   14838          delta = dis_MOVSxDUP_128( vbi, pfx, delta, False/*!isAvx*/,
   14839                                    True/*isL*/ );
   14840          goto decode_success;
   14841       }
   14842       /* F2 0F 12 = MOVDDUP -- move from E (mem or xmm) to G (xmm),
   14843          duplicating some lanes (0:1:0:1). */
   14844       if (haveF2no66noF3(pfx)
   14845           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   14846          delta = dis_MOVDDUP_128( vbi, pfx, delta, False/*!isAvx*/ );
   14847          goto decode_success;
   14848       }
   14849       break;
   14850 
   14851    case 0x16:
   14852       /* F3 0F 16 = MOVSHDUP -- move from E (mem or xmm) to G (xmm),
   14853          duplicating some lanes (3:3:1:1). */
   14854       if (haveF3no66noF2(pfx) && sz == 4) {
   14855          delta = dis_MOVSxDUP_128( vbi, pfx, delta, False/*!isAvx*/,
   14856                                    False/*!isL*/ );
   14857          goto decode_success;
   14858       }
   14859       break;
   14860 
   14861    case 0x7C:
   14862    case 0x7D:
   14863       /* F2 0F 7C = HADDPS -- 32x4 add across from E (mem or xmm) to G (xmm). */
   14864       /* F2 0F 7D = HSUBPS -- 32x4 sub across from E (mem or xmm) to G (xmm). */
   14865       if (haveF2no66noF3(pfx) && sz == 4) {
   14866          IRTemp eV     = newTemp(Ity_V128);
   14867          IRTemp gV     = newTemp(Ity_V128);
   14868          Bool   isAdd  = opc == 0x7C;
   14869          const HChar* str = isAdd ? "add" : "sub";
   14870          modrm         = getUChar(delta);
   14871          UInt   rG     = gregOfRexRM(pfx,modrm);
   14872          if (epartIsReg(modrm)) {
   14873             UInt rE = eregOfRexRM(pfx,modrm);
   14874             assign( eV, getXMMReg(rE) );
   14875             DIP("h%sps %s,%s\n", str, nameXMMReg(rE), nameXMMReg(rG));
   14876             delta += 1;
   14877          } else {
   14878             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14879             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   14880             DIP("h%sps %s,%s\n", str, dis_buf, nameXMMReg(rG));
   14881             delta += alen;
   14882          }
   14883 
   14884          assign( gV, getXMMReg(rG) );
   14885          putXMMReg( rG, mkexpr( math_HADDPS_128 ( gV, eV, isAdd ) ) );
   14886          goto decode_success;
   14887       }
   14888       /* 66 0F 7C = HADDPD -- 64x2 add across from E (mem or xmm) to G (xmm). */
   14889       /* 66 0F 7D = HSUBPD -- 64x2 sub across from E (mem or xmm) to G (xmm). */
   14890       if (have66noF2noF3(pfx) && sz == 2) {
   14891          IRTemp eV     = newTemp(Ity_V128);
   14892          IRTemp gV     = newTemp(Ity_V128);
   14893          Bool   isAdd  = opc == 0x7C;
   14894          const HChar* str = isAdd ? "add" : "sub";
   14895          modrm         = getUChar(delta);
   14896          UInt   rG     = gregOfRexRM(pfx,modrm);
   14897          if (epartIsReg(modrm)) {
   14898             UInt rE = eregOfRexRM(pfx,modrm);
   14899             assign( eV, getXMMReg(rE) );
   14900             DIP("h%spd %s,%s\n", str, nameXMMReg(rE), nameXMMReg(rG));
   14901             delta += 1;
   14902          } else {
   14903             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14904             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   14905             DIP("h%spd %s,%s\n", str, dis_buf, nameXMMReg(rG));
   14906             delta += alen;
   14907          }
   14908 
   14909          assign( gV, getXMMReg(rG) );
   14910          putXMMReg( rG, mkexpr( math_HADDPD_128 ( gV, eV, isAdd ) ) );
   14911          goto decode_success;
   14912       }
   14913       break;
   14914 
   14915    case 0xD0:
   14916       /* 66 0F D0 = ADDSUBPD -- 64x4 +/- from E (mem or xmm) to G (xmm). */
   14917       if (have66noF2noF3(pfx) && sz == 2) {
   14918          IRTemp eV   = newTemp(Ity_V128);
   14919          IRTemp gV   = newTemp(Ity_V128);
   14920          modrm       = getUChar(delta);
   14921          UInt   rG   = gregOfRexRM(pfx,modrm);
   14922          if (epartIsReg(modrm)) {
   14923             UInt rE = eregOfRexRM(pfx,modrm);
   14924             assign( eV, getXMMReg(rE) );
   14925             DIP("addsubpd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   14926             delta += 1;
   14927          } else {
   14928             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14929             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   14930             DIP("addsubpd %s,%s\n", dis_buf, nameXMMReg(rG));
   14931             delta += alen;
   14932          }
   14933 
   14934          assign( gV, getXMMReg(rG) );
   14935          putXMMReg( rG, mkexpr( math_ADDSUBPD_128 ( gV, eV ) ) );
   14936          goto decode_success;
   14937       }
   14938       /* F2 0F D0 = ADDSUBPS -- 32x4 +/-/+/- from E (mem or xmm) to G (xmm). */
   14939       if (haveF2no66noF3(pfx) && sz == 4) {
   14940          IRTemp eV   = newTemp(Ity_V128);
   14941          IRTemp gV   = newTemp(Ity_V128);
   14942          modrm       = getUChar(delta);
   14943          UInt   rG   = gregOfRexRM(pfx,modrm);
   14944 
   14945          modrm = getUChar(delta);
   14946          if (epartIsReg(modrm)) {
   14947             UInt rE = eregOfRexRM(pfx,modrm);
   14948             assign( eV, getXMMReg(rE) );
   14949             DIP("addsubps %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   14950             delta += 1;
   14951          } else {
   14952             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14953             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   14954             DIP("addsubps %s,%s\n", dis_buf, nameXMMReg(rG));
   14955             delta += alen;
   14956          }
   14957 
   14958          assign( gV, getXMMReg(rG) );
   14959          putXMMReg( rG, mkexpr( math_ADDSUBPS_128 ( gV, eV ) ) );
   14960          goto decode_success;
   14961       }
   14962       break;
   14963 
   14964    case 0xF0:
   14965       /* F2 0F F0 = LDDQU -- move from E (mem or xmm) to G (xmm). */
   14966       if (haveF2no66noF3(pfx) && sz == 4) {
   14967          modrm = getUChar(delta);
   14968          if (epartIsReg(modrm)) {
   14969             goto decode_failure;
   14970          } else {
   14971             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14972             putXMMReg( gregOfRexRM(pfx,modrm),
   14973                        loadLE(Ity_V128, mkexpr(addr)) );
   14974             DIP("lddqu %s,%s\n", dis_buf,
   14975                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   14976             delta += alen;
   14977          }
   14978          goto decode_success;
   14979       }
   14980       break;
   14981 
   14982    default:
   14983       goto decode_failure;
   14984 
   14985    }
   14986 
   14987   decode_failure:
   14988    *decode_OK = False;
   14989    return deltaIN;
   14990 
   14991   decode_success:
   14992    *decode_OK = True;
   14993    return delta;
   14994 }
   14995 
   14996 
   14997 /*------------------------------------------------------------*/
   14998 /*---                                                      ---*/
   14999 /*--- Top-level SSSE3: dis_ESC_0F38__SupSSE3               ---*/
   15000 /*---                                                      ---*/
   15001 /*------------------------------------------------------------*/
   15002 
   15003 static
   15004 IRTemp math_PSHUFB_XMM ( IRTemp dV/*data to perm*/, IRTemp sV/*perm*/ )
   15005 {
   15006    IRTemp sHi        = newTemp(Ity_I64);
   15007    IRTemp sLo        = newTemp(Ity_I64);
   15008    IRTemp dHi        = newTemp(Ity_I64);
   15009    IRTemp dLo        = newTemp(Ity_I64);
   15010    IRTemp rHi        = newTemp(Ity_I64);
   15011    IRTemp rLo        = newTemp(Ity_I64);
   15012    IRTemp sevens     = newTemp(Ity_I64);
   15013    IRTemp mask0x80hi = newTemp(Ity_I64);
   15014    IRTemp mask0x80lo = newTemp(Ity_I64);
   15015    IRTemp maskBit3hi = newTemp(Ity_I64);
   15016    IRTemp maskBit3lo = newTemp(Ity_I64);
   15017    IRTemp sAnd7hi    = newTemp(Ity_I64);
   15018    IRTemp sAnd7lo    = newTemp(Ity_I64);
   15019    IRTemp permdHi    = newTemp(Ity_I64);
   15020    IRTemp permdLo    = newTemp(Ity_I64);
   15021    IRTemp res        = newTemp(Ity_V128);
   15022 
   15023    assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   15024    assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   15025    assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   15026    assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   15027 
   15028    assign( sevens, mkU64(0x0707070707070707ULL) );
   15029 
   15030    /* mask0x80hi = Not(SarN8x8(sHi,7))
   15031       maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7)
   15032       sAnd7hi    = And(sHi,sevens)
   15033       permdHi    = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi),
   15034       And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) )
   15035       rHi        = And(permdHi,mask0x80hi)
   15036    */
   15037    assign(
   15038       mask0x80hi,
   15039       unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7))));
   15040 
   15041    assign(
   15042       maskBit3hi,
   15043       binop(Iop_SarN8x8,
   15044             binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)),
   15045             mkU8(7)));
   15046 
   15047    assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens)));
   15048 
   15049    assign(
   15050       permdHi,
   15051       binop(
   15052          Iop_Or64,
   15053          binop(Iop_And64,
   15054                binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)),
   15055                mkexpr(maskBit3hi)),
   15056          binop(Iop_And64,
   15057                binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)),
   15058                unop(Iop_Not64,mkexpr(maskBit3hi))) ));
   15059 
   15060    assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) );
   15061 
   15062    /* And the same for the lower half of the result.  What fun. */
   15063 
   15064    assign(
   15065       mask0x80lo,
   15066       unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7))));
   15067 
   15068    assign(
   15069       maskBit3lo,
   15070       binop(Iop_SarN8x8,
   15071             binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)),
   15072             mkU8(7)));
   15073 
   15074    assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens)));
   15075 
   15076    assign(
   15077       permdLo,
   15078       binop(
   15079          Iop_Or64,
   15080          binop(Iop_And64,
   15081                binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)),
   15082                mkexpr(maskBit3lo)),
   15083          binop(Iop_And64,
   15084                binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)),
   15085                unop(Iop_Not64,mkexpr(maskBit3lo))) ));
   15086 
   15087    assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) );
   15088 
   15089    assign(res, binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo)));
   15090    return res;
   15091 }
   15092 
   15093 
   15094 static
   15095 IRTemp math_PSHUFB_YMM ( IRTemp dV/*data to perm*/, IRTemp sV/*perm*/ )
   15096 {
   15097    IRTemp sHi, sLo, dHi, dLo;
   15098    sHi = sLo = dHi = dLo = IRTemp_INVALID;
   15099    breakupV256toV128s( dV, &dHi, &dLo);
   15100    breakupV256toV128s( sV, &sHi, &sLo);
   15101    IRTemp res = newTemp(Ity_V256);
   15102    assign(res, binop(Iop_V128HLtoV256,
   15103                      mkexpr(math_PSHUFB_XMM(dHi, sHi)),
   15104                      mkexpr(math_PSHUFB_XMM(dLo, sLo))));
   15105    return res;
   15106 }
   15107 
   15108 
   15109 static Long dis_PHADD_128 ( const VexAbiInfo* vbi, Prefix pfx, Long delta,
   15110                             Bool isAvx, UChar opc )
   15111 {
   15112    IRTemp addr   = IRTemp_INVALID;
   15113    Int    alen   = 0;
   15114    HChar  dis_buf[50];
   15115    const HChar* str = "???";
   15116    IROp   opV64  = Iop_INVALID;
   15117    IROp   opCatO = Iop_CatOddLanes16x4;
   15118    IROp   opCatE = Iop_CatEvenLanes16x4;
   15119    IRTemp sV     = newTemp(Ity_V128);
   15120    IRTemp dV     = newTemp(Ity_V128);
   15121    IRTemp sHi    = newTemp(Ity_I64);
   15122    IRTemp sLo    = newTemp(Ity_I64);
   15123    IRTemp dHi    = newTemp(Ity_I64);
   15124    IRTemp dLo    = newTemp(Ity_I64);
   15125    UChar  modrm  = getUChar(delta);
   15126    UInt   rG     = gregOfRexRM(pfx,modrm);
   15127    UInt   rV     = isAvx ? getVexNvvvv(pfx) : rG;
   15128 
   15129    switch (opc) {
   15130       case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
   15131       case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
   15132       case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
   15133       case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
   15134       case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
   15135       case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
   15136       default: vassert(0);
   15137    }
   15138    if (opc == 0x02 || opc == 0x06) {
   15139       opCatO = Iop_InterleaveHI32x2;
   15140       opCatE = Iop_InterleaveLO32x2;
   15141    }
   15142 
   15143    assign( dV, getXMMReg(rV) );
   15144 
   15145    if (epartIsReg(modrm)) {
   15146       UInt rE = eregOfRexRM(pfx,modrm);
   15147       assign( sV, getXMMReg(rE) );
   15148       DIP("%sph%s %s,%s\n", isAvx ? "v" : "", str,
   15149           nameXMMReg(rE), nameXMMReg(rG));
   15150       delta += 1;
   15151    } else {
   15152       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15153       if (!isAvx)
   15154          gen_SEGV_if_not_16_aligned( addr );
   15155       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   15156       DIP("%sph%s %s,%s\n", isAvx ? "v" : "", str,
   15157           dis_buf, nameXMMReg(rG));
   15158       delta += alen;
   15159    }
   15160 
   15161    assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   15162    assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   15163    assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   15164    assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   15165 
   15166    /* This isn't a particularly efficient way to compute the
   15167       result, but at least it avoids a proliferation of IROps,
   15168       hence avoids complication all the backends. */
   15169 
   15170    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   15171       ( rG,
   15172         binop(Iop_64HLtoV128,
   15173               binop(opV64,
   15174                     binop(opCatE,mkexpr(sHi),mkexpr(sLo)),
   15175                     binop(opCatO,mkexpr(sHi),mkexpr(sLo)) ),
   15176               binop(opV64,
   15177                     binop(opCatE,mkexpr(dHi),mkexpr(dLo)),
   15178                     binop(opCatO,mkexpr(dHi),mkexpr(dLo)) ) ) );
   15179    return delta;
   15180 }
   15181 
   15182 
   15183 static Long dis_PHADD_256 ( const VexAbiInfo* vbi, Prefix pfx, Long delta,
   15184                             UChar opc )
   15185 {
   15186    IRTemp addr   = IRTemp_INVALID;
   15187    Int    alen   = 0;
   15188    HChar  dis_buf[50];
   15189    const HChar* str = "???";
   15190    IROp   opV64  = Iop_INVALID;
   15191    IROp   opCatO = Iop_CatOddLanes16x4;
   15192    IROp   opCatE = Iop_CatEvenLanes16x4;
   15193    IRTemp sV     = newTemp(Ity_V256);
   15194    IRTemp dV     = newTemp(Ity_V256);
   15195    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   15196    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   15197    UChar  modrm  = getUChar(delta);
   15198    UInt   rG     = gregOfRexRM(pfx,modrm);
   15199    UInt   rV     = getVexNvvvv(pfx);
   15200 
   15201    switch (opc) {
   15202       case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
   15203       case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
   15204       case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
   15205       case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
   15206       case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
   15207       case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
   15208       default: vassert(0);
   15209    }
   15210    if (opc == 0x02 || opc == 0x06) {
   15211       opCatO = Iop_InterleaveHI32x2;
   15212       opCatE = Iop_InterleaveLO32x2;
   15213    }
   15214 
   15215    assign( dV, getYMMReg(rV) );
   15216 
   15217    if (epartIsReg(modrm)) {
   15218       UInt rE = eregOfRexRM(pfx,modrm);
   15219       assign( sV, getYMMReg(rE) );
   15220       DIP("vph%s %s,%s\n", str, nameYMMReg(rE), nameYMMReg(rG));
   15221       delta += 1;
   15222    } else {
   15223       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15224       assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   15225       DIP("vph%s %s,%s\n", str, dis_buf, nameYMMReg(rG));
   15226       delta += alen;
   15227    }
   15228 
   15229    breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
   15230    breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
   15231 
   15232    /* This isn't a particularly efficient way to compute the
   15233       result, but at least it avoids a proliferation of IROps,
   15234       hence avoids complication all the backends. */
   15235 
   15236    putYMMReg( rG,
   15237               binop(Iop_V128HLtoV256,
   15238                     binop(Iop_64HLtoV128,
   15239                           binop(opV64,
   15240                                 binop(opCatE,mkexpr(s3),mkexpr(s2)),
   15241                                 binop(opCatO,mkexpr(s3),mkexpr(s2)) ),
   15242                           binop(opV64,
   15243                                 binop(opCatE,mkexpr(d3),mkexpr(d2)),
   15244                                 binop(opCatO,mkexpr(d3),mkexpr(d2)) ) ),
   15245                     binop(Iop_64HLtoV128,
   15246                           binop(opV64,
   15247                                 binop(opCatE,mkexpr(s1),mkexpr(s0)),
   15248                                 binop(opCatO,mkexpr(s1),mkexpr(s0)) ),
   15249                           binop(opV64,
   15250                                 binop(opCatE,mkexpr(d1),mkexpr(d0)),
   15251                                 binop(opCatO,mkexpr(d1),mkexpr(d0)) ) ) ) );
   15252    return delta;
   15253 }
   15254 
   15255 
   15256 static IRTemp math_PMADDUBSW_128 ( IRTemp dV, IRTemp sV )
   15257 {
   15258    IRTemp sVoddsSX  = newTemp(Ity_V128);
   15259    IRTemp sVevensSX = newTemp(Ity_V128);
   15260    IRTemp dVoddsZX  = newTemp(Ity_V128);
   15261    IRTemp dVevensZX = newTemp(Ity_V128);
   15262    /* compute dV unsigned x sV signed */
   15263    assign( sVoddsSX, binop(Iop_SarN16x8, mkexpr(sV), mkU8(8)) );
   15264    assign( sVevensSX, binop(Iop_SarN16x8,
   15265                             binop(Iop_ShlN16x8, mkexpr(sV), mkU8(8)),
   15266                             mkU8(8)) );
   15267    assign( dVoddsZX, binop(Iop_ShrN16x8, mkexpr(dV), mkU8(8)) );
   15268    assign( dVevensZX, binop(Iop_ShrN16x8,
   15269                             binop(Iop_ShlN16x8, mkexpr(dV), mkU8(8)),
   15270                             mkU8(8)) );
   15271 
   15272    IRTemp res = newTemp(Ity_V128);
   15273    assign( res, binop(Iop_QAdd16Sx8,
   15274                       binop(Iop_Mul16x8, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
   15275                       binop(Iop_Mul16x8, mkexpr(sVevensSX), mkexpr(dVevensZX))
   15276                      )
   15277          );
   15278    return res;
   15279 }
   15280 
   15281 
   15282 static
   15283 IRTemp math_PMADDUBSW_256 ( IRTemp dV, IRTemp sV )
   15284 {
   15285    IRTemp sHi, sLo, dHi, dLo;
   15286    sHi = sLo = dHi = dLo = IRTemp_INVALID;
   15287    breakupV256toV128s( dV, &dHi, &dLo);
   15288    breakupV256toV128s( sV, &sHi, &sLo);
   15289    IRTemp res = newTemp(Ity_V256);
   15290    assign(res, binop(Iop_V128HLtoV256,
   15291                      mkexpr(math_PMADDUBSW_128(dHi, sHi)),
   15292                      mkexpr(math_PMADDUBSW_128(dLo, sLo))));
   15293    return res;
   15294 }
   15295 
   15296 
   15297 __attribute__((noinline))
   15298 static
   15299 Long dis_ESC_0F38__SupSSE3 ( Bool* decode_OK,
   15300                              const VexAbiInfo* vbi,
   15301                              Prefix pfx, Int sz, Long deltaIN )
   15302 {
   15303    IRTemp addr  = IRTemp_INVALID;
   15304    UChar  modrm = 0;
   15305    Int    alen  = 0;
   15306    HChar  dis_buf[50];
   15307 
   15308    *decode_OK = False;
   15309 
   15310    Long   delta = deltaIN;
   15311    UChar  opc   = getUChar(delta);
   15312    delta++;
   15313    switch (opc) {
   15314 
   15315    case 0x00:
   15316       /* 66 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x16 (XMM) */
   15317       if (have66noF2noF3(pfx)
   15318           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   15319          IRTemp sV = newTemp(Ity_V128);
   15320          IRTemp dV = newTemp(Ity_V128);
   15321 
   15322          modrm = getUChar(delta);
   15323          assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   15324 
   15325          if (epartIsReg(modrm)) {
   15326             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   15327             delta += 1;
   15328             DIP("pshufb %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   15329                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   15330          } else {
   15331             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15332             gen_SEGV_if_not_16_aligned( addr );
   15333             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   15334             delta += alen;
   15335             DIP("pshufb %s,%s\n", dis_buf,
   15336                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   15337          }
   15338 
   15339          IRTemp res = math_PSHUFB_XMM( dV, sV );
   15340          putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(res));
   15341          goto decode_success;
   15342       }
   15343       /* 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x8 (MMX) */
   15344       if (haveNo66noF2noF3(pfx) && sz == 4) {
   15345          IRTemp sV      = newTemp(Ity_I64);
   15346          IRTemp dV      = newTemp(Ity_I64);
   15347 
   15348          modrm = getUChar(delta);
   15349          do_MMX_preamble();
   15350          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   15351 
   15352          if (epartIsReg(modrm)) {
   15353             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   15354             delta += 1;
   15355             DIP("pshufb %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   15356                                   nameMMXReg(gregLO3ofRM(modrm)));
   15357          } else {
   15358             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15359             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   15360             delta += alen;
   15361             DIP("pshufb %s,%s\n", dis_buf,
   15362                                   nameMMXReg(gregLO3ofRM(modrm)));
   15363          }
   15364 
   15365          putMMXReg(
   15366             gregLO3ofRM(modrm),
   15367             binop(
   15368                Iop_And64,
   15369                /* permute the lanes */
   15370                binop(
   15371                   Iop_Perm8x8,
   15372                   mkexpr(dV),
   15373                   binop(Iop_And64, mkexpr(sV), mkU64(0x0707070707070707ULL))
   15374                ),
   15375                /* mask off lanes which have (index & 0x80) == 0x80 */
   15376                unop(Iop_Not64, binop(Iop_SarN8x8, mkexpr(sV), mkU8(7)))
   15377             )
   15378          );
   15379          goto decode_success;
   15380       }
   15381       break;
   15382 
   15383    case 0x01:
   15384    case 0x02:
   15385    case 0x03:
   15386    case 0x05:
   15387    case 0x06:
   15388    case 0x07:
   15389       /* 66 0F 38 01 = PHADDW -- 16x8 add across from E (mem or xmm) and
   15390          G to G (xmm). */
   15391       /* 66 0F 38 02 = PHADDD -- 32x4 add across from E (mem or xmm) and
   15392          G to G (xmm). */
   15393       /* 66 0F 38 03 = PHADDSW -- 16x8 signed qadd across from E (mem or
   15394          xmm) and G to G (xmm). */
   15395       /* 66 0F 38 05 = PHSUBW -- 16x8 sub across from E (mem or xmm) and
   15396          G to G (xmm). */
   15397       /* 66 0F 38 06 = PHSUBD -- 32x4 sub across from E (mem or xmm) and
   15398          G to G (xmm). */
   15399       /* 66 0F 38 07 = PHSUBSW -- 16x8 signed qsub across from E (mem or
   15400          xmm) and G to G (xmm). */
   15401       if (have66noF2noF3(pfx)
   15402           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   15403          delta = dis_PHADD_128( vbi, pfx, delta, False/*isAvx*/, opc );
   15404          goto decode_success;
   15405       }
   15406       /* ***--- these are MMX class insns introduced in SSSE3 ---*** */
   15407       /* 0F 38 01 = PHADDW -- 16x4 add across from E (mem or mmx) and G
   15408          to G (mmx). */
   15409       /* 0F 38 02 = PHADDD -- 32x2 add across from E (mem or mmx) and G
   15410          to G (mmx). */
   15411       /* 0F 38 03 = PHADDSW -- 16x4 signed qadd across from E (mem or
   15412          mmx) and G to G (mmx). */
   15413       /* 0F 38 05 = PHSUBW -- 16x4 sub across from E (mem or mmx) and G
   15414          to G (mmx). */
   15415       /* 0F 38 06 = PHSUBD -- 32x2 sub across from E (mem or mmx) and G
   15416          to G (mmx). */
   15417       /* 0F 38 07 = PHSUBSW -- 16x4 signed qsub across from E (mem or
   15418          mmx) and G to G (mmx). */
   15419       if (haveNo66noF2noF3(pfx) && sz == 4) {
   15420          const HChar* str = "???";
   15421          IROp   opV64  = Iop_INVALID;
   15422          IROp   opCatO = Iop_CatOddLanes16x4;
   15423          IROp   opCatE = Iop_CatEvenLanes16x4;
   15424          IRTemp sV     = newTemp(Ity_I64);
   15425          IRTemp dV     = newTemp(Ity_I64);
   15426 
   15427          modrm = getUChar(delta);
   15428 
   15429          switch (opc) {
   15430             case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
   15431             case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
   15432             case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
   15433             case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
   15434             case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
   15435             case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
   15436             default: vassert(0);
   15437          }
   15438          if (opc == 0x02 || opc == 0x06) {
   15439             opCatO = Iop_InterleaveHI32x2;
   15440             opCatE = Iop_InterleaveLO32x2;
   15441          }
   15442 
   15443          do_MMX_preamble();
   15444          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   15445 
   15446          if (epartIsReg(modrm)) {
   15447             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   15448             delta += 1;
   15449             DIP("ph%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
   15450                                      nameMMXReg(gregLO3ofRM(modrm)));
   15451          } else {
   15452             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15453             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   15454             delta += alen;
   15455             DIP("ph%s %s,%s\n", str, dis_buf,
   15456                                      nameMMXReg(gregLO3ofRM(modrm)));
   15457          }
   15458 
   15459          putMMXReg(
   15460             gregLO3ofRM(modrm),
   15461             binop(opV64,
   15462                   binop(opCatE,mkexpr(sV),mkexpr(dV)),
   15463                   binop(opCatO,mkexpr(sV),mkexpr(dV))
   15464             )
   15465          );
   15466          goto decode_success;
   15467       }
   15468       break;
   15469 
   15470    case 0x04:
   15471       /* 66 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
   15472          Unsigned Bytes (XMM) */
   15473       if (have66noF2noF3(pfx)
   15474           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   15475          IRTemp sV = newTemp(Ity_V128);
   15476          IRTemp dV = newTemp(Ity_V128);
   15477          modrm     = getUChar(delta);
   15478          UInt   rG = gregOfRexRM(pfx,modrm);
   15479 
   15480          assign( dV, getXMMReg(rG) );
   15481 
   15482          if (epartIsReg(modrm)) {
   15483             UInt rE = eregOfRexRM(pfx,modrm);
   15484             assign( sV, getXMMReg(rE) );
   15485             delta += 1;
   15486             DIP("pmaddubsw %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   15487          } else {
   15488             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15489             gen_SEGV_if_not_16_aligned( addr );
   15490             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   15491             delta += alen;
   15492             DIP("pmaddubsw %s,%s\n", dis_buf, nameXMMReg(rG));
   15493          }
   15494 
   15495          putXMMReg( rG, mkexpr( math_PMADDUBSW_128( dV, sV ) ) );
   15496          goto decode_success;
   15497       }
   15498       /* 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
   15499          Unsigned Bytes (MMX) */
   15500       if (haveNo66noF2noF3(pfx) && sz == 4) {
   15501          IRTemp sV        = newTemp(Ity_I64);
   15502          IRTemp dV        = newTemp(Ity_I64);
   15503          IRTemp sVoddsSX  = newTemp(Ity_I64);
   15504          IRTemp sVevensSX = newTemp(Ity_I64);
   15505          IRTemp dVoddsZX  = newTemp(Ity_I64);
   15506          IRTemp dVevensZX = newTemp(Ity_I64);
   15507 
   15508          modrm = getUChar(delta);
   15509          do_MMX_preamble();
   15510          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   15511 
   15512          if (epartIsReg(modrm)) {
   15513             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   15514             delta += 1;
   15515             DIP("pmaddubsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   15516                                      nameMMXReg(gregLO3ofRM(modrm)));
   15517          } else {
   15518             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15519             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   15520             delta += alen;
   15521             DIP("pmaddubsw %s,%s\n", dis_buf,
   15522                                      nameMMXReg(gregLO3ofRM(modrm)));
   15523          }
   15524 
   15525          /* compute dV unsigned x sV signed */
   15526          assign( sVoddsSX,
   15527                  binop(Iop_SarN16x4, mkexpr(sV), mkU8(8)) );
   15528          assign( sVevensSX,
   15529                  binop(Iop_SarN16x4,
   15530                        binop(Iop_ShlN16x4, mkexpr(sV), mkU8(8)),
   15531                        mkU8(8)) );
   15532          assign( dVoddsZX,
   15533                  binop(Iop_ShrN16x4, mkexpr(dV), mkU8(8)) );
   15534          assign( dVevensZX,
   15535                  binop(Iop_ShrN16x4,
   15536                        binop(Iop_ShlN16x4, mkexpr(dV), mkU8(8)),
   15537                        mkU8(8)) );
   15538 
   15539          putMMXReg(
   15540             gregLO3ofRM(modrm),
   15541             binop(Iop_QAdd16Sx4,
   15542                   binop(Iop_Mul16x4, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
   15543                   binop(Iop_Mul16x4, mkexpr(sVevensSX), mkexpr(dVevensZX))
   15544             )
   15545          );
   15546          goto decode_success;
   15547       }
   15548       break;
   15549 
   15550    case 0x08:
   15551    case 0x09:
   15552    case 0x0A:
   15553       /* 66 0F 38 08 = PSIGNB -- Packed Sign 8x16 (XMM) */
   15554       /* 66 0F 38 09 = PSIGNW -- Packed Sign 16x8 (XMM) */
   15555       /* 66 0F 38 0A = PSIGND -- Packed Sign 32x4 (XMM) */
   15556       if (have66noF2noF3(pfx)
   15557           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   15558          IRTemp sV      = newTemp(Ity_V128);
   15559          IRTemp dV      = newTemp(Ity_V128);
   15560          IRTemp sHi     = newTemp(Ity_I64);
   15561          IRTemp sLo     = newTemp(Ity_I64);
   15562          IRTemp dHi     = newTemp(Ity_I64);
   15563          IRTemp dLo     = newTemp(Ity_I64);
   15564          const HChar* str = "???";
   15565          Int    laneszB = 0;
   15566 
   15567          switch (opc) {
   15568             case 0x08: laneszB = 1; str = "b"; break;
   15569             case 0x09: laneszB = 2; str = "w"; break;
   15570             case 0x0A: laneszB = 4; str = "d"; break;
   15571             default: vassert(0);
   15572          }
   15573 
   15574          modrm = getUChar(delta);
   15575          assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   15576 
   15577          if (epartIsReg(modrm)) {
   15578             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   15579             delta += 1;
   15580             DIP("psign%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
   15581                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
   15582          } else {
   15583             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15584             gen_SEGV_if_not_16_aligned( addr );
   15585             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   15586             delta += alen;
   15587             DIP("psign%s %s,%s\n", str, dis_buf,
   15588                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
   15589          }
   15590 
   15591          assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   15592          assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   15593          assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   15594          assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   15595 
   15596          putXMMReg(
   15597             gregOfRexRM(pfx,modrm),
   15598             binop(Iop_64HLtoV128,
   15599                   dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
   15600                   dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
   15601             )
   15602          );
   15603          goto decode_success;
   15604       }
   15605       /* 0F 38 08 = PSIGNB -- Packed Sign 8x8  (MMX) */
   15606       /* 0F 38 09 = PSIGNW -- Packed Sign 16x4 (MMX) */
   15607       /* 0F 38 0A = PSIGND -- Packed Sign 32x2 (MMX) */
   15608       if (haveNo66noF2noF3(pfx) && sz == 4) {
   15609          IRTemp sV      = newTemp(Ity_I64);
   15610          IRTemp dV      = newTemp(Ity_I64);
   15611          const HChar* str = "???";
   15612          Int    laneszB = 0;
   15613 
   15614          switch (opc) {
   15615             case 0x08: laneszB = 1; str = "b"; break;
   15616             case 0x09: laneszB = 2; str = "w"; break;
   15617             case 0x0A: laneszB = 4; str = "d"; break;
   15618             default: vassert(0);
   15619          }
   15620 
   15621          modrm = getUChar(delta);
   15622          do_MMX_preamble();
   15623          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   15624 
   15625          if (epartIsReg(modrm)) {
   15626             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   15627             delta += 1;
   15628             DIP("psign%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
   15629                                         nameMMXReg(gregLO3ofRM(modrm)));
   15630          } else {
   15631             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15632             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   15633             delta += alen;
   15634             DIP("psign%s %s,%s\n", str, dis_buf,
   15635                                         nameMMXReg(gregLO3ofRM(modrm)));
   15636          }
   15637 
   15638          putMMXReg(
   15639             gregLO3ofRM(modrm),
   15640             dis_PSIGN_helper( mkexpr(sV), mkexpr(dV), laneszB )
   15641          );
   15642          goto decode_success;
   15643       }
   15644       break;
   15645 
   15646    case 0x0B:
   15647       /* 66 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and
   15648          Scale (XMM) */
   15649       if (have66noF2noF3(pfx)
   15650           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   15651          IRTemp sV  = newTemp(Ity_V128);
   15652          IRTemp dV  = newTemp(Ity_V128);
   15653          IRTemp sHi = newTemp(Ity_I64);
   15654          IRTemp sLo = newTemp(Ity_I64);
   15655          IRTemp dHi = newTemp(Ity_I64);
   15656          IRTemp dLo = newTemp(Ity_I64);
   15657 
   15658          modrm = getUChar(delta);
   15659          assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   15660 
   15661          if (epartIsReg(modrm)) {
   15662             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   15663             delta += 1;
   15664             DIP("pmulhrsw %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   15665                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   15666          } else {
   15667             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15668             gen_SEGV_if_not_16_aligned( addr );
   15669             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   15670             delta += alen;
   15671             DIP("pmulhrsw %s,%s\n", dis_buf,
   15672                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   15673          }
   15674 
   15675          assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   15676          assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   15677          assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   15678          assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   15679 
   15680          putXMMReg(
   15681             gregOfRexRM(pfx,modrm),
   15682             binop(Iop_64HLtoV128,
   15683                   dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
   15684                   dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
   15685             )
   15686          );
   15687          goto decode_success;
   15688       }
   15689       /* 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and Scale
   15690          (MMX) */
   15691       if (haveNo66noF2noF3(pfx) && sz == 4) {
   15692          IRTemp sV = newTemp(Ity_I64);
   15693          IRTemp dV = newTemp(Ity_I64);
   15694 
   15695          modrm = getUChar(delta);
   15696          do_MMX_preamble();
   15697          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   15698 
   15699          if (epartIsReg(modrm)) {
   15700             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   15701             delta += 1;
   15702             DIP("pmulhrsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   15703                                     nameMMXReg(gregLO3ofRM(modrm)));
   15704          } else {
   15705             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15706             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   15707             delta += alen;
   15708             DIP("pmulhrsw %s,%s\n", dis_buf,
   15709                                     nameMMXReg(gregLO3ofRM(modrm)));
   15710          }
   15711 
   15712          putMMXReg(
   15713             gregLO3ofRM(modrm),
   15714             dis_PMULHRSW_helper( mkexpr(sV), mkexpr(dV) )
   15715          );
   15716          goto decode_success;
   15717       }
   15718       break;
   15719 
   15720    case 0x1C:
   15721    case 0x1D:
   15722    case 0x1E:
   15723       /* 66 0F 38 1C = PABSB -- Packed Absolute Value 8x16 (XMM) */
   15724       /* 66 0F 38 1D = PABSW -- Packed Absolute Value 16x8 (XMM) */
   15725       /* 66 0F 38 1E = PABSD -- Packed Absolute Value 32x4 (XMM) */
   15726       if (have66noF2noF3(pfx)
   15727           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   15728          IRTemp sV  = newTemp(Ity_V128);
   15729          const HChar* str = "???";
   15730          Int    laneszB = 0;
   15731 
   15732          switch (opc) {
   15733             case 0x1C: laneszB = 1; str = "b"; break;
   15734             case 0x1D: laneszB = 2; str = "w"; break;
   15735             case 0x1E: laneszB = 4; str = "d"; break;
   15736             default: vassert(0);
   15737          }
   15738 
   15739          modrm = getUChar(delta);
   15740          if (epartIsReg(modrm)) {
   15741             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   15742             delta += 1;
   15743             DIP("pabs%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
   15744                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
   15745          } else {
   15746             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15747             gen_SEGV_if_not_16_aligned( addr );
   15748             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   15749             delta += alen;
   15750             DIP("pabs%s %s,%s\n", str, dis_buf,
   15751                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
   15752          }
   15753 
   15754          putXMMReg( gregOfRexRM(pfx,modrm),
   15755                     mkexpr(math_PABS_XMM(sV, laneszB)) );
   15756          goto decode_success;
   15757       }
   15758       /* 0F 38 1C = PABSB -- Packed Absolute Value 8x8  (MMX) */
   15759       /* 0F 38 1D = PABSW -- Packed Absolute Value 16x4 (MMX) */
   15760       /* 0F 38 1E = PABSD -- Packed Absolute Value 32x2 (MMX) */
   15761       if (haveNo66noF2noF3(pfx) && sz == 4) {
   15762          IRTemp sV      = newTemp(Ity_I64);
   15763          const HChar* str = "???";
   15764          Int    laneszB = 0;
   15765 
   15766          switch (opc) {
   15767             case 0x1C: laneszB = 1; str = "b"; break;
   15768             case 0x1D: laneszB = 2; str = "w"; break;
   15769             case 0x1E: laneszB = 4; str = "d"; break;
   15770             default: vassert(0);
   15771          }
   15772 
   15773          modrm = getUChar(delta);
   15774          do_MMX_preamble();
   15775 
   15776          if (epartIsReg(modrm)) {
   15777             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   15778             delta += 1;
   15779             DIP("pabs%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
   15780                                        nameMMXReg(gregLO3ofRM(modrm)));
   15781          } else {
   15782             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15783             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   15784             delta += alen;
   15785             DIP("pabs%s %s,%s\n", str, dis_buf,
   15786                                        nameMMXReg(gregLO3ofRM(modrm)));
   15787          }
   15788 
   15789          putMMXReg( gregLO3ofRM(modrm),
   15790                     mkexpr(math_PABS_MMX( sV, laneszB )) );
   15791          goto decode_success;
   15792       }
   15793       break;
   15794 
   15795    default:
   15796       break;
   15797 
   15798    }
   15799 
   15800   //decode_failure:
   15801    *decode_OK = False;
   15802    return deltaIN;
   15803 
   15804   decode_success:
   15805    *decode_OK = True;
   15806    return delta;
   15807 }
   15808 
   15809 
   15810 /*------------------------------------------------------------*/
   15811 /*---                                                      ---*/
   15812 /*--- Top-level SSSE3: dis_ESC_0F3A__SupSSE3               ---*/
   15813 /*---                                                      ---*/
   15814 /*------------------------------------------------------------*/
   15815 
   15816 __attribute__((noinline))
   15817 static
   15818 Long dis_ESC_0F3A__SupSSE3 ( Bool* decode_OK,
   15819                              const VexAbiInfo* vbi,
   15820                              Prefix pfx, Int sz, Long deltaIN )
   15821 {
   15822    Long   d64   = 0;
   15823    IRTemp addr  = IRTemp_INVALID;
   15824    UChar  modrm = 0;
   15825    Int    alen  = 0;
   15826    HChar  dis_buf[50];
   15827 
   15828    *decode_OK = False;
   15829 
   15830    Long   delta = deltaIN;
   15831    UChar  opc   = getUChar(delta);
   15832    delta++;
   15833    switch (opc) {
   15834 
   15835    case 0x0F:
   15836       /* 66 0F 3A 0F = PALIGNR -- Packed Align Right (XMM) */
   15837       if (have66noF2noF3(pfx)
   15838           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   15839          IRTemp sV  = newTemp(Ity_V128);
   15840          IRTemp dV  = newTemp(Ity_V128);
   15841 
   15842          modrm = getUChar(delta);
   15843          assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   15844 
   15845          if (epartIsReg(modrm)) {
   15846             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   15847             d64 = (Long)getUChar(delta+1);
   15848             delta += 1+1;
   15849             DIP("palignr $%d,%s,%s\n", (Int)d64,
   15850                                        nameXMMReg(eregOfRexRM(pfx,modrm)),
   15851                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
   15852          } else {
   15853             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   15854             gen_SEGV_if_not_16_aligned( addr );
   15855             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   15856             d64 = (Long)getUChar(delta+alen);
   15857             delta += alen+1;
   15858             DIP("palignr $%d,%s,%s\n", (Int)d64,
   15859                                        dis_buf,
   15860                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
   15861          }
   15862 
   15863          IRTemp res = math_PALIGNR_XMM( sV, dV, d64 );
   15864          putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
   15865          goto decode_success;
   15866       }
   15867       /* 0F 3A 0F = PALIGNR -- Packed Align Right (MMX) */
   15868       if (haveNo66noF2noF3(pfx) && sz == 4) {
   15869          IRTemp sV  = newTemp(Ity_I64);
   15870          IRTemp dV  = newTemp(Ity_I64);
   15871          IRTemp res = newTemp(Ity_I64);
   15872 
   15873          modrm = getUChar(delta);
   15874          do_MMX_preamble();
   15875          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   15876 
   15877          if (epartIsReg(modrm)) {
   15878             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   15879             d64 = (Long)getUChar(delta+1);
   15880             delta += 1+1;
   15881             DIP("palignr $%d,%s,%s\n",  (Int)d64,
   15882                                         nameMMXReg(eregLO3ofRM(modrm)),
   15883                                         nameMMXReg(gregLO3ofRM(modrm)));
   15884          } else {
   15885             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   15886             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   15887             d64 = (Long)getUChar(delta+alen);
   15888             delta += alen+1;
   15889             DIP("palignr $%d%s,%s\n", (Int)d64,
   15890                                       dis_buf,
   15891                                       nameMMXReg(gregLO3ofRM(modrm)));
   15892          }
   15893 
   15894          if (d64 == 0) {
   15895             assign( res, mkexpr(sV) );
   15896          }
   15897          else if (d64 >= 1 && d64 <= 7) {
   15898             assign(res,
   15899                    binop(Iop_Or64,
   15900                          binop(Iop_Shr64, mkexpr(sV), mkU8(8*d64)),
   15901                          binop(Iop_Shl64, mkexpr(dV), mkU8(8*(8-d64))
   15902                         )));
   15903          }
   15904          else if (d64 == 8) {
   15905            assign( res, mkexpr(dV) );
   15906          }
   15907          else if (d64 >= 9 && d64 <= 15) {
   15908             assign( res, binop(Iop_Shr64, mkexpr(dV), mkU8(8*(d64-8))) );
   15909          }
   15910          else if (d64 >= 16 && d64 <= 255) {
   15911             assign( res, mkU64(0) );
   15912          }
   15913          else
   15914             vassert(0);
   15915 
   15916          putMMXReg( gregLO3ofRM(modrm), mkexpr(res) );
   15917          goto decode_success;
   15918       }
   15919       break;
   15920 
   15921    default:
   15922       break;
   15923 
   15924    }
   15925 
   15926   //decode_failure:
   15927    *decode_OK = False;
   15928    return deltaIN;
   15929 
   15930   decode_success:
   15931    *decode_OK = True;
   15932    return delta;
   15933 }
   15934 
   15935 
   15936 /*------------------------------------------------------------*/
   15937 /*---                                                      ---*/
   15938 /*--- Top-level SSE4: dis_ESC_0F__SSE4                     ---*/
   15939 /*---                                                      ---*/
   15940 /*------------------------------------------------------------*/
   15941 
   15942 __attribute__((noinline))
   15943 static
   15944 Long dis_ESC_0F__SSE4 ( Bool* decode_OK,
   15945                         const VexArchInfo* archinfo,
   15946                         const VexAbiInfo* vbi,
   15947                         Prefix pfx, Int sz, Long deltaIN )
   15948 {
   15949    IRTemp addr  = IRTemp_INVALID;
   15950    IRType ty    = Ity_INVALID;
   15951    UChar  modrm = 0;
   15952    Int    alen  = 0;
   15953    HChar  dis_buf[50];
   15954 
   15955    *decode_OK = False;
   15956 
   15957    Long   delta = deltaIN;
   15958    UChar  opc   = getUChar(delta);
   15959    delta++;
   15960    switch (opc) {
   15961 
   15962    case 0xB8:
   15963       /* F3 0F B8  = POPCNT{W,L,Q}
   15964          Count the number of 1 bits in a register
   15965       */
   15966       if (haveF3noF2(pfx) /* so both 66 and REX.W are possibilities */
   15967           && (sz == 2 || sz == 4 || sz == 8)) {
   15968          /*IRType*/ ty  = szToITy(sz);
   15969          IRTemp     src = newTemp(ty);
   15970          modrm = getUChar(delta);
   15971          if (epartIsReg(modrm)) {
   15972             assign(src, getIRegE(sz, pfx, modrm));
   15973             delta += 1;
   15974             DIP("popcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
   15975                 nameIRegG(sz, pfx, modrm));
   15976          } else {
   15977             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0);
   15978             assign(src, loadLE(ty, mkexpr(addr)));
   15979             delta += alen;
   15980             DIP("popcnt%c %s, %s\n", nameISize(sz), dis_buf,
   15981                 nameIRegG(sz, pfx, modrm));
   15982          }
   15983 
   15984          IRTemp result = gen_POPCOUNT(ty, src);
   15985          putIRegG(sz, pfx, modrm, mkexpr(result));
   15986 
   15987          // Update flags.  This is pretty lame .. perhaps can do better
   15988          // if this turns out to be performance critical.
   15989          // O S A C P are cleared.  Z is set if SRC == 0.
   15990          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   15991          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   15992          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   15993          stmt( IRStmt_Put( OFFB_CC_DEP1,
   15994                binop(Iop_Shl64,
   15995                      unop(Iop_1Uto64,
   15996                           binop(Iop_CmpEQ64,
   15997                                 widenUto64(mkexpr(src)),
   15998                                 mkU64(0))),
   15999                      mkU8(AMD64G_CC_SHIFT_Z))));
   16000 
   16001          goto decode_success;
   16002       }
   16003       break;
   16004 
   16005    case 0xBC:
   16006       /* F3 0F BC -- TZCNT (count trailing zeroes.  A BMI extension,
   16007          which we can only decode if we're sure this is a BMI1 capable cpu
   16008          that supports TZCNT, since otherwise it's BSF, which behaves
   16009          differently on zero source.  */
   16010       if (haveF3noF2(pfx) /* so both 66 and 48 are possibilities */
   16011           && (sz == 2 || sz == 4 || sz == 8)
   16012           && 0 != (archinfo->hwcaps & VEX_HWCAPS_AMD64_BMI)) {
   16013          /*IRType*/ ty  = szToITy(sz);
   16014          IRTemp     src = newTemp(ty);
   16015          modrm = getUChar(delta);
   16016          if (epartIsReg(modrm)) {
   16017             assign(src, getIRegE(sz, pfx, modrm));
   16018             delta += 1;
   16019             DIP("tzcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
   16020                 nameIRegG(sz, pfx, modrm));
   16021          } else {
   16022             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0);
   16023             assign(src, loadLE(ty, mkexpr(addr)));
   16024             delta += alen;
   16025             DIP("tzcnt%c %s, %s\n", nameISize(sz), dis_buf,
   16026                 nameIRegG(sz, pfx, modrm));
   16027          }
   16028 
   16029          IRTemp res = gen_TZCNT(ty, src);
   16030          putIRegG(sz, pfx, modrm, mkexpr(res));
   16031 
   16032          // Update flags.  This is pretty lame .. perhaps can do better
   16033          // if this turns out to be performance critical.
   16034          // O S A P are cleared.  Z is set if RESULT == 0.
   16035          // C is set if SRC is zero.
   16036          IRTemp src64 = newTemp(Ity_I64);
   16037          IRTemp res64 = newTemp(Ity_I64);
   16038          assign(src64, widenUto64(mkexpr(src)));
   16039          assign(res64, widenUto64(mkexpr(res)));
   16040 
   16041          IRTemp oszacp = newTemp(Ity_I64);
   16042          assign(
   16043             oszacp,
   16044             binop(Iop_Or64,
   16045                   binop(Iop_Shl64,
   16046                         unop(Iop_1Uto64,
   16047                              binop(Iop_CmpEQ64, mkexpr(res64), mkU64(0))),
   16048                         mkU8(AMD64G_CC_SHIFT_Z)),
   16049                   binop(Iop_Shl64,
   16050                         unop(Iop_1Uto64,
   16051                              binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0))),
   16052                         mkU8(AMD64G_CC_SHIFT_C))
   16053             )
   16054          );
   16055 
   16056          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   16057          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   16058          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   16059          stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
   16060 
   16061          goto decode_success;
   16062       }
   16063       break;
   16064 
   16065    case 0xBD:
   16066       /* F3 0F BD -- LZCNT (count leading zeroes.  An AMD extension,
   16067          which we can only decode if we're sure this is an AMD cpu
   16068          that supports LZCNT, since otherwise it's BSR, which behaves
   16069          differently.  Bizarrely, my Sandy Bridge also accepts these
   16070          instructions but produces different results. */
   16071       if (haveF3noF2(pfx) /* so both 66 and 48 are possibilities */
   16072           && (sz == 2 || sz == 4 || sz == 8)
   16073           && 0 != (archinfo->hwcaps & VEX_HWCAPS_AMD64_LZCNT)) {
   16074          /*IRType*/ ty  = szToITy(sz);
   16075          IRTemp     src = newTemp(ty);
   16076          modrm = getUChar(delta);
   16077          if (epartIsReg(modrm)) {
   16078             assign(src, getIRegE(sz, pfx, modrm));
   16079             delta += 1;
   16080             DIP("lzcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
   16081                 nameIRegG(sz, pfx, modrm));
   16082          } else {
   16083             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0);
   16084             assign(src, loadLE(ty, mkexpr(addr)));
   16085             delta += alen;
   16086             DIP("lzcnt%c %s, %s\n", nameISize(sz), dis_buf,
   16087                 nameIRegG(sz, pfx, modrm));
   16088          }
   16089 
   16090          IRTemp res = gen_LZCNT(ty, src);
   16091          putIRegG(sz, pfx, modrm, mkexpr(res));
   16092 
   16093          // Update flags.  This is pretty lame .. perhaps can do better
   16094          // if this turns out to be performance critical.
   16095          // O S A P are cleared.  Z is set if RESULT == 0.
   16096          // C is set if SRC is zero.
   16097          IRTemp src64 = newTemp(Ity_I64);
   16098          IRTemp res64 = newTemp(Ity_I64);
   16099          assign(src64, widenUto64(mkexpr(src)));
   16100          assign(res64, widenUto64(mkexpr(res)));
   16101 
   16102          IRTemp oszacp = newTemp(Ity_I64);
   16103          assign(
   16104             oszacp,
   16105             binop(Iop_Or64,
   16106                   binop(Iop_Shl64,
   16107                         unop(Iop_1Uto64,
   16108                              binop(Iop_CmpEQ64, mkexpr(res64), mkU64(0))),
   16109                         mkU8(AMD64G_CC_SHIFT_Z)),
   16110                   binop(Iop_Shl64,
   16111                         unop(Iop_1Uto64,
   16112                              binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0))),
   16113                         mkU8(AMD64G_CC_SHIFT_C))
   16114             )
   16115          );
   16116 
   16117          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   16118          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   16119          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   16120          stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
   16121 
   16122          goto decode_success;
   16123       }
   16124       break;
   16125 
   16126    default:
   16127       break;
   16128 
   16129    }
   16130 
   16131   //decode_failure:
   16132    *decode_OK = False;
   16133    return deltaIN;
   16134 
   16135   decode_success:
   16136    *decode_OK = True;
   16137    return delta;
   16138 }
   16139 
   16140 
   16141 /*------------------------------------------------------------*/
   16142 /*---                                                      ---*/
   16143 /*--- Top-level SSE4: dis_ESC_0F38__SSE4                   ---*/
   16144 /*---                                                      ---*/
   16145 /*------------------------------------------------------------*/
   16146 
   16147 static IRTemp math_PBLENDVB_128 ( IRTemp vecE, IRTemp vecG,
   16148                                   IRTemp vec0/*controlling mask*/,
   16149                                   UInt gran, IROp opSAR )
   16150 {
   16151    /* The tricky bit is to convert vec0 into a suitable mask, by
   16152       copying the most significant bit of each lane into all positions
   16153       in the lane. */
   16154    IRTemp sh = newTemp(Ity_I8);
   16155    assign(sh, mkU8(8 * gran - 1));
   16156 
   16157    IRTemp mask = newTemp(Ity_V128);
   16158    assign(mask, binop(opSAR, mkexpr(vec0), mkexpr(sh)));
   16159 
   16160    IRTemp notmask = newTemp(Ity_V128);
   16161    assign(notmask, unop(Iop_NotV128, mkexpr(mask)));
   16162 
   16163    IRTemp res = newTemp(Ity_V128);
   16164    assign(res,  binop(Iop_OrV128,
   16165                       binop(Iop_AndV128, mkexpr(vecE), mkexpr(mask)),
   16166                       binop(Iop_AndV128, mkexpr(vecG), mkexpr(notmask))));
   16167    return res;
   16168 }
   16169 
   16170 static IRTemp math_PBLENDVB_256 ( IRTemp vecE, IRTemp vecG,
   16171                                   IRTemp vec0/*controlling mask*/,
   16172                                   UInt gran, IROp opSAR128 )
   16173 {
   16174    /* The tricky bit is to convert vec0 into a suitable mask, by
   16175       copying the most significant bit of each lane into all positions
   16176       in the lane. */
   16177    IRTemp sh = newTemp(Ity_I8);
   16178    assign(sh, mkU8(8 * gran - 1));
   16179 
   16180    IRTemp vec0Hi = IRTemp_INVALID;
   16181    IRTemp vec0Lo = IRTemp_INVALID;
   16182    breakupV256toV128s( vec0, &vec0Hi, &vec0Lo );
   16183 
   16184    IRTemp mask = newTemp(Ity_V256);
   16185    assign(mask, binop(Iop_V128HLtoV256,
   16186                       binop(opSAR128, mkexpr(vec0Hi), mkexpr(sh)),
   16187                       binop(opSAR128, mkexpr(vec0Lo), mkexpr(sh))));
   16188 
   16189    IRTemp notmask = newTemp(Ity_V256);
   16190    assign(notmask, unop(Iop_NotV256, mkexpr(mask)));
   16191 
   16192    IRTemp res = newTemp(Ity_V256);
   16193    assign(res,  binop(Iop_OrV256,
   16194                       binop(Iop_AndV256, mkexpr(vecE), mkexpr(mask)),
   16195                       binop(Iop_AndV256, mkexpr(vecG), mkexpr(notmask))));
   16196    return res;
   16197 }
   16198 
   16199 static Long dis_VBLENDV_128 ( const VexAbiInfo* vbi, Prefix pfx, Long delta,
   16200                               const HChar *name, UInt gran, IROp opSAR )
   16201 {
   16202    IRTemp addr   = IRTemp_INVALID;
   16203    Int    alen   = 0;
   16204    HChar  dis_buf[50];
   16205    UChar  modrm  = getUChar(delta);
   16206    UInt   rG     = gregOfRexRM(pfx, modrm);
   16207    UInt   rV     = getVexNvvvv(pfx);
   16208    UInt   rIS4   = 0xFF; /* invalid */
   16209    IRTemp vecE   = newTemp(Ity_V128);
   16210    IRTemp vecV   = newTemp(Ity_V128);
   16211    IRTemp vecIS4 = newTemp(Ity_V128);
   16212    if (epartIsReg(modrm)) {
   16213       delta++;
   16214       UInt rE = eregOfRexRM(pfx, modrm);
   16215       assign(vecE, getXMMReg(rE));
   16216       UChar ib = getUChar(delta);
   16217       rIS4 = (ib >> 4) & 0xF;
   16218       DIP("%s %s,%s,%s,%s\n",
   16219           name, nameXMMReg(rIS4), nameXMMReg(rE),
   16220           nameXMMReg(rV), nameXMMReg(rG));
   16221    } else {
   16222       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   16223       delta += alen;
   16224       assign(vecE, loadLE(Ity_V128, mkexpr(addr)));
   16225       UChar ib = getUChar(delta);
   16226       rIS4 = (ib >> 4) & 0xF;
   16227       DIP("%s %s,%s,%s,%s\n",
   16228           name, nameXMMReg(rIS4), dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   16229    }
   16230    delta++;
   16231    assign(vecV,   getXMMReg(rV));
   16232    assign(vecIS4, getXMMReg(rIS4));
   16233    IRTemp res = math_PBLENDVB_128( vecE, vecV, vecIS4, gran, opSAR );
   16234    putYMMRegLoAndZU( rG, mkexpr(res) );
   16235    return delta;
   16236 }
   16237 
   16238 static Long dis_VBLENDV_256 ( const VexAbiInfo* vbi, Prefix pfx, Long delta,
   16239                               const HChar *name, UInt gran, IROp opSAR128 )
   16240 {
   16241    IRTemp addr   = IRTemp_INVALID;
   16242    Int    alen   = 0;
   16243    HChar  dis_buf[50];
   16244    UChar  modrm  = getUChar(delta);
   16245    UInt   rG     = gregOfRexRM(pfx, modrm);
   16246    UInt   rV     = getVexNvvvv(pfx);
   16247    UInt   rIS4   = 0xFF; /* invalid */
   16248    IRTemp vecE   = newTemp(Ity_V256);
   16249    IRTemp vecV   = newTemp(Ity_V256);
   16250    IRTemp vecIS4 = newTemp(Ity_V256);
   16251    if (epartIsReg(modrm)) {
   16252       delta++;
   16253       UInt rE = eregOfRexRM(pfx, modrm);
   16254       assign(vecE, getYMMReg(rE));
   16255       UChar ib = getUChar(delta);
   16256       rIS4 = (ib >> 4) & 0xF;
   16257       DIP("%s %s,%s,%s,%s\n",
   16258           name, nameYMMReg(rIS4), nameYMMReg(rE),
   16259           nameYMMReg(rV), nameYMMReg(rG));
   16260    } else {
   16261       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   16262       delta += alen;
   16263       assign(vecE, loadLE(Ity_V256, mkexpr(addr)));
   16264       UChar ib = getUChar(delta);
   16265       rIS4 = (ib >> 4) & 0xF;
   16266       DIP("%s %s,%s,%s,%s\n",
   16267           name, nameYMMReg(rIS4), dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   16268    }
   16269    delta++;
   16270    assign(vecV,   getYMMReg(rV));
   16271    assign(vecIS4, getYMMReg(rIS4));
   16272    IRTemp res = math_PBLENDVB_256( vecE, vecV, vecIS4, gran, opSAR128 );
   16273    putYMMReg( rG, mkexpr(res) );
   16274    return delta;
   16275 }
   16276 
   16277 static void finish_xTESTy ( IRTemp andV, IRTemp andnV, Int sign )
   16278 {
   16279    /* Set Z=1 iff (vecE & vecG) == 0
   16280       Set C=1 iff (vecE & not vecG) == 0
   16281    */
   16282 
   16283    /* andV, andnV:  vecE & vecG,  vecE and not(vecG) */
   16284 
   16285    /* andV resp. andnV, reduced to 64-bit values, by or-ing the top
   16286       and bottom 64-bits together.  It relies on this trick:
   16287 
   16288       InterleaveLO64x2([a,b],[c,d]) == [b,d]    hence
   16289 
   16290       InterleaveLO64x2([a,b],[a,b]) == [b,b]    and similarly
   16291       InterleaveHI64x2([a,b],[a,b]) == [a,a]
   16292 
   16293       and so the OR of the above 2 exprs produces
   16294       [a OR b, a OR b], from which we simply take the lower half.
   16295    */
   16296    IRTemp and64  = newTemp(Ity_I64);
   16297    IRTemp andn64 = newTemp(Ity_I64);
   16298 
   16299    assign(and64,
   16300           unop(Iop_V128to64,
   16301                binop(Iop_OrV128,
   16302                      binop(Iop_InterleaveLO64x2,
   16303                            mkexpr(andV), mkexpr(andV)),
   16304                      binop(Iop_InterleaveHI64x2,
   16305                            mkexpr(andV), mkexpr(andV)))));
   16306 
   16307    assign(andn64,
   16308           unop(Iop_V128to64,
   16309                binop(Iop_OrV128,
   16310                      binop(Iop_InterleaveLO64x2,
   16311                            mkexpr(andnV), mkexpr(andnV)),
   16312                      binop(Iop_InterleaveHI64x2,
   16313                            mkexpr(andnV), mkexpr(andnV)))));
   16314 
   16315    IRTemp z64 = newTemp(Ity_I64);
   16316    IRTemp c64 = newTemp(Ity_I64);
   16317    if (sign == 64) {
   16318       /* When only interested in the most significant bit, just shift
   16319          arithmetically right and negate.  */
   16320       assign(z64,
   16321              unop(Iop_Not64,
   16322                   binop(Iop_Sar64, mkexpr(and64), mkU8(63))));
   16323 
   16324       assign(c64,
   16325              unop(Iop_Not64,
   16326                   binop(Iop_Sar64, mkexpr(andn64), mkU8(63))));
   16327    } else {
   16328       if (sign == 32) {
   16329          /* When interested in bit 31 and bit 63, mask those bits and
   16330             fallthrough into the PTEST handling.  */
   16331          IRTemp t0 = newTemp(Ity_I64);
   16332          IRTemp t1 = newTemp(Ity_I64);
   16333          IRTemp t2 = newTemp(Ity_I64);
   16334          assign(t0, mkU64(0x8000000080000000ULL));
   16335          assign(t1, binop(Iop_And64, mkexpr(and64), mkexpr(t0)));
   16336          assign(t2, binop(Iop_And64, mkexpr(andn64), mkexpr(t0)));
   16337          and64 = t1;
   16338          andn64 = t2;
   16339       }
   16340       /* Now convert and64, andn64 to all-zeroes or all-1s, so we can
   16341          slice out the Z and C bits conveniently.  We use the standard
   16342          trick all-zeroes -> all-zeroes, anything-else -> all-ones
   16343          done by "(x | -x) >>s (word-size - 1)".
   16344       */
   16345       assign(z64,
   16346              unop(Iop_Not64,
   16347                   binop(Iop_Sar64,
   16348                         binop(Iop_Or64,
   16349                               binop(Iop_Sub64, mkU64(0), mkexpr(and64)),
   16350                                     mkexpr(and64)), mkU8(63))));
   16351 
   16352       assign(c64,
   16353              unop(Iop_Not64,
   16354                   binop(Iop_Sar64,
   16355                         binop(Iop_Or64,
   16356                               binop(Iop_Sub64, mkU64(0), mkexpr(andn64)),
   16357                                     mkexpr(andn64)), mkU8(63))));
   16358    }
   16359 
   16360    /* And finally, slice out the Z and C flags and set the flags
   16361       thunk to COPY for them.  OSAP are set to zero. */
   16362    IRTemp newOSZACP = newTemp(Ity_I64);
   16363    assign(newOSZACP,
   16364           binop(Iop_Or64,
   16365                 binop(Iop_And64, mkexpr(z64), mkU64(AMD64G_CC_MASK_Z)),
   16366                 binop(Iop_And64, mkexpr(c64), mkU64(AMD64G_CC_MASK_C))));
   16367 
   16368    stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(newOSZACP)));
   16369    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   16370    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   16371    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   16372 }
   16373 
   16374 
   16375 /* Handles 128 bit versions of PTEST, VTESTPS or VTESTPD.
   16376    sign is 0 for PTEST insn, 32 for VTESTPS and 64 for VTESTPD. */
   16377 static Long dis_xTESTy_128 ( const VexAbiInfo* vbi, Prefix pfx,
   16378                              Long delta, Bool isAvx, Int sign )
   16379 {
   16380    IRTemp addr   = IRTemp_INVALID;
   16381    Int    alen   = 0;
   16382    HChar  dis_buf[50];
   16383    UChar  modrm  = getUChar(delta);
   16384    UInt   rG     = gregOfRexRM(pfx, modrm);
   16385    IRTemp vecE = newTemp(Ity_V128);
   16386    IRTemp vecG = newTemp(Ity_V128);
   16387 
   16388    if ( epartIsReg(modrm) ) {
   16389       UInt rE = eregOfRexRM(pfx, modrm);
   16390       assign(vecE, getXMMReg(rE));
   16391       delta += 1;
   16392       DIP( "%s%stest%s %s,%s\n",
   16393            isAvx ? "v" : "", sign == 0 ? "p" : "",
   16394            sign == 0 ? "" : sign == 32 ? "ps" : "pd",
   16395            nameXMMReg(rE), nameXMMReg(rG) );
   16396    } else {
   16397       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16398       if (!isAvx)
   16399          gen_SEGV_if_not_16_aligned( addr );
   16400       assign(vecE, loadLE( Ity_V128, mkexpr(addr) ));
   16401       delta += alen;
   16402       DIP( "%s%stest%s %s,%s\n",
   16403            isAvx ? "v" : "", sign == 0 ? "p" : "",
   16404            sign == 0 ? "" : sign == 32 ? "ps" : "pd",
   16405            dis_buf, nameXMMReg(rG) );
   16406    }
   16407 
   16408    assign(vecG, getXMMReg(rG));
   16409 
   16410    /* Set Z=1 iff (vecE & vecG) == 0
   16411       Set C=1 iff (vecE & not vecG) == 0
   16412    */
   16413 
   16414    /* andV, andnV:  vecE & vecG,  vecE and not(vecG) */
   16415    IRTemp andV  = newTemp(Ity_V128);
   16416    IRTemp andnV = newTemp(Ity_V128);
   16417    assign(andV,  binop(Iop_AndV128, mkexpr(vecE), mkexpr(vecG)));
   16418    assign(andnV, binop(Iop_AndV128,
   16419                        mkexpr(vecE),
   16420                        binop(Iop_XorV128, mkexpr(vecG),
   16421                                           mkV128(0xFFFF))));
   16422 
   16423    finish_xTESTy ( andV, andnV, sign );
   16424    return delta;
   16425 }
   16426 
   16427 
   16428 /* Handles 256 bit versions of PTEST, VTESTPS or VTESTPD.
   16429    sign is 0 for PTEST insn, 32 for VTESTPS and 64 for VTESTPD. */
   16430 static Long dis_xTESTy_256 ( const VexAbiInfo* vbi, Prefix pfx,
   16431                              Long delta, Int sign )
   16432 {
   16433    IRTemp addr   = IRTemp_INVALID;
   16434    Int    alen   = 0;
   16435    HChar  dis_buf[50];
   16436    UChar  modrm  = getUChar(delta);
   16437    UInt   rG     = gregOfRexRM(pfx, modrm);
   16438    IRTemp vecE   = newTemp(Ity_V256);
   16439    IRTemp vecG   = newTemp(Ity_V256);
   16440 
   16441    if ( epartIsReg(modrm) ) {
   16442       UInt rE = eregOfRexRM(pfx, modrm);
   16443       assign(vecE, getYMMReg(rE));
   16444       delta += 1;
   16445       DIP( "v%stest%s %s,%s\n", sign == 0 ? "p" : "",
   16446            sign == 0 ? "" : sign == 32 ? "ps" : "pd",
   16447            nameYMMReg(rE), nameYMMReg(rG) );
   16448    } else {
   16449       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16450       assign(vecE, loadLE( Ity_V256, mkexpr(addr) ));
   16451       delta += alen;
   16452       DIP( "v%stest%s %s,%s\n", sign == 0 ? "p" : "",
   16453            sign == 0 ? "" : sign == 32 ? "ps" : "pd",
   16454            dis_buf, nameYMMReg(rG) );
   16455    }
   16456 
   16457    assign(vecG, getYMMReg(rG));
   16458 
   16459    /* Set Z=1 iff (vecE & vecG) == 0
   16460       Set C=1 iff (vecE & not vecG) == 0
   16461    */
   16462 
   16463    /* andV, andnV:  vecE & vecG,  vecE and not(vecG) */
   16464    IRTemp andV  = newTemp(Ity_V256);
   16465    IRTemp andnV = newTemp(Ity_V256);
   16466    assign(andV,  binop(Iop_AndV256, mkexpr(vecE), mkexpr(vecG)));
   16467    assign(andnV, binop(Iop_AndV256,
   16468                        mkexpr(vecE), unop(Iop_NotV256, mkexpr(vecG))));
   16469 
   16470    IRTemp andVhi  = IRTemp_INVALID;
   16471    IRTemp andVlo  = IRTemp_INVALID;
   16472    IRTemp andnVhi = IRTemp_INVALID;
   16473    IRTemp andnVlo = IRTemp_INVALID;
   16474    breakupV256toV128s( andV, &andVhi, &andVlo );
   16475    breakupV256toV128s( andnV, &andnVhi, &andnVlo );
   16476 
   16477    IRTemp andV128  = newTemp(Ity_V128);
   16478    IRTemp andnV128 = newTemp(Ity_V128);
   16479    assign( andV128, binop( Iop_OrV128, mkexpr(andVhi), mkexpr(andVlo) ) );
   16480    assign( andnV128, binop( Iop_OrV128, mkexpr(andnVhi), mkexpr(andnVlo) ) );
   16481 
   16482    finish_xTESTy ( andV128, andnV128, sign );
   16483    return delta;
   16484 }
   16485 
   16486 
   16487 /* Handles 128 bit versions of PMOVZXBW and PMOVSXBW. */
   16488 static Long dis_PMOVxXBW_128 ( const VexAbiInfo* vbi, Prefix pfx,
   16489                                Long delta, Bool isAvx, Bool xIsZ )
   16490 {
   16491    IRTemp addr   = IRTemp_INVALID;
   16492    Int    alen   = 0;
   16493    HChar  dis_buf[50];
   16494    IRTemp srcVec = newTemp(Ity_V128);
   16495    UChar  modrm  = getUChar(delta);
   16496    const HChar* mbV    = isAvx ? "v" : "";
   16497    const HChar  how    = xIsZ ? 'z' : 's';
   16498    UInt   rG     = gregOfRexRM(pfx, modrm);
   16499    if ( epartIsReg(modrm) ) {
   16500       UInt rE = eregOfRexRM(pfx, modrm);
   16501       assign( srcVec, getXMMReg(rE) );
   16502       delta += 1;
   16503       DIP( "%spmov%cxbw %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
   16504    } else {
   16505       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16506       assign( srcVec,
   16507               unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
   16508       delta += alen;
   16509       DIP( "%spmov%cxbw %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
   16510    }
   16511 
   16512    IRExpr* res
   16513       = xIsZ /* do math for either zero or sign extend */
   16514         ? binop( Iop_InterleaveLO8x16,
   16515                  IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) )
   16516         : binop( Iop_SarN16x8,
   16517                  binop( Iop_ShlN16x8,
   16518                         binop( Iop_InterleaveLO8x16,
   16519                                IRExpr_Const( IRConst_V128(0) ),
   16520                                mkexpr(srcVec) ),
   16521                         mkU8(8) ),
   16522                  mkU8(8) );
   16523 
   16524    (isAvx ? putYMMRegLoAndZU : putXMMReg) ( rG, res );
   16525 
   16526    return delta;
   16527 }
   16528 
   16529 
   16530 /* Handles 256 bit versions of PMOVZXBW and PMOVSXBW. */
   16531 static Long dis_PMOVxXBW_256 ( const VexAbiInfo* vbi, Prefix pfx,
   16532                                Long delta, Bool xIsZ )
   16533 {
   16534    IRTemp addr   = IRTemp_INVALID;
   16535    Int    alen   = 0;
   16536    HChar  dis_buf[50];
   16537    IRTemp srcVec = newTemp(Ity_V128);
   16538    UChar  modrm  = getUChar(delta);
   16539    UChar  how    = xIsZ ? 'z' : 's';
   16540    UInt   rG     = gregOfRexRM(pfx, modrm);
   16541    if ( epartIsReg(modrm) ) {
   16542       UInt rE = eregOfRexRM(pfx, modrm);
   16543       assign( srcVec, getXMMReg(rE) );
   16544       delta += 1;
   16545       DIP( "vpmov%cxbw %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) );
   16546    } else {
   16547       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16548       assign( srcVec, loadLE( Ity_V128, mkexpr(addr) ) );
   16549       delta += alen;
   16550       DIP( "vpmov%cxbw %s,%s\n", how, dis_buf, nameYMMReg(rG) );
   16551    }
   16552 
   16553    /* First do zero extend.  */
   16554    IRExpr* res
   16555       = binop( Iop_V128HLtoV256,
   16556                binop( Iop_InterleaveHI8x16,
   16557                       IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ),
   16558                binop( Iop_InterleaveLO8x16,
   16559                       IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ) );
   16560    /* And if needed sign extension as well.  */
   16561    if (!xIsZ)
   16562       res = binop( Iop_SarN16x16,
   16563                    binop( Iop_ShlN16x16, res, mkU8(8) ), mkU8(8) );
   16564 
   16565    putYMMReg ( rG, res );
   16566 
   16567    return delta;
   16568 }
   16569 
   16570 
   16571 static Long dis_PMOVxXWD_128 ( const VexAbiInfo* vbi, Prefix pfx,
   16572                                Long delta, Bool isAvx, Bool xIsZ )
   16573 {
   16574    IRTemp addr   = IRTemp_INVALID;
   16575    Int    alen   = 0;
   16576    HChar  dis_buf[50];
   16577    IRTemp srcVec = newTemp(Ity_V128);
   16578    UChar  modrm  = getUChar(delta);
   16579    const HChar* mbV    = isAvx ? "v" : "";
   16580    const HChar  how    = xIsZ ? 'z' : 's';
   16581    UInt   rG     = gregOfRexRM(pfx, modrm);
   16582 
   16583    if ( epartIsReg(modrm) ) {
   16584       UInt rE = eregOfRexRM(pfx, modrm);
   16585       assign( srcVec, getXMMReg(rE) );
   16586       delta += 1;
   16587       DIP( "%spmov%cxwd %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
   16588    } else {
   16589       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16590       assign( srcVec,
   16591               unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
   16592       delta += alen;
   16593       DIP( "%spmov%cxwd %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
   16594    }
   16595 
   16596    IRExpr* res
   16597       = binop( Iop_InterleaveLO16x8,
   16598                IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) );
   16599    if (!xIsZ)
   16600       res = binop(Iop_SarN32x4,
   16601                   binop(Iop_ShlN32x4, res, mkU8(16)), mkU8(16));
   16602 
   16603    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   16604       ( gregOfRexRM(pfx, modrm), res );
   16605 
   16606    return delta;
   16607 }
   16608 
   16609 
   16610 static Long dis_PMOVxXWD_256 ( const VexAbiInfo* vbi, Prefix pfx,
   16611                                Long delta, Bool xIsZ )
   16612 {
   16613    IRTemp addr   = IRTemp_INVALID;
   16614    Int    alen   = 0;
   16615    HChar  dis_buf[50];
   16616    IRTemp srcVec = newTemp(Ity_V128);
   16617    UChar  modrm  = getUChar(delta);
   16618    UChar  how    = xIsZ ? 'z' : 's';
   16619    UInt   rG     = gregOfRexRM(pfx, modrm);
   16620 
   16621    if ( epartIsReg(modrm) ) {
   16622       UInt rE = eregOfRexRM(pfx, modrm);
   16623       assign( srcVec, getXMMReg(rE) );
   16624       delta += 1;
   16625       DIP( "vpmov%cxwd %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) );
   16626    } else {
   16627       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16628       assign( srcVec, loadLE( Ity_V128, mkexpr(addr) ) );
   16629       delta += alen;
   16630       DIP( "vpmov%cxwd %s,%s\n", how, dis_buf, nameYMMReg(rG) );
   16631    }
   16632 
   16633    IRExpr* res
   16634       = binop( Iop_V128HLtoV256,
   16635                binop( Iop_InterleaveHI16x8,
   16636                       IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ),
   16637                binop( Iop_InterleaveLO16x8,
   16638                       IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ) );
   16639    if (!xIsZ)
   16640       res = binop(Iop_SarN32x8,
   16641                   binop(Iop_ShlN32x8, res, mkU8(16)), mkU8(16));
   16642 
   16643    putYMMReg ( rG, res );
   16644 
   16645    return delta;
   16646 }
   16647 
   16648 
   16649 static Long dis_PMOVSXWQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
   16650                                Long delta, Bool isAvx )
   16651 {
   16652    IRTemp addr     = IRTemp_INVALID;
   16653    Int    alen     = 0;
   16654    HChar  dis_buf[50];
   16655    IRTemp srcBytes = newTemp(Ity_I32);
   16656    UChar  modrm    = getUChar(delta);
   16657    const HChar* mbV = isAvx ? "v" : "";
   16658    UInt   rG       = gregOfRexRM(pfx, modrm);
   16659 
   16660    if ( epartIsReg( modrm ) ) {
   16661       UInt rE = eregOfRexRM(pfx, modrm);
   16662       assign( srcBytes, getXMMRegLane32( rE, 0 ) );
   16663       delta += 1;
   16664       DIP( "%spmovsxwq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
   16665    } else {
   16666       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16667       assign( srcBytes, loadLE( Ity_I32, mkexpr(addr) ) );
   16668       delta += alen;
   16669       DIP( "%spmovsxwq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
   16670    }
   16671 
   16672    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   16673       ( rG, binop( Iop_64HLtoV128,
   16674                    unop( Iop_16Sto64,
   16675                          unop( Iop_32HIto16, mkexpr(srcBytes) ) ),
   16676                    unop( Iop_16Sto64,
   16677                          unop( Iop_32to16, mkexpr(srcBytes) ) ) ) );
   16678    return delta;
   16679 }
   16680 
   16681 
   16682 static Long dis_PMOVSXWQ_256 ( const VexAbiInfo* vbi, Prefix pfx, Long delta )
   16683 {
   16684    IRTemp addr     = IRTemp_INVALID;
   16685    Int    alen     = 0;
   16686    HChar  dis_buf[50];
   16687    IRTemp srcBytes = newTemp(Ity_I64);
   16688    UChar  modrm    = getUChar(delta);
   16689    UInt   rG       = gregOfRexRM(pfx, modrm);
   16690    IRTemp s3, s2, s1, s0;
   16691    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   16692 
   16693    if ( epartIsReg( modrm ) ) {
   16694       UInt rE = eregOfRexRM(pfx, modrm);
   16695       assign( srcBytes, getXMMRegLane64( rE, 0 ) );
   16696       delta += 1;
   16697       DIP( "vpmovsxwq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) );
   16698    } else {
   16699       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16700       assign( srcBytes, loadLE( Ity_I64, mkexpr(addr) ) );
   16701       delta += alen;
   16702       DIP( "vpmovsxwq %s,%s\n", dis_buf, nameYMMReg(rG) );
   16703    }
   16704 
   16705    breakup64to16s( srcBytes, &s3, &s2, &s1, &s0 );
   16706    putYMMReg( rG, binop( Iop_V128HLtoV256,
   16707                          binop( Iop_64HLtoV128,
   16708                                 unop( Iop_16Sto64, mkexpr(s3) ),
   16709                                 unop( Iop_16Sto64, mkexpr(s2) ) ),
   16710                          binop( Iop_64HLtoV128,
   16711                                 unop( Iop_16Sto64, mkexpr(s1) ),
   16712                                 unop( Iop_16Sto64, mkexpr(s0) ) ) ) );
   16713    return delta;
   16714 }
   16715 
   16716 
   16717 static Long dis_PMOVZXWQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
   16718                                Long delta, Bool isAvx )
   16719 {
   16720    IRTemp addr     = IRTemp_INVALID;
   16721    Int    alen     = 0;
   16722    HChar  dis_buf[50];
   16723    IRTemp srcVec = newTemp(Ity_V128);
   16724    UChar  modrm    = getUChar(delta);
   16725    const HChar* mbV = isAvx ? "v" : "";
   16726    UInt   rG       = gregOfRexRM(pfx, modrm);
   16727 
   16728    if ( epartIsReg( modrm ) ) {
   16729       UInt rE = eregOfRexRM(pfx, modrm);
   16730       assign( srcVec, getXMMReg(rE) );
   16731       delta += 1;
   16732       DIP( "%spmovzxwq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
   16733    } else {
   16734       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16735       assign( srcVec,
   16736               unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) ) ) );
   16737       delta += alen;
   16738       DIP( "%spmovzxwq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
   16739    }
   16740 
   16741    IRTemp zeroVec = newTemp( Ity_V128 );
   16742    assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   16743 
   16744    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   16745       ( rG, binop( Iop_InterleaveLO16x8,
   16746                    mkexpr(zeroVec),
   16747                    binop( Iop_InterleaveLO16x8,
   16748                           mkexpr(zeroVec), mkexpr(srcVec) ) ) );
   16749    return delta;
   16750 }
   16751 
   16752 
   16753 static Long dis_PMOVZXWQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
   16754                                Long delta )
   16755 {
   16756    IRTemp addr     = IRTemp_INVALID;
   16757    Int    alen     = 0;
   16758    HChar  dis_buf[50];
   16759    IRTemp srcVec = newTemp(Ity_V128);
   16760    UChar  modrm    = getUChar(delta);
   16761    UInt   rG       = gregOfRexRM(pfx, modrm);
   16762 
   16763    if ( epartIsReg( modrm ) ) {
   16764       UInt rE = eregOfRexRM(pfx, modrm);
   16765       assign( srcVec, getXMMReg(rE) );
   16766       delta += 1;
   16767       DIP( "vpmovzxwq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) );
   16768    } else {
   16769       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16770       assign( srcVec,
   16771               unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
   16772       delta += alen;
   16773       DIP( "vpmovzxwq %s,%s\n", dis_buf, nameYMMReg(rG) );
   16774    }
   16775 
   16776    IRTemp zeroVec = newTemp( Ity_V128 );
   16777    assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   16778 
   16779    putYMMReg( rG, binop( Iop_V128HLtoV256,
   16780                          binop( Iop_InterleaveHI16x8,
   16781                                 mkexpr(zeroVec),
   16782                                 binop( Iop_InterleaveLO16x8,
   16783                                        mkexpr(zeroVec), mkexpr(srcVec) ) ),
   16784                          binop( Iop_InterleaveLO16x8,
   16785                                 mkexpr(zeroVec),
   16786                                 binop( Iop_InterleaveLO16x8,
   16787                                        mkexpr(zeroVec), mkexpr(srcVec) ) ) ) );
   16788    return delta;
   16789 }
   16790 
   16791 
   16792 /* Handles 128 bit versions of PMOVZXDQ and PMOVSXDQ. */
   16793 static Long dis_PMOVxXDQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
   16794                                Long delta, Bool isAvx, Bool xIsZ )
   16795 {
   16796    IRTemp addr   = IRTemp_INVALID;
   16797    Int    alen   = 0;
   16798    HChar  dis_buf[50];
   16799    IRTemp srcI64 = newTemp(Ity_I64);
   16800    IRTemp srcVec = newTemp(Ity_V128);
   16801    UChar  modrm  = getUChar(delta);
   16802    const HChar* mbV = isAvx ? "v" : "";
   16803    const HChar  how = xIsZ ? 'z' : 's';
   16804    UInt   rG     = gregOfRexRM(pfx, modrm);
   16805    /* Compute both srcI64 -- the value to expand -- and srcVec -- same
   16806       thing in a V128, with arbitrary junk in the top 64 bits.  Use
   16807       one or both of them and let iropt clean up afterwards (as
   16808       usual). */
   16809    if ( epartIsReg(modrm) ) {
   16810       UInt rE = eregOfRexRM(pfx, modrm);
   16811       assign( srcVec, getXMMReg(rE) );
   16812       assign( srcI64, unop(Iop_V128to64, mkexpr(srcVec)) );
   16813       delta += 1;
   16814       DIP( "%spmov%cxdq %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
   16815    } else {
   16816       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16817       assign( srcI64, loadLE(Ity_I64, mkexpr(addr)) );
   16818       assign( srcVec, unop( Iop_64UtoV128, mkexpr(srcI64)) );
   16819       delta += alen;
   16820       DIP( "%spmov%cxdq %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
   16821    }
   16822 
   16823    IRExpr* res
   16824       = xIsZ /* do math for either zero or sign extend */
   16825         ? binop( Iop_InterleaveLO32x4,
   16826                  IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) )
   16827         : binop( Iop_64HLtoV128,
   16828                  unop( Iop_32Sto64,
   16829                        unop( Iop_64HIto32, mkexpr(srcI64) ) ),
   16830                  unop( Iop_32Sto64,
   16831                        unop( Iop_64to32, mkexpr(srcI64) ) ) );
   16832 
   16833    (isAvx ? putYMMRegLoAndZU : putXMMReg) ( rG, res );
   16834 
   16835    return delta;
   16836 }
   16837 
   16838 
   16839 /* Handles 256 bit versions of PMOVZXDQ and PMOVSXDQ. */
   16840 static Long dis_PMOVxXDQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
   16841                                Long delta, Bool xIsZ )
   16842 {
   16843    IRTemp addr   = IRTemp_INVALID;
   16844    Int    alen   = 0;
   16845    HChar  dis_buf[50];
   16846    IRTemp srcVec = newTemp(Ity_V128);
   16847    UChar  modrm  = getUChar(delta);
   16848    UChar  how    = xIsZ ? 'z' : 's';
   16849    UInt   rG     = gregOfRexRM(pfx, modrm);
   16850    /* Compute both srcI64 -- the value to expand -- and srcVec -- same
   16851       thing in a V128, with arbitrary junk in the top 64 bits.  Use
   16852       one or both of them and let iropt clean up afterwards (as
   16853       usual). */
   16854    if ( epartIsReg(modrm) ) {
   16855       UInt rE = eregOfRexRM(pfx, modrm);
   16856       assign( srcVec, getXMMReg(rE) );
   16857       delta += 1;
   16858       DIP( "vpmov%cxdq %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) );
   16859    } else {
   16860       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16861       assign( srcVec, loadLE(Ity_V128, mkexpr(addr)) );
   16862       delta += alen;
   16863       DIP( "vpmov%cxdq %s,%s\n", how, dis_buf, nameYMMReg(rG) );
   16864    }
   16865 
   16866    IRExpr* res;
   16867    if (xIsZ)
   16868       res = binop( Iop_V128HLtoV256,
   16869                    binop( Iop_InterleaveHI32x4,
   16870                           IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ),
   16871                    binop( Iop_InterleaveLO32x4,
   16872                           IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ) );
   16873    else {
   16874       IRTemp s3, s2, s1, s0;
   16875       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   16876       breakupV128to32s( srcVec, &s3, &s2, &s1, &s0 );
   16877       res = binop( Iop_V128HLtoV256,
   16878                    binop( Iop_64HLtoV128,
   16879                           unop( Iop_32Sto64, mkexpr(s3) ),
   16880                           unop( Iop_32Sto64, mkexpr(s2) ) ),
   16881                    binop( Iop_64HLtoV128,
   16882                           unop( Iop_32Sto64, mkexpr(s1) ),
   16883                           unop( Iop_32Sto64, mkexpr(s0) ) ) );
   16884    }
   16885 
   16886    putYMMReg ( rG, res );
   16887 
   16888    return delta;
   16889 }
   16890 
   16891 
   16892 /* Handles 128 bit versions of PMOVZXBD and PMOVSXBD. */
   16893 static Long dis_PMOVxXBD_128 ( const VexAbiInfo* vbi, Prefix pfx,
   16894                                Long delta, Bool isAvx, Bool xIsZ )
   16895 {
   16896    IRTemp addr   = IRTemp_INVALID;
   16897    Int    alen   = 0;
   16898    HChar  dis_buf[50];
   16899    IRTemp srcVec = newTemp(Ity_V128);
   16900    UChar  modrm  = getUChar(delta);
   16901    const HChar* mbV = isAvx ? "v" : "";
   16902    const HChar  how = xIsZ ? 'z' : 's';
   16903    UInt   rG     = gregOfRexRM(pfx, modrm);
   16904    if ( epartIsReg(modrm) ) {
   16905       UInt rE = eregOfRexRM(pfx, modrm);
   16906       assign( srcVec, getXMMReg(rE) );
   16907       delta += 1;
   16908       DIP( "%spmov%cxbd %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
   16909    } else {
   16910       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16911       assign( srcVec,
   16912               unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) ) ) );
   16913       delta += alen;
   16914       DIP( "%spmov%cxbd %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
   16915    }
   16916 
   16917    IRTemp zeroVec = newTemp(Ity_V128);
   16918    assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   16919 
   16920    IRExpr* res
   16921       = binop(Iop_InterleaveLO8x16,
   16922               mkexpr(zeroVec),
   16923               binop(Iop_InterleaveLO8x16,
   16924                     mkexpr(zeroVec), mkexpr(srcVec)));
   16925    if (!xIsZ)
   16926       res = binop(Iop_SarN32x4,
   16927                   binop(Iop_ShlN32x4, res, mkU8(24)), mkU8(24));
   16928 
   16929    (isAvx ? putYMMRegLoAndZU : putXMMReg) ( rG, res );
   16930 
   16931    return delta;
   16932 }
   16933 
   16934 
   16935 /* Handles 256 bit versions of PMOVZXBD and PMOVSXBD. */
   16936 static Long dis_PMOVxXBD_256 ( const VexAbiInfo* vbi, Prefix pfx,
   16937                                Long delta, Bool xIsZ )
   16938 {
   16939    IRTemp addr   = IRTemp_INVALID;
   16940    Int    alen   = 0;
   16941    HChar  dis_buf[50];
   16942    IRTemp srcVec = newTemp(Ity_V128);
   16943    UChar  modrm  = getUChar(delta);
   16944    UChar  how    = xIsZ ? 'z' : 's';
   16945    UInt   rG     = gregOfRexRM(pfx, modrm);
   16946    if ( epartIsReg(modrm) ) {
   16947       UInt rE = eregOfRexRM(pfx, modrm);
   16948       assign( srcVec, getXMMReg(rE) );
   16949       delta += 1;
   16950       DIP( "vpmov%cxbd %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) );
   16951    } else {
   16952       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16953       assign( srcVec,
   16954               unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
   16955       delta += alen;
   16956       DIP( "vpmov%cxbd %s,%s\n", how, dis_buf, nameYMMReg(rG) );
   16957    }
   16958 
   16959    IRTemp zeroVec = newTemp(Ity_V128);
   16960    assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   16961 
   16962    IRExpr* res
   16963       = binop( Iop_V128HLtoV256,
   16964                binop(Iop_InterleaveHI8x16,
   16965                      mkexpr(zeroVec),
   16966                      binop(Iop_InterleaveLO8x16,
   16967                            mkexpr(zeroVec), mkexpr(srcVec)) ),
   16968                binop(Iop_InterleaveLO8x16,
   16969                      mkexpr(zeroVec),
   16970                      binop(Iop_InterleaveLO8x16,
   16971                            mkexpr(zeroVec), mkexpr(srcVec)) ) );
   16972    if (!xIsZ)
   16973       res = binop(Iop_SarN32x8,
   16974                   binop(Iop_ShlN32x8, res, mkU8(24)), mkU8(24));
   16975 
   16976    putYMMReg ( rG, res );
   16977 
   16978    return delta;
   16979 }
   16980 
   16981 
   16982 /* Handles 128 bit versions of PMOVSXBQ. */
   16983 static Long dis_PMOVSXBQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
   16984                                Long delta, Bool isAvx )
   16985 {
   16986    IRTemp addr     = IRTemp_INVALID;
   16987    Int    alen     = 0;
   16988    HChar  dis_buf[50];
   16989    IRTemp srcBytes = newTemp(Ity_I16);
   16990    UChar  modrm    = getUChar(delta);
   16991    const HChar* mbV = isAvx ? "v" : "";
   16992    UInt   rG       = gregOfRexRM(pfx, modrm);
   16993    if ( epartIsReg(modrm) ) {
   16994       UInt rE = eregOfRexRM(pfx, modrm);
   16995       assign( srcBytes, getXMMRegLane16( rE, 0 ) );
   16996       delta += 1;
   16997       DIP( "%spmovsxbq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
   16998    } else {
   16999       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17000       assign( srcBytes, loadLE( Ity_I16, mkexpr(addr) ) );
   17001       delta += alen;
   17002       DIP( "%spmovsxbq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
   17003    }
   17004 
   17005    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   17006       ( rG, binop( Iop_64HLtoV128,
   17007                    unop( Iop_8Sto64,
   17008                          unop( Iop_16HIto8, mkexpr(srcBytes) ) ),
   17009                    unop( Iop_8Sto64,
   17010                          unop( Iop_16to8, mkexpr(srcBytes) ) ) ) );
   17011    return delta;
   17012 }
   17013 
   17014 
   17015 /* Handles 256 bit versions of PMOVSXBQ. */
   17016 static Long dis_PMOVSXBQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
   17017                                Long delta )
   17018 {
   17019    IRTemp addr     = IRTemp_INVALID;
   17020    Int    alen     = 0;
   17021    HChar  dis_buf[50];
   17022    IRTemp srcBytes = newTemp(Ity_I32);
   17023    UChar  modrm    = getUChar(delta);
   17024    UInt   rG       = gregOfRexRM(pfx, modrm);
   17025    if ( epartIsReg(modrm) ) {
   17026       UInt rE = eregOfRexRM(pfx, modrm);
   17027       assign( srcBytes, getXMMRegLane32( rE, 0 ) );
   17028       delta += 1;
   17029       DIP( "vpmovsxbq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) );
   17030    } else {
   17031       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17032       assign( srcBytes, loadLE( Ity_I32, mkexpr(addr) ) );
   17033       delta += alen;
   17034       DIP( "vpmovsxbq %s,%s\n", dis_buf, nameYMMReg(rG) );
   17035    }
   17036 
   17037    putYMMReg
   17038       ( rG, binop( Iop_V128HLtoV256,
   17039                    binop( Iop_64HLtoV128,
   17040                           unop( Iop_8Sto64,
   17041                                 unop( Iop_16HIto8,
   17042                                       unop( Iop_32HIto16,
   17043                                             mkexpr(srcBytes) ) ) ),
   17044                           unop( Iop_8Sto64,
   17045                                 unop( Iop_16to8,
   17046                                       unop( Iop_32HIto16,
   17047                                             mkexpr(srcBytes) ) ) ) ),
   17048                    binop( Iop_64HLtoV128,
   17049                           unop( Iop_8Sto64,
   17050                                 unop( Iop_16HIto8,
   17051                                       unop( Iop_32to16,
   17052                                             mkexpr(srcBytes) ) ) ),
   17053                           unop( Iop_8Sto64,
   17054                                 unop( Iop_16to8,
   17055                                       unop( Iop_32to16,
   17056                                             mkexpr(srcBytes) ) ) ) ) ) );
   17057    return delta;
   17058 }
   17059 
   17060 
   17061 /* Handles 128 bit versions of PMOVZXBQ. */
   17062 static Long dis_PMOVZXBQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
   17063                                Long delta, Bool isAvx )
   17064 {
   17065    IRTemp addr     = IRTemp_INVALID;
   17066    Int    alen     = 0;
   17067    HChar  dis_buf[50];
   17068    IRTemp srcVec   = newTemp(Ity_V128);
   17069    UChar  modrm    = getUChar(delta);
   17070    const HChar* mbV = isAvx ? "v" : "";
   17071    UInt   rG       = gregOfRexRM(pfx, modrm);
   17072    if ( epartIsReg(modrm) ) {
   17073       UInt rE = eregOfRexRM(pfx, modrm);
   17074       assign( srcVec, getXMMReg(rE) );
   17075       delta += 1;
   17076       DIP( "%spmovzxbq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
   17077    } else {
   17078       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17079       assign( srcVec,
   17080               unop( Iop_32UtoV128,
   17081                     unop( Iop_16Uto32, loadLE( Ity_I16, mkexpr(addr) ))));
   17082       delta += alen;
   17083       DIP( "%spmovzxbq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
   17084    }
   17085 
   17086    IRTemp zeroVec = newTemp(Ity_V128);
   17087    assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   17088 
   17089    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   17090       ( rG, binop( Iop_InterleaveLO8x16,
   17091                    mkexpr(zeroVec),
   17092                    binop( Iop_InterleaveLO8x16,
   17093                           mkexpr(zeroVec),
   17094                           binop( Iop_InterleaveLO8x16,
   17095                                  mkexpr(zeroVec), mkexpr(srcVec) ) ) ) );
   17096    return delta;
   17097 }
   17098 
   17099 
   17100 /* Handles 256 bit versions of PMOVZXBQ. */
   17101 static Long dis_PMOVZXBQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
   17102                                Long delta )
   17103 {
   17104    IRTemp addr     = IRTemp_INVALID;
   17105    Int    alen     = 0;
   17106    HChar  dis_buf[50];
   17107    IRTemp srcVec   = newTemp(Ity_V128);
   17108    UChar  modrm    = getUChar(delta);
   17109    UInt   rG       = gregOfRexRM(pfx, modrm);
   17110    if ( epartIsReg(modrm) ) {
   17111       UInt rE = eregOfRexRM(pfx, modrm);
   17112       assign( srcVec, getXMMReg(rE) );
   17113       delta += 1;
   17114       DIP( "vpmovzxbq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) );
   17115    } else {
   17116       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17117       assign( srcVec,
   17118               unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) )));
   17119       delta += alen;
   17120       DIP( "vpmovzxbq %s,%s\n", dis_buf, nameYMMReg(rG) );
   17121    }
   17122 
   17123    IRTemp zeroVec = newTemp(Ity_V128);
   17124    assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   17125 
   17126    putYMMReg
   17127       ( rG, binop( Iop_V128HLtoV256,
   17128                    binop( Iop_InterleaveHI8x16,
   17129                           mkexpr(zeroVec),
   17130                           binop( Iop_InterleaveLO8x16,
   17131                                  mkexpr(zeroVec),
   17132                                  binop( Iop_InterleaveLO8x16,
   17133                                         mkexpr(zeroVec), mkexpr(srcVec) ) ) ),
   17134                    binop( Iop_InterleaveLO8x16,
   17135                           mkexpr(zeroVec),
   17136                           binop( Iop_InterleaveLO8x16,
   17137                                  mkexpr(zeroVec),
   17138                                  binop( Iop_InterleaveLO8x16,
   17139                                         mkexpr(zeroVec), mkexpr(srcVec) ) ) )
   17140                  ) );
   17141    return delta;
   17142 }
   17143 
   17144 
   17145 static Long dis_PHMINPOSUW_128 ( const VexAbiInfo* vbi, Prefix pfx,
   17146                                  Long delta, Bool isAvx )
   17147 {
   17148    IRTemp addr   = IRTemp_INVALID;
   17149    Int    alen   = 0;
   17150    HChar  dis_buf[50];
   17151    UChar  modrm  = getUChar(delta);
   17152    const HChar* mbV = isAvx ? "v" : "";
   17153    IRTemp sV     = newTemp(Ity_V128);
   17154    IRTemp sHi    = newTemp(Ity_I64);
   17155    IRTemp sLo    = newTemp(Ity_I64);
   17156    IRTemp dLo    = newTemp(Ity_I64);
   17157    UInt   rG     = gregOfRexRM(pfx,modrm);
   17158    if (epartIsReg(modrm)) {
   17159       UInt rE = eregOfRexRM(pfx,modrm);
   17160       assign( sV, getXMMReg(rE) );
   17161       delta += 1;
   17162       DIP("%sphminposuw %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG));
   17163    } else {
   17164       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   17165       if (!isAvx)
   17166          gen_SEGV_if_not_16_aligned(addr);
   17167       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   17168       delta += alen;
   17169       DIP("%sphminposuw %s,%s\n", mbV, dis_buf, nameXMMReg(rG));
   17170    }
   17171    assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   17172    assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   17173    assign( dLo, mkIRExprCCall(
   17174                    Ity_I64, 0/*regparms*/,
   17175                    "amd64g_calculate_sse_phminposuw",
   17176                    &amd64g_calculate_sse_phminposuw,
   17177                    mkIRExprVec_2( mkexpr(sLo), mkexpr(sHi) )
   17178          ));
   17179    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   17180       (rG, unop(Iop_64UtoV128, mkexpr(dLo)));
   17181    return delta;
   17182 }
   17183 
   17184 
   17185 static Long dis_AESx ( const VexAbiInfo* vbi, Prefix pfx,
   17186                        Long delta, Bool isAvx, UChar opc )
   17187 {
   17188    IRTemp addr   = IRTemp_INVALID;
   17189    Int    alen   = 0;
   17190    HChar  dis_buf[50];
   17191    UChar  modrm  = getUChar(delta);
   17192    UInt   rG     = gregOfRexRM(pfx, modrm);
   17193    UInt   regNoL = 0;
   17194    UInt   regNoR = (isAvx && opc != 0xDB) ? getVexNvvvv(pfx) : rG;
   17195 
   17196    /* This is a nasty kludge.  We need to pass 2 x V128 to the
   17197       helper.  Since we can't do that, use a dirty
   17198       helper to compute the results directly from the XMM regs in
   17199       the guest state.  That means for the memory case, we need to
   17200       move the left operand into a pseudo-register (XMM16, let's
   17201       call it). */
   17202    if (epartIsReg(modrm)) {
   17203       regNoL = eregOfRexRM(pfx, modrm);
   17204       delta += 1;
   17205    } else {
   17206       regNoL = 16; /* use XMM16 as an intermediary */
   17207       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17208       /* alignment check needed ???? */
   17209       stmt( IRStmt_Put( OFFB_YMM16, loadLE(Ity_V128, mkexpr(addr)) ));
   17210       delta += alen;
   17211    }
   17212 
   17213    void*  fn = &amd64g_dirtyhelper_AES;
   17214    const HChar* nm = "amd64g_dirtyhelper_AES";
   17215 
   17216    /* Round up the arguments.  Note that this is a kludge -- the
   17217       use of mkU64 rather than mkIRExpr_HWord implies the
   17218       assumption that the host's word size is 64-bit. */
   17219    UInt gstOffD = ymmGuestRegOffset(rG);
   17220    UInt gstOffL = regNoL == 16 ? OFFB_YMM16 : ymmGuestRegOffset(regNoL);
   17221    UInt gstOffR = ymmGuestRegOffset(regNoR);
   17222    IRExpr*  opc4         = mkU64(opc);
   17223    IRExpr*  gstOffDe     = mkU64(gstOffD);
   17224    IRExpr*  gstOffLe     = mkU64(gstOffL);
   17225    IRExpr*  gstOffRe     = mkU64(gstOffR);
   17226    IRExpr** args
   17227       = mkIRExprVec_5( IRExpr_BBPTR(), opc4, gstOffDe, gstOffLe, gstOffRe );
   17228 
   17229    IRDirty* d    = unsafeIRDirty_0_N( 0/*regparms*/, nm, fn, args );
   17230    /* It's not really a dirty call, but we can't use the clean helper
   17231       mechanism here for the very lame reason that we can't pass 2 x
   17232       V128s by value to a helper.  Hence this roundabout scheme. */
   17233    d->nFxState = 2;
   17234    vex_bzero(&d->fxState, sizeof(d->fxState));
   17235    /* AES{ENC,ENCLAST,DEC,DECLAST} read both registers, and writes
   17236       the second for !isAvx or the third for isAvx.
   17237       AESIMC (0xDB) reads the first register, and writes the second. */
   17238    d->fxState[0].fx     = Ifx_Read;
   17239    d->fxState[0].offset = gstOffL;
   17240    d->fxState[0].size   = sizeof(U128);
   17241    d->fxState[1].offset = gstOffR;
   17242    d->fxState[1].size   = sizeof(U128);
   17243    if (opc == 0xDB)
   17244       d->fxState[1].fx   = Ifx_Write;
   17245    else if (!isAvx || rG == regNoR)
   17246       d->fxState[1].fx   = Ifx_Modify;
   17247    else {
   17248       d->fxState[1].fx     = Ifx_Read;
   17249       d->nFxState++;
   17250       d->fxState[2].fx     = Ifx_Write;
   17251       d->fxState[2].offset = gstOffD;
   17252       d->fxState[2].size   = sizeof(U128);
   17253    }
   17254 
   17255    stmt( IRStmt_Dirty(d) );
   17256    {
   17257       const HChar* opsuf;
   17258       switch (opc) {
   17259          case 0xDC: opsuf = "enc"; break;
   17260          case 0XDD: opsuf = "enclast"; break;
   17261          case 0xDE: opsuf = "dec"; break;
   17262          case 0xDF: opsuf = "declast"; break;
   17263          case 0xDB: opsuf = "imc"; break;
   17264          default: vassert(0);
   17265       }
   17266       DIP("%saes%s %s,%s%s%s\n", isAvx ? "v" : "", opsuf,
   17267           (regNoL == 16 ? dis_buf : nameXMMReg(regNoL)),
   17268           nameXMMReg(regNoR),
   17269           (isAvx && opc != 0xDB) ? "," : "",
   17270           (isAvx && opc != 0xDB) ? nameXMMReg(rG) : "");
   17271    }
   17272    if (isAvx)
   17273       putYMMRegLane128( rG, 1, mkV128(0) );
   17274    return delta;
   17275 }
   17276 
   17277 static Long dis_AESKEYGENASSIST ( const VexAbiInfo* vbi, Prefix pfx,
   17278                                   Long delta, Bool isAvx )
   17279 {
   17280    IRTemp addr   = IRTemp_INVALID;
   17281    Int    alen   = 0;
   17282    HChar  dis_buf[50];
   17283    UChar  modrm  = getUChar(delta);
   17284    UInt   regNoL = 0;
   17285    UInt   regNoR = gregOfRexRM(pfx, modrm);
   17286    UChar  imm    = 0;
   17287 
   17288    /* This is a nasty kludge.  See AESENC et al. instructions. */
   17289    modrm = getUChar(delta);
   17290    if (epartIsReg(modrm)) {
   17291       regNoL = eregOfRexRM(pfx, modrm);
   17292       imm = getUChar(delta+1);
   17293       delta += 1+1;
   17294    } else {
   17295       regNoL = 16; /* use XMM16 as an intermediary */
   17296       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   17297       /* alignment check ???? . */
   17298       stmt( IRStmt_Put( OFFB_YMM16, loadLE(Ity_V128, mkexpr(addr)) ));
   17299       imm = getUChar(delta+alen);
   17300       delta += alen+1;
   17301    }
   17302 
   17303    /* Who ya gonna call?  Presumably not Ghostbusters. */
   17304    void*  fn = &amd64g_dirtyhelper_AESKEYGENASSIST;
   17305    const HChar* nm = "amd64g_dirtyhelper_AESKEYGENASSIST";
   17306 
   17307    /* Round up the arguments.  Note that this is a kludge -- the
   17308       use of mkU64 rather than mkIRExpr_HWord implies the
   17309       assumption that the host's word size is 64-bit. */
   17310    UInt gstOffL = regNoL == 16 ? OFFB_YMM16 : ymmGuestRegOffset(regNoL);
   17311    UInt gstOffR = ymmGuestRegOffset(regNoR);
   17312 
   17313    IRExpr*  imme          = mkU64(imm & 0xFF);
   17314    IRExpr*  gstOffLe     = mkU64(gstOffL);
   17315    IRExpr*  gstOffRe     = mkU64(gstOffR);
   17316    IRExpr** args
   17317       = mkIRExprVec_4( IRExpr_BBPTR(), imme, gstOffLe, gstOffRe );
   17318 
   17319    IRDirty* d    = unsafeIRDirty_0_N( 0/*regparms*/, nm, fn, args );
   17320    /* It's not really a dirty call, but we can't use the clean helper
   17321       mechanism here for the very lame reason that we can't pass 2 x
   17322       V128s by value to a helper.  Hence this roundabout scheme. */
   17323    d->nFxState = 2;
   17324    vex_bzero(&d->fxState, sizeof(d->fxState));
   17325    d->fxState[0].fx     = Ifx_Read;
   17326    d->fxState[0].offset = gstOffL;
   17327    d->fxState[0].size   = sizeof(U128);
   17328    d->fxState[1].fx     = Ifx_Write;
   17329    d->fxState[1].offset = gstOffR;
   17330    d->fxState[1].size   = sizeof(U128);
   17331    stmt( IRStmt_Dirty(d) );
   17332 
   17333    DIP("%saeskeygenassist $%x,%s,%s\n", isAvx ? "v" : "", (UInt)imm,
   17334        (regNoL == 16 ? dis_buf : nameXMMReg(regNoL)),
   17335        nameXMMReg(regNoR));
   17336    if (isAvx)
   17337       putYMMRegLane128( regNoR, 1, mkV128(0) );
   17338    return delta;
   17339 }
   17340 
   17341 
   17342 __attribute__((noinline))
   17343 static
   17344 Long dis_ESC_0F38__SSE4 ( Bool* decode_OK,
   17345                           const VexAbiInfo* vbi,
   17346                           Prefix pfx, Int sz, Long deltaIN )
   17347 {
   17348    IRTemp addr  = IRTemp_INVALID;
   17349    UChar  modrm = 0;
   17350    Int    alen  = 0;
   17351    HChar  dis_buf[50];
   17352 
   17353    *decode_OK = False;
   17354 
   17355    Long   delta = deltaIN;
   17356    UChar  opc   = getUChar(delta);
   17357    delta++;
   17358    switch (opc) {
   17359 
   17360    case 0x10:
   17361    case 0x14:
   17362    case 0x15:
   17363       /* 66 0F 38 10 /r = PBLENDVB xmm1, xmm2/m128  (byte gran)
   17364          66 0F 38 14 /r = BLENDVPS xmm1, xmm2/m128  (float gran)
   17365          66 0F 38 15 /r = BLENDVPD xmm1, xmm2/m128  (double gran)
   17366          Blend at various granularities, with XMM0 (implicit operand)
   17367          providing the controlling mask.
   17368       */
   17369       if (have66noF2noF3(pfx) && sz == 2) {
   17370          modrm = getUChar(delta);
   17371 
   17372          const HChar* nm    = NULL;
   17373          UInt   gran  = 0;
   17374          IROp   opSAR = Iop_INVALID;
   17375          switch (opc) {
   17376             case 0x10:
   17377                nm = "pblendvb"; gran = 1; opSAR = Iop_SarN8x16;
   17378                break;
   17379             case 0x14:
   17380                nm = "blendvps"; gran = 4; opSAR = Iop_SarN32x4;
   17381                break;
   17382             case 0x15:
   17383                nm = "blendvpd"; gran = 8; opSAR = Iop_SarN64x2;
   17384                break;
   17385          }
   17386          vassert(nm);
   17387 
   17388          IRTemp vecE = newTemp(Ity_V128);
   17389          IRTemp vecG = newTemp(Ity_V128);
   17390          IRTemp vec0 = newTemp(Ity_V128);
   17391 
   17392          if ( epartIsReg(modrm) ) {
   17393             assign(vecE, getXMMReg(eregOfRexRM(pfx, modrm)));
   17394             delta += 1;
   17395             DIP( "%s %s,%s\n", nm,
   17396                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
   17397                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   17398          } else {
   17399             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17400             gen_SEGV_if_not_16_aligned( addr );
   17401             assign(vecE, loadLE( Ity_V128, mkexpr(addr) ));
   17402             delta += alen;
   17403             DIP( "%s %s,%s\n", nm,
   17404                  dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   17405          }
   17406 
   17407          assign(vecG, getXMMReg(gregOfRexRM(pfx, modrm)));
   17408          assign(vec0, getXMMReg(0));
   17409 
   17410          IRTemp res = math_PBLENDVB_128( vecE, vecG, vec0, gran, opSAR );
   17411          putXMMReg(gregOfRexRM(pfx, modrm), mkexpr(res));
   17412 
   17413          goto decode_success;
   17414       }
   17415       break;
   17416 
   17417    case 0x17:
   17418       /* 66 0F 38 17 /r = PTEST xmm1, xmm2/m128
   17419          Logical compare (set ZF and CF from AND/ANDN of the operands) */
   17420       if (have66noF2noF3(pfx)
   17421           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   17422          delta = dis_xTESTy_128( vbi, pfx, delta, False/*!isAvx*/, 0 );
   17423          goto decode_success;
   17424       }
   17425       break;
   17426 
   17427    case 0x20:
   17428       /* 66 0F 38 20 /r = PMOVSXBW xmm1, xmm2/m64
   17429          Packed Move with Sign Extend from Byte to Word (XMM) */
   17430       if (have66noF2noF3(pfx) && sz == 2) {
   17431          delta = dis_PMOVxXBW_128( vbi, pfx, delta,
   17432                                    False/*!isAvx*/, False/*!xIsZ*/ );
   17433          goto decode_success;
   17434       }
   17435       break;
   17436 
   17437    case 0x21:
   17438       /* 66 0F 38 21 /r = PMOVSXBD xmm1, xmm2/m32
   17439          Packed Move with Sign Extend from Byte to DWord (XMM) */
   17440       if (have66noF2noF3(pfx) && sz == 2) {
   17441          delta = dis_PMOVxXBD_128( vbi, pfx, delta,
   17442                                    False/*!isAvx*/, False/*!xIsZ*/ );
   17443          goto decode_success;
   17444       }
   17445       break;
   17446 
   17447    case 0x22:
   17448       /* 66 0F 38 22 /r = PMOVSXBQ xmm1, xmm2/m16
   17449          Packed Move with Sign Extend from Byte to QWord (XMM) */
   17450       if (have66noF2noF3(pfx) && sz == 2) {
   17451          delta = dis_PMOVSXBQ_128( vbi, pfx, delta, False/*!isAvx*/ );
   17452          goto decode_success;
   17453       }
   17454       break;
   17455 
   17456    case 0x23:
   17457       /* 66 0F 38 23 /r = PMOVSXWD xmm1, xmm2/m64
   17458          Packed Move with Sign Extend from Word to DWord (XMM) */
   17459       if (have66noF2noF3(pfx) && sz == 2) {
   17460          delta = dis_PMOVxXWD_128(vbi, pfx, delta,
   17461                                   False/*!isAvx*/, False/*!xIsZ*/);
   17462          goto decode_success;
   17463       }
   17464       break;
   17465 
   17466    case 0x24:
   17467       /* 66 0F 38 24 /r = PMOVSXWQ xmm1, xmm2/m32
   17468          Packed Move with Sign Extend from Word to QWord (XMM) */
   17469       if (have66noF2noF3(pfx) && sz == 2) {
   17470          delta = dis_PMOVSXWQ_128( vbi, pfx, delta, False/*!isAvx*/ );
   17471          goto decode_success;
   17472       }
   17473       break;
   17474 
   17475    case 0x25:
   17476       /* 66 0F 38 25 /r = PMOVSXDQ xmm1, xmm2/m64
   17477          Packed Move with Sign Extend from Double Word to Quad Word (XMM) */
   17478       if (have66noF2noF3(pfx) && sz == 2) {
   17479          delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
   17480                                    False/*!isAvx*/, False/*!xIsZ*/ );
   17481          goto decode_success;
   17482       }
   17483       break;
   17484 
   17485    case 0x28:
   17486       /* 66 0F 38 28 = PMULDQ -- signed widening multiply of 32-lanes
   17487          0 x 0 to form lower 64-bit half and lanes 2 x 2 to form upper
   17488          64-bit half */
   17489       /* This is a really poor translation -- could be improved if
   17490          performance critical.  It's a copy-paste of PMULUDQ, too. */
   17491       if (have66noF2noF3(pfx) && sz == 2) {
   17492          IRTemp sV = newTemp(Ity_V128);
   17493          IRTemp dV = newTemp(Ity_V128);
   17494          modrm = getUChar(delta);
   17495          UInt rG = gregOfRexRM(pfx,modrm);
   17496          assign( dV, getXMMReg(rG) );
   17497          if (epartIsReg(modrm)) {
   17498             UInt rE = eregOfRexRM(pfx,modrm);
   17499             assign( sV, getXMMReg(rE) );
   17500             delta += 1;
   17501             DIP("pmuldq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   17502          } else {
   17503             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   17504             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   17505             delta += alen;
   17506             DIP("pmuldq %s,%s\n", dis_buf, nameXMMReg(rG));
   17507          }
   17508 
   17509          putXMMReg( rG, mkexpr(math_PMULDQ_128( dV, sV )) );
   17510          goto decode_success;
   17511       }
   17512       break;
   17513 
   17514    case 0x29:
   17515       /* 66 0F 38 29 = PCMPEQQ
   17516          64x2 equality comparison */
   17517       if (have66noF2noF3(pfx) && sz == 2) {
   17518          /* FIXME: this needs an alignment check */
   17519          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   17520                                     "pcmpeqq", Iop_CmpEQ64x2, False );
   17521          goto decode_success;
   17522       }
   17523       break;
   17524 
   17525    case 0x2A:
   17526       /* 66 0F 38 2A = MOVNTDQA
   17527          "non-temporal" "streaming" load
   17528          Handle like MOVDQA but only memory operand is allowed */
   17529       if (have66noF2noF3(pfx) && sz == 2) {
   17530          modrm = getUChar(delta);
   17531          if (!epartIsReg(modrm)) {
   17532             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   17533             gen_SEGV_if_not_16_aligned( addr );
   17534             putXMMReg( gregOfRexRM(pfx,modrm),
   17535                        loadLE(Ity_V128, mkexpr(addr)) );
   17536             DIP("movntdqa %s,%s\n", dis_buf,
   17537                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   17538             delta += alen;
   17539             goto decode_success;
   17540          }
   17541       }
   17542       break;
   17543 
   17544    case 0x2B:
   17545       /* 66 0f 38 2B /r = PACKUSDW xmm1, xmm2/m128
   17546          2x 32x4 S->U saturating narrow from xmm2/m128 to xmm1 */
   17547       if (have66noF2noF3(pfx) && sz == 2) {
   17548 
   17549          modrm = getUChar(delta);
   17550 
   17551          IRTemp argL = newTemp(Ity_V128);
   17552          IRTemp argR = newTemp(Ity_V128);
   17553 
   17554          if ( epartIsReg(modrm) ) {
   17555             assign( argL, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   17556             delta += 1;
   17557             DIP( "packusdw %s,%s\n",
   17558                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
   17559                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   17560          } else {
   17561             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17562             gen_SEGV_if_not_16_aligned( addr );
   17563             assign( argL, loadLE( Ity_V128, mkexpr(addr) ));
   17564             delta += alen;
   17565             DIP( "packusdw %s,%s\n",
   17566                  dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   17567          }
   17568 
   17569          assign(argR, getXMMReg( gregOfRexRM(pfx, modrm) ));
   17570 
   17571          putXMMReg( gregOfRexRM(pfx, modrm),
   17572                     binop( Iop_QNarrowBin32Sto16Ux8,
   17573                            mkexpr(argL), mkexpr(argR)) );
   17574 
   17575          goto decode_success;
   17576       }
   17577       break;
   17578 
   17579    case 0x30:
   17580       /* 66 0F 38 30 /r = PMOVZXBW xmm1, xmm2/m64
   17581          Packed Move with Zero Extend from Byte to Word (XMM) */
   17582       if (have66noF2noF3(pfx) && sz == 2) {
   17583          delta = dis_PMOVxXBW_128( vbi, pfx, delta,
   17584                                    False/*!isAvx*/, True/*xIsZ*/ );
   17585          goto decode_success;
   17586       }
   17587       break;
   17588 
   17589    case 0x31:
   17590       /* 66 0F 38 31 /r = PMOVZXBD xmm1, xmm2/m32
   17591          Packed Move with Zero Extend from Byte to DWord (XMM) */
   17592       if (have66noF2noF3(pfx) && sz == 2) {
   17593          delta = dis_PMOVxXBD_128( vbi, pfx, delta,
   17594                                    False/*!isAvx*/, True/*xIsZ*/ );
   17595          goto decode_success;
   17596       }
   17597       break;
   17598 
   17599    case 0x32:
   17600       /* 66 0F 38 32 /r = PMOVZXBQ xmm1, xmm2/m16
   17601          Packed Move with Zero Extend from Byte to QWord (XMM) */
   17602       if (have66noF2noF3(pfx) && sz == 2) {
   17603          delta = dis_PMOVZXBQ_128( vbi, pfx, delta, False/*!isAvx*/ );
   17604          goto decode_success;
   17605       }
   17606       break;
   17607 
   17608    case 0x33:
   17609       /* 66 0F 38 33 /r = PMOVZXWD xmm1, xmm2/m64
   17610          Packed Move with Zero Extend from Word to DWord (XMM) */
   17611       if (have66noF2noF3(pfx) && sz == 2) {
   17612          delta = dis_PMOVxXWD_128( vbi, pfx, delta,
   17613                                    False/*!isAvx*/, True/*xIsZ*/ );
   17614          goto decode_success;
   17615       }
   17616       break;
   17617 
   17618    case 0x34:
   17619       /* 66 0F 38 34 /r = PMOVZXWQ xmm1, xmm2/m32
   17620          Packed Move with Zero Extend from Word to QWord (XMM) */
   17621       if (have66noF2noF3(pfx) && sz == 2) {
   17622          delta = dis_PMOVZXWQ_128( vbi, pfx, delta, False/*!isAvx*/ );
   17623          goto decode_success;
   17624       }
   17625       break;
   17626 
   17627    case 0x35:
   17628       /* 66 0F 38 35 /r = PMOVZXDQ xmm1, xmm2/m64
   17629          Packed Move with Zero Extend from DWord to QWord (XMM) */
   17630       if (have66noF2noF3(pfx) && sz == 2) {
   17631          delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
   17632                                    False/*!isAvx*/, True/*xIsZ*/ );
   17633          goto decode_success;
   17634       }
   17635       break;
   17636 
   17637    case 0x37:
   17638       /* 66 0F 38 37 = PCMPGTQ
   17639          64x2 comparison (signed, presumably; the Intel docs don't say :-)
   17640       */
   17641       if (have66noF2noF3(pfx) && sz == 2) {
   17642          /* FIXME: this needs an alignment check */
   17643          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   17644                                     "pcmpgtq", Iop_CmpGT64Sx2, False );
   17645          goto decode_success;
   17646       }
   17647       break;
   17648 
   17649    case 0x38:
   17650    case 0x3C:
   17651       /* 66 0F 38 38 /r = PMINSB xmm1, xmm2/m128    8Sx16 (signed) min
   17652          66 0F 38 3C /r = PMAXSB xmm1, xmm2/m128    8Sx16 (signed) max
   17653       */
   17654       if (have66noF2noF3(pfx) && sz == 2) {
   17655          /* FIXME: this needs an alignment check */
   17656          Bool isMAX = opc == 0x3C;
   17657          delta = dis_SSEint_E_to_G(
   17658                     vbi, pfx, delta,
   17659                     isMAX ? "pmaxsb" : "pminsb",
   17660                     isMAX ? Iop_Max8Sx16 : Iop_Min8Sx16,
   17661                     False
   17662                  );
   17663          goto decode_success;
   17664       }
   17665       break;
   17666 
   17667    case 0x39:
   17668    case 0x3D:
   17669       /* 66 0F 38 39 /r = PMINSD xmm1, xmm2/m128
   17670          Minimum of Packed Signed Double Word Integers (XMM)
   17671          66 0F 38 3D /r = PMAXSD xmm1, xmm2/m128
   17672          Maximum of Packed Signed Double Word Integers (XMM)
   17673       */
   17674       if (have66noF2noF3(pfx) && sz == 2) {
   17675          /* FIXME: this needs an alignment check */
   17676          Bool isMAX = opc == 0x3D;
   17677          delta = dis_SSEint_E_to_G(
   17678                     vbi, pfx, delta,
   17679                     isMAX ? "pmaxsd" : "pminsd",
   17680                     isMAX ? Iop_Max32Sx4 : Iop_Min32Sx4,
   17681                     False
   17682                  );
   17683          goto decode_success;
   17684       }
   17685       break;
   17686 
   17687    case 0x3A:
   17688    case 0x3E:
   17689       /* 66 0F 38 3A /r = PMINUW xmm1, xmm2/m128
   17690          Minimum of Packed Unsigned Word Integers (XMM)
   17691          66 0F 38 3E /r = PMAXUW xmm1, xmm2/m128
   17692          Maximum of Packed Unsigned Word Integers (XMM)
   17693       */
   17694       if (have66noF2noF3(pfx) && sz == 2) {
   17695          /* FIXME: this needs an alignment check */
   17696          Bool isMAX = opc == 0x3E;
   17697          delta = dis_SSEint_E_to_G(
   17698                     vbi, pfx, delta,
   17699                     isMAX ? "pmaxuw" : "pminuw",
   17700                     isMAX ? Iop_Max16Ux8 : Iop_Min16Ux8,
   17701                     False
   17702                  );
   17703          goto decode_success;
   17704       }
   17705       break;
   17706 
   17707    case 0x3B:
   17708    case 0x3F:
   17709       /* 66 0F 38 3B /r = PMINUD xmm1, xmm2/m128
   17710          Minimum of Packed Unsigned Doubleword Integers (XMM)
   17711          66 0F 38 3F /r = PMAXUD xmm1, xmm2/m128
   17712          Maximum of Packed Unsigned Doubleword Integers (XMM)
   17713       */
   17714       if (have66noF2noF3(pfx) && sz == 2) {
   17715          /* FIXME: this needs an alignment check */
   17716          Bool isMAX = opc == 0x3F;
   17717          delta = dis_SSEint_E_to_G(
   17718                     vbi, pfx, delta,
   17719                     isMAX ? "pmaxud" : "pminud",
   17720                     isMAX ? Iop_Max32Ux4 : Iop_Min32Ux4,
   17721                     False
   17722                  );
   17723          goto decode_success;
   17724       }
   17725       break;
   17726 
   17727    case 0x40:
   17728       /* 66 0F 38 40 /r = PMULLD xmm1, xmm2/m128
   17729          32x4 integer multiply from xmm2/m128 to xmm1 */
   17730       if (have66noF2noF3(pfx) && sz == 2) {
   17731 
   17732          modrm = getUChar(delta);
   17733 
   17734          IRTemp argL = newTemp(Ity_V128);
   17735          IRTemp argR = newTemp(Ity_V128);
   17736 
   17737          if ( epartIsReg(modrm) ) {
   17738             assign( argL, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   17739             delta += 1;
   17740             DIP( "pmulld %s,%s\n",
   17741                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
   17742                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   17743          } else {
   17744             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17745             gen_SEGV_if_not_16_aligned( addr );
   17746             assign( argL, loadLE( Ity_V128, mkexpr(addr) ));
   17747             delta += alen;
   17748             DIP( "pmulld %s,%s\n",
   17749                  dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   17750          }
   17751 
   17752          assign(argR, getXMMReg( gregOfRexRM(pfx, modrm) ));
   17753 
   17754          putXMMReg( gregOfRexRM(pfx, modrm),
   17755                     binop( Iop_Mul32x4, mkexpr(argL), mkexpr(argR)) );
   17756 
   17757          goto decode_success;
   17758       }
   17759       break;
   17760 
   17761    case 0x41:
   17762       /* 66 0F 38 41 /r = PHMINPOSUW xmm1, xmm2/m128
   17763          Packed Horizontal Word Minimum from xmm2/m128 to xmm1 */
   17764       if (have66noF2noF3(pfx) && sz == 2) {
   17765          delta = dis_PHMINPOSUW_128( vbi, pfx, delta, False/*!isAvx*/ );
   17766          goto decode_success;
   17767       }
   17768       break;
   17769 
   17770    case 0xDC:
   17771    case 0xDD:
   17772    case 0xDE:
   17773    case 0xDF:
   17774    case 0xDB:
   17775       /* 66 0F 38 DC /r = AESENC xmm1, xmm2/m128
   17776                   DD /r = AESENCLAST xmm1, xmm2/m128
   17777                   DE /r = AESDEC xmm1, xmm2/m128
   17778                   DF /r = AESDECLAST xmm1, xmm2/m128
   17779 
   17780                   DB /r = AESIMC xmm1, xmm2/m128 */
   17781       if (have66noF2noF3(pfx) && sz == 2) {
   17782          delta = dis_AESx( vbi, pfx, delta, False/*!isAvx*/, opc );
   17783          goto decode_success;
   17784       }
   17785       break;
   17786 
   17787    case 0xF0:
   17788    case 0xF1:
   17789       /* F2 0F 38 F0 /r = CRC32 r/m8, r32 (REX.W ok, 66 not ok)
   17790          F2 0F 38 F1 /r = CRC32 r/m{16,32,64}, r32
   17791          The decoding on this is a bit unusual.
   17792       */
   17793       if (haveF2noF3(pfx)
   17794           && (opc == 0xF1 || (opc == 0xF0 && !have66(pfx)))) {
   17795          modrm = getUChar(delta);
   17796 
   17797          if (opc == 0xF0)
   17798             sz = 1;
   17799          else
   17800             vassert(sz == 2 || sz == 4 || sz == 8);
   17801 
   17802          IRType tyE = szToITy(sz);
   17803          IRTemp valE = newTemp(tyE);
   17804 
   17805          if (epartIsReg(modrm)) {
   17806             assign(valE, getIRegE(sz, pfx, modrm));
   17807             delta += 1;
   17808             DIP("crc32b %s,%s\n", nameIRegE(sz, pfx, modrm),
   17809                 nameIRegG(1==getRexW(pfx) ? 8 : 4, pfx, modrm));
   17810          } else {
   17811             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17812             assign(valE, loadLE(tyE, mkexpr(addr)));
   17813             delta += alen;
   17814             DIP("crc32b %s,%s\n", dis_buf,
   17815                 nameIRegG(1==getRexW(pfx) ? 8 : 4, pfx, modrm));
   17816          }
   17817 
   17818          /* Somewhat funny getting/putting of the crc32 value, in order
   17819             to ensure that it turns into 64-bit gets and puts.  However,
   17820             mask off the upper 32 bits so as to not get memcheck false
   17821             +ves around the helper call. */
   17822          IRTemp valG0 = newTemp(Ity_I64);
   17823          assign(valG0, binop(Iop_And64, getIRegG(8, pfx, modrm),
   17824                              mkU64(0xFFFFFFFF)));
   17825 
   17826          const HChar* nm = NULL;
   17827          void*  fn = NULL;
   17828          switch (sz) {
   17829             case 1: nm = "amd64g_calc_crc32b";
   17830                     fn = &amd64g_calc_crc32b; break;
   17831             case 2: nm = "amd64g_calc_crc32w";
   17832                     fn = &amd64g_calc_crc32w; break;
   17833             case 4: nm = "amd64g_calc_crc32l";
   17834                     fn = &amd64g_calc_crc32l; break;
   17835             case 8: nm = "amd64g_calc_crc32q";
   17836                     fn = &amd64g_calc_crc32q; break;
   17837          }
   17838          vassert(nm && fn);
   17839          IRTemp valG1 = newTemp(Ity_I64);
   17840          assign(valG1,
   17841                 mkIRExprCCall(Ity_I64, 0/*regparm*/, nm, fn,
   17842                               mkIRExprVec_2(mkexpr(valG0),
   17843                                             widenUto64(mkexpr(valE)))));
   17844 
   17845          putIRegG(4, pfx, modrm, unop(Iop_64to32, mkexpr(valG1)));
   17846          goto decode_success;
   17847       }
   17848       break;
   17849 
   17850    default:
   17851       break;
   17852 
   17853    }
   17854 
   17855   //decode_failure:
   17856    *decode_OK = False;
   17857    return deltaIN;
   17858 
   17859   decode_success:
   17860    *decode_OK = True;
   17861    return delta;
   17862 }
   17863 
   17864 
   17865 /*------------------------------------------------------------*/
   17866 /*---                                                      ---*/
   17867 /*--- Top-level SSE4: dis_ESC_0F3A__SSE4                   ---*/
   17868 /*---                                                      ---*/
   17869 /*------------------------------------------------------------*/
   17870 
   17871 static Long dis_PEXTRW ( const VexAbiInfo* vbi, Prefix pfx,
   17872                          Long delta, Bool isAvx )
   17873 {
   17874    IRTemp addr  = IRTemp_INVALID;
   17875    IRTemp t0    = IRTemp_INVALID;
   17876    IRTemp t1    = IRTemp_INVALID;
   17877    IRTemp t2    = IRTemp_INVALID;
   17878    IRTemp t3    = IRTemp_INVALID;
   17879    UChar  modrm = getUChar(delta);
   17880    Int    alen  = 0;
   17881    HChar  dis_buf[50];
   17882    UInt   rG    = gregOfRexRM(pfx,modrm);
   17883    Int    imm8_20;
   17884    IRTemp xmm_vec = newTemp(Ity_V128);
   17885    IRTemp d16   = newTemp(Ity_I16);
   17886    const HChar* mbV = isAvx ? "v" : "";
   17887 
   17888    vassert(0==getRexW(pfx)); /* ensured by caller */
   17889    assign( xmm_vec, getXMMReg(rG) );
   17890    breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
   17891 
   17892    if ( epartIsReg( modrm ) ) {
   17893       imm8_20 = (Int)(getUChar(delta+1) & 7);
   17894    } else {
   17895       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   17896       imm8_20 = (Int)(getUChar(delta+alen) & 7);
   17897    }
   17898 
   17899    switch (imm8_20) {
   17900       case 0:  assign(d16, unop(Iop_32to16,   mkexpr(t0))); break;
   17901       case 1:  assign(d16, unop(Iop_32HIto16, mkexpr(t0))); break;
   17902       case 2:  assign(d16, unop(Iop_32to16,   mkexpr(t1))); break;
   17903       case 3:  assign(d16, unop(Iop_32HIto16, mkexpr(t1))); break;
   17904       case 4:  assign(d16, unop(Iop_32to16,   mkexpr(t2))); break;
   17905       case 5:  assign(d16, unop(Iop_32HIto16, mkexpr(t2))); break;
   17906       case 6:  assign(d16, unop(Iop_32to16,   mkexpr(t3))); break;
   17907       case 7:  assign(d16, unop(Iop_32HIto16, mkexpr(t3))); break;
   17908       default: vassert(0);
   17909    }
   17910 
   17911    if ( epartIsReg( modrm ) ) {
   17912       UInt rE = eregOfRexRM(pfx,modrm);
   17913       putIReg32( rE, unop(Iop_16Uto32, mkexpr(d16)) );
   17914       delta += 1+1;
   17915       DIP( "%spextrw $%d, %s,%s\n", mbV, imm8_20,
   17916            nameXMMReg( rG ), nameIReg32( rE ) );
   17917    } else {
   17918       storeLE( mkexpr(addr), mkexpr(d16) );
   17919       delta += alen+1;
   17920       DIP( "%spextrw $%d, %s,%s\n", mbV, imm8_20, nameXMMReg( rG ), dis_buf );
   17921    }
   17922    return delta;
   17923 }
   17924 
   17925 
   17926 static Long dis_PEXTRD ( const VexAbiInfo* vbi, Prefix pfx,
   17927                          Long delta, Bool isAvx )
   17928 {
   17929    IRTemp addr  = IRTemp_INVALID;
   17930    IRTemp t0    = IRTemp_INVALID;
   17931    IRTemp t1    = IRTemp_INVALID;
   17932    IRTemp t2    = IRTemp_INVALID;
   17933    IRTemp t3    = IRTemp_INVALID;
   17934    UChar  modrm = 0;
   17935    Int    alen  = 0;
   17936    HChar  dis_buf[50];
   17937 
   17938    Int    imm8_10;
   17939    IRTemp xmm_vec   = newTemp(Ity_V128);
   17940    IRTemp src_dword = newTemp(Ity_I32);
   17941    const HChar* mbV = isAvx ? "v" : "";
   17942 
   17943    vassert(0==getRexW(pfx)); /* ensured by caller */
   17944    modrm = getUChar(delta);
   17945    assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
   17946    breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
   17947 
   17948    if ( epartIsReg( modrm ) ) {
   17949       imm8_10 = (Int)(getUChar(delta+1) & 3);
   17950    } else {
   17951       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   17952       imm8_10 = (Int)(getUChar(delta+alen) & 3);
   17953    }
   17954 
   17955    switch ( imm8_10 ) {
   17956       case 0:  assign( src_dword, mkexpr(t0) ); break;
   17957       case 1:  assign( src_dword, mkexpr(t1) ); break;
   17958       case 2:  assign( src_dword, mkexpr(t2) ); break;
   17959       case 3:  assign( src_dword, mkexpr(t3) ); break;
   17960       default: vassert(0);
   17961    }
   17962 
   17963    if ( epartIsReg( modrm ) ) {
   17964       putIReg32( eregOfRexRM(pfx,modrm), mkexpr(src_dword) );
   17965       delta += 1+1;
   17966       DIP( "%spextrd $%d, %s,%s\n", mbV, imm8_10,
   17967            nameXMMReg( gregOfRexRM(pfx, modrm) ),
   17968            nameIReg32( eregOfRexRM(pfx, modrm) ) );
   17969    } else {
   17970       storeLE( mkexpr(addr), mkexpr(src_dword) );
   17971       delta += alen+1;
   17972       DIP( "%spextrd $%d, %s,%s\n", mbV,
   17973            imm8_10, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
   17974    }
   17975    return delta;
   17976 }
   17977 
   17978 
   17979 static Long dis_PEXTRQ ( const VexAbiInfo* vbi, Prefix pfx,
   17980                          Long delta, Bool isAvx )
   17981 {
   17982    IRTemp addr  = IRTemp_INVALID;
   17983    UChar  modrm = 0;
   17984    Int    alen  = 0;
   17985    HChar  dis_buf[50];
   17986 
   17987    Int imm8_0;
   17988    IRTemp xmm_vec   = newTemp(Ity_V128);
   17989    IRTemp src_qword = newTemp(Ity_I64);
   17990    const HChar* mbV = isAvx ? "v" : "";
   17991 
   17992    vassert(1==getRexW(pfx)); /* ensured by caller */
   17993    modrm = getUChar(delta);
   17994    assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
   17995 
   17996    if ( epartIsReg( modrm ) ) {
   17997       imm8_0 = (Int)(getUChar(delta+1) & 1);
   17998    } else {
   17999       addr   = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   18000       imm8_0 = (Int)(getUChar(delta+alen) & 1);
   18001    }
   18002 
   18003    switch ( imm8_0 ) {
   18004       case 0:  assign( src_qword, unop(Iop_V128to64,   mkexpr(xmm_vec)) );
   18005                break;
   18006       case 1:  assign( src_qword, unop(Iop_V128HIto64, mkexpr(xmm_vec)) );
   18007                break;
   18008       default: vassert(0);
   18009    }
   18010 
   18011    if ( epartIsReg( modrm ) ) {
   18012       putIReg64( eregOfRexRM(pfx,modrm), mkexpr(src_qword) );
   18013       delta += 1+1;
   18014       DIP( "%spextrq $%d, %s,%s\n", mbV, imm8_0,
   18015            nameXMMReg( gregOfRexRM(pfx, modrm) ),
   18016            nameIReg64( eregOfRexRM(pfx, modrm) ) );
   18017    } else {
   18018       storeLE( mkexpr(addr), mkexpr(src_qword) );
   18019       delta += alen+1;
   18020       DIP( "%spextrq $%d, %s,%s\n", mbV,
   18021            imm8_0, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
   18022    }
   18023    return delta;
   18024 }
   18025 
   18026 static IRExpr* math_CTZ32(IRExpr *exp)
   18027 {
   18028    /* Iop_Ctz32 isn't implemented by the amd64 back end, so use Iop_Ctz64. */
   18029    return unop(Iop_64to32, unop(Iop_Ctz64, unop(Iop_32Uto64, exp)));
   18030 }
   18031 
   18032 static Long dis_PCMPISTRI_3A ( UChar modrm, UInt regNoL, UInt regNoR,
   18033                                Long delta, UChar opc, UChar imm,
   18034                                HChar dis_buf[])
   18035 {
   18036    /* We only handle PCMPISTRI for now */
   18037    vassert((opc & 0x03) == 0x03);
   18038    /* And only an immediate byte of 0x38 or 0x3A */
   18039    vassert((imm & ~0x02) == 0x38);
   18040 
   18041    /* FIXME: Is this correct when RegNoL == 16 ? */
   18042    IRTemp argL = newTemp(Ity_V128);
   18043    assign(argL, getXMMReg(regNoL));
   18044    IRTemp argR = newTemp(Ity_V128);
   18045    assign(argR, getXMMReg(regNoR));
   18046 
   18047    IRTemp zmaskL = newTemp(Ity_I32);
   18048    assign(zmaskL, unop(Iop_16Uto32,
   18049                        unop(Iop_GetMSBs8x16,
   18050                             binop(Iop_CmpEQ8x16, mkexpr(argL), mkV128(0)))));
   18051    IRTemp zmaskR = newTemp(Ity_I32);
   18052    assign(zmaskR, unop(Iop_16Uto32,
   18053                        unop(Iop_GetMSBs8x16,
   18054                             binop(Iop_CmpEQ8x16, mkexpr(argR), mkV128(0)))));
   18055 
   18056    /* We want validL = ~(zmaskL | -zmaskL)
   18057 
   18058       But this formulation kills memcheck's validity tracking when any
   18059       bits above the first "1" are invalid.  So reformulate as:
   18060 
   18061       validL = (zmaskL ? (1 << ctz(zmaskL)) : 0) - 1
   18062    */
   18063 
   18064    IRExpr *ctzL = unop(Iop_32to8, math_CTZ32(mkexpr(zmaskL)));
   18065 
   18066    /* Generate a bool expression which is zero iff the original is
   18067       zero.  Do this carefully so memcheck can propagate validity bits
   18068       correctly.
   18069     */
   18070    IRTemp zmaskL_zero = newTemp(Ity_I1);
   18071    assign(zmaskL_zero, binop(Iop_ExpCmpNE32, mkexpr(zmaskL), mkU32(0)));
   18072 
   18073    IRTemp validL = newTemp(Ity_I32);
   18074    assign(validL, binop(Iop_Sub32,
   18075                         IRExpr_ITE(mkexpr(zmaskL_zero),
   18076                                    binop(Iop_Shl32, mkU32(1), ctzL),
   18077                                    mkU32(0)),
   18078                         mkU32(1)));
   18079 
   18080    /* And similarly for validR. */
   18081    IRExpr *ctzR = unop(Iop_32to8, math_CTZ32(mkexpr(zmaskR)));
   18082    IRTemp zmaskR_zero = newTemp(Ity_I1);
   18083    assign(zmaskR_zero, binop(Iop_ExpCmpNE32, mkexpr(zmaskR), mkU32(0)));
   18084    IRTemp validR = newTemp(Ity_I32);
   18085    assign(validR, binop(Iop_Sub32,
   18086                         IRExpr_ITE(mkexpr(zmaskR_zero),
   18087                                    binop(Iop_Shl32, mkU32(1), ctzR),
   18088                                    mkU32(0)),
   18089                         mkU32(1)));
   18090 
   18091    /* Do the actual comparison. */
   18092    IRExpr *boolResII = unop(Iop_16Uto32,
   18093                             unop(Iop_GetMSBs8x16,
   18094                                  binop(Iop_CmpEQ8x16, mkexpr(argL),
   18095                                                       mkexpr(argR))));
   18096 
   18097    /* Compute boolresII & validL & validR (i.e., if both valid, use
   18098       comparison result) */
   18099    IRExpr *intRes1_a = binop(Iop_And32, boolResII,
   18100                              binop(Iop_And32,
   18101                                    mkexpr(validL), mkexpr(validR)));
   18102 
   18103    /* Compute ~(validL | validR); i.e., if both invalid, force 1. */
   18104    IRExpr *intRes1_b = unop(Iop_Not32, binop(Iop_Or32,
   18105                                              mkexpr(validL), mkexpr(validR)));
   18106    /* Otherwise, zero. */
   18107    IRExpr *intRes1 = binop(Iop_And32, mkU32(0xFFFF),
   18108                            binop(Iop_Or32, intRes1_a, intRes1_b));
   18109 
   18110    /* The "0x30" in imm=0x3A means "polarity=3" means XOR validL with
   18111       result. */
   18112    IRTemp intRes2 = newTemp(Ity_I32);
   18113    assign(intRes2, binop(Iop_And32, mkU32(0xFFFF),
   18114                          binop(Iop_Xor32, intRes1, mkexpr(validL))));
   18115 
   18116    /* If the 0x40 bit were set in imm=0x3A, we would return the index
   18117       of the msb.  Since it is clear, we return the index of the
   18118       lsb. */
   18119    IRExpr *newECX = math_CTZ32(binop(Iop_Or32,
   18120                                      mkexpr(intRes2), mkU32(0x10000)));
   18121 
   18122    /* And thats our rcx. */
   18123    putIReg32(R_RCX, newECX);
   18124 
   18125    /* Now for the condition codes... */
   18126 
   18127    /* C == 0 iff intRes2 == 0 */
   18128    IRExpr *c_bit = IRExpr_ITE( binop(Iop_ExpCmpNE32, mkexpr(intRes2),
   18129                                      mkU32(0)),
   18130                                mkU32(1 << AMD64G_CC_SHIFT_C),
   18131                                mkU32(0));
   18132    /* Z == 1 iff any in argL is 0 */
   18133    IRExpr *z_bit = IRExpr_ITE( mkexpr(zmaskL_zero),
   18134                                mkU32(1 << AMD64G_CC_SHIFT_Z),
   18135                                mkU32(0));
   18136    /* S == 1 iff any in argR is 0 */
   18137    IRExpr *s_bit = IRExpr_ITE( mkexpr(zmaskR_zero),
   18138                                mkU32(1 << AMD64G_CC_SHIFT_S),
   18139                                mkU32(0));
   18140    /* O == IntRes2[0] */
   18141    IRExpr *o_bit = binop(Iop_Shl32, binop(Iop_And32, mkexpr(intRes2),
   18142                                           mkU32(0x01)),
   18143                          mkU8(AMD64G_CC_SHIFT_O));
   18144 
   18145    /* Put them all together */
   18146    IRTemp cc = newTemp(Ity_I64);
   18147    assign(cc, widenUto64(binop(Iop_Or32,
   18148                                binop(Iop_Or32, c_bit, z_bit),
   18149                                binop(Iop_Or32, s_bit, o_bit))));
   18150    stmt(IRStmt_Put(OFFB_CC_OP, mkU64(AMD64G_CC_OP_COPY)));
   18151    stmt(IRStmt_Put(OFFB_CC_DEP1, mkexpr(cc)));
   18152    stmt(IRStmt_Put(OFFB_CC_DEP2, mkU64(0)));
   18153    stmt(IRStmt_Put(OFFB_CC_NDEP, mkU64(0)));
   18154 
   18155    return delta;
   18156 }
   18157 
   18158 /* This can fail, in which case it returns the original (unchanged)
   18159    delta. */
   18160 static Long dis_PCMPxSTRx ( const VexAbiInfo* vbi, Prefix pfx,
   18161                             Long delta, Bool isAvx, UChar opc )
   18162 {
   18163    Long   delta0  = delta;
   18164    UInt   isISTRx = opc & 2;
   18165    UInt   isxSTRM = (opc & 1) ^ 1;
   18166    UInt   regNoL  = 0;
   18167    UInt   regNoR  = 0;
   18168    UChar  imm     = 0;
   18169    IRTemp addr    = IRTemp_INVALID;
   18170    Int    alen    = 0;
   18171    HChar  dis_buf[50];
   18172 
   18173    /* This is a nasty kludge.  We need to pass 2 x V128 to the helper
   18174       (which is clean).  Since we can't do that, use a dirty helper to
   18175       compute the results directly from the XMM regs in the guest
   18176       state.  That means for the memory case, we need to move the left
   18177       operand into a pseudo-register (XMM16, let's call it). */
   18178    UChar modrm = getUChar(delta);
   18179    if (epartIsReg(modrm)) {
   18180       regNoL = eregOfRexRM(pfx, modrm);
   18181       regNoR = gregOfRexRM(pfx, modrm);
   18182       imm = getUChar(delta+1);
   18183       delta += 1+1;
   18184    } else {
   18185       regNoL = 16; /* use XMM16 as an intermediary */
   18186       regNoR = gregOfRexRM(pfx, modrm);
   18187       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   18188       /* No alignment check; I guess that makes sense, given that
   18189          these insns are for dealing with C style strings. */
   18190       stmt( IRStmt_Put( OFFB_YMM16, loadLE(Ity_V128, mkexpr(addr)) ));
   18191       imm = getUChar(delta+alen);
   18192       delta += alen+1;
   18193    }
   18194 
   18195    /* Print the insn here, since dis_PCMPISTRI_3A doesn't do so
   18196       itself. */
   18197    if (regNoL == 16) {
   18198       DIP("%spcmp%cstr%c $%x,%s,%s\n",
   18199           isAvx ? "v" : "", isISTRx ? 'i' : 'e', isxSTRM ? 'm' : 'i',
   18200           (UInt)imm, dis_buf, nameXMMReg(regNoR));
   18201    } else {
   18202       DIP("%spcmp%cstr%c $%x,%s,%s\n",
   18203           isAvx ? "v" : "", isISTRx ? 'i' : 'e', isxSTRM ? 'm' : 'i',
   18204           (UInt)imm, nameXMMReg(regNoL), nameXMMReg(regNoR));
   18205    }
   18206 
   18207    /* Handle special case(s). */
   18208    if (imm == 0x3A && isISTRx && !isxSTRM) {
   18209       return dis_PCMPISTRI_3A ( modrm, regNoL, regNoR, delta,
   18210                                 opc, imm, dis_buf);
   18211    }
   18212 
   18213    /* Now we know the XMM reg numbers for the operands, and the
   18214       immediate byte.  Is it one we can actually handle? Throw out any
   18215       cases for which the helper function has not been verified. */
   18216    switch (imm) {
   18217       case 0x00: case 0x02: case 0x08: case 0x0A: case 0x0C: case 0x0E:
   18218       case 0x12: case 0x14: case 0x1A:
   18219       case 0x30: case 0x34: case 0x38: case 0x3A:
   18220       case 0x40: case 0x44: case 0x46: case 0x4A:
   18221          break;
   18222       // the 16-bit character versions of the above
   18223       case 0x01: case 0x03: case 0x09: case 0x0B: case 0x0D:
   18224       case 0x13:            case 0x1B:
   18225                             case 0x39: case 0x3B:
   18226                  case 0x45:            case 0x4B:
   18227          break;
   18228       default:
   18229          return delta0; /*FAIL*/
   18230    }
   18231 
   18232    /* Who ya gonna call?  Presumably not Ghostbusters. */
   18233    void*  fn = &amd64g_dirtyhelper_PCMPxSTRx;
   18234    const HChar* nm = "amd64g_dirtyhelper_PCMPxSTRx";
   18235 
   18236    /* Round up the arguments.  Note that this is a kludge -- the use
   18237       of mkU64 rather than mkIRExpr_HWord implies the assumption that
   18238       the host's word size is 64-bit. */
   18239    UInt gstOffL = regNoL == 16 ? OFFB_YMM16 : ymmGuestRegOffset(regNoL);
   18240    UInt gstOffR = ymmGuestRegOffset(regNoR);
   18241 
   18242    IRExpr*  opc4_and_imm = mkU64((opc << 8) | (imm & 0xFF));
   18243    IRExpr*  gstOffLe     = mkU64(gstOffL);
   18244    IRExpr*  gstOffRe     = mkU64(gstOffR);
   18245    IRExpr*  edxIN        = isISTRx ? mkU64(0) : getIRegRDX(8);
   18246    IRExpr*  eaxIN        = isISTRx ? mkU64(0) : getIRegRAX(8);
   18247    IRExpr** args
   18248       = mkIRExprVec_6( IRExpr_BBPTR(),
   18249                        opc4_and_imm, gstOffLe, gstOffRe, edxIN, eaxIN );
   18250 
   18251    IRTemp   resT = newTemp(Ity_I64);
   18252    IRDirty* d    = unsafeIRDirty_1_N( resT, 0/*regparms*/, nm, fn, args );
   18253    /* It's not really a dirty call, but we can't use the clean helper
   18254       mechanism here for the very lame reason that we can't pass 2 x
   18255       V128s by value to a helper.  Hence this roundabout scheme. */
   18256    d->nFxState = 2;
   18257    vex_bzero(&d->fxState, sizeof(d->fxState));
   18258    d->fxState[0].fx     = Ifx_Read;
   18259    d->fxState[0].offset = gstOffL;
   18260    d->fxState[0].size   = sizeof(U128);
   18261    d->fxState[1].fx     = Ifx_Read;
   18262    d->fxState[1].offset = gstOffR;
   18263    d->fxState[1].size   = sizeof(U128);
   18264    if (isxSTRM) {
   18265       /* Declare that the helper writes XMM0. */
   18266       d->nFxState = 3;
   18267       d->fxState[2].fx     = Ifx_Write;
   18268       d->fxState[2].offset = ymmGuestRegOffset(0);
   18269       d->fxState[2].size   = sizeof(U128);
   18270    }
   18271 
   18272    stmt( IRStmt_Dirty(d) );
   18273 
   18274    /* Now resT[15:0] holds the new OSZACP values, so the condition
   18275       codes must be updated. And for a xSTRI case, resT[31:16] holds
   18276       the new ECX value, so stash that too. */
   18277    if (!isxSTRM) {
   18278       putIReg64(R_RCX, binop(Iop_And64,
   18279                              binop(Iop_Shr64, mkexpr(resT), mkU8(16)),
   18280                              mkU64(0xFFFF)));
   18281    }
   18282 
   18283    /* Zap the upper half of the dest reg as per AVX conventions. */
   18284    if (isxSTRM && isAvx)
   18285       putYMMRegLane128(/*YMM*/0, 1, mkV128(0));
   18286 
   18287    stmt( IRStmt_Put(
   18288             OFFB_CC_DEP1,
   18289             binop(Iop_And64, mkexpr(resT), mkU64(0xFFFF))
   18290    ));
   18291    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   18292    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   18293    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   18294 
   18295    return delta;
   18296 }
   18297 
   18298 
   18299 static IRTemp math_PINSRB_128 ( IRTemp v128, IRTemp u8, UInt imm8 )
   18300 {
   18301    vassert(imm8 >= 0 && imm8 <= 15);
   18302 
   18303    // Create a V128 value which has the selected byte in the
   18304    // specified lane, and zeroes everywhere else.
   18305    IRTemp tmp128    = newTemp(Ity_V128);
   18306    IRTemp halfshift = newTemp(Ity_I64);
   18307    assign(halfshift, binop(Iop_Shl64,
   18308                            unop(Iop_8Uto64, mkexpr(u8)),
   18309                            mkU8(8 * (imm8 & 7))));
   18310    if (imm8 < 8) {
   18311       assign(tmp128, binop(Iop_64HLtoV128, mkU64(0), mkexpr(halfshift)));
   18312    } else {
   18313       assign(tmp128, binop(Iop_64HLtoV128, mkexpr(halfshift), mkU64(0)));
   18314    }
   18315 
   18316    UShort mask = ~(1 << imm8);
   18317    IRTemp res  = newTemp(Ity_V128);
   18318    assign( res, binop(Iop_OrV128,
   18319                       mkexpr(tmp128),
   18320                       binop(Iop_AndV128, mkexpr(v128), mkV128(mask))) );
   18321    return res;
   18322 }
   18323 
   18324 
   18325 static IRTemp math_PINSRD_128 ( IRTemp v128, IRTemp u32, UInt imm8 )
   18326 {
   18327    IRTemp z32 = newTemp(Ity_I32);
   18328    assign(z32, mkU32(0));
   18329 
   18330    /* Surround u32 with zeroes as per imm, giving us something we can
   18331       OR into a suitably masked-out v128.*/
   18332    IRTemp withZs = newTemp(Ity_V128);
   18333    UShort mask = 0;
   18334    switch (imm8) {
   18335       case 3:  mask = 0x0FFF;
   18336                assign(withZs, mkV128from32s(u32, z32, z32, z32));
   18337                break;
   18338       case 2:  mask = 0xF0FF;
   18339                assign(withZs, mkV128from32s(z32, u32, z32, z32));
   18340                break;
   18341       case 1:  mask = 0xFF0F;
   18342                assign(withZs, mkV128from32s(z32, z32, u32, z32));
   18343                break;
   18344       case 0:  mask = 0xFFF0;
   18345                assign(withZs, mkV128from32s(z32, z32, z32, u32));
   18346                break;
   18347       default: vassert(0);
   18348    }
   18349 
   18350    IRTemp res = newTemp(Ity_V128);
   18351    assign(res, binop( Iop_OrV128,
   18352                       mkexpr(withZs),
   18353                       binop( Iop_AndV128, mkexpr(v128), mkV128(mask) ) ) );
   18354    return res;
   18355 }
   18356 
   18357 
   18358 static IRTemp math_PINSRQ_128 ( IRTemp v128, IRTemp u64, UInt imm8 )
   18359 {
   18360    /* Surround u64 with zeroes as per imm, giving us something we can
   18361       OR into a suitably masked-out v128.*/
   18362    IRTemp withZs = newTemp(Ity_V128);
   18363    UShort mask = 0;
   18364    if (imm8 == 0) {
   18365       mask = 0xFF00;
   18366       assign(withZs, binop(Iop_64HLtoV128, mkU64(0), mkexpr(u64)));
   18367    } else {
   18368       vassert(imm8 == 1);
   18369       mask = 0x00FF;
   18370       assign( withZs, binop(Iop_64HLtoV128, mkexpr(u64), mkU64(0)));
   18371    }
   18372 
   18373    IRTemp res = newTemp(Ity_V128);
   18374    assign( res, binop( Iop_OrV128,
   18375                        mkexpr(withZs),
   18376                        binop( Iop_AndV128, mkexpr(v128), mkV128(mask) ) ) );
   18377    return res;
   18378 }
   18379 
   18380 
   18381 static IRTemp math_INSERTPS ( IRTemp dstV, IRTemp toInsertD, UInt imm8 )
   18382 {
   18383    const IRTemp inval = IRTemp_INVALID;
   18384    IRTemp dstDs[4] = { inval, inval, inval, inval };
   18385    breakupV128to32s( dstV, &dstDs[3], &dstDs[2], &dstDs[1], &dstDs[0] );
   18386 
   18387    vassert(imm8 <= 255);
   18388    dstDs[(imm8 >> 4) & 3] = toInsertD; /* "imm8_count_d" */
   18389 
   18390    UInt imm8_zmask = (imm8 & 15);
   18391    IRTemp zero_32 = newTemp(Ity_I32);
   18392    assign( zero_32, mkU32(0) );
   18393    IRTemp resV = newTemp(Ity_V128);
   18394    assign( resV, mkV128from32s(
   18395                     ((imm8_zmask & 8) == 8) ? zero_32 : dstDs[3],
   18396                     ((imm8_zmask & 4) == 4) ? zero_32 : dstDs[2],
   18397                     ((imm8_zmask & 2) == 2) ? zero_32 : dstDs[1],
   18398                     ((imm8_zmask & 1) == 1) ? zero_32 : dstDs[0]) );
   18399    return resV;
   18400 }
   18401 
   18402 
   18403 static Long dis_PEXTRB_128_GtoE ( const VexAbiInfo* vbi, Prefix pfx,
   18404                                   Long delta, Bool isAvx )
   18405 {
   18406    IRTemp addr     = IRTemp_INVALID;
   18407    Int    alen     = 0;
   18408    HChar  dis_buf[50];
   18409    IRTemp xmm_vec  = newTemp(Ity_V128);
   18410    IRTemp sel_lane = newTemp(Ity_I32);
   18411    IRTemp shr_lane = newTemp(Ity_I32);
   18412    const HChar* mbV = isAvx ? "v" : "";
   18413    UChar  modrm    = getUChar(delta);
   18414    IRTemp t3, t2, t1, t0;
   18415    Int    imm8;
   18416    assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
   18417    t3 = t2 = t1 = t0 = IRTemp_INVALID;
   18418    breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
   18419 
   18420    if ( epartIsReg( modrm ) ) {
   18421       imm8 = (Int)getUChar(delta+1);
   18422    } else {
   18423       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   18424       imm8 = (Int)getUChar(delta+alen);
   18425    }
   18426    switch ( (imm8 >> 2) & 3 ) {
   18427       case 0:  assign( sel_lane, mkexpr(t0) ); break;
   18428       case 1:  assign( sel_lane, mkexpr(t1) ); break;
   18429       case 2:  assign( sel_lane, mkexpr(t2) ); break;
   18430       case 3:  assign( sel_lane, mkexpr(t3) ); break;
   18431       default: vassert(0);
   18432    }
   18433    assign( shr_lane,
   18434            binop( Iop_Shr32, mkexpr(sel_lane), mkU8(((imm8 & 3)*8)) ) );
   18435 
   18436    if ( epartIsReg( modrm ) ) {
   18437       putIReg64( eregOfRexRM(pfx,modrm),
   18438                  unop( Iop_32Uto64,
   18439                        binop(Iop_And32, mkexpr(shr_lane), mkU32(255)) ) );
   18440       delta += 1+1;
   18441       DIP( "%spextrb $%d, %s,%s\n", mbV, imm8,
   18442            nameXMMReg( gregOfRexRM(pfx, modrm) ),
   18443            nameIReg64( eregOfRexRM(pfx, modrm) ) );
   18444    } else {
   18445       storeLE( mkexpr(addr), unop(Iop_32to8, mkexpr(shr_lane) ) );
   18446       delta += alen+1;
   18447       DIP( "%spextrb $%d,%s,%s\n", mbV,
   18448            imm8, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
   18449    }
   18450 
   18451    return delta;
   18452 }
   18453 
   18454 
   18455 static IRTemp math_DPPD_128 ( IRTemp src_vec, IRTemp dst_vec, UInt imm8 )
   18456 {
   18457    vassert(imm8 < 256);
   18458    UShort imm8_perms[4] = { 0x0000, 0x00FF, 0xFF00, 0xFFFF };
   18459    IRTemp and_vec = newTemp(Ity_V128);
   18460    IRTemp sum_vec = newTemp(Ity_V128);
   18461    IRTemp rm      = newTemp(Ity_I32);
   18462    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   18463    assign( and_vec, binop( Iop_AndV128,
   18464                            triop( Iop_Mul64Fx2,
   18465                                   mkexpr(rm),
   18466                                   mkexpr(dst_vec), mkexpr(src_vec) ),
   18467                            mkV128( imm8_perms[ ((imm8 >> 4) & 3) ] ) ) );
   18468 
   18469    assign( sum_vec, binop( Iop_Add64F0x2,
   18470                            binop( Iop_InterleaveHI64x2,
   18471                                   mkexpr(and_vec), mkexpr(and_vec) ),
   18472                            binop( Iop_InterleaveLO64x2,
   18473                                   mkexpr(and_vec), mkexpr(and_vec) ) ) );
   18474    IRTemp res = newTemp(Ity_V128);
   18475    assign(res, binop( Iop_AndV128,
   18476                       binop( Iop_InterleaveLO64x2,
   18477                              mkexpr(sum_vec), mkexpr(sum_vec) ),
   18478                       mkV128( imm8_perms[ (imm8 & 3) ] ) ) );
   18479    return res;
   18480 }
   18481 
   18482 
   18483 static IRTemp math_DPPS_128 ( IRTemp src_vec, IRTemp dst_vec, UInt imm8 )
   18484 {
   18485    vassert(imm8 < 256);
   18486    IRTemp tmp_prod_vec = newTemp(Ity_V128);
   18487    IRTemp prod_vec     = newTemp(Ity_V128);
   18488    IRTemp sum_vec      = newTemp(Ity_V128);
   18489    IRTemp rm           = newTemp(Ity_I32);
   18490    IRTemp v3, v2, v1, v0;
   18491    v3 = v2 = v1 = v0   = IRTemp_INVALID;
   18492    UShort imm8_perms[16] = { 0x0000, 0x000F, 0x00F0, 0x00FF, 0x0F00,
   18493                              0x0F0F, 0x0FF0, 0x0FFF, 0xF000, 0xF00F,
   18494                              0xF0F0, 0xF0FF, 0xFF00, 0xFF0F, 0xFFF0,
   18495                              0xFFFF };
   18496 
   18497    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   18498    assign( tmp_prod_vec,
   18499            binop( Iop_AndV128,
   18500                   triop( Iop_Mul32Fx4,
   18501                          mkexpr(rm), mkexpr(dst_vec), mkexpr(src_vec) ),
   18502                   mkV128( imm8_perms[((imm8 >> 4)& 15)] ) ) );
   18503    breakupV128to32s( tmp_prod_vec, &v3, &v2, &v1, &v0 );
   18504    assign( prod_vec, mkV128from32s( v3, v1, v2, v0 ) );
   18505 
   18506    assign( sum_vec, triop( Iop_Add32Fx4,
   18507                            mkexpr(rm),
   18508                            binop( Iop_InterleaveHI32x4,
   18509                                   mkexpr(prod_vec), mkexpr(prod_vec) ),
   18510                            binop( Iop_InterleaveLO32x4,
   18511                                   mkexpr(prod_vec), mkexpr(prod_vec) ) ) );
   18512 
   18513    IRTemp res = newTemp(Ity_V128);
   18514    assign( res, binop( Iop_AndV128,
   18515                        triop( Iop_Add32Fx4,
   18516                               mkexpr(rm),
   18517                               binop( Iop_InterleaveHI32x4,
   18518                                      mkexpr(sum_vec), mkexpr(sum_vec) ),
   18519                               binop( Iop_InterleaveLO32x4,
   18520                                      mkexpr(sum_vec), mkexpr(sum_vec) ) ),
   18521                        mkV128( imm8_perms[ (imm8 & 15) ] ) ) );
   18522    return res;
   18523 }
   18524 
   18525 
   18526 static IRTemp math_MPSADBW_128 ( IRTemp dst_vec, IRTemp src_vec, UInt imm8 )
   18527 {
   18528    /* Mask out bits of the operands we don't need.  This isn't
   18529       strictly necessary, but it does ensure Memcheck doesn't
   18530       give us any false uninitialised value errors as a
   18531       result. */
   18532    UShort src_mask[4] = { 0x000F, 0x00F0, 0x0F00, 0xF000 };
   18533    UShort dst_mask[2] = { 0x07FF, 0x7FF0 };
   18534 
   18535    IRTemp src_maskV = newTemp(Ity_V128);
   18536    IRTemp dst_maskV = newTemp(Ity_V128);
   18537    assign(src_maskV, mkV128( src_mask[ imm8 & 3 ] ));
   18538    assign(dst_maskV, mkV128( dst_mask[ (imm8 >> 2) & 1 ] ));
   18539 
   18540    IRTemp src_masked = newTemp(Ity_V128);
   18541    IRTemp dst_masked = newTemp(Ity_V128);
   18542    assign(src_masked, binop(Iop_AndV128, mkexpr(src_vec), mkexpr(src_maskV)));
   18543    assign(dst_masked, binop(Iop_AndV128, mkexpr(dst_vec), mkexpr(dst_maskV)));
   18544 
   18545    /* Generate 4 64 bit values that we can hand to a clean helper */
   18546    IRTemp sHi = newTemp(Ity_I64);
   18547    IRTemp sLo = newTemp(Ity_I64);
   18548    assign( sHi, unop(Iop_V128HIto64, mkexpr(src_masked)) );
   18549    assign( sLo, unop(Iop_V128to64,   mkexpr(src_masked)) );
   18550 
   18551    IRTemp dHi = newTemp(Ity_I64);
   18552    IRTemp dLo = newTemp(Ity_I64);
   18553    assign( dHi, unop(Iop_V128HIto64, mkexpr(dst_masked)) );
   18554    assign( dLo, unop(Iop_V128to64,   mkexpr(dst_masked)) );
   18555 
   18556    /* Compute halves of the result separately */
   18557    IRTemp resHi = newTemp(Ity_I64);
   18558    IRTemp resLo = newTemp(Ity_I64);
   18559 
   18560    IRExpr** argsHi
   18561       = mkIRExprVec_5( mkexpr(sHi), mkexpr(sLo), mkexpr(dHi), mkexpr(dLo),
   18562                        mkU64( 0x80 | (imm8 & 7) ));
   18563    IRExpr** argsLo
   18564       = mkIRExprVec_5( mkexpr(sHi), mkexpr(sLo), mkexpr(dHi), mkexpr(dLo),
   18565                        mkU64( 0x00 | (imm8 & 7) ));
   18566 
   18567    assign(resHi, mkIRExprCCall( Ity_I64, 0/*regparm*/,
   18568                                 "amd64g_calc_mpsadbw",
   18569                                 &amd64g_calc_mpsadbw, argsHi ));
   18570    assign(resLo, mkIRExprCCall( Ity_I64, 0/*regparm*/,
   18571                                 "amd64g_calc_mpsadbw",
   18572                                 &amd64g_calc_mpsadbw, argsLo ));
   18573 
   18574    IRTemp res = newTemp(Ity_V128);
   18575    assign(res, binop(Iop_64HLtoV128, mkexpr(resHi), mkexpr(resLo)));
   18576    return res;
   18577 }
   18578 
   18579 static Long dis_EXTRACTPS ( const VexAbiInfo* vbi, Prefix pfx,
   18580                             Long delta, Bool isAvx )
   18581 {
   18582    IRTemp addr       = IRTemp_INVALID;
   18583    Int    alen       = 0;
   18584    HChar  dis_buf[50];
   18585    UChar  modrm      = getUChar(delta);
   18586    Int imm8_10;
   18587    IRTemp xmm_vec    = newTemp(Ity_V128);
   18588    IRTemp src_dword  = newTemp(Ity_I32);
   18589    UInt   rG         = gregOfRexRM(pfx,modrm);
   18590    IRTemp t3, t2, t1, t0;
   18591    t3 = t2 = t1 = t0 = IRTemp_INVALID;
   18592 
   18593    assign( xmm_vec, getXMMReg( rG ) );
   18594    breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
   18595 
   18596    if ( epartIsReg( modrm ) ) {
   18597       imm8_10 = (Int)(getUChar(delta+1) & 3);
   18598    } else {
   18599       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   18600       imm8_10 = (Int)(getUChar(delta+alen) & 3);
   18601    }
   18602 
   18603    switch ( imm8_10 ) {
   18604       case 0:  assign( src_dword, mkexpr(t0) ); break;
   18605       case 1:  assign( src_dword, mkexpr(t1) ); break;
   18606       case 2:  assign( src_dword, mkexpr(t2) ); break;
   18607       case 3:  assign( src_dword, mkexpr(t3) ); break;
   18608       default: vassert(0);
   18609    }
   18610 
   18611    if ( epartIsReg( modrm ) ) {
   18612       UInt rE = eregOfRexRM(pfx,modrm);
   18613       putIReg32( rE, mkexpr(src_dword) );
   18614       delta += 1+1;
   18615       DIP( "%sextractps $%d, %s,%s\n", isAvx ? "v" : "", imm8_10,
   18616            nameXMMReg( rG ), nameIReg32( rE ) );
   18617    } else {
   18618       storeLE( mkexpr(addr), mkexpr(src_dword) );
   18619       delta += alen+1;
   18620       DIP( "%sextractps $%d, %s,%s\n", isAvx ? "v" : "", imm8_10,
   18621            nameXMMReg( rG ), dis_buf );
   18622    }
   18623 
   18624    return delta;
   18625 }
   18626 
   18627 
   18628 static IRTemp math_PCLMULQDQ( IRTemp dV, IRTemp sV, UInt imm8 )
   18629 {
   18630    IRTemp t0 = newTemp(Ity_I64);
   18631    IRTemp t1 = newTemp(Ity_I64);
   18632    assign(t0, unop((imm8&1)? Iop_V128HIto64 : Iop_V128to64,
   18633               mkexpr(dV)));
   18634    assign(t1, unop((imm8&16) ? Iop_V128HIto64 : Iop_V128to64,
   18635               mkexpr(sV)));
   18636 
   18637    IRTemp t2 = newTemp(Ity_I64);
   18638    IRTemp t3 = newTemp(Ity_I64);
   18639 
   18640    IRExpr** args;
   18641 
   18642    args = mkIRExprVec_3(mkexpr(t0), mkexpr(t1), mkU64(0));
   18643    assign(t2, mkIRExprCCall(Ity_I64,0, "amd64g_calculate_pclmul",
   18644                             &amd64g_calculate_pclmul, args));
   18645    args = mkIRExprVec_3(mkexpr(t0), mkexpr(t1), mkU64(1));
   18646    assign(t3, mkIRExprCCall(Ity_I64,0, "amd64g_calculate_pclmul",
   18647                             &amd64g_calculate_pclmul, args));
   18648 
   18649    IRTemp res     = newTemp(Ity_V128);
   18650    assign(res, binop(Iop_64HLtoV128, mkexpr(t3), mkexpr(t2)));
   18651    return res;
   18652 }
   18653 
   18654 
   18655 __attribute__((noinline))
   18656 static
   18657 Long dis_ESC_0F3A__SSE4 ( Bool* decode_OK,
   18658                           const VexAbiInfo* vbi,
   18659                           Prefix pfx, Int sz, Long deltaIN )
   18660 {
   18661    IRTemp addr  = IRTemp_INVALID;
   18662    UChar  modrm = 0;
   18663    Int    alen  = 0;
   18664    HChar  dis_buf[50];
   18665 
   18666    *decode_OK = False;
   18667 
   18668    Long   delta = deltaIN;
   18669    UChar  opc   = getUChar(delta);
   18670    delta++;
   18671    switch (opc) {
   18672 
   18673    case 0x08:
   18674       /* 66 0F 3A 08 /r ib = ROUNDPS imm8, xmm2/m128, xmm1 */
   18675       if (have66noF2noF3(pfx) && sz == 2) {
   18676 
   18677          IRTemp src0 = newTemp(Ity_F32);
   18678          IRTemp src1 = newTemp(Ity_F32);
   18679          IRTemp src2 = newTemp(Ity_F32);
   18680          IRTemp src3 = newTemp(Ity_F32);
   18681          IRTemp res0 = newTemp(Ity_F32);
   18682          IRTemp res1 = newTemp(Ity_F32);
   18683          IRTemp res2 = newTemp(Ity_F32);
   18684          IRTemp res3 = newTemp(Ity_F32);
   18685          IRTemp rm   = newTemp(Ity_I32);
   18686          Int    imm  = 0;
   18687 
   18688          modrm = getUChar(delta);
   18689 
   18690          if (epartIsReg(modrm)) {
   18691             assign( src0,
   18692                     getXMMRegLane32F( eregOfRexRM(pfx, modrm), 0 ) );
   18693             assign( src1,
   18694                     getXMMRegLane32F( eregOfRexRM(pfx, modrm), 1 ) );
   18695             assign( src2,
   18696                     getXMMRegLane32F( eregOfRexRM(pfx, modrm), 2 ) );
   18697             assign( src3,
   18698                     getXMMRegLane32F( eregOfRexRM(pfx, modrm), 3 ) );
   18699             imm = getUChar(delta+1);
   18700             if (imm & ~15) goto decode_failure;
   18701             delta += 1+1;
   18702             DIP( "roundps $%d,%s,%s\n",
   18703                  imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
   18704                       nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   18705          } else {
   18706             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   18707             gen_SEGV_if_not_16_aligned(addr);
   18708             assign( src0, loadLE(Ity_F32,
   18709                                  binop(Iop_Add64, mkexpr(addr), mkU64(0) )));
   18710             assign( src1, loadLE(Ity_F32,
   18711                                  binop(Iop_Add64, mkexpr(addr), mkU64(4) )));
   18712             assign( src2, loadLE(Ity_F32,
   18713                                  binop(Iop_Add64, mkexpr(addr), mkU64(8) )));
   18714             assign( src3, loadLE(Ity_F32,
   18715                                  binop(Iop_Add64, mkexpr(addr), mkU64(12) )));
   18716             imm = getUChar(delta+alen);
   18717             if (imm & ~15) goto decode_failure;
   18718             delta += alen+1;
   18719             DIP( "roundps $%d,%s,%s\n",
   18720                  imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   18721          }
   18722 
   18723          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   18724             that encoding is the same as the encoding for IRRoundingMode,
   18725             we can use that value directly in the IR as a rounding
   18726             mode. */
   18727          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   18728 
   18729          assign(res0, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src0)) );
   18730          assign(res1, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src1)) );
   18731          assign(res2, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src2)) );
   18732          assign(res3, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src3)) );
   18733 
   18734          putXMMRegLane32F( gregOfRexRM(pfx, modrm), 0, mkexpr(res0) );
   18735          putXMMRegLane32F( gregOfRexRM(pfx, modrm), 1, mkexpr(res1) );
   18736          putXMMRegLane32F( gregOfRexRM(pfx, modrm), 2, mkexpr(res2) );
   18737          putXMMRegLane32F( gregOfRexRM(pfx, modrm), 3, mkexpr(res3) );
   18738 
   18739          goto decode_success;
   18740       }
   18741       break;
   18742 
   18743    case 0x09:
   18744       /* 66 0F 3A 09 /r ib = ROUNDPD imm8, xmm2/m128, xmm1 */
   18745       if (have66noF2noF3(pfx) && sz == 2) {
   18746 
   18747          IRTemp src0 = newTemp(Ity_F64);
   18748          IRTemp src1 = newTemp(Ity_F64);
   18749          IRTemp res0 = newTemp(Ity_F64);
   18750          IRTemp res1 = newTemp(Ity_F64);
   18751          IRTemp rm   = newTemp(Ity_I32);
   18752          Int    imm  = 0;
   18753 
   18754          modrm = getUChar(delta);
   18755 
   18756          if (epartIsReg(modrm)) {
   18757             assign( src0,
   18758                     getXMMRegLane64F( eregOfRexRM(pfx, modrm), 0 ) );
   18759             assign( src1,
   18760                     getXMMRegLane64F( eregOfRexRM(pfx, modrm), 1 ) );
   18761             imm = getUChar(delta+1);
   18762             if (imm & ~15) goto decode_failure;
   18763             delta += 1+1;
   18764             DIP( "roundpd $%d,%s,%s\n",
   18765                  imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
   18766                       nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   18767          } else {
   18768             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   18769             gen_SEGV_if_not_16_aligned(addr);
   18770             assign( src0, loadLE(Ity_F64,
   18771                                  binop(Iop_Add64, mkexpr(addr), mkU64(0) )));
   18772             assign( src1, loadLE(Ity_F64,
   18773                                  binop(Iop_Add64, mkexpr(addr), mkU64(8) )));
   18774             imm = getUChar(delta+alen);
   18775             if (imm & ~15) goto decode_failure;
   18776             delta += alen+1;
   18777             DIP( "roundpd $%d,%s,%s\n",
   18778                  imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   18779          }
   18780 
   18781          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   18782             that encoding is the same as the encoding for IRRoundingMode,
   18783             we can use that value directly in the IR as a rounding
   18784             mode. */
   18785          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   18786 
   18787          assign(res0, binop(Iop_RoundF64toInt, mkexpr(rm), mkexpr(src0)) );
   18788          assign(res1, binop(Iop_RoundF64toInt, mkexpr(rm), mkexpr(src1)) );
   18789 
   18790          putXMMRegLane64F( gregOfRexRM(pfx, modrm), 0, mkexpr(res0) );
   18791          putXMMRegLane64F( gregOfRexRM(pfx, modrm), 1, mkexpr(res1) );
   18792 
   18793          goto decode_success;
   18794       }
   18795       break;
   18796 
   18797    case 0x0A:
   18798    case 0x0B:
   18799       /* 66 0F 3A 0A /r ib = ROUNDSS imm8, xmm2/m32, xmm1
   18800          66 0F 3A 0B /r ib = ROUNDSD imm8, xmm2/m64, xmm1
   18801       */
   18802       if (have66noF2noF3(pfx) && sz == 2) {
   18803 
   18804          Bool   isD = opc == 0x0B;
   18805          IRTemp src = newTemp(isD ? Ity_F64 : Ity_F32);
   18806          IRTemp res = newTemp(isD ? Ity_F64 : Ity_F32);
   18807          Int    imm = 0;
   18808 
   18809          modrm = getUChar(delta);
   18810 
   18811          if (epartIsReg(modrm)) {
   18812             assign( src,
   18813                     isD ? getXMMRegLane64F( eregOfRexRM(pfx, modrm), 0 )
   18814                         : getXMMRegLane32F( eregOfRexRM(pfx, modrm), 0 ) );
   18815             imm = getUChar(delta+1);
   18816             if (imm & ~15) goto decode_failure;
   18817             delta += 1+1;
   18818             DIP( "rounds%c $%d,%s,%s\n",
   18819                  isD ? 'd' : 's',
   18820                  imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
   18821                       nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   18822          } else {
   18823             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   18824             assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
   18825             imm = getUChar(delta+alen);
   18826             if (imm & ~15) goto decode_failure;
   18827             delta += alen+1;
   18828             DIP( "rounds%c $%d,%s,%s\n",
   18829                  isD ? 'd' : 's',
   18830                  imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   18831          }
   18832 
   18833          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   18834             that encoding is the same as the encoding for IRRoundingMode,
   18835             we can use that value directly in the IR as a rounding
   18836             mode. */
   18837          assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
   18838                            (imm & 4) ? get_sse_roundingmode()
   18839                                      : mkU32(imm & 3),
   18840                            mkexpr(src)) );
   18841 
   18842          if (isD)
   18843             putXMMRegLane64F( gregOfRexRM(pfx, modrm), 0, mkexpr(res) );
   18844          else
   18845             putXMMRegLane32F( gregOfRexRM(pfx, modrm), 0, mkexpr(res) );
   18846 
   18847          goto decode_success;
   18848       }
   18849       break;
   18850 
   18851    case 0x0C:
   18852       /* 66 0F 3A 0C /r ib = BLENDPS xmm1, xmm2/m128, imm8
   18853          Blend Packed Single Precision Floating-Point Values (XMM) */
   18854       if (have66noF2noF3(pfx) && sz == 2) {
   18855 
   18856          Int imm8;
   18857          IRTemp dst_vec = newTemp(Ity_V128);
   18858          IRTemp src_vec = newTemp(Ity_V128);
   18859 
   18860          modrm = getUChar(delta);
   18861 
   18862          assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
   18863 
   18864          if ( epartIsReg( modrm ) ) {
   18865             imm8 = (Int)getUChar(delta+1);
   18866             assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   18867             delta += 1+1;
   18868             DIP( "blendps $%d, %s,%s\n", imm8,
   18869                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
   18870                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   18871          } else {
   18872             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   18873                              1/* imm8 is 1 byte after the amode */ );
   18874             gen_SEGV_if_not_16_aligned( addr );
   18875             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   18876             imm8 = (Int)getUChar(delta+alen);
   18877             delta += alen+1;
   18878             DIP( "blendpd $%d, %s,%s\n",
   18879                  imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   18880          }
   18881 
   18882          putXMMReg( gregOfRexRM(pfx, modrm),
   18883                     mkexpr( math_BLENDPS_128( src_vec, dst_vec, imm8) ) );
   18884          goto decode_success;
   18885       }
   18886       break;
   18887 
   18888    case 0x0D:
   18889       /* 66 0F 3A 0D /r ib = BLENDPD xmm1, xmm2/m128, imm8
   18890          Blend Packed Double Precision Floating-Point Values (XMM) */
   18891       if (have66noF2noF3(pfx) && sz == 2) {
   18892 
   18893          Int imm8;
   18894          IRTemp dst_vec = newTemp(Ity_V128);
   18895          IRTemp src_vec = newTemp(Ity_V128);
   18896 
   18897          modrm = getUChar(delta);
   18898          assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
   18899 
   18900          if ( epartIsReg( modrm ) ) {
   18901             imm8 = (Int)getUChar(delta+1);
   18902             assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   18903             delta += 1+1;
   18904             DIP( "blendpd $%d, %s,%s\n", imm8,
   18905                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
   18906                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   18907          } else {
   18908             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   18909                              1/* imm8 is 1 byte after the amode */ );
   18910             gen_SEGV_if_not_16_aligned( addr );
   18911             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   18912             imm8 = (Int)getUChar(delta+alen);
   18913             delta += alen+1;
   18914             DIP( "blendpd $%d, %s,%s\n",
   18915                  imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   18916          }
   18917 
   18918          putXMMReg( gregOfRexRM(pfx, modrm),
   18919                     mkexpr( math_BLENDPD_128( src_vec, dst_vec, imm8) ) );
   18920          goto decode_success;
   18921       }
   18922       break;
   18923 
   18924    case 0x0E:
   18925       /* 66 0F 3A 0E /r ib = PBLENDW xmm1, xmm2/m128, imm8
   18926          Blend Packed Words (XMM) */
   18927       if (have66noF2noF3(pfx) && sz == 2) {
   18928 
   18929          Int imm8;
   18930          IRTemp dst_vec = newTemp(Ity_V128);
   18931          IRTemp src_vec = newTemp(Ity_V128);
   18932 
   18933          modrm = getUChar(delta);
   18934 
   18935          assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
   18936 
   18937          if ( epartIsReg( modrm ) ) {
   18938             imm8 = (Int)getUChar(delta+1);
   18939             assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   18940             delta += 1+1;
   18941             DIP( "pblendw $%d, %s,%s\n", imm8,
   18942                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
   18943                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   18944          } else {
   18945             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   18946                              1/* imm8 is 1 byte after the amode */ );
   18947             gen_SEGV_if_not_16_aligned( addr );
   18948             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   18949             imm8 = (Int)getUChar(delta+alen);
   18950             delta += alen+1;
   18951             DIP( "pblendw $%d, %s,%s\n",
   18952                  imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   18953          }
   18954 
   18955          putXMMReg( gregOfRexRM(pfx, modrm),
   18956                     mkexpr( math_PBLENDW_128( src_vec, dst_vec, imm8) ) );
   18957          goto decode_success;
   18958       }
   18959       break;
   18960 
   18961    case 0x14:
   18962       /* 66 0F 3A 14 /r ib = PEXTRB r/m16, xmm, imm8
   18963          Extract Byte from xmm, store in mem or zero-extend + store in gen.reg.
   18964          (XMM) */
   18965       if (have66noF2noF3(pfx) && sz == 2) {
   18966          delta = dis_PEXTRB_128_GtoE( vbi, pfx, delta, False/*!isAvx*/ );
   18967          goto decode_success;
   18968       }
   18969       break;
   18970 
   18971    case 0x15:
   18972       /* 66 0F 3A 15 /r ib = PEXTRW r/m16, xmm, imm8
   18973          Extract Word from xmm, store in mem or zero-extend + store in gen.reg.
   18974          (XMM) */
   18975       if (have66noF2noF3(pfx) && sz == 2) {
   18976          delta = dis_PEXTRW( vbi, pfx, delta, False/*!isAvx*/ );
   18977          goto decode_success;
   18978       }
   18979       break;
   18980 
   18981    case 0x16:
   18982       /* 66 no-REX.W 0F 3A 16 /r ib = PEXTRD reg/mem32, xmm2, imm8
   18983          Extract Doubleword int from xmm reg and store in gen.reg or mem. (XMM)
   18984          Note that this insn has the same opcodes as PEXTRQ, but
   18985          here the REX.W bit is _not_ present */
   18986       if (have66noF2noF3(pfx)
   18987           && sz == 2 /* REX.W is _not_ present */) {
   18988          delta = dis_PEXTRD( vbi, pfx, delta, False/*!isAvx*/ );
   18989          goto decode_success;
   18990       }
   18991       /* 66 REX.W 0F 3A 16 /r ib = PEXTRQ reg/mem64, xmm2, imm8
   18992          Extract Quadword int from xmm reg and store in gen.reg or mem. (XMM)
   18993          Note that this insn has the same opcodes as PEXTRD, but
   18994          here the REX.W bit is present */
   18995       if (have66noF2noF3(pfx)
   18996           && sz == 8 /* REX.W is present */) {
   18997          delta = dis_PEXTRQ( vbi, pfx, delta, False/*!isAvx*/);
   18998          goto decode_success;
   18999       }
   19000       break;
   19001 
   19002    case 0x17:
   19003       /* 66 0F 3A 17 /r ib = EXTRACTPS reg/mem32, xmm2, imm8 Extract
   19004          float from xmm reg and store in gen.reg or mem.  This is
   19005          identical to PEXTRD, except that REX.W appears to be ignored.
   19006       */
   19007       if (have66noF2noF3(pfx)
   19008           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   19009          delta = dis_EXTRACTPS( vbi, pfx, delta, False/*!isAvx*/ );
   19010          goto decode_success;
   19011       }
   19012       break;
   19013 
   19014    case 0x20:
   19015       /* 66 0F 3A 20 /r ib = PINSRB xmm1, r32/m8, imm8
   19016          Extract byte from r32/m8 and insert into xmm1 */
   19017       if (have66noF2noF3(pfx) && sz == 2) {
   19018          Int    imm8;
   19019          IRTemp new8 = newTemp(Ity_I8);
   19020          modrm = getUChar(delta);
   19021          UInt rG = gregOfRexRM(pfx, modrm);
   19022          if ( epartIsReg( modrm ) ) {
   19023             UInt rE = eregOfRexRM(pfx,modrm);
   19024             imm8 = (Int)(getUChar(delta+1) & 0xF);
   19025             assign( new8, unop(Iop_32to8, getIReg32(rE)) );
   19026             delta += 1+1;
   19027             DIP( "pinsrb $%d,%s,%s\n", imm8,
   19028                  nameIReg32(rE), nameXMMReg(rG) );
   19029          } else {
   19030             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   19031             imm8 = (Int)(getUChar(delta+alen) & 0xF);
   19032             assign( new8, loadLE( Ity_I8, mkexpr(addr) ) );
   19033             delta += alen+1;
   19034             DIP( "pinsrb $%d,%s,%s\n",
   19035                  imm8, dis_buf, nameXMMReg(rG) );
   19036          }
   19037          IRTemp src_vec = newTemp(Ity_V128);
   19038          assign(src_vec, getXMMReg( gregOfRexRM(pfx, modrm) ));
   19039          IRTemp res = math_PINSRB_128( src_vec, new8, imm8 );
   19040          putXMMReg( rG, mkexpr(res) );
   19041          goto decode_success;
   19042       }
   19043       break;
   19044 
   19045    case 0x21:
   19046       /* 66 0F 3A 21 /r ib = INSERTPS imm8, xmm2/m32, xmm1
   19047          Insert Packed Single Precision Floating-Point Value (XMM) */
   19048       if (have66noF2noF3(pfx) && sz == 2) {
   19049          UInt   imm8;
   19050          IRTemp d2ins = newTemp(Ity_I32); /* comes from the E part */
   19051          const IRTemp inval = IRTemp_INVALID;
   19052 
   19053          modrm = getUChar(delta);
   19054          UInt rG = gregOfRexRM(pfx, modrm);
   19055 
   19056          if ( epartIsReg( modrm ) ) {
   19057             UInt   rE = eregOfRexRM(pfx, modrm);
   19058             IRTemp vE = newTemp(Ity_V128);
   19059             assign( vE, getXMMReg(rE) );
   19060             IRTemp dsE[4] = { inval, inval, inval, inval };
   19061             breakupV128to32s( vE, &dsE[3], &dsE[2], &dsE[1], &dsE[0] );
   19062             imm8 = getUChar(delta+1);
   19063             d2ins = dsE[(imm8 >> 6) & 3]; /* "imm8_count_s" */
   19064             delta += 1+1;
   19065             DIP( "insertps $%u, %s,%s\n",
   19066                  imm8, nameXMMReg(rE), nameXMMReg(rG) );
   19067          } else {
   19068             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   19069             assign( d2ins, loadLE( Ity_I32, mkexpr(addr) ) );
   19070             imm8 = getUChar(delta+alen);
   19071             delta += alen+1;
   19072             DIP( "insertps $%u, %s,%s\n",
   19073                  imm8, dis_buf, nameXMMReg(rG) );
   19074          }
   19075 
   19076          IRTemp vG = newTemp(Ity_V128);
   19077          assign( vG, getXMMReg(rG) );
   19078 
   19079          putXMMReg( rG, mkexpr(math_INSERTPS( vG, d2ins, imm8 )) );
   19080          goto decode_success;
   19081       }
   19082       break;
   19083 
   19084    case 0x22:
   19085       /* 66 no-REX.W 0F 3A 22 /r ib = PINSRD xmm1, r/m32, imm8
   19086          Extract Doubleword int from gen.reg/mem32 and insert into xmm1 */
   19087       if (have66noF2noF3(pfx)
   19088           && sz == 2 /* REX.W is NOT present */) {
   19089          Int    imm8_10;
   19090          IRTemp src_u32 = newTemp(Ity_I32);
   19091          modrm = getUChar(delta);
   19092          UInt rG = gregOfRexRM(pfx, modrm);
   19093 
   19094          if ( epartIsReg( modrm ) ) {
   19095             UInt rE = eregOfRexRM(pfx,modrm);
   19096             imm8_10 = (Int)(getUChar(delta+1) & 3);
   19097             assign( src_u32, getIReg32( rE ) );
   19098             delta += 1+1;
   19099             DIP( "pinsrd $%d, %s,%s\n",
   19100                  imm8_10, nameIReg32(rE), nameXMMReg(rG) );
   19101          } else {
   19102             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   19103             imm8_10 = (Int)(getUChar(delta+alen) & 3);
   19104             assign( src_u32, loadLE( Ity_I32, mkexpr(addr) ) );
   19105             delta += alen+1;
   19106             DIP( "pinsrd $%d, %s,%s\n",
   19107                  imm8_10, dis_buf, nameXMMReg(rG) );
   19108          }
   19109 
   19110          IRTemp src_vec = newTemp(Ity_V128);
   19111          assign(src_vec, getXMMReg( rG ));
   19112          IRTemp res_vec = math_PINSRD_128( src_vec, src_u32, imm8_10 );
   19113          putXMMReg( rG, mkexpr(res_vec) );
   19114          goto decode_success;
   19115       }
   19116       /* 66 REX.W 0F 3A 22 /r ib = PINSRQ xmm1, r/m64, imm8
   19117          Extract Quadword int from gen.reg/mem64 and insert into xmm1 */
   19118       if (have66noF2noF3(pfx)
   19119           && sz == 8 /* REX.W is present */) {
   19120          Int imm8_0;
   19121          IRTemp src_u64 = newTemp(Ity_I64);
   19122          modrm = getUChar(delta);
   19123          UInt rG = gregOfRexRM(pfx, modrm);
   19124 
   19125          if ( epartIsReg( modrm ) ) {
   19126             UInt rE = eregOfRexRM(pfx,modrm);
   19127             imm8_0 = (Int)(getUChar(delta+1) & 1);
   19128             assign( src_u64, getIReg64( rE ) );
   19129             delta += 1+1;
   19130             DIP( "pinsrq $%d, %s,%s\n",
   19131                  imm8_0, nameIReg64(rE), nameXMMReg(rG) );
   19132          } else {
   19133             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   19134             imm8_0 = (Int)(getUChar(delta+alen) & 1);
   19135             assign( src_u64, loadLE( Ity_I64, mkexpr(addr) ) );
   19136             delta += alen+1;
   19137             DIP( "pinsrq $%d, %s,%s\n",
   19138                  imm8_0, dis_buf, nameXMMReg(rG) );
   19139          }
   19140 
   19141          IRTemp src_vec = newTemp(Ity_V128);
   19142          assign(src_vec, getXMMReg( rG ));
   19143          IRTemp res_vec = math_PINSRQ_128( src_vec, src_u64, imm8_0 );
   19144          putXMMReg( rG, mkexpr(res_vec) );
   19145          goto decode_success;
   19146       }
   19147       break;
   19148 
   19149    case 0x40:
   19150       /* 66 0F 3A 40 /r ib = DPPS xmm1, xmm2/m128, imm8
   19151          Dot Product of Packed Single Precision Floating-Point Values (XMM) */
   19152       if (have66noF2noF3(pfx) && sz == 2) {
   19153          modrm = getUChar(delta);
   19154          Int    imm8;
   19155          IRTemp src_vec = newTemp(Ity_V128);
   19156          IRTemp dst_vec = newTemp(Ity_V128);
   19157          UInt   rG      = gregOfRexRM(pfx, modrm);
   19158          assign( dst_vec, getXMMReg( rG ) );
   19159          if ( epartIsReg( modrm ) ) {
   19160             UInt rE = eregOfRexRM(pfx, modrm);
   19161             imm8 = (Int)getUChar(delta+1);
   19162             assign( src_vec, getXMMReg(rE) );
   19163             delta += 1+1;
   19164             DIP( "dpps $%d, %s,%s\n",
   19165                  imm8, nameXMMReg(rE), nameXMMReg(rG) );
   19166          } else {
   19167             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   19168                              1/* imm8 is 1 byte after the amode */ );
   19169             gen_SEGV_if_not_16_aligned( addr );
   19170             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   19171             imm8 = (Int)getUChar(delta+alen);
   19172             delta += alen+1;
   19173             DIP( "dpps $%d, %s,%s\n",
   19174                  imm8, dis_buf, nameXMMReg(rG) );
   19175          }
   19176          IRTemp res = math_DPPS_128( src_vec, dst_vec, imm8 );
   19177          putXMMReg( rG, mkexpr(res) );
   19178          goto decode_success;
   19179       }
   19180       break;
   19181 
   19182    case 0x41:
   19183       /* 66 0F 3A 41 /r ib = DPPD xmm1, xmm2/m128, imm8
   19184          Dot Product of Packed Double Precision Floating-Point Values (XMM) */
   19185       if (have66noF2noF3(pfx) && sz == 2) {
   19186          modrm = getUChar(delta);
   19187          Int    imm8;
   19188          IRTemp src_vec = newTemp(Ity_V128);
   19189          IRTemp dst_vec = newTemp(Ity_V128);
   19190          UInt   rG      = gregOfRexRM(pfx, modrm);
   19191          assign( dst_vec, getXMMReg( rG ) );
   19192          if ( epartIsReg( modrm ) ) {
   19193             UInt rE = eregOfRexRM(pfx, modrm);
   19194             imm8 = (Int)getUChar(delta+1);
   19195             assign( src_vec, getXMMReg(rE) );
   19196             delta += 1+1;
   19197             DIP( "dppd $%d, %s,%s\n",
   19198                  imm8, nameXMMReg(rE), nameXMMReg(rG) );
   19199          } else {
   19200             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   19201                              1/* imm8 is 1 byte after the amode */ );
   19202             gen_SEGV_if_not_16_aligned( addr );
   19203             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   19204             imm8 = (Int)getUChar(delta+alen);
   19205             delta += alen+1;
   19206             DIP( "dppd $%d, %s,%s\n",
   19207                  imm8, dis_buf, nameXMMReg(rG) );
   19208          }
   19209          IRTemp res = math_DPPD_128( src_vec, dst_vec, imm8 );
   19210          putXMMReg( rG, mkexpr(res) );
   19211          goto decode_success;
   19212       }
   19213       break;
   19214 
   19215    case 0x42:
   19216       /* 66 0F 3A 42 /r ib = MPSADBW xmm1, xmm2/m128, imm8
   19217          Multiple Packed Sums of Absolule Difference (XMM) */
   19218       if (have66noF2noF3(pfx) && sz == 2) {
   19219          Int    imm8;
   19220          IRTemp src_vec = newTemp(Ity_V128);
   19221          IRTemp dst_vec = newTemp(Ity_V128);
   19222          modrm          = getUChar(delta);
   19223          UInt   rG      = gregOfRexRM(pfx, modrm);
   19224 
   19225          assign( dst_vec, getXMMReg(rG) );
   19226 
   19227          if ( epartIsReg( modrm ) ) {
   19228             UInt rE = eregOfRexRM(pfx, modrm);
   19229 
   19230             imm8 = (Int)getUChar(delta+1);
   19231             assign( src_vec, getXMMReg(rE) );
   19232             delta += 1+1;
   19233             DIP( "mpsadbw $%d, %s,%s\n", imm8,
   19234                  nameXMMReg(rE), nameXMMReg(rG) );
   19235          } else {
   19236             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   19237                              1/* imm8 is 1 byte after the amode */ );
   19238             gen_SEGV_if_not_16_aligned( addr );
   19239             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   19240             imm8 = (Int)getUChar(delta+alen);
   19241             delta += alen+1;
   19242             DIP( "mpsadbw $%d, %s,%s\n", imm8, dis_buf, nameXMMReg(rG) );
   19243          }
   19244 
   19245          putXMMReg( rG, mkexpr( math_MPSADBW_128(dst_vec, src_vec, imm8) ) );
   19246          goto decode_success;
   19247       }
   19248       break;
   19249 
   19250    case 0x44:
   19251       /* 66 0F 3A 44 /r ib = PCLMULQDQ xmm1, xmm2/m128, imm8
   19252        * Carry-less multiplication of selected XMM quadwords into XMM
   19253        * registers (a.k.a multiplication of polynomials over GF(2))
   19254        */
   19255       if (have66noF2noF3(pfx) && sz == 2) {
   19256 
   19257          Int imm8;
   19258          IRTemp svec = newTemp(Ity_V128);
   19259          IRTemp dvec = newTemp(Ity_V128);
   19260          modrm       = getUChar(delta);
   19261          UInt   rG   = gregOfRexRM(pfx, modrm);
   19262 
   19263          assign( dvec, getXMMReg(rG) );
   19264 
   19265          if ( epartIsReg( modrm ) ) {
   19266             UInt rE = eregOfRexRM(pfx, modrm);
   19267             imm8 = (Int)getUChar(delta+1);
   19268             assign( svec, getXMMReg(rE) );
   19269             delta += 1+1;
   19270             DIP( "pclmulqdq $%d, %s,%s\n", imm8,
   19271                  nameXMMReg(rE), nameXMMReg(rG) );
   19272          } else {
   19273             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   19274                              1/* imm8 is 1 byte after the amode */ );
   19275             gen_SEGV_if_not_16_aligned( addr );
   19276             assign( svec, loadLE( Ity_V128, mkexpr(addr) ) );
   19277             imm8 = (Int)getUChar(delta+alen);
   19278             delta += alen+1;
   19279             DIP( "pclmulqdq $%d, %s,%s\n",
   19280                  imm8, dis_buf, nameXMMReg(rG) );
   19281          }
   19282 
   19283          putXMMReg( rG, mkexpr( math_PCLMULQDQ(dvec, svec, imm8) ) );
   19284          goto decode_success;
   19285       }
   19286       break;
   19287 
   19288    case 0x60:
   19289    case 0x61:
   19290    case 0x62:
   19291    case 0x63:
   19292       /* 66 0F 3A 63 /r ib = PCMPISTRI imm8, xmm2/m128, xmm1
   19293          66 0F 3A 62 /r ib = PCMPISTRM imm8, xmm2/m128, xmm1
   19294          66 0F 3A 61 /r ib = PCMPESTRI imm8, xmm2/m128, xmm1
   19295          66 0F 3A 60 /r ib = PCMPESTRM imm8, xmm2/m128, xmm1
   19296          (selected special cases that actually occur in glibc,
   19297           not by any means a complete implementation.)
   19298       */
   19299       if (have66noF2noF3(pfx) && sz == 2) {
   19300          Long delta0 = delta;
   19301          delta = dis_PCMPxSTRx( vbi, pfx, delta, False/*!isAvx*/, opc );
   19302          if (delta > delta0) goto decode_success;
   19303          /* else fall though; dis_PCMPxSTRx failed to decode it */
   19304       }
   19305       break;
   19306 
   19307    case 0xDF:
   19308       /* 66 0F 3A DF /r ib = AESKEYGENASSIST imm8, xmm2/m128, xmm1 */
   19309       if (have66noF2noF3(pfx) && sz == 2) {
   19310          delta = dis_AESKEYGENASSIST( vbi, pfx, delta, False/*!isAvx*/ );
   19311          goto decode_success;
   19312       }
   19313       break;
   19314 
   19315    default:
   19316       break;
   19317 
   19318    }
   19319 
   19320   decode_failure:
   19321    *decode_OK = False;
   19322    return deltaIN;
   19323 
   19324   decode_success:
   19325    *decode_OK = True;
   19326    return delta;
   19327 }
   19328 
   19329 
   19330 /*------------------------------------------------------------*/
   19331 /*---                                                      ---*/
   19332 /*--- Top-level post-escape decoders: dis_ESC_NONE         ---*/
   19333 /*---                                                      ---*/
   19334 /*------------------------------------------------------------*/
   19335 
   19336 __attribute__((noinline))
   19337 static
   19338 Long dis_ESC_NONE (
   19339         /*MB_OUT*/DisResult* dres,
   19340         /*MB_OUT*/Bool*      expect_CAS,
   19341         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
   19342         Bool         resteerCisOk,
   19343         void*        callback_opaque,
   19344         const VexArchInfo* archinfo,
   19345         const VexAbiInfo*  vbi,
   19346         Prefix pfx, Int sz, Long deltaIN
   19347      )
   19348 {
   19349    Long   d64   = 0;
   19350    UChar  abyte = 0;
   19351    IRTemp addr  = IRTemp_INVALID;
   19352    IRTemp t1    = IRTemp_INVALID;
   19353    IRTemp t2    = IRTemp_INVALID;
   19354    IRTemp t3    = IRTemp_INVALID;
   19355    IRTemp t4    = IRTemp_INVALID;
   19356    IRTemp t5    = IRTemp_INVALID;
   19357    IRType ty    = Ity_INVALID;
   19358    UChar  modrm = 0;
   19359    Int    am_sz = 0;
   19360    Int    d_sz  = 0;
   19361    Int    alen  = 0;
   19362    HChar  dis_buf[50];
   19363 
   19364    Long   delta = deltaIN;
   19365    UChar  opc   = getUChar(delta); delta++;
   19366 
   19367    /* delta now points at the modrm byte.  In most of the cases that
   19368       follow, neither the F2 nor F3 prefixes are allowed.  However,
   19369       for some basic arithmetic operations we have to allow F2/XACQ or
   19370       F3/XREL in the case where the destination is memory and the LOCK
   19371       prefix is also present.  Do this check by looking at the modrm
   19372       byte but not advancing delta over it. */
   19373    /* By default, F2 and F3 are not allowed, so let's start off with
   19374       that setting. */
   19375    Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
   19376    { UChar tmp_modrm = getUChar(delta);
   19377      switch (opc) {
   19378         case 0x00: /* ADD Gb,Eb */  case 0x01: /* ADD Gv,Ev */
   19379         case 0x08: /* OR  Gb,Eb */  case 0x09: /* OR  Gv,Ev */
   19380         case 0x10: /* ADC Gb,Eb */  case 0x11: /* ADC Gv,Ev */
   19381         case 0x18: /* SBB Gb,Eb */  case 0x19: /* SBB Gv,Ev */
   19382         case 0x20: /* AND Gb,Eb */  case 0x21: /* AND Gv,Ev */
   19383         case 0x28: /* SUB Gb,Eb */  case 0x29: /* SUB Gv,Ev */
   19384         case 0x30: /* XOR Gb,Eb */  case 0x31: /* XOR Gv,Ev */
   19385            if (!epartIsReg(tmp_modrm)
   19386                && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
   19387               /* dst is mem, and we have F2 or F3 but not both */
   19388               validF2orF3 = True;
   19389            }
   19390            break;
   19391         default:
   19392            break;
   19393      }
   19394    }
   19395 
   19396    /* Now, in the switch below, for the opc values examined by the
   19397       switch above, use validF2orF3 rather than looking at pfx
   19398       directly. */
   19399    switch (opc) {
   19400 
   19401    case 0x00: /* ADD Gb,Eb */
   19402       if (!validF2orF3) goto decode_failure;
   19403       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Add8, True, 1, delta, "add" );
   19404       return delta;
   19405    case 0x01: /* ADD Gv,Ev */
   19406       if (!validF2orF3) goto decode_failure;
   19407       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Add8, True, sz, delta, "add" );
   19408       return delta;
   19409 
   19410    case 0x02: /* ADD Eb,Gb */
   19411       if (haveF2orF3(pfx)) goto decode_failure;
   19412       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Add8, True, 1, delta, "add" );
   19413       return delta;
   19414    case 0x03: /* ADD Ev,Gv */
   19415       if (haveF2orF3(pfx)) goto decode_failure;
   19416       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Add8, True, sz, delta, "add" );
   19417       return delta;
   19418 
   19419    case 0x04: /* ADD Ib, AL */
   19420       if (haveF2orF3(pfx)) goto decode_failure;
   19421       delta = dis_op_imm_A( 1, False, Iop_Add8, True, delta, "add" );
   19422       return delta;
   19423    case 0x05: /* ADD Iv, eAX */
   19424       if (haveF2orF3(pfx)) goto decode_failure;
   19425       delta = dis_op_imm_A(sz, False, Iop_Add8, True, delta, "add" );
   19426       return delta;
   19427 
   19428    case 0x08: /* OR Gb,Eb */
   19429       if (!validF2orF3) goto decode_failure;
   19430       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Or8, True, 1, delta, "or" );
   19431       return delta;
   19432    case 0x09: /* OR Gv,Ev */
   19433       if (!validF2orF3) goto decode_failure;
   19434       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Or8, True, sz, delta, "or" );
   19435       return delta;
   19436 
   19437    case 0x0A: /* OR Eb,Gb */
   19438       if (haveF2orF3(pfx)) goto decode_failure;
   19439       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Or8, True, 1, delta, "or" );
   19440       return delta;
   19441    case 0x0B: /* OR Ev,Gv */
   19442       if (haveF2orF3(pfx)) goto decode_failure;
   19443       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Or8, True, sz, delta, "or" );
   19444       return delta;
   19445 
   19446    case 0x0C: /* OR Ib, AL */
   19447       if (haveF2orF3(pfx)) goto decode_failure;
   19448       delta = dis_op_imm_A( 1, False, Iop_Or8, True, delta, "or" );
   19449       return delta;
   19450    case 0x0D: /* OR Iv, eAX */
   19451       if (haveF2orF3(pfx)) goto decode_failure;
   19452       delta = dis_op_imm_A( sz, False, Iop_Or8, True, delta, "or" );
   19453       return delta;
   19454 
   19455    case 0x10: /* ADC Gb,Eb */
   19456       if (!validF2orF3) goto decode_failure;
   19457       delta = dis_op2_G_E ( vbi, pfx, True, Iop_Add8, True, 1, delta, "adc" );
   19458       return delta;
   19459    case 0x11: /* ADC Gv,Ev */
   19460       if (!validF2orF3) goto decode_failure;
   19461       delta = dis_op2_G_E ( vbi, pfx, True, Iop_Add8, True, sz, delta, "adc" );
   19462       return delta;
   19463 
   19464    case 0x12: /* ADC Eb,Gb */
   19465       if (haveF2orF3(pfx)) goto decode_failure;
   19466       delta = dis_op2_E_G ( vbi, pfx, True, Iop_Add8, True, 1, delta, "adc" );
   19467       return delta;
   19468    case 0x13: /* ADC Ev,Gv */
   19469       if (haveF2orF3(pfx)) goto decode_failure;
   19470       delta = dis_op2_E_G ( vbi, pfx, True, Iop_Add8, True, sz, delta, "adc" );
   19471       return delta;
   19472 
   19473    case 0x14: /* ADC Ib, AL */
   19474       if (haveF2orF3(pfx)) goto decode_failure;
   19475       delta = dis_op_imm_A( 1, True, Iop_Add8, True, delta, "adc" );
   19476       return delta;
   19477    case 0x15: /* ADC Iv, eAX */
   19478       if (haveF2orF3(pfx)) goto decode_failure;
   19479       delta = dis_op_imm_A( sz, True, Iop_Add8, True, delta, "adc" );
   19480       return delta;
   19481 
   19482    case 0x18: /* SBB Gb,Eb */
   19483       if (!validF2orF3) goto decode_failure;
   19484       delta = dis_op2_G_E ( vbi, pfx, True, Iop_Sub8, True, 1, delta, "sbb" );
   19485       return delta;
   19486    case 0x19: /* SBB Gv,Ev */
   19487       if (!validF2orF3) goto decode_failure;
   19488       delta = dis_op2_G_E ( vbi, pfx, True, Iop_Sub8, True, sz, delta, "sbb" );
   19489       return delta;
   19490 
   19491    case 0x1A: /* SBB Eb,Gb */
   19492       if (haveF2orF3(pfx)) goto decode_failure;
   19493       delta = dis_op2_E_G ( vbi, pfx, True, Iop_Sub8, True, 1, delta, "sbb" );
   19494       return delta;
   19495    case 0x1B: /* SBB Ev,Gv */
   19496       if (haveF2orF3(pfx)) goto decode_failure;
   19497       delta = dis_op2_E_G ( vbi, pfx, True, Iop_Sub8, True, sz, delta, "sbb" );
   19498       return delta;
   19499 
   19500    case 0x1C: /* SBB Ib, AL */
   19501       if (haveF2orF3(pfx)) goto decode_failure;
   19502       delta = dis_op_imm_A( 1, True, Iop_Sub8, True, delta, "sbb" );
   19503       return delta;
   19504    case 0x1D: /* SBB Iv, eAX */
   19505       if (haveF2orF3(pfx)) goto decode_failure;
   19506       delta = dis_op_imm_A( sz, True, Iop_Sub8, True, delta, "sbb" );
   19507       return delta;
   19508 
   19509    case 0x20: /* AND Gb,Eb */
   19510       if (!validF2orF3) goto decode_failure;
   19511       delta = dis_op2_G_E ( vbi, pfx, False, Iop_And8, True, 1, delta, "and" );
   19512       return delta;
   19513    case 0x21: /* AND Gv,Ev */
   19514       if (!validF2orF3) goto decode_failure;
   19515       delta = dis_op2_G_E ( vbi, pfx, False, Iop_And8, True, sz, delta, "and" );
   19516       return delta;
   19517 
   19518    case 0x22: /* AND Eb,Gb */
   19519       if (haveF2orF3(pfx)) goto decode_failure;
   19520       delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, True, 1, delta, "and" );
   19521       return delta;
   19522    case 0x23: /* AND Ev,Gv */
   19523       if (haveF2orF3(pfx)) goto decode_failure;
   19524       delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, True, sz, delta, "and" );
   19525       return delta;
   19526 
   19527    case 0x24: /* AND Ib, AL */
   19528       if (haveF2orF3(pfx)) goto decode_failure;
   19529       delta = dis_op_imm_A( 1, False, Iop_And8, True, delta, "and" );
   19530       return delta;
   19531    case 0x25: /* AND Iv, eAX */
   19532       if (haveF2orF3(pfx)) goto decode_failure;
   19533       delta = dis_op_imm_A( sz, False, Iop_And8, True, delta, "and" );
   19534       return delta;
   19535 
   19536    case 0x28: /* SUB Gb,Eb */
   19537       if (!validF2orF3) goto decode_failure;
   19538       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, True, 1, delta, "sub" );
   19539       return delta;
   19540    case 0x29: /* SUB Gv,Ev */
   19541       if (!validF2orF3) goto decode_failure;
   19542       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, True, sz, delta, "sub" );
   19543       return delta;
   19544 
   19545    case 0x2A: /* SUB Eb,Gb */
   19546       if (haveF2orF3(pfx)) goto decode_failure;
   19547       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, True, 1, delta, "sub" );
   19548       return delta;
   19549    case 0x2B: /* SUB Ev,Gv */
   19550       if (haveF2orF3(pfx)) goto decode_failure;
   19551       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, True, sz, delta, "sub" );
   19552       return delta;
   19553 
   19554    case 0x2C: /* SUB Ib, AL */
   19555       if (haveF2orF3(pfx)) goto decode_failure;
   19556       delta = dis_op_imm_A(1, False, Iop_Sub8, True, delta, "sub" );
   19557       return delta;
   19558    case 0x2D: /* SUB Iv, eAX */
   19559       if (haveF2orF3(pfx)) goto decode_failure;
   19560       delta = dis_op_imm_A( sz, False, Iop_Sub8, True, delta, "sub" );
   19561       return delta;
   19562 
   19563    case 0x30: /* XOR Gb,Eb */
   19564       if (!validF2orF3) goto decode_failure;
   19565       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Xor8, True, 1, delta, "xor" );
   19566       return delta;
   19567    case 0x31: /* XOR Gv,Ev */
   19568       if (!validF2orF3) goto decode_failure;
   19569       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Xor8, True, sz, delta, "xor" );
   19570       return delta;
   19571 
   19572    case 0x32: /* XOR Eb,Gb */
   19573       if (haveF2orF3(pfx)) goto decode_failure;
   19574       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Xor8, True, 1, delta, "xor" );
   19575       return delta;
   19576    case 0x33: /* XOR Ev,Gv */
   19577       if (haveF2orF3(pfx)) goto decode_failure;
   19578       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Xor8, True, sz, delta, "xor" );
   19579       return delta;
   19580 
   19581    case 0x34: /* XOR Ib, AL */
   19582       if (haveF2orF3(pfx)) goto decode_failure;
   19583       delta = dis_op_imm_A( 1, False, Iop_Xor8, True, delta, "xor" );
   19584       return delta;
   19585    case 0x35: /* XOR Iv, eAX */
   19586       if (haveF2orF3(pfx)) goto decode_failure;
   19587       delta = dis_op_imm_A( sz, False, Iop_Xor8, True, delta, "xor" );
   19588       return delta;
   19589 
   19590    case 0x38: /* CMP Gb,Eb */
   19591       if (haveF2orF3(pfx)) goto decode_failure;
   19592       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, False, 1, delta, "cmp" );
   19593       return delta;
   19594    case 0x39: /* CMP Gv,Ev */
   19595       if (haveF2orF3(pfx)) goto decode_failure;
   19596       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, False, sz, delta, "cmp" );
   19597       return delta;
   19598 
   19599    case 0x3A: /* CMP Eb,Gb */
   19600       if (haveF2orF3(pfx)) goto decode_failure;
   19601       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, False, 1, delta, "cmp" );
   19602       return delta;
   19603    case 0x3B: /* CMP Ev,Gv */
   19604       if (haveF2orF3(pfx)) goto decode_failure;
   19605       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, False, sz, delta, "cmp" );
   19606       return delta;
   19607 
   19608    case 0x3C: /* CMP Ib, AL */
   19609       if (haveF2orF3(pfx)) goto decode_failure;
   19610       delta = dis_op_imm_A( 1, False, Iop_Sub8, False, delta, "cmp" );
   19611       return delta;
   19612    case 0x3D: /* CMP Iv, eAX */
   19613       if (haveF2orF3(pfx)) goto decode_failure;
   19614       delta = dis_op_imm_A( sz, False, Iop_Sub8, False, delta, "cmp" );
   19615       return delta;
   19616 
   19617    case 0x50: /* PUSH eAX */
   19618    case 0x51: /* PUSH eCX */
   19619    case 0x52: /* PUSH eDX */
   19620    case 0x53: /* PUSH eBX */
   19621    case 0x55: /* PUSH eBP */
   19622    case 0x56: /* PUSH eSI */
   19623    case 0x57: /* PUSH eDI */
   19624    case 0x54: /* PUSH eSP */
   19625       /* This is the Right Way, in that the value to be pushed is
   19626          established before %rsp is changed, so that pushq %rsp
   19627          correctly pushes the old value. */
   19628       if (haveF2orF3(pfx)) goto decode_failure;
   19629       vassert(sz == 2 || sz == 4 || sz == 8);
   19630       if (sz == 4)
   19631          sz = 8; /* there is no encoding for 32-bit push in 64-bit mode */
   19632       ty = sz==2 ? Ity_I16 : Ity_I64;
   19633       t1 = newTemp(ty);
   19634       t2 = newTemp(Ity_I64);
   19635       assign(t1, getIRegRexB(sz, pfx, opc-0x50));
   19636       assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(sz)));
   19637       putIReg64(R_RSP, mkexpr(t2) );
   19638       storeLE(mkexpr(t2),mkexpr(t1));
   19639       DIP("push%c %s\n", nameISize(sz), nameIRegRexB(sz,pfx,opc-0x50));
   19640       return delta;
   19641 
   19642    case 0x58: /* POP eAX */
   19643    case 0x59: /* POP eCX */
   19644    case 0x5A: /* POP eDX */
   19645    case 0x5B: /* POP eBX */
   19646    case 0x5D: /* POP eBP */
   19647    case 0x5E: /* POP eSI */
   19648    case 0x5F: /* POP eDI */
   19649    case 0x5C: /* POP eSP */
   19650       if (haveF2orF3(pfx)) goto decode_failure;
   19651       vassert(sz == 2 || sz == 4 || sz == 8);
   19652       if (sz == 4)
   19653          sz = 8; /* there is no encoding for 32-bit pop in 64-bit mode */
   19654       t1 = newTemp(szToITy(sz));
   19655       t2 = newTemp(Ity_I64);
   19656       assign(t2, getIReg64(R_RSP));
   19657       assign(t1, loadLE(szToITy(sz),mkexpr(t2)));
   19658       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t2), mkU64(sz)));
   19659       putIRegRexB(sz, pfx, opc-0x58, mkexpr(t1));
   19660       DIP("pop%c %s\n", nameISize(sz), nameIRegRexB(sz,pfx,opc-0x58));
   19661       return delta;
   19662 
   19663    case 0x63: /* MOVSX */
   19664       if (haveF2orF3(pfx)) goto decode_failure;
   19665       if (haveREX(pfx) && 1==getRexW(pfx)) {
   19666          vassert(sz == 8);
   19667          /* movsx r/m32 to r64 */
   19668          modrm = getUChar(delta);
   19669          if (epartIsReg(modrm)) {
   19670             delta++;
   19671             putIRegG(8, pfx, modrm,
   19672                              unop(Iop_32Sto64,
   19673                                   getIRegE(4, pfx, modrm)));
   19674             DIP("movslq %s,%s\n",
   19675                 nameIRegE(4, pfx, modrm),
   19676                 nameIRegG(8, pfx, modrm));
   19677             return delta;
   19678          } else {
   19679             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   19680             delta += alen;
   19681             putIRegG(8, pfx, modrm,
   19682                              unop(Iop_32Sto64,
   19683                                   loadLE(Ity_I32, mkexpr(addr))));
   19684             DIP("movslq %s,%s\n", dis_buf,
   19685                 nameIRegG(8, pfx, modrm));
   19686             return delta;
   19687          }
   19688       } else {
   19689          goto decode_failure;
   19690       }
   19691 
   19692    case 0x68: /* PUSH Iv */
   19693       if (haveF2orF3(pfx)) goto decode_failure;
   19694       /* Note, sz==4 is not possible in 64-bit mode.  Hence ... */
   19695       if (sz == 4) sz = 8;
   19696       d64 = getSDisp(imin(4,sz),delta);
   19697       delta += imin(4,sz);
   19698       goto do_push_I;
   19699 
   19700    case 0x69: /* IMUL Iv, Ev, Gv */
   19701       if (haveF2orF3(pfx)) goto decode_failure;
   19702       delta = dis_imul_I_E_G ( vbi, pfx, sz, delta, sz );
   19703       return delta;
   19704 
   19705    case 0x6A: /* PUSH Ib, sign-extended to sz */
   19706       if (haveF2orF3(pfx)) goto decode_failure;
   19707       /* Note, sz==4 is not possible in 64-bit mode.  Hence ... */
   19708       if (sz == 4) sz = 8;
   19709       d64 = getSDisp8(delta); delta += 1;
   19710       goto do_push_I;
   19711    do_push_I:
   19712       ty = szToITy(sz);
   19713       t1 = newTemp(Ity_I64);
   19714       t2 = newTemp(ty);
   19715       assign( t1, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
   19716       putIReg64(R_RSP, mkexpr(t1) );
   19717       /* stop mkU16 asserting if d32 is a negative 16-bit number
   19718          (bug #132813) */
   19719       if (ty == Ity_I16)
   19720          d64 &= 0xFFFF;
   19721       storeLE( mkexpr(t1), mkU(ty,d64) );
   19722       DIP("push%c $%lld\n", nameISize(sz), (Long)d64);
   19723       return delta;
   19724 
   19725    case 0x6B: /* IMUL Ib, Ev, Gv */
   19726       delta = dis_imul_I_E_G ( vbi, pfx, sz, delta, 1 );
   19727       return delta;
   19728 
   19729    case 0x70:
   19730    case 0x71:
   19731    case 0x72:   /* JBb/JNAEb (jump below) */
   19732    case 0x73:   /* JNBb/JAEb (jump not below) */
   19733    case 0x74:   /* JZb/JEb (jump zero) */
   19734    case 0x75:   /* JNZb/JNEb (jump not zero) */
   19735    case 0x76:   /* JBEb/JNAb (jump below or equal) */
   19736    case 0x77:   /* JNBEb/JAb (jump not below or equal) */
   19737    case 0x78:   /* JSb (jump negative) */
   19738    case 0x79:   /* JSb (jump not negative) */
   19739    case 0x7A:   /* JP (jump parity even) */
   19740    case 0x7B:   /* JNP/JPO (jump parity odd) */
   19741    case 0x7C:   /* JLb/JNGEb (jump less) */
   19742    case 0x7D:   /* JGEb/JNLb (jump greater or equal) */
   19743    case 0x7E:   /* JLEb/JNGb (jump less or equal) */
   19744    case 0x7F: { /* JGb/JNLEb (jump greater) */
   19745       Long   jmpDelta;
   19746       const HChar* comment  = "";
   19747       if (haveF3(pfx)) goto decode_failure;
   19748       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   19749       jmpDelta = getSDisp8(delta);
   19750       vassert(-128 <= jmpDelta && jmpDelta < 128);
   19751       d64 = (guest_RIP_bbstart+delta+1) + jmpDelta;
   19752       delta++;
   19753       if (resteerCisOk
   19754           && vex_control.guest_chase_cond
   19755           && (Addr64)d64 != (Addr64)guest_RIP_bbstart
   19756           && jmpDelta < 0
   19757           && resteerOkFn( callback_opaque, (Addr64)d64) ) {
   19758          /* Speculation: assume this backward branch is taken.  So we
   19759             need to emit a side-exit to the insn following this one,
   19760             on the negation of the condition, and continue at the
   19761             branch target address (d64).  If we wind up back at the
   19762             first instruction of the trace, just stop; it's better to
   19763             let the IR loop unroller handle that case. */
   19764          stmt( IRStmt_Exit(
   19765                   mk_amd64g_calculate_condition(
   19766                      (AMD64Condcode)(1 ^ (opc - 0x70))),
   19767                   Ijk_Boring,
   19768                   IRConst_U64(guest_RIP_bbstart+delta),
   19769                   OFFB_RIP ) );
   19770          dres->whatNext   = Dis_ResteerC;
   19771          dres->continueAt = d64;
   19772          comment = "(assumed taken)";
   19773       }
   19774       else
   19775       if (resteerCisOk
   19776           && vex_control.guest_chase_cond
   19777           && (Addr64)d64 != (Addr64)guest_RIP_bbstart
   19778           && jmpDelta >= 0
   19779           && resteerOkFn( callback_opaque, guest_RIP_bbstart+delta ) ) {
   19780          /* Speculation: assume this forward branch is not taken.  So
   19781             we need to emit a side-exit to d64 (the dest) and continue
   19782             disassembling at the insn immediately following this
   19783             one. */
   19784          stmt( IRStmt_Exit(
   19785                   mk_amd64g_calculate_condition((AMD64Condcode)(opc - 0x70)),
   19786                   Ijk_Boring,
   19787                   IRConst_U64(d64),
   19788                   OFFB_RIP ) );
   19789          dres->whatNext   = Dis_ResteerC;
   19790          dres->continueAt = guest_RIP_bbstart+delta;
   19791          comment = "(assumed not taken)";
   19792       }
   19793       else {
   19794          /* Conservative default translation - end the block at this
   19795             point. */
   19796          jcc_01( dres, (AMD64Condcode)(opc - 0x70),
   19797                  guest_RIP_bbstart+delta, d64 );
   19798          vassert(dres->whatNext == Dis_StopHere);
   19799       }
   19800       DIP("j%s-8 0x%llx %s\n", name_AMD64Condcode(opc - 0x70), d64, comment);
   19801       return delta;
   19802    }
   19803 
   19804    case 0x80: /* Grp1 Ib,Eb */
   19805       modrm = getUChar(delta);
   19806       /* Disallow F2/XACQ and F3/XREL for the non-mem case.  Allow
   19807          just one for the mem case and also require LOCK in this case.
   19808          Note that this erroneously allows XACQ/XREL on CMP since we
   19809          don't check the subopcode here.  No big deal. */
   19810       if (epartIsReg(modrm) && haveF2orF3(pfx))
   19811          goto decode_failure;
   19812       if (!epartIsReg(modrm) && haveF2andF3(pfx))
   19813          goto decode_failure;
   19814       if (!epartIsReg(modrm) && haveF2orF3(pfx) && !haveLOCK(pfx))
   19815          goto decode_failure;
   19816       am_sz = lengthAMode(pfx,delta);
   19817       sz    = 1;
   19818       d_sz  = 1;
   19819       d64   = getSDisp8(delta + am_sz);
   19820       delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
   19821       return delta;
   19822 
   19823    case 0x81: /* Grp1 Iv,Ev */
   19824       modrm = getUChar(delta);
   19825       /* Same comment as for case 0x80 just above. */
   19826       if (epartIsReg(modrm) && haveF2orF3(pfx))
   19827          goto decode_failure;
   19828       if (!epartIsReg(modrm) && haveF2andF3(pfx))
   19829          goto decode_failure;
   19830       if (!epartIsReg(modrm) && haveF2orF3(pfx) && !haveLOCK(pfx))
   19831          goto decode_failure;
   19832       am_sz = lengthAMode(pfx,delta);
   19833       d_sz  = imin(sz,4);
   19834       d64   = getSDisp(d_sz, delta + am_sz);
   19835       delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
   19836       return delta;
   19837 
   19838    case 0x83: /* Grp1 Ib,Ev */
   19839       if (haveF2orF3(pfx)) goto decode_failure;
   19840       modrm = getUChar(delta);
   19841       am_sz = lengthAMode(pfx,delta);
   19842       d_sz  = 1;
   19843       d64   = getSDisp8(delta + am_sz);
   19844       delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
   19845       return delta;
   19846 
   19847    case 0x84: /* TEST Eb,Gb */
   19848       if (haveF2orF3(pfx)) goto decode_failure;
   19849       delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, False, 1, delta, "test" );
   19850       return delta;
   19851 
   19852    case 0x85: /* TEST Ev,Gv */
   19853       if (haveF2orF3(pfx)) goto decode_failure;
   19854       delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, False, sz, delta, "test" );
   19855       return delta;
   19856 
   19857    /* XCHG reg,mem automatically asserts LOCK# even without a LOCK
   19858       prefix.  Therefore, generate CAS regardless of the presence or
   19859       otherwise of a LOCK prefix. */
   19860    case 0x86: /* XCHG Gb,Eb */
   19861       sz = 1;
   19862       /* Fall through ... */
   19863    case 0x87: /* XCHG Gv,Ev */
   19864       modrm = getUChar(delta);
   19865       /* Check whether F2 or F3 are allowable.  For the mem case, one
   19866          or the othter but not both are.  We don't care about the
   19867          presence of LOCK in this case -- XCHG is unusual in this
   19868          respect. */
   19869       if (haveF2orF3(pfx)) {
   19870          if (epartIsReg(modrm)) {
   19871             goto decode_failure;
   19872          } else {
   19873             if (haveF2andF3(pfx))
   19874                goto decode_failure;
   19875          }
   19876       }
   19877       ty = szToITy(sz);
   19878       t1 = newTemp(ty); t2 = newTemp(ty);
   19879       if (epartIsReg(modrm)) {
   19880          assign(t1, getIRegE(sz, pfx, modrm));
   19881          assign(t2, getIRegG(sz, pfx, modrm));
   19882          putIRegG(sz, pfx, modrm, mkexpr(t1));
   19883          putIRegE(sz, pfx, modrm, mkexpr(t2));
   19884          delta++;
   19885          DIP("xchg%c %s, %s\n",
   19886              nameISize(sz), nameIRegG(sz, pfx, modrm),
   19887                             nameIRegE(sz, pfx, modrm));
   19888       } else {
   19889          *expect_CAS = True;
   19890          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   19891          assign( t1, loadLE(ty, mkexpr(addr)) );
   19892          assign( t2, getIRegG(sz, pfx, modrm) );
   19893          casLE( mkexpr(addr),
   19894                 mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
   19895          putIRegG( sz, pfx, modrm, mkexpr(t1) );
   19896          delta += alen;
   19897          DIP("xchg%c %s, %s\n", nameISize(sz),
   19898                                 nameIRegG(sz, pfx, modrm), dis_buf);
   19899       }
   19900       return delta;
   19901 
   19902    case 0x88: { /* MOV Gb,Eb */
   19903       /* We let dis_mov_G_E decide whether F3(XRELEASE) is allowable. */
   19904       Bool ok = True;
   19905       delta = dis_mov_G_E(vbi, pfx, 1, delta, &ok);
   19906       if (!ok) goto decode_failure;
   19907       return delta;
   19908    }
   19909 
   19910    case 0x89: { /* MOV Gv,Ev */
   19911       /* We let dis_mov_G_E decide whether F3(XRELEASE) is allowable. */
   19912       Bool ok = True;
   19913       delta = dis_mov_G_E(vbi, pfx, sz, delta, &ok);
   19914       if (!ok) goto decode_failure;
   19915       return delta;
   19916    }
   19917 
   19918    case 0x8A: /* MOV Eb,Gb */
   19919       if (haveF2orF3(pfx)) goto decode_failure;
   19920       delta = dis_mov_E_G(vbi, pfx, 1, delta);
   19921       return delta;
   19922 
   19923    case 0x8B: /* MOV Ev,Gv */
   19924       if (haveF2orF3(pfx)) goto decode_failure;
   19925       delta = dis_mov_E_G(vbi, pfx, sz, delta);
   19926       return delta;
   19927 
   19928    case 0x8D: /* LEA M,Gv */
   19929       if (haveF2orF3(pfx)) goto decode_failure;
   19930       if (sz != 4 && sz != 8)
   19931          goto decode_failure;
   19932       modrm = getUChar(delta);
   19933       if (epartIsReg(modrm))
   19934          goto decode_failure;
   19935       /* NOTE!  this is the one place where a segment override prefix
   19936          has no effect on the address calculation.  Therefore we clear
   19937          any segment override bits in pfx. */
   19938       addr = disAMode ( &alen, vbi, clearSegBits(pfx), delta, dis_buf, 0 );
   19939       delta += alen;
   19940       /* This is a hack.  But it isn't clear that really doing the
   19941          calculation at 32 bits is really worth it.  Hence for leal,
   19942          do the full 64-bit calculation and then truncate it. */
   19943       putIRegG( sz, pfx, modrm,
   19944                          sz == 4
   19945                             ? unop(Iop_64to32, mkexpr(addr))
   19946                             : mkexpr(addr)
   19947               );
   19948       DIP("lea%c %s, %s\n", nameISize(sz), dis_buf,
   19949                             nameIRegG(sz,pfx,modrm));
   19950       return delta;
   19951 
   19952    case 0x8F: { /* POPQ m64 / POPW m16 */
   19953       Int   len;
   19954       UChar rm;
   19955       /* There is no encoding for 32-bit pop in 64-bit mode.
   19956          So sz==4 actually means sz==8. */
   19957       if (haveF2orF3(pfx)) goto decode_failure;
   19958       vassert(sz == 2 || sz == 4
   19959               || /* tolerate redundant REX.W, see #210481 */ sz == 8);
   19960       if (sz == 4) sz = 8;
   19961       if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
   19962 
   19963       rm = getUChar(delta);
   19964 
   19965       /* make sure this instruction is correct POP */
   19966       if (epartIsReg(rm) || gregLO3ofRM(rm) != 0)
   19967          goto decode_failure;
   19968       /* and has correct size */
   19969       vassert(sz == 8);
   19970 
   19971       t1 = newTemp(Ity_I64);
   19972       t3 = newTemp(Ity_I64);
   19973       assign( t1, getIReg64(R_RSP) );
   19974       assign( t3, loadLE(Ity_I64, mkexpr(t1)) );
   19975 
   19976       /* Increase RSP; must be done before the STORE.  Intel manual
   19977          says: If the RSP register is used as a base register for
   19978          addressing a destination operand in memory, the POP
   19979          instruction computes the effective address of the operand
   19980          after it increments the RSP register.  */
   19981       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t1), mkU64(sz)) );
   19982 
   19983       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   19984       storeLE( mkexpr(addr), mkexpr(t3) );
   19985 
   19986       DIP("popl %s\n", dis_buf);
   19987 
   19988       delta += len;
   19989       return delta;
   19990    }
   19991 
   19992    case 0x90: /* XCHG eAX,eAX */
   19993       /* detect and handle F3 90 (rep nop) specially */
   19994       if (!have66(pfx) && !haveF2(pfx) && haveF3(pfx)) {
   19995          DIP("rep nop (P4 pause)\n");
   19996          /* "observe" the hint.  The Vex client needs to be careful not
   19997             to cause very long delays as a result, though. */
   19998          jmp_lit(dres, Ijk_Yield, guest_RIP_bbstart+delta);
   19999          vassert(dres->whatNext == Dis_StopHere);
   20000          return delta;
   20001       }
   20002       /* detect and handle NOPs specially */
   20003       if (/* F2/F3 probably change meaning completely */
   20004           !haveF2orF3(pfx)
   20005           /* If REX.B is 1, we're not exchanging rAX with itself */
   20006           && getRexB(pfx)==0 ) {
   20007          DIP("nop\n");
   20008          return delta;
   20009       }
   20010       /* else fall through to normal case. */
   20011    case 0x91: /* XCHG rAX,rCX */
   20012    case 0x92: /* XCHG rAX,rDX */
   20013    case 0x93: /* XCHG rAX,rBX */
   20014    case 0x94: /* XCHG rAX,rSP */
   20015    case 0x95: /* XCHG rAX,rBP */
   20016    case 0x96: /* XCHG rAX,rSI */
   20017    case 0x97: /* XCHG rAX,rDI */
   20018       /* guard against mutancy */
   20019       if (haveF2orF3(pfx)) goto decode_failure;
   20020       codegen_xchg_rAX_Reg ( pfx, sz, opc - 0x90 );
   20021       return delta;
   20022 
   20023    case 0x98: /* CBW */
   20024       if (haveF2orF3(pfx)) goto decode_failure;
   20025       if (sz == 8) {
   20026          putIRegRAX( 8, unop(Iop_32Sto64, getIRegRAX(4)) );
   20027          DIP(/*"cdqe\n"*/"cltq");
   20028          return delta;
   20029       }
   20030       if (sz == 4) {
   20031          putIRegRAX( 4, unop(Iop_16Sto32, getIRegRAX(2)) );
   20032          DIP("cwtl\n");
   20033          return delta;
   20034       }
   20035       if (sz == 2) {
   20036          putIRegRAX( 2, unop(Iop_8Sto16, getIRegRAX(1)) );
   20037          DIP("cbw\n");
   20038          return delta;
   20039       }
   20040       goto decode_failure;
   20041 
   20042    case 0x99: /* CWD/CDQ/CQO */
   20043       if (haveF2orF3(pfx)) goto decode_failure;
   20044       vassert(sz == 2 || sz == 4 || sz == 8);
   20045       ty = szToITy(sz);
   20046       putIRegRDX( sz,
   20047                   binop(mkSizedOp(ty,Iop_Sar8),
   20048                         getIRegRAX(sz),
   20049                         mkU8(sz == 2 ? 15 : (sz == 4 ? 31 : 63))) );
   20050       DIP(sz == 2 ? "cwd\n"
   20051                   : (sz == 4 ? /*"cdq\n"*/ "cltd\n"
   20052                              : "cqo\n"));
   20053       return delta;
   20054 
   20055    case 0x9B: /* FWAIT (X87 insn) */
   20056       /* ignore? */
   20057       DIP("fwait\n");
   20058       return delta;
   20059 
   20060    case 0x9C: /* PUSHF */ {
   20061       /* Note.  There is no encoding for a 32-bit pushf in 64-bit
   20062          mode.  So sz==4 actually means sz==8. */
   20063       /* 24 July 06: has also been seen with a redundant REX prefix,
   20064          so must also allow sz==8. */
   20065       if (haveF2orF3(pfx)) goto decode_failure;
   20066       vassert(sz == 2 || sz == 4 || sz == 8);
   20067       if (sz == 4) sz = 8;
   20068       if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
   20069 
   20070       t1 = newTemp(Ity_I64);
   20071       assign( t1, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
   20072       putIReg64(R_RSP, mkexpr(t1) );
   20073 
   20074       t2 = newTemp(Ity_I64);
   20075       assign( t2, mk_amd64g_calculate_rflags_all() );
   20076 
   20077       /* Patch in the D flag.  This can simply be a copy of bit 10 of
   20078          baseBlock[OFFB_DFLAG]. */
   20079       t3 = newTemp(Ity_I64);
   20080       assign( t3, binop(Iop_Or64,
   20081                         mkexpr(t2),
   20082                         binop(Iop_And64,
   20083                               IRExpr_Get(OFFB_DFLAG,Ity_I64),
   20084                               mkU64(1<<10)))
   20085             );
   20086 
   20087       /* And patch in the ID flag. */
   20088       t4 = newTemp(Ity_I64);
   20089       assign( t4, binop(Iop_Or64,
   20090                         mkexpr(t3),
   20091                         binop(Iop_And64,
   20092                               binop(Iop_Shl64, IRExpr_Get(OFFB_IDFLAG,Ity_I64),
   20093                                                mkU8(21)),
   20094                               mkU64(1<<21)))
   20095             );
   20096 
   20097       /* And patch in the AC flag too. */
   20098       t5 = newTemp(Ity_I64);
   20099       assign( t5, binop(Iop_Or64,
   20100                         mkexpr(t4),
   20101                         binop(Iop_And64,
   20102                               binop(Iop_Shl64, IRExpr_Get(OFFB_ACFLAG,Ity_I64),
   20103                                                mkU8(18)),
   20104                               mkU64(1<<18)))
   20105             );
   20106 
   20107       /* if sz==2, the stored value needs to be narrowed. */
   20108       if (sz == 2)
   20109         storeLE( mkexpr(t1), unop(Iop_32to16,
   20110                              unop(Iop_64to32,mkexpr(t5))) );
   20111       else
   20112         storeLE( mkexpr(t1), mkexpr(t5) );
   20113 
   20114       DIP("pushf%c\n", nameISize(sz));
   20115       return delta;
   20116    }
   20117 
   20118    case 0x9D: /* POPF */
   20119       /* Note.  There is no encoding for a 32-bit popf in 64-bit mode.
   20120          So sz==4 actually means sz==8. */
   20121       if (haveF2orF3(pfx)) goto decode_failure;
   20122       vassert(sz == 2 || sz == 4);
   20123       if (sz == 4) sz = 8;
   20124       if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
   20125       t1 = newTemp(Ity_I64); t2 = newTemp(Ity_I64);
   20126       assign(t2, getIReg64(R_RSP));
   20127       assign(t1, widenUto64(loadLE(szToITy(sz),mkexpr(t2))));
   20128       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t2), mkU64(sz)));
   20129       /* t1 is the flag word.  Mask out everything except OSZACP and
   20130          set the flags thunk to AMD64G_CC_OP_COPY. */
   20131       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   20132       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   20133       stmt( IRStmt_Put( OFFB_CC_DEP1,
   20134                         binop(Iop_And64,
   20135                               mkexpr(t1),
   20136                               mkU64( AMD64G_CC_MASK_C | AMD64G_CC_MASK_P
   20137                                      | AMD64G_CC_MASK_A | AMD64G_CC_MASK_Z
   20138                                      | AMD64G_CC_MASK_S| AMD64G_CC_MASK_O )
   20139                              )
   20140                        )
   20141           );
   20142 
   20143       /* Also need to set the D flag, which is held in bit 10 of t1.
   20144          If zero, put 1 in OFFB_DFLAG, else -1 in OFFB_DFLAG. */
   20145       stmt( IRStmt_Put(
   20146                OFFB_DFLAG,
   20147                IRExpr_ITE(
   20148                   unop(Iop_64to1,
   20149                        binop(Iop_And64,
   20150                              binop(Iop_Shr64, mkexpr(t1), mkU8(10)),
   20151                              mkU64(1))),
   20152                   mkU64(0xFFFFFFFFFFFFFFFFULL),
   20153                   mkU64(1)))
   20154           );
   20155 
   20156       /* And set the ID flag */
   20157       stmt( IRStmt_Put(
   20158                OFFB_IDFLAG,
   20159                IRExpr_ITE(
   20160                   unop(Iop_64to1,
   20161                        binop(Iop_And64,
   20162                              binop(Iop_Shr64, mkexpr(t1), mkU8(21)),
   20163                              mkU64(1))),
   20164                   mkU64(1),
   20165                   mkU64(0)))
   20166           );
   20167 
   20168       /* And set the AC flag too */
   20169       stmt( IRStmt_Put(
   20170                OFFB_ACFLAG,
   20171                IRExpr_ITE(
   20172                   unop(Iop_64to1,
   20173                        binop(Iop_And64,
   20174                              binop(Iop_Shr64, mkexpr(t1), mkU8(18)),
   20175                              mkU64(1))),
   20176                   mkU64(1),
   20177                   mkU64(0)))
   20178           );
   20179 
   20180       DIP("popf%c\n", nameISize(sz));
   20181       return delta;
   20182 
   20183    case 0x9E: /* SAHF */
   20184       codegen_SAHF();
   20185       DIP("sahf\n");
   20186       return delta;
   20187 
   20188    case 0x9F: /* LAHF */
   20189       codegen_LAHF();
   20190       DIP("lahf\n");
   20191       return delta;
   20192 
   20193    case 0xA0: /* MOV Ob,AL */
   20194       if (have66orF2orF3(pfx)) goto decode_failure;
   20195       sz = 1;
   20196       /* Fall through ... */
   20197    case 0xA1: /* MOV Ov,eAX */
   20198       if (sz != 8 && sz != 4 && sz != 2 && sz != 1)
   20199          goto decode_failure;
   20200       d64 = getDisp64(delta);
   20201       delta += 8;
   20202       ty = szToITy(sz);
   20203       addr = newTemp(Ity_I64);
   20204       assign( addr, handleAddrOverrides(vbi, pfx, mkU64(d64)) );
   20205       putIRegRAX(sz, loadLE( ty, mkexpr(addr) ));
   20206       DIP("mov%c %s0x%llx, %s\n", nameISize(sz),
   20207                                   segRegTxt(pfx), d64,
   20208                                   nameIRegRAX(sz));
   20209       return delta;
   20210 
   20211    case 0xA2: /* MOV AL,Ob */
   20212       if (have66orF2orF3(pfx)) goto decode_failure;
   20213       sz = 1;
   20214       /* Fall through ... */
   20215    case 0xA3: /* MOV eAX,Ov */
   20216       if (sz != 8 && sz != 4 && sz != 2 && sz != 1)
   20217          goto decode_failure;
   20218       d64 = getDisp64(delta);
   20219       delta += 8;
   20220       ty = szToITy(sz);
   20221       addr = newTemp(Ity_I64);
   20222       assign( addr, handleAddrOverrides(vbi, pfx, mkU64(d64)) );
   20223       storeLE( mkexpr(addr), getIRegRAX(sz) );
   20224       DIP("mov%c %s, %s0x%llx\n", nameISize(sz), nameIRegRAX(sz),
   20225                                   segRegTxt(pfx), d64);
   20226       return delta;
   20227 
   20228    case 0xA4:
   20229    case 0xA5:
   20230       /* F3 A4: rep movsb */
   20231       if (haveF3(pfx) && !haveF2(pfx)) {
   20232          if (opc == 0xA4)
   20233             sz = 1;
   20234          dis_REP_op ( dres, AMD64CondAlways, dis_MOVS, sz,
   20235                       guest_RIP_curr_instr,
   20236                       guest_RIP_bbstart+delta, "rep movs", pfx );
   20237         dres->whatNext = Dis_StopHere;
   20238         return delta;
   20239       }
   20240       /* A4: movsb */
   20241       if (!haveF3(pfx) && !haveF2(pfx)) {
   20242          if (opc == 0xA4)
   20243             sz = 1;
   20244          dis_string_op( dis_MOVS, sz, "movs", pfx );
   20245          return delta;
   20246       }
   20247       goto decode_failure;
   20248 
   20249    case 0xA6:
   20250    case 0xA7:
   20251       /* F3 A6/A7: repe cmps/rep cmps{w,l,q} */
   20252       if (haveF3(pfx) && !haveF2(pfx)) {
   20253          if (opc == 0xA6)
   20254             sz = 1;
   20255          dis_REP_op ( dres, AMD64CondZ, dis_CMPS, sz,
   20256                       guest_RIP_curr_instr,
   20257                       guest_RIP_bbstart+delta, "repe cmps", pfx );
   20258          dres->whatNext = Dis_StopHere;
   20259          return delta;
   20260       }
   20261       goto decode_failure;
   20262 
   20263    case 0xAA:
   20264    case 0xAB:
   20265       /* F3 AA/AB: rep stosb/rep stos{w,l,q} */
   20266       if (haveF3(pfx) && !haveF2(pfx)) {
   20267          if (opc == 0xAA)
   20268             sz = 1;
   20269          dis_REP_op ( dres, AMD64CondAlways, dis_STOS, sz,
   20270                       guest_RIP_curr_instr,
   20271                       guest_RIP_bbstart+delta, "rep stos", pfx );
   20272          vassert(dres->whatNext == Dis_StopHere);
   20273          return delta;
   20274       }
   20275       /* AA/AB: stosb/stos{w,l,q} */
   20276       if (!haveF3(pfx) && !haveF2(pfx)) {
   20277          if (opc == 0xAA)
   20278             sz = 1;
   20279          dis_string_op( dis_STOS, sz, "stos", pfx );
   20280          return delta;
   20281       }
   20282       goto decode_failure;
   20283 
   20284    case 0xA8: /* TEST Ib, AL */
   20285       if (haveF2orF3(pfx)) goto decode_failure;
   20286       delta = dis_op_imm_A( 1, False, Iop_And8, False, delta, "test" );
   20287       return delta;
   20288    case 0xA9: /* TEST Iv, eAX */
   20289       if (haveF2orF3(pfx)) goto decode_failure;
   20290       delta = dis_op_imm_A( sz, False, Iop_And8, False, delta, "test" );
   20291       return delta;
   20292 
   20293    case 0xAC: /* LODS, no REP prefix */
   20294    case 0xAD:
   20295       dis_string_op( dis_LODS, ( opc == 0xAC ? 1 : sz ), "lods", pfx );
   20296       return delta;
   20297 
   20298    case 0xAE:
   20299    case 0xAF:
   20300       /* F2 AE/AF: repne scasb/repne scas{w,l,q} */
   20301       if (haveF2(pfx) && !haveF3(pfx)) {
   20302          if (opc == 0xAE)
   20303             sz = 1;
   20304          dis_REP_op ( dres, AMD64CondNZ, dis_SCAS, sz,
   20305                       guest_RIP_curr_instr,
   20306                       guest_RIP_bbstart+delta, "repne scas", pfx );
   20307          vassert(dres->whatNext == Dis_StopHere);
   20308          return delta;
   20309       }
   20310       /* F3 AE/AF: repe scasb/repe scas{w,l,q} */
   20311       if (!haveF2(pfx) && haveF3(pfx)) {
   20312          if (opc == 0xAE)
   20313             sz = 1;
   20314          dis_REP_op ( dres, AMD64CondZ, dis_SCAS, sz,
   20315                       guest_RIP_curr_instr,
   20316                       guest_RIP_bbstart+delta, "repe scas", pfx );
   20317          vassert(dres->whatNext == Dis_StopHere);
   20318          return delta;
   20319       }
   20320       /* AE/AF: scasb/scas{w,l,q} */
   20321       if (!haveF2(pfx) && !haveF3(pfx)) {
   20322          if (opc == 0xAE)
   20323             sz = 1;
   20324          dis_string_op( dis_SCAS, sz, "scas", pfx );
   20325          return delta;
   20326       }
   20327       goto decode_failure;
   20328 
   20329    /* XXXX be careful here with moves to AH/BH/CH/DH */
   20330    case 0xB0: /* MOV imm,AL */
   20331    case 0xB1: /* MOV imm,CL */
   20332    case 0xB2: /* MOV imm,DL */
   20333    case 0xB3: /* MOV imm,BL */
   20334    case 0xB4: /* MOV imm,AH */
   20335    case 0xB5: /* MOV imm,CH */
   20336    case 0xB6: /* MOV imm,DH */
   20337    case 0xB7: /* MOV imm,BH */
   20338       if (haveF2orF3(pfx)) goto decode_failure;
   20339       d64 = getUChar(delta);
   20340       delta += 1;
   20341       putIRegRexB(1, pfx, opc-0xB0, mkU8(d64));
   20342       DIP("movb $%lld,%s\n", d64, nameIRegRexB(1,pfx,opc-0xB0));
   20343       return delta;
   20344 
   20345    case 0xB8: /* MOV imm,eAX */
   20346    case 0xB9: /* MOV imm,eCX */
   20347    case 0xBA: /* MOV imm,eDX */
   20348    case 0xBB: /* MOV imm,eBX */
   20349    case 0xBC: /* MOV imm,eSP */
   20350    case 0xBD: /* MOV imm,eBP */
   20351    case 0xBE: /* MOV imm,eSI */
   20352    case 0xBF: /* MOV imm,eDI */
   20353       /* This is the one-and-only place where 64-bit literals are
   20354          allowed in the instruction stream. */
   20355       if (haveF2orF3(pfx)) goto decode_failure;
   20356       if (sz == 8) {
   20357          d64 = getDisp64(delta);
   20358          delta += 8;
   20359          putIRegRexB(8, pfx, opc-0xB8, mkU64(d64));
   20360          DIP("movabsq $%lld,%s\n", (Long)d64,
   20361                                    nameIRegRexB(8,pfx,opc-0xB8));
   20362       } else {
   20363          d64 = getSDisp(imin(4,sz),delta);
   20364          delta += imin(4,sz);
   20365          putIRegRexB(sz, pfx, opc-0xB8,
   20366                          mkU(szToITy(sz), d64 & mkSizeMask(sz)));
   20367          DIP("mov%c $%lld,%s\n", nameISize(sz),
   20368                                  (Long)d64,
   20369                                  nameIRegRexB(sz,pfx,opc-0xB8));
   20370       }
   20371       return delta;
   20372 
   20373    case 0xC0: { /* Grp2 Ib,Eb */
   20374       Bool decode_OK = True;
   20375       if (haveF2orF3(pfx)) goto decode_failure;
   20376       modrm = getUChar(delta);
   20377       am_sz = lengthAMode(pfx,delta);
   20378       d_sz  = 1;
   20379       d64   = getUChar(delta + am_sz);
   20380       sz    = 1;
   20381       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   20382                          mkU8(d64 & 0xFF), NULL, &decode_OK );
   20383       if (!decode_OK) goto decode_failure;
   20384       return delta;
   20385    }
   20386 
   20387    case 0xC1: { /* Grp2 Ib,Ev */
   20388       Bool decode_OK = True;
   20389       if (haveF2orF3(pfx)) goto decode_failure;
   20390       modrm = getUChar(delta);
   20391       am_sz = lengthAMode(pfx,delta);
   20392       d_sz  = 1;
   20393       d64   = getUChar(delta + am_sz);
   20394       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   20395                          mkU8(d64 & 0xFF), NULL, &decode_OK );
   20396       if (!decode_OK) goto decode_failure;
   20397       return delta;
   20398    }
   20399 
   20400    case 0xC2: /* RET imm16 */
   20401       if (have66orF3(pfx)) goto decode_failure;
   20402       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   20403       d64 = getUDisp16(delta);
   20404       delta += 2;
   20405       dis_ret(dres, vbi, d64);
   20406       DIP("ret $%lld\n", d64);
   20407       return delta;
   20408 
   20409    case 0xC3: /* RET */
   20410       if (have66(pfx)) goto decode_failure;
   20411       /* F3 is acceptable on AMD. */
   20412       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   20413       dis_ret(dres, vbi, 0);
   20414       DIP(haveF3(pfx) ? "rep ; ret\n" : "ret\n");
   20415       return delta;
   20416 
   20417    case 0xC6: /* C6 /0 = MOV Ib,Eb */
   20418       sz = 1;
   20419       goto maybe_do_Mov_I_E;
   20420    case 0xC7: /* C7 /0 = MOV Iv,Ev */
   20421       goto maybe_do_Mov_I_E;
   20422    maybe_do_Mov_I_E:
   20423       modrm = getUChar(delta);
   20424       if (gregLO3ofRM(modrm) == 0) {
   20425          if (epartIsReg(modrm)) {
   20426             /* Neither F2 nor F3 are allowable. */
   20427             if (haveF2orF3(pfx)) goto decode_failure;
   20428             delta++; /* mod/rm byte */
   20429             d64 = getSDisp(imin(4,sz),delta);
   20430             delta += imin(4,sz);
   20431             putIRegE(sz, pfx, modrm,
   20432                          mkU(szToITy(sz), d64 & mkSizeMask(sz)));
   20433             DIP("mov%c $%lld, %s\n", nameISize(sz),
   20434                                      (Long)d64,
   20435                                      nameIRegE(sz,pfx,modrm));
   20436          } else {
   20437             if (haveF2(pfx)) goto decode_failure;
   20438             /* F3(XRELEASE) is allowable here */
   20439             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   20440                               /*xtra*/imin(4,sz) );
   20441             delta += alen;
   20442             d64 = getSDisp(imin(4,sz),delta);
   20443             delta += imin(4,sz);
   20444             storeLE(mkexpr(addr),
   20445                     mkU(szToITy(sz), d64 & mkSizeMask(sz)));
   20446             DIP("mov%c $%lld, %s\n", nameISize(sz), (Long)d64, dis_buf);
   20447          }
   20448          return delta;
   20449       }
   20450       /* BEGIN HACKY SUPPORT FOR xbegin */
   20451       if (opc == 0xC7 && modrm == 0xF8 && !have66orF2orF3(pfx) && sz == 4
   20452           && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   20453          delta++; /* mod/rm byte */
   20454          d64 = getSDisp(4,delta);
   20455          delta += 4;
   20456          guest_RIP_next_mustcheck = True;
   20457          guest_RIP_next_assumed   = guest_RIP_bbstart + delta;
   20458          Addr64 failAddr = guest_RIP_bbstart + delta + d64;
   20459          /* EAX contains the failure status code.  Bit 3 is "Set if an
   20460             internal buffer overflowed", which seems like the
   20461             least-bogus choice we can make here. */
   20462          putIRegRAX(4, mkU32(1<<3));
   20463          /* And jump to the fail address. */
   20464          jmp_lit(dres, Ijk_Boring, failAddr);
   20465          vassert(dres->whatNext == Dis_StopHere);
   20466          DIP("xbeginq 0x%llx\n", failAddr);
   20467          return delta;
   20468       }
   20469       /* END HACKY SUPPORT FOR xbegin */
   20470       /* BEGIN HACKY SUPPORT FOR xabort */
   20471       if (opc == 0xC6 && modrm == 0xF8 && !have66orF2orF3(pfx) && sz == 1
   20472           && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   20473          delta++; /* mod/rm byte */
   20474          abyte = getUChar(delta); delta++;
   20475          /* There is never a real transaction in progress, so do nothing. */
   20476          DIP("xabort $%d", (Int)abyte);
   20477          return delta;
   20478       }
   20479       /* END HACKY SUPPORT FOR xabort */
   20480       goto decode_failure;
   20481 
   20482    case 0xC8: /* ENTER */
   20483       /* Same comments re operand size as for LEAVE below apply.
   20484          Also, only handles the case "enter $imm16, $0"; other cases
   20485          for the second operand (nesting depth) are not handled. */
   20486       if (sz != 4)
   20487          goto decode_failure;
   20488       d64 = getUDisp16(delta);
   20489       delta += 2;
   20490       vassert(d64 >= 0 && d64 <= 0xFFFF);
   20491       if (getUChar(delta) != 0)
   20492          goto decode_failure;
   20493       delta++;
   20494       /* Intel docs seem to suggest:
   20495            push rbp
   20496            temp = rsp
   20497            rbp = temp
   20498            rsp = rsp - imm16
   20499       */
   20500       t1 = newTemp(Ity_I64);
   20501       assign(t1, getIReg64(R_RBP));
   20502       t2 = newTemp(Ity_I64);
   20503       assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   20504       putIReg64(R_RSP, mkexpr(t2));
   20505       storeLE(mkexpr(t2), mkexpr(t1));
   20506       putIReg64(R_RBP, mkexpr(t2));
   20507       if (d64 > 0) {
   20508          putIReg64(R_RSP, binop(Iop_Sub64, mkexpr(t2), mkU64(d64)));
   20509       }
   20510       DIP("enter $%u, $0\n", (UInt)d64);
   20511       return delta;
   20512 
   20513    case 0xC9: /* LEAVE */
   20514       /* In 64-bit mode this defaults to a 64-bit operand size.  There
   20515          is no way to encode a 32-bit variant.  Hence sz==4 but we do
   20516          it as if sz=8. */
   20517       if (sz != 4)
   20518          goto decode_failure;
   20519       t1 = newTemp(Ity_I64);
   20520       t2 = newTemp(Ity_I64);
   20521       assign(t1, getIReg64(R_RBP));
   20522       /* First PUT RSP looks redundant, but need it because RSP must
   20523          always be up-to-date for Memcheck to work... */
   20524       putIReg64(R_RSP, mkexpr(t1));
   20525       assign(t2, loadLE(Ity_I64,mkexpr(t1)));
   20526       putIReg64(R_RBP, mkexpr(t2));
   20527       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t1), mkU64(8)) );
   20528       DIP("leave\n");
   20529       return delta;
   20530 
   20531    case 0xCC: /* INT 3 */
   20532       jmp_lit(dres, Ijk_SigTRAP, guest_RIP_bbstart + delta);
   20533       vassert(dres->whatNext == Dis_StopHere);
   20534       DIP("int $0x3\n");
   20535       return delta;
   20536 
   20537    case 0xD0: { /* Grp2 1,Eb */
   20538       Bool decode_OK = True;
   20539       if (haveF2orF3(pfx)) goto decode_failure;
   20540       modrm = getUChar(delta);
   20541       am_sz = lengthAMode(pfx,delta);
   20542       d_sz  = 0;
   20543       d64   = 1;
   20544       sz    = 1;
   20545       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   20546                          mkU8(d64), NULL, &decode_OK );
   20547       if (!decode_OK) goto decode_failure;
   20548       return delta;
   20549    }
   20550 
   20551    case 0xD1: { /* Grp2 1,Ev */
   20552       Bool decode_OK = True;
   20553       if (haveF2orF3(pfx)) goto decode_failure;
   20554       modrm = getUChar(delta);
   20555       am_sz = lengthAMode(pfx,delta);
   20556       d_sz  = 0;
   20557       d64   = 1;
   20558       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   20559                          mkU8(d64), NULL, &decode_OK );
   20560       if (!decode_OK) goto decode_failure;
   20561       return delta;
   20562    }
   20563 
   20564    case 0xD2: { /* Grp2 CL,Eb */
   20565       Bool decode_OK = True;
   20566       if (haveF2orF3(pfx)) goto decode_failure;
   20567       modrm = getUChar(delta);
   20568       am_sz = lengthAMode(pfx,delta);
   20569       d_sz  = 0;
   20570       sz    = 1;
   20571       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   20572                          getIRegCL(), "%cl", &decode_OK );
   20573       if (!decode_OK) goto decode_failure;
   20574       return delta;
   20575    }
   20576 
   20577    case 0xD3: { /* Grp2 CL,Ev */
   20578       Bool decode_OK = True;
   20579       if (haveF2orF3(pfx)) goto decode_failure;
   20580       modrm = getUChar(delta);
   20581       am_sz = lengthAMode(pfx,delta);
   20582       d_sz  = 0;
   20583       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   20584                          getIRegCL(), "%cl", &decode_OK );
   20585       if (!decode_OK) goto decode_failure;
   20586       return delta;
   20587    }
   20588 
   20589    case 0xD8: /* X87 instructions */
   20590    case 0xD9:
   20591    case 0xDA:
   20592    case 0xDB:
   20593    case 0xDC:
   20594    case 0xDD:
   20595    case 0xDE:
   20596    case 0xDF: {
   20597       Bool redundantREXWok = False;
   20598 
   20599       if (haveF2orF3(pfx))
   20600          goto decode_failure;
   20601 
   20602       /* kludge to tolerate redundant rex.w prefixes (should do this
   20603          properly one day) */
   20604       /* mono 1.1.18.1 produces 48 D9 FA, which is rex.w fsqrt */
   20605       if ( (opc == 0xD9 && getUChar(delta+0) == 0xFA)/*fsqrt*/ )
   20606          redundantREXWok = True;
   20607 
   20608       Bool size_OK = False;
   20609       if ( sz == 4 )
   20610          size_OK = True;
   20611       else if ( sz == 8 )
   20612          size_OK = redundantREXWok;
   20613       else if ( sz == 2 ) {
   20614          int mod_rm = getUChar(delta+0);
   20615          int reg = gregLO3ofRM(mod_rm);
   20616          /* The HotSpot JVM uses these */
   20617          if ( (opc == 0xDD) && (reg == 0 /* FLDL   */ ||
   20618                                 reg == 4 /* FNSAVE */ ||
   20619                                 reg == 6 /* FRSTOR */ ) )
   20620             size_OK = True;
   20621       }
   20622       /* AMD manual says 0x66 size override is ignored, except where
   20623          it is meaningful */
   20624       if (!size_OK)
   20625          goto decode_failure;
   20626 
   20627       Bool decode_OK = False;
   20628       delta = dis_FPU ( &decode_OK, vbi, pfx, delta );
   20629       if (!decode_OK)
   20630          goto decode_failure;
   20631 
   20632       return delta;
   20633    }
   20634 
   20635    case 0xE0: /* LOOPNE disp8: decrement count, jump if count != 0 && ZF==0 */
   20636    case 0xE1: /* LOOPE  disp8: decrement count, jump if count != 0 && ZF==1 */
   20637    case 0xE2: /* LOOP   disp8: decrement count, jump if count != 0 */
   20638     { /* The docs say this uses rCX as a count depending on the
   20639          address size override, not the operand one. */
   20640       IRExpr* zbit  = NULL;
   20641       IRExpr* count = NULL;
   20642       IRExpr* cond  = NULL;
   20643       const HChar* xtra = NULL;
   20644 
   20645       if (have66orF2orF3(pfx) || 1==getRexW(pfx)) goto decode_failure;
   20646       /* So at this point we've rejected any variants which appear to
   20647          be governed by the usual operand-size modifiers.  Hence only
   20648          the address size prefix can have an effect.  It changes the
   20649          size from 64 (default) to 32. */
   20650       d64 = guest_RIP_bbstart+delta+1 + getSDisp8(delta);
   20651       delta++;
   20652       if (haveASO(pfx)) {
   20653          /* 64to32 of 64-bit get is merely a get-put improvement
   20654             trick. */
   20655          putIReg32(R_RCX, binop(Iop_Sub32,
   20656                                 unop(Iop_64to32, getIReg64(R_RCX)),
   20657                                 mkU32(1)));
   20658       } else {
   20659          putIReg64(R_RCX, binop(Iop_Sub64, getIReg64(R_RCX), mkU64(1)));
   20660       }
   20661 
   20662       /* This is correct, both for 32- and 64-bit versions.  If we're
   20663          doing a 32-bit dec and the result is zero then the default
   20664          zero extension rule will cause the upper 32 bits to be zero
   20665          too.  Hence a 64-bit check against zero is OK. */
   20666       count = getIReg64(R_RCX);
   20667       cond = binop(Iop_CmpNE64, count, mkU64(0));
   20668       switch (opc) {
   20669          case 0xE2:
   20670             xtra = "";
   20671             break;
   20672          case 0xE1:
   20673             xtra = "e";
   20674             zbit = mk_amd64g_calculate_condition( AMD64CondZ );
   20675             cond = mkAnd1(cond, zbit);
   20676             break;
   20677          case 0xE0:
   20678             xtra = "ne";
   20679             zbit = mk_amd64g_calculate_condition( AMD64CondNZ );
   20680             cond = mkAnd1(cond, zbit);
   20681             break;
   20682          default:
   20683             vassert(0);
   20684       }
   20685       stmt( IRStmt_Exit(cond, Ijk_Boring, IRConst_U64(d64), OFFB_RIP) );
   20686 
   20687       DIP("loop%s%s 0x%llx\n", xtra, haveASO(pfx) ? "l" : "", d64);
   20688       return delta;
   20689     }
   20690 
   20691    case 0xE3:
   20692       /* JRCXZ or JECXZ, depending address size override. */
   20693       if (have66orF2orF3(pfx)) goto decode_failure;
   20694       d64 = (guest_RIP_bbstart+delta+1) + getSDisp8(delta);
   20695       delta++;
   20696       if (haveASO(pfx)) {
   20697          /* 32-bit */
   20698          stmt( IRStmt_Exit( binop(Iop_CmpEQ64,
   20699                                   unop(Iop_32Uto64, getIReg32(R_RCX)),
   20700                                   mkU64(0)),
   20701                             Ijk_Boring,
   20702                             IRConst_U64(d64),
   20703                             OFFB_RIP
   20704              ));
   20705          DIP("jecxz 0x%llx\n", d64);
   20706       } else {
   20707          /* 64-bit */
   20708          stmt( IRStmt_Exit( binop(Iop_CmpEQ64,
   20709                                   getIReg64(R_RCX),
   20710                                   mkU64(0)),
   20711                             Ijk_Boring,
   20712                             IRConst_U64(d64),
   20713                             OFFB_RIP
   20714                ));
   20715          DIP("jrcxz 0x%llx\n", d64);
   20716       }
   20717       return delta;
   20718 
   20719    case 0xE4: /* IN imm8, AL */
   20720       sz = 1;
   20721       t1 = newTemp(Ity_I64);
   20722       abyte = getUChar(delta); delta++;
   20723       assign(t1, mkU64( abyte & 0xFF ));
   20724       DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIRegRAX(sz));
   20725       goto do_IN;
   20726    case 0xE5: /* IN imm8, eAX */
   20727       if (!(sz == 2 || sz == 4)) goto decode_failure;
   20728       t1 = newTemp(Ity_I64);
   20729       abyte = getUChar(delta); delta++;
   20730       assign(t1, mkU64( abyte & 0xFF ));
   20731       DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIRegRAX(sz));
   20732       goto do_IN;
   20733    case 0xEC: /* IN %DX, AL */
   20734       sz = 1;
   20735       t1 = newTemp(Ity_I64);
   20736       assign(t1, unop(Iop_16Uto64, getIRegRDX(2)));
   20737       DIP("in%c %s,%s\n", nameISize(sz), nameIRegRDX(2),
   20738                                          nameIRegRAX(sz));
   20739       goto do_IN;
   20740    case 0xED: /* IN %DX, eAX */
   20741       if (!(sz == 2 || sz == 4)) goto decode_failure;
   20742       t1 = newTemp(Ity_I64);
   20743       assign(t1, unop(Iop_16Uto64, getIRegRDX(2)));
   20744       DIP("in%c %s,%s\n", nameISize(sz), nameIRegRDX(2),
   20745                                          nameIRegRAX(sz));
   20746       goto do_IN;
   20747    do_IN: {
   20748       /* At this point, sz indicates the width, and t1 is a 64-bit
   20749          value giving port number. */
   20750       IRDirty* d;
   20751       if (haveF2orF3(pfx)) goto decode_failure;
   20752       vassert(sz == 1 || sz == 2 || sz == 4);
   20753       ty = szToITy(sz);
   20754       t2 = newTemp(Ity_I64);
   20755       d = unsafeIRDirty_1_N(
   20756              t2,
   20757              0/*regparms*/,
   20758              "amd64g_dirtyhelper_IN",
   20759              &amd64g_dirtyhelper_IN,
   20760              mkIRExprVec_2( mkexpr(t1), mkU64(sz) )
   20761           );
   20762       /* do the call, dumping the result in t2. */
   20763       stmt( IRStmt_Dirty(d) );
   20764       putIRegRAX(sz, narrowTo( ty, mkexpr(t2) ) );
   20765       return delta;
   20766    }
   20767 
   20768    case 0xE6: /* OUT AL, imm8 */
   20769       sz = 1;
   20770       t1 = newTemp(Ity_I64);
   20771       abyte = getUChar(delta); delta++;
   20772       assign( t1, mkU64( abyte & 0xFF ) );
   20773       DIP("out%c %s,$%d\n", nameISize(sz), nameIRegRAX(sz), (Int)abyte);
   20774       goto do_OUT;
   20775    case 0xE7: /* OUT eAX, imm8 */
   20776       if (!(sz == 2 || sz == 4)) goto decode_failure;
   20777       t1 = newTemp(Ity_I64);
   20778       abyte = getUChar(delta); delta++;
   20779       assign( t1, mkU64( abyte & 0xFF ) );
   20780       DIP("out%c %s,$%d\n", nameISize(sz), nameIRegRAX(sz), (Int)abyte);
   20781       goto do_OUT;
   20782    case 0xEE: /* OUT AL, %DX */
   20783       sz = 1;
   20784       t1 = newTemp(Ity_I64);
   20785       assign( t1, unop(Iop_16Uto64, getIRegRDX(2)) );
   20786       DIP("out%c %s,%s\n", nameISize(sz), nameIRegRAX(sz),
   20787                                           nameIRegRDX(2));
   20788       goto do_OUT;
   20789    case 0xEF: /* OUT eAX, %DX */
   20790       if (!(sz == 2 || sz == 4)) goto decode_failure;
   20791       t1 = newTemp(Ity_I64);
   20792       assign( t1, unop(Iop_16Uto64, getIRegRDX(2)) );
   20793       DIP("out%c %s,%s\n", nameISize(sz), nameIRegRAX(sz),
   20794                                           nameIRegRDX(2));
   20795       goto do_OUT;
   20796    do_OUT: {
   20797       /* At this point, sz indicates the width, and t1 is a 64-bit
   20798          value giving port number. */
   20799       IRDirty* d;
   20800       if (haveF2orF3(pfx)) goto decode_failure;
   20801       vassert(sz == 1 || sz == 2 || sz == 4);
   20802       ty = szToITy(sz);
   20803       d = unsafeIRDirty_0_N(
   20804              0/*regparms*/,
   20805              "amd64g_dirtyhelper_OUT",
   20806              &amd64g_dirtyhelper_OUT,
   20807              mkIRExprVec_3( mkexpr(t1),
   20808                             widenUto64( getIRegRAX(sz) ),
   20809                             mkU64(sz) )
   20810           );
   20811       stmt( IRStmt_Dirty(d) );
   20812       return delta;
   20813    }
   20814 
   20815    case 0xE8: /* CALL J4 */
   20816       if (haveF3(pfx)) goto decode_failure;
   20817       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   20818       d64 = getSDisp32(delta); delta += 4;
   20819       d64 += (guest_RIP_bbstart+delta);
   20820       /* (guest_RIP_bbstart+delta) == return-to addr, d64 == call-to addr */
   20821       t1 = newTemp(Ity_I64);
   20822       assign(t1, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   20823       putIReg64(R_RSP, mkexpr(t1));
   20824       storeLE( mkexpr(t1), mkU64(guest_RIP_bbstart+delta));
   20825       t2 = newTemp(Ity_I64);
   20826       assign(t2, mkU64((Addr64)d64));
   20827       make_redzone_AbiHint(vbi, t1, t2/*nia*/, "call-d32");
   20828       if (resteerOkFn( callback_opaque, (Addr64)d64) ) {
   20829          /* follow into the call target. */
   20830          dres->whatNext   = Dis_ResteerU;
   20831          dres->continueAt = d64;
   20832       } else {
   20833          jmp_lit(dres, Ijk_Call, d64);
   20834          vassert(dres->whatNext == Dis_StopHere);
   20835       }
   20836       DIP("call 0x%llx\n",d64);
   20837       return delta;
   20838 
   20839    case 0xE9: /* Jv (jump, 16/32 offset) */
   20840       if (haveF3(pfx)) goto decode_failure;
   20841       if (sz != 4)
   20842          goto decode_failure; /* JRS added 2004 July 11 */
   20843       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   20844       d64 = (guest_RIP_bbstart+delta+sz) + getSDisp(sz,delta);
   20845       delta += sz;
   20846       if (resteerOkFn(callback_opaque, (Addr64)d64)) {
   20847          dres->whatNext   = Dis_ResteerU;
   20848          dres->continueAt = d64;
   20849       } else {
   20850          jmp_lit(dres, Ijk_Boring, d64);
   20851          vassert(dres->whatNext == Dis_StopHere);
   20852       }
   20853       DIP("jmp 0x%llx\n", d64);
   20854       return delta;
   20855 
   20856    case 0xEB: /* Jb (jump, byte offset) */
   20857       if (haveF3(pfx)) goto decode_failure;
   20858       if (sz != 4)
   20859          goto decode_failure; /* JRS added 2004 July 11 */
   20860       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   20861       d64 = (guest_RIP_bbstart+delta+1) + getSDisp8(delta);
   20862       delta++;
   20863       if (resteerOkFn(callback_opaque, (Addr64)d64)) {
   20864          dres->whatNext   = Dis_ResteerU;
   20865          dres->continueAt = d64;
   20866       } else {
   20867          jmp_lit(dres, Ijk_Boring, d64);
   20868          vassert(dres->whatNext == Dis_StopHere);
   20869       }
   20870       DIP("jmp-8 0x%llx\n", d64);
   20871       return delta;
   20872 
   20873    case 0xF5: /* CMC */
   20874    case 0xF8: /* CLC */
   20875    case 0xF9: /* STC */
   20876       t1 = newTemp(Ity_I64);
   20877       t2 = newTemp(Ity_I64);
   20878       assign( t1, mk_amd64g_calculate_rflags_all() );
   20879       switch (opc) {
   20880          case 0xF5:
   20881             assign( t2, binop(Iop_Xor64, mkexpr(t1),
   20882                                          mkU64(AMD64G_CC_MASK_C)));
   20883             DIP("cmc\n");
   20884             break;
   20885          case 0xF8:
   20886             assign( t2, binop(Iop_And64, mkexpr(t1),
   20887                                          mkU64(~AMD64G_CC_MASK_C)));
   20888             DIP("clc\n");
   20889             break;
   20890          case 0xF9:
   20891             assign( t2, binop(Iop_Or64, mkexpr(t1),
   20892                                         mkU64(AMD64G_CC_MASK_C)));
   20893             DIP("stc\n");
   20894             break;
   20895          default:
   20896             vpanic("disInstr(x64)(cmc/clc/stc)");
   20897       }
   20898       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   20899       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   20900       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t2) ));
   20901       /* Set NDEP even though it isn't used.  This makes redundant-PUT
   20902          elimination of previous stores to this field work better. */
   20903       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   20904       return delta;
   20905 
   20906    case 0xF6: { /* Grp3 Eb */
   20907       Bool decode_OK = True;
   20908       /* RM'd: if (haveF2orF3(pfx)) goto decode_failure; */
   20909       /* We now let dis_Grp3 itself decide if F2 and/or F3 are valid */
   20910       delta = dis_Grp3 ( vbi, pfx, 1, delta, &decode_OK );
   20911       if (!decode_OK) goto decode_failure;
   20912       return delta;
   20913    }
   20914 
   20915    case 0xF7: { /* Grp3 Ev */
   20916       Bool decode_OK = True;
   20917       /* RM'd: if (haveF2orF3(pfx)) goto decode_failure; */
   20918       /* We now let dis_Grp3 itself decide if F2 and/or F3 are valid */
   20919       delta = dis_Grp3 ( vbi, pfx, sz, delta, &decode_OK );
   20920       if (!decode_OK) goto decode_failure;
   20921       return delta;
   20922    }
   20923 
   20924    case 0xFC: /* CLD */
   20925       if (haveF2orF3(pfx)) goto decode_failure;
   20926       stmt( IRStmt_Put( OFFB_DFLAG, mkU64(1)) );
   20927       DIP("cld\n");
   20928       return delta;
   20929 
   20930    case 0xFD: /* STD */
   20931       if (haveF2orF3(pfx)) goto decode_failure;
   20932       stmt( IRStmt_Put( OFFB_DFLAG, mkU64(-1ULL)) );
   20933       DIP("std\n");
   20934       return delta;
   20935 
   20936    case 0xFE: { /* Grp4 Eb */
   20937       Bool decode_OK = True;
   20938       /* RM'd: if (haveF2orF3(pfx)) goto decode_failure; */
   20939       /* We now let dis_Grp4 itself decide if F2 and/or F3 are valid */
   20940       delta = dis_Grp4 ( vbi, pfx, delta, &decode_OK );
   20941       if (!decode_OK) goto decode_failure;
   20942       return delta;
   20943    }
   20944 
   20945    case 0xFF: { /* Grp5 Ev */
   20946       Bool decode_OK = True;
   20947       /* RM'd: if (haveF2orF3(pfx)) goto decode_failure; */
   20948       /* We now let dis_Grp5 itself decide if F2 and/or F3 are valid */
   20949       delta = dis_Grp5 ( vbi, pfx, sz, delta, dres, &decode_OK );
   20950       if (!decode_OK) goto decode_failure;
   20951       return delta;
   20952    }
   20953 
   20954    default:
   20955       break;
   20956 
   20957    }
   20958 
   20959   decode_failure:
   20960    return deltaIN; /* fail */
   20961 }
   20962 
   20963 
   20964 /*------------------------------------------------------------*/
   20965 /*---                                                      ---*/
   20966 /*--- Top-level post-escape decoders: dis_ESC_0F           ---*/
   20967 /*---                                                      ---*/
   20968 /*------------------------------------------------------------*/
   20969 
   20970 static IRTemp math_BSWAP ( IRTemp t1, IRType ty )
   20971 {
   20972    IRTemp t2 = newTemp(ty);
   20973    if (ty == Ity_I64) {
   20974       IRTemp m8  = newTemp(Ity_I64);
   20975       IRTemp s8  = newTemp(Ity_I64);
   20976       IRTemp m16 = newTemp(Ity_I64);
   20977       IRTemp s16 = newTemp(Ity_I64);
   20978       IRTemp m32 = newTemp(Ity_I64);
   20979       assign( m8, mkU64(0xFF00FF00FF00FF00ULL) );
   20980       assign( s8,
   20981               binop(Iop_Or64,
   20982                     binop(Iop_Shr64,
   20983                           binop(Iop_And64,mkexpr(t1),mkexpr(m8)),
   20984                           mkU8(8)),
   20985                     binop(Iop_And64,
   20986                           binop(Iop_Shl64,mkexpr(t1),mkU8(8)),
   20987                           mkexpr(m8))
   20988                    )
   20989             );
   20990 
   20991       assign( m16, mkU64(0xFFFF0000FFFF0000ULL) );
   20992       assign( s16,
   20993               binop(Iop_Or64,
   20994                     binop(Iop_Shr64,
   20995                           binop(Iop_And64,mkexpr(s8),mkexpr(m16)),
   20996                           mkU8(16)),
   20997                     binop(Iop_And64,
   20998                           binop(Iop_Shl64,mkexpr(s8),mkU8(16)),
   20999                           mkexpr(m16))
   21000                    )
   21001             );
   21002 
   21003       assign( m32, mkU64(0xFFFFFFFF00000000ULL) );
   21004       assign( t2,
   21005               binop(Iop_Or64,
   21006                     binop(Iop_Shr64,
   21007                           binop(Iop_And64,mkexpr(s16),mkexpr(m32)),
   21008                           mkU8(32)),
   21009                     binop(Iop_And64,
   21010                           binop(Iop_Shl64,mkexpr(s16),mkU8(32)),
   21011                           mkexpr(m32))
   21012                    )
   21013             );
   21014       return t2;
   21015    }
   21016    if (ty == Ity_I32) {
   21017       assign( t2,
   21018          binop(
   21019             Iop_Or32,
   21020             binop(Iop_Shl32, mkexpr(t1), mkU8(24)),
   21021             binop(
   21022                Iop_Or32,
   21023                binop(Iop_And32, binop(Iop_Shl32, mkexpr(t1), mkU8(8)),
   21024                                 mkU32(0x00FF0000)),
   21025                binop(Iop_Or32,
   21026                      binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(8)),
   21027                                       mkU32(0x0000FF00)),
   21028                      binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(24)),
   21029                                       mkU32(0x000000FF) )
   21030             )))
   21031       );
   21032       return t2;
   21033    }
   21034    if (ty == Ity_I16) {
   21035       assign(t2,
   21036              binop(Iop_Or16,
   21037                    binop(Iop_Shl16, mkexpr(t1), mkU8(8)),
   21038                    binop(Iop_Shr16, mkexpr(t1), mkU8(8)) ));
   21039       return t2;
   21040    }
   21041    vassert(0);
   21042    /*NOTREACHED*/
   21043    return IRTemp_INVALID;
   21044 }
   21045 
   21046 
   21047 __attribute__((noinline))
   21048 static
   21049 Long dis_ESC_0F (
   21050         /*MB_OUT*/DisResult* dres,
   21051         /*MB_OUT*/Bool*      expect_CAS,
   21052         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
   21053         Bool         resteerCisOk,
   21054         void*        callback_opaque,
   21055         const VexArchInfo* archinfo,
   21056         const VexAbiInfo*  vbi,
   21057         Prefix pfx, Int sz, Long deltaIN
   21058      )
   21059 {
   21060    Long   d64   = 0;
   21061    IRTemp addr  = IRTemp_INVALID;
   21062    IRTemp t1    = IRTemp_INVALID;
   21063    IRTemp t2    = IRTemp_INVALID;
   21064    UChar  modrm = 0;
   21065    Int    am_sz = 0;
   21066    Int    alen  = 0;
   21067    HChar  dis_buf[50];
   21068 
   21069    /* In the first switch, look for ordinary integer insns. */
   21070    Long   delta = deltaIN;
   21071    UChar  opc   = getUChar(delta);
   21072    delta++;
   21073    switch (opc) { /* first switch */
   21074 
   21075    case 0x01:
   21076    {
   21077       modrm = getUChar(delta);
   21078       /* 0F 01 /0 -- SGDT */
   21079       /* 0F 01 /1 -- SIDT */
   21080       if (!epartIsReg(modrm)
   21081           && (gregLO3ofRM(modrm) == 0 || gregLO3ofRM(modrm) == 1)) {
   21082          /* This is really revolting, but ... since each processor
   21083             (core) only has one IDT and one GDT, just let the guest
   21084             see it (pass-through semantics).  I can't see any way to
   21085             construct a faked-up value, so don't bother to try. */
   21086          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21087          delta += alen;
   21088          switch (gregLO3ofRM(modrm)) {
   21089             case 0: DIP("sgdt %s\n", dis_buf); break;
   21090             case 1: DIP("sidt %s\n", dis_buf); break;
   21091             default: vassert(0); /*NOTREACHED*/
   21092          }
   21093          IRDirty* d = unsafeIRDirty_0_N (
   21094                           0/*regparms*/,
   21095                           "amd64g_dirtyhelper_SxDT",
   21096                           &amd64g_dirtyhelper_SxDT,
   21097                           mkIRExprVec_2( mkexpr(addr),
   21098                                          mkU64(gregLO3ofRM(modrm)) )
   21099                       );
   21100          /* declare we're writing memory */
   21101          d->mFx   = Ifx_Write;
   21102          d->mAddr = mkexpr(addr);
   21103          d->mSize = 6;
   21104          stmt( IRStmt_Dirty(d) );
   21105          return delta;
   21106       }
   21107       /* 0F 01 D0 = XGETBV */
   21108       if (modrm == 0xD0 && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   21109          delta += 1;
   21110          DIP("xgetbv\n");
   21111          /* Fault (SEGV) if ECX isn't zero.  Intel docs say #GP and I
   21112             am not sure if that translates in to SEGV or to something
   21113             else, in user space. */
   21114          t1 = newTemp(Ity_I32);
   21115          assign( t1, getIReg32(R_RCX) );
   21116          stmt( IRStmt_Exit(binop(Iop_CmpNE32, mkexpr(t1), mkU32(0)),
   21117                            Ijk_SigSEGV,
   21118                            IRConst_U64(guest_RIP_curr_instr),
   21119                            OFFB_RIP
   21120          ));
   21121          putIRegRAX(4, mkU32(7));
   21122          putIRegRDX(4, mkU32(0));
   21123          return delta;
   21124       }
   21125       /* BEGIN HACKY SUPPORT FOR xend */
   21126       /* 0F 01 D5 = XEND */
   21127       if (modrm == 0xD5 && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   21128          /* We are never in an transaction (xbegin immediately aborts).
   21129             So this just always generates a General Protection Fault. */
   21130          delta += 1;
   21131          jmp_lit(dres, Ijk_SigSEGV, guest_RIP_bbstart + delta);
   21132          vassert(dres->whatNext == Dis_StopHere);
   21133          DIP("xend\n");
   21134          return delta;
   21135       }
   21136       /* END HACKY SUPPORT FOR xend */
   21137       /* BEGIN HACKY SUPPORT FOR xtest */
   21138       /* 0F 01 D6 = XTEST */
   21139       if (modrm == 0xD6 && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   21140          /* Sets ZF because there never is a transaction, and all
   21141             CF, OF, SF, PF and AF are always cleared by xtest. */
   21142          delta += 1;
   21143          DIP("xtest\n");
   21144          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   21145          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   21146          stmt( IRStmt_Put( OFFB_CC_DEP1, mkU64(AMD64G_CC_MASK_Z) ));
   21147          /* Set NDEP even though it isn't used.  This makes redundant-PUT
   21148             elimination of previous stores to this field work better. */
   21149          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   21150          return delta;
   21151       }
   21152       /* END HACKY SUPPORT FOR xtest */
   21153       /* 0F 01 F9 = RDTSCP */
   21154       if (modrm == 0xF9 && (archinfo->hwcaps & VEX_HWCAPS_AMD64_RDTSCP)) {
   21155          delta += 1;
   21156          /* Uses dirty helper:
   21157             void amd64g_dirtyhelper_RDTSCP ( VexGuestAMD64State* )
   21158             declared to wr rax, rcx, rdx
   21159          */
   21160          const HChar* fName = "amd64g_dirtyhelper_RDTSCP";
   21161          void*        fAddr = &amd64g_dirtyhelper_RDTSCP;
   21162          IRDirty* d
   21163             = unsafeIRDirty_0_N ( 0/*regparms*/,
   21164                                   fName, fAddr, mkIRExprVec_1(IRExpr_BBPTR()) );
   21165          /* declare guest state effects */
   21166          d->nFxState = 3;
   21167          vex_bzero(&d->fxState, sizeof(d->fxState));
   21168          d->fxState[0].fx     = Ifx_Write;
   21169          d->fxState[0].offset = OFFB_RAX;
   21170          d->fxState[0].size   = 8;
   21171          d->fxState[1].fx     = Ifx_Write;
   21172          d->fxState[1].offset = OFFB_RCX;
   21173          d->fxState[1].size   = 8;
   21174          d->fxState[2].fx     = Ifx_Write;
   21175          d->fxState[2].offset = OFFB_RDX;
   21176          d->fxState[2].size   = 8;
   21177          /* execute the dirty call, side-effecting guest state */
   21178          stmt( IRStmt_Dirty(d) );
   21179          /* RDTSCP is a serialising insn.  So, just in case someone is
   21180             using it as a memory fence ... */
   21181          stmt( IRStmt_MBE(Imbe_Fence) );
   21182          DIP("rdtscp\n");
   21183          return delta;
   21184       }
   21185       /* else decode failed */
   21186       break;
   21187    }
   21188 
   21189    case 0x05: /* SYSCALL */
   21190       guest_RIP_next_mustcheck = True;
   21191       guest_RIP_next_assumed = guest_RIP_bbstart + delta;
   21192       putIReg64( R_RCX, mkU64(guest_RIP_next_assumed) );
   21193       /* It's important that all guest state is up-to-date
   21194          at this point.  So we declare an end-of-block here, which
   21195          forces any cached guest state to be flushed. */
   21196       jmp_lit(dres, Ijk_Sys_syscall, guest_RIP_next_assumed);
   21197       vassert(dres->whatNext == Dis_StopHere);
   21198       DIP("syscall\n");
   21199       return delta;
   21200 
   21201    case 0x0B: /* UD2 */
   21202       stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_curr_instr) ) );
   21203       jmp_lit(dres, Ijk_NoDecode, guest_RIP_curr_instr);
   21204       vassert(dres->whatNext == Dis_StopHere);
   21205       DIP("ud2\n");
   21206       return delta;
   21207 
   21208    case 0x0D: /* 0F 0D /0 -- prefetch mem8 */
   21209               /* 0F 0D /1 -- prefetchw mem8 */
   21210       if (have66orF2orF3(pfx)) goto decode_failure;
   21211       modrm = getUChar(delta);
   21212       if (epartIsReg(modrm)) goto decode_failure;
   21213       if (gregLO3ofRM(modrm) != 0 && gregLO3ofRM(modrm) != 1)
   21214          goto decode_failure;
   21215       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21216       delta += alen;
   21217       switch (gregLO3ofRM(modrm)) {
   21218          case 0: DIP("prefetch %s\n", dis_buf); break;
   21219          case 1: DIP("prefetchw %s\n", dis_buf); break;
   21220          default: vassert(0); /*NOTREACHED*/
   21221       }
   21222       return delta;
   21223 
   21224    case 0x1F:
   21225       if (haveF2orF3(pfx)) goto decode_failure;
   21226       modrm = getUChar(delta);
   21227       if (epartIsReg(modrm)) goto decode_failure;
   21228       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21229       delta += alen;
   21230       DIP("nop%c %s\n", nameISize(sz), dis_buf);
   21231       return delta;
   21232 
   21233    case 0x31: { /* RDTSC */
   21234       IRTemp   val  = newTemp(Ity_I64);
   21235       IRExpr** args = mkIRExprVec_0();
   21236       IRDirty* d    = unsafeIRDirty_1_N (
   21237                          val,
   21238                          0/*regparms*/,
   21239                          "amd64g_dirtyhelper_RDTSC",
   21240                          &amd64g_dirtyhelper_RDTSC,
   21241                          args
   21242                       );
   21243       if (have66orF2orF3(pfx)) goto decode_failure;
   21244       /* execute the dirty call, dumping the result in val. */
   21245       stmt( IRStmt_Dirty(d) );
   21246       putIRegRDX(4, unop(Iop_64HIto32, mkexpr(val)));
   21247       putIRegRAX(4, unop(Iop_64to32, mkexpr(val)));
   21248       DIP("rdtsc\n");
   21249       return delta;
   21250    }
   21251 
   21252    case 0x40:
   21253    case 0x41:
   21254    case 0x42: /* CMOVBb/CMOVNAEb (cmov below) */
   21255    case 0x43: /* CMOVNBb/CMOVAEb (cmov not below) */
   21256    case 0x44: /* CMOVZb/CMOVEb (cmov zero) */
   21257    case 0x45: /* CMOVNZb/CMOVNEb (cmov not zero) */
   21258    case 0x46: /* CMOVBEb/CMOVNAb (cmov below or equal) */
   21259    case 0x47: /* CMOVNBEb/CMOVAb (cmov not below or equal) */
   21260    case 0x48: /* CMOVSb (cmov negative) */
   21261    case 0x49: /* CMOVSb (cmov not negative) */
   21262    case 0x4A: /* CMOVP (cmov parity even) */
   21263    case 0x4B: /* CMOVNP (cmov parity odd) */
   21264    case 0x4C: /* CMOVLb/CMOVNGEb (cmov less) */
   21265    case 0x4D: /* CMOVGEb/CMOVNLb (cmov greater or equal) */
   21266    case 0x4E: /* CMOVLEb/CMOVNGb (cmov less or equal) */
   21267    case 0x4F: /* CMOVGb/CMOVNLEb (cmov greater) */
   21268       if (haveF2orF3(pfx)) goto decode_failure;
   21269       delta = dis_cmov_E_G(vbi, pfx, sz, (AMD64Condcode)(opc - 0x40), delta);
   21270       return delta;
   21271 
   21272    case 0x80:
   21273    case 0x81:
   21274    case 0x82:   /* JBb/JNAEb (jump below) */
   21275    case 0x83:   /* JNBb/JAEb (jump not below) */
   21276    case 0x84:   /* JZb/JEb (jump zero) */
   21277    case 0x85:   /* JNZb/JNEb (jump not zero) */
   21278    case 0x86:   /* JBEb/JNAb (jump below or equal) */
   21279    case 0x87:   /* JNBEb/JAb (jump not below or equal) */
   21280    case 0x88:   /* JSb (jump negative) */
   21281    case 0x89:   /* JSb (jump not negative) */
   21282    case 0x8A:   /* JP (jump parity even) */
   21283    case 0x8B:   /* JNP/JPO (jump parity odd) */
   21284    case 0x8C:   /* JLb/JNGEb (jump less) */
   21285    case 0x8D:   /* JGEb/JNLb (jump greater or equal) */
   21286    case 0x8E:   /* JLEb/JNGb (jump less or equal) */
   21287    case 0x8F: { /* JGb/JNLEb (jump greater) */
   21288       Long   jmpDelta;
   21289       const HChar* comment  = "";
   21290       if (haveF3(pfx)) goto decode_failure;
   21291       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   21292       jmpDelta = getSDisp32(delta);
   21293       d64 = (guest_RIP_bbstart+delta+4) + jmpDelta;
   21294       delta += 4;
   21295       if (resteerCisOk
   21296           && vex_control.guest_chase_cond
   21297           && (Addr64)d64 != (Addr64)guest_RIP_bbstart
   21298           && jmpDelta < 0
   21299           && resteerOkFn( callback_opaque, (Addr64)d64) ) {
   21300          /* Speculation: assume this backward branch is taken.  So
   21301             we need to emit a side-exit to the insn following this
   21302             one, on the negation of the condition, and continue at
   21303             the branch target address (d64).  If we wind up back at
   21304             the first instruction of the trace, just stop; it's
   21305             better to let the IR loop unroller handle that case. */
   21306          stmt( IRStmt_Exit(
   21307                   mk_amd64g_calculate_condition(
   21308                      (AMD64Condcode)(1 ^ (opc - 0x80))),
   21309                   Ijk_Boring,
   21310                   IRConst_U64(guest_RIP_bbstart+delta),
   21311                   OFFB_RIP
   21312              ));
   21313          dres->whatNext   = Dis_ResteerC;
   21314          dres->continueAt = d64;
   21315          comment = "(assumed taken)";
   21316       }
   21317       else
   21318       if (resteerCisOk
   21319           && vex_control.guest_chase_cond
   21320           && (Addr64)d64 != (Addr64)guest_RIP_bbstart
   21321           && jmpDelta >= 0
   21322           && resteerOkFn( callback_opaque, guest_RIP_bbstart+delta ) ) {
   21323          /* Speculation: assume this forward branch is not taken.
   21324             So we need to emit a side-exit to d64 (the dest) and
   21325             continue disassembling at the insn immediately
   21326             following this one. */
   21327          stmt( IRStmt_Exit(
   21328                   mk_amd64g_calculate_condition((AMD64Condcode)
   21329                                                 (opc - 0x80)),
   21330                   Ijk_Boring,
   21331                   IRConst_U64(d64),
   21332                   OFFB_RIP
   21333              ));
   21334          dres->whatNext   = Dis_ResteerC;
   21335          dres->continueAt = guest_RIP_bbstart+delta;
   21336          comment = "(assumed not taken)";
   21337       }
   21338       else {
   21339          /* Conservative default translation - end the block at
   21340             this point. */
   21341          jcc_01( dres, (AMD64Condcode)(opc - 0x80),
   21342                  guest_RIP_bbstart+delta, d64 );
   21343          vassert(dres->whatNext == Dis_StopHere);
   21344       }
   21345       DIP("j%s-32 0x%llx %s\n", name_AMD64Condcode(opc - 0x80), d64, comment);
   21346       return delta;
   21347    }
   21348 
   21349    case 0x90:
   21350    case 0x91:
   21351    case 0x92: /* set-Bb/set-NAEb (set if below) */
   21352    case 0x93: /* set-NBb/set-AEb (set if not below) */
   21353    case 0x94: /* set-Zb/set-Eb (set if zero) */
   21354    case 0x95: /* set-NZb/set-NEb (set if not zero) */
   21355    case 0x96: /* set-BEb/set-NAb (set if below or equal) */
   21356    case 0x97: /* set-NBEb/set-Ab (set if not below or equal) */
   21357    case 0x98: /* set-Sb (set if negative) */
   21358    case 0x99: /* set-Sb (set if not negative) */
   21359    case 0x9A: /* set-P (set if parity even) */
   21360    case 0x9B: /* set-NP (set if parity odd) */
   21361    case 0x9C: /* set-Lb/set-NGEb (set if less) */
   21362    case 0x9D: /* set-GEb/set-NLb (set if greater or equal) */
   21363    case 0x9E: /* set-LEb/set-NGb (set if less or equal) */
   21364    case 0x9F: /* set-Gb/set-NLEb (set if greater) */
   21365       if (haveF2orF3(pfx)) goto decode_failure;
   21366       t1 = newTemp(Ity_I8);
   21367       assign( t1, unop(Iop_1Uto8,mk_amd64g_calculate_condition(opc-0x90)) );
   21368       modrm = getUChar(delta);
   21369       if (epartIsReg(modrm)) {
   21370          delta++;
   21371          putIRegE(1, pfx, modrm, mkexpr(t1));
   21372          DIP("set%s %s\n", name_AMD64Condcode(opc-0x90),
   21373                            nameIRegE(1,pfx,modrm));
   21374       } else {
   21375          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21376          delta += alen;
   21377          storeLE( mkexpr(addr), mkexpr(t1) );
   21378          DIP("set%s %s\n", name_AMD64Condcode(opc-0x90), dis_buf);
   21379       }
   21380       return delta;
   21381 
   21382    case 0x1A:
   21383    case 0x1B: { /* Future MPX instructions, currently NOPs.
   21384                    BNDMK b, m     F3 0F 1B
   21385                    BNDCL b, r/m   F3 0F 1A
   21386                    BNDCU b, r/m   F2 0F 1A
   21387                    BNDCN b, r/m   F2 0F 1B
   21388                    BNDMOV b, b/m  66 0F 1A
   21389                    BNDMOV b/m, b  66 0F 1B
   21390                    BNDLDX b, mib     0F 1A
   21391                    BNDSTX mib, b     0F 1B */
   21392 
   21393       /* All instructions have two operands. One operand is always the
   21394          bnd register number (bnd0-bnd3, other register numbers are
   21395          ignored when MPX isn't enabled, but should generate an
   21396          exception if MPX is enabled) given by gregOfRexRM. The other
   21397          operand is either a ModRM:reg, ModRM:r/m or a SIB encoded
   21398          address, all of which can be decoded by using either
   21399          eregOfRexRM or disAMode. */
   21400 
   21401       modrm = getUChar(delta);
   21402       int bnd = gregOfRexRM(pfx,modrm);
   21403       const HChar *oper;
   21404       if (epartIsReg(modrm)) {
   21405          oper = nameIReg64 (eregOfRexRM(pfx,modrm));
   21406          delta += 1;
   21407       } else {
   21408          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21409          delta += alen;
   21410          oper = dis_buf;
   21411       }
   21412 
   21413       if (haveF3no66noF2 (pfx)) {
   21414          if (opc == 0x1B) {
   21415             DIP ("bndmk %s, %%bnd%d\n", oper, bnd);
   21416          } else /* opc == 0x1A */ {
   21417             DIP ("bndcl %s, %%bnd%d\n", oper, bnd);
   21418          }
   21419       } else if (haveF2no66noF3 (pfx)) {
   21420          if (opc == 0x1A) {
   21421             DIP ("bndcu %s, %%bnd%d\n", oper, bnd);
   21422          } else /* opc == 0x1B */ {
   21423             DIP ("bndcn %s, %%bnd%d\n", oper, bnd);
   21424          }
   21425       } else if (have66noF2noF3 (pfx)) {
   21426          if (opc == 0x1A) {
   21427             DIP ("bndmov %s, %%bnd%d\n", oper, bnd);
   21428          } else /* opc == 0x1B */ {
   21429             DIP ("bndmov %%bnd%d, %s\n", bnd, oper);
   21430          }
   21431       } else if (haveNo66noF2noF3 (pfx)) {
   21432          if (opc == 0x1A) {
   21433             DIP ("bndldx %s, %%bnd%d\n", oper, bnd);
   21434          } else /* opc == 0x1B */ {
   21435             DIP ("bndstx %%bnd%d, %s\n", bnd, oper);
   21436          }
   21437       } else goto decode_failure;
   21438 
   21439       return delta;
   21440    }
   21441 
   21442    case 0xA2: { /* CPUID */
   21443       /* Uses dirty helper:
   21444             void amd64g_dirtyhelper_CPUID ( VexGuestAMD64State* )
   21445          declared to mod rax, wr rbx, rcx, rdx
   21446       */
   21447       IRDirty*     d     = NULL;
   21448       const HChar* fName = NULL;
   21449       void*        fAddr = NULL;
   21450 
   21451       /* JRS 2014-11-11: this a really horrible temp kludge to work
   21452          around the fact that the Yosemite (OSX 10.10)
   21453          /usr/lib/system/libdyld.dylib expects XSAVE/XRSTOR to be
   21454          implemented, because amd64g_dirtyhelper_CPUID_avx_and_cx16
   21455          claims they are supported, but so far they aren't.  So cause
   21456          it to fall back to a simpler CPU.  The cleaner approach of
   21457          setting CPUID(eax=1).OSXSAVE=0 and .XSAVE=0 isn't desirable
   21458          since it will (per the official Intel guidelines) lead to
   21459          software concluding that AVX isn't supported.
   21460 
   21461          This is also a kludge in that putting these ifdefs here checks
   21462          the build (host) architecture, when really we're checking the
   21463          guest architecture. */
   21464       Bool this_is_yosemite = False;
   21465 #     if defined(VGP_amd64_darwin) && DARWIN_VERS == DARWIN_10_10
   21466       this_is_yosemite = True;
   21467 #     endif
   21468 
   21469       if (haveF2orF3(pfx)) goto decode_failure;
   21470       /* This isn't entirely correct, CPUID should depend on the VEX
   21471          capabilities, not on the underlying CPU. See bug #324882. */
   21472       if (!this_is_yosemite &&
   21473           (archinfo->hwcaps & VEX_HWCAPS_AMD64_SSE3) &&
   21474           (archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16) &&
   21475           (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   21476          fName = "amd64g_dirtyhelper_CPUID_avx_and_cx16";
   21477          fAddr = &amd64g_dirtyhelper_CPUID_avx_and_cx16;
   21478          /* This is a Core-i5-2300-like machine */
   21479       }
   21480       else if ((archinfo->hwcaps & VEX_HWCAPS_AMD64_SSE3) &&
   21481                (archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16)) {
   21482          fName = "amd64g_dirtyhelper_CPUID_sse42_and_cx16";
   21483          fAddr = &amd64g_dirtyhelper_CPUID_sse42_and_cx16;
   21484          /* This is a Core-i5-670-like machine */
   21485       }
   21486       else {
   21487          /* Give a CPUID for at least a baseline machine, SSE2
   21488             only, and no CX16 */
   21489          fName = "amd64g_dirtyhelper_CPUID_baseline";
   21490          fAddr = &amd64g_dirtyhelper_CPUID_baseline;
   21491       }
   21492 
   21493       vassert(fName); vassert(fAddr);
   21494       d = unsafeIRDirty_0_N ( 0/*regparms*/,
   21495                               fName, fAddr, mkIRExprVec_1(IRExpr_BBPTR()) );
   21496       /* declare guest state effects */
   21497       d->nFxState = 4;
   21498       vex_bzero(&d->fxState, sizeof(d->fxState));
   21499       d->fxState[0].fx     = Ifx_Modify;
   21500       d->fxState[0].offset = OFFB_RAX;
   21501       d->fxState[0].size   = 8;
   21502       d->fxState[1].fx     = Ifx_Write;
   21503       d->fxState[1].offset = OFFB_RBX;
   21504       d->fxState[1].size   = 8;
   21505       d->fxState[2].fx     = Ifx_Modify;
   21506       d->fxState[2].offset = OFFB_RCX;
   21507       d->fxState[2].size   = 8;
   21508       d->fxState[3].fx     = Ifx_Write;
   21509       d->fxState[3].offset = OFFB_RDX;
   21510       d->fxState[3].size   = 8;
   21511       /* execute the dirty call, side-effecting guest state */
   21512       stmt( IRStmt_Dirty(d) );
   21513       /* CPUID is a serialising insn.  So, just in case someone is
   21514          using it as a memory fence ... */
   21515       stmt( IRStmt_MBE(Imbe_Fence) );
   21516       DIP("cpuid\n");
   21517       return delta;
   21518    }
   21519 
   21520    case 0xA3: { /* BT Gv,Ev */
   21521       /* We let dis_bt_G_E decide whether F2 or F3 are allowable. */
   21522       Bool ok = True;
   21523       if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
   21524       delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpNone, &ok );
   21525       if (!ok) goto decode_failure;
   21526       return delta;
   21527    }
   21528 
   21529    case 0xA4: /* SHLDv imm8,Gv,Ev */
   21530       modrm = getUChar(delta);
   21531       d64   = delta + lengthAMode(pfx, delta);
   21532       vex_sprintf(dis_buf, "$%d", (Int)getUChar(d64));
   21533       delta = dis_SHLRD_Gv_Ev (
   21534                  vbi, pfx, delta, modrm, sz,
   21535                  mkU8(getUChar(d64)), True, /* literal */
   21536                  dis_buf, True /* left */ );
   21537       return delta;
   21538 
   21539    case 0xA5: /* SHLDv %cl,Gv,Ev */
   21540       modrm = getUChar(delta);
   21541       delta = dis_SHLRD_Gv_Ev (
   21542                  vbi, pfx, delta, modrm, sz,
   21543                  getIRegCL(), False, /* not literal */
   21544                  "%cl", True /* left */ );
   21545       return delta;
   21546 
   21547    case 0xAB: { /* BTS Gv,Ev */
   21548       /* We let dis_bt_G_E decide whether F2 or F3 are allowable. */
   21549       Bool ok = True;
   21550       if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
   21551       delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpSet, &ok );
   21552       if (!ok) goto decode_failure;
   21553       return delta;
   21554    }
   21555 
   21556    case 0xAC: /* SHRDv imm8,Gv,Ev */
   21557       modrm = getUChar(delta);
   21558       d64   = delta + lengthAMode(pfx, delta);
   21559       vex_sprintf(dis_buf, "$%d", (Int)getUChar(d64));
   21560       delta = dis_SHLRD_Gv_Ev (
   21561                  vbi, pfx, delta, modrm, sz,
   21562                  mkU8(getUChar(d64)), True, /* literal */
   21563                  dis_buf, False /* right */ );
   21564       return delta;
   21565 
   21566    case 0xAD: /* SHRDv %cl,Gv,Ev */
   21567       modrm = getUChar(delta);
   21568       delta = dis_SHLRD_Gv_Ev (
   21569                  vbi, pfx, delta, modrm, sz,
   21570                  getIRegCL(), False, /* not literal */
   21571                  "%cl", False /* right */);
   21572       return delta;
   21573 
   21574    case 0xAF: /* IMUL Ev, Gv */
   21575       if (haveF2orF3(pfx)) goto decode_failure;
   21576       delta = dis_mul_E_G ( vbi, pfx, sz, delta );
   21577       return delta;
   21578 
   21579    case 0xB0: { /* CMPXCHG Gb,Eb */
   21580       Bool ok = True;
   21581       /* We let dis_cmpxchg_G_E decide whether F2 or F3 are allowable. */
   21582       delta = dis_cmpxchg_G_E ( &ok, vbi, pfx, 1, delta );
   21583       if (!ok) goto decode_failure;
   21584       return delta;
   21585    }
   21586 
   21587    case 0xB1: { /* CMPXCHG Gv,Ev (allowed in 16,32,64 bit) */
   21588       Bool ok = True;
   21589       /* We let dis_cmpxchg_G_E decide whether F2 or F3 are allowable. */
   21590       if (sz != 2 && sz != 4 && sz != 8) goto decode_failure;
   21591       delta = dis_cmpxchg_G_E ( &ok, vbi, pfx, sz, delta );
   21592       if (!ok) goto decode_failure;
   21593       return delta;
   21594    }
   21595 
   21596    case 0xB3: { /* BTR Gv,Ev */
   21597       /* We let dis_bt_G_E decide whether F2 or F3 are allowable. */
   21598       Bool ok = True;
   21599       if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
   21600       delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpReset, &ok );
   21601       if (!ok) goto decode_failure;
   21602       return delta;
   21603    }
   21604 
   21605    case 0xB6: /* MOVZXb Eb,Gv */
   21606       if (haveF2orF3(pfx)) goto decode_failure;
   21607       if (sz != 2 && sz != 4 && sz != 8)
   21608          goto decode_failure;
   21609       delta = dis_movx_E_G ( vbi, pfx, delta, 1, sz, False );
   21610       return delta;
   21611 
   21612    case 0xB7: /* MOVZXw Ew,Gv */
   21613       if (haveF2orF3(pfx)) goto decode_failure;
   21614       if (sz != 4 && sz != 8)
   21615          goto decode_failure;
   21616       delta = dis_movx_E_G ( vbi, pfx, delta, 2, sz, False );
   21617       return delta;
   21618 
   21619    case 0xBA: { /* Grp8 Ib,Ev */
   21620       /* We let dis_Grp8_Imm decide whether F2 or F3 are allowable. */
   21621       Bool decode_OK = False;
   21622       modrm = getUChar(delta);
   21623       am_sz = lengthAMode(pfx,delta);
   21624       d64   = getSDisp8(delta + am_sz);
   21625       delta = dis_Grp8_Imm ( vbi, pfx, delta, modrm, am_sz, sz, d64,
   21626                              &decode_OK );
   21627       if (!decode_OK)
   21628          goto decode_failure;
   21629       return delta;
   21630    }
   21631 
   21632    case 0xBB: { /* BTC Gv,Ev */
   21633       /* We let dis_bt_G_E decide whether F2 or F3 are allowable. */
   21634       Bool ok = False;
   21635       if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
   21636       delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpComp, &ok );
   21637       if (!ok) goto decode_failure;
   21638       return delta;
   21639    }
   21640 
   21641    case 0xBC: /* BSF Gv,Ev */
   21642       if (!haveF2orF3(pfx)
   21643           || (haveF3noF2(pfx)
   21644               && 0 == (archinfo->hwcaps & VEX_HWCAPS_AMD64_BMI))) {
   21645          /* no-F2 no-F3 0F BC = BSF
   21646                   or F3 0F BC = REP; BSF on older CPUs.  */
   21647          delta = dis_bs_E_G ( vbi, pfx, sz, delta, True );
   21648          return delta;
   21649       }
   21650       /* Fall through, since F3 0F BC is TZCNT, and needs to
   21651          be handled by dis_ESC_0F__SSE4. */
   21652       break;
   21653 
   21654    case 0xBD: /* BSR Gv,Ev */
   21655       if (!haveF2orF3(pfx)
   21656           || (haveF3noF2(pfx)
   21657               && 0 == (archinfo->hwcaps & VEX_HWCAPS_AMD64_LZCNT))) {
   21658          /* no-F2 no-F3 0F BD = BSR
   21659                   or F3 0F BD = REP; BSR on older CPUs.  */
   21660          delta = dis_bs_E_G ( vbi, pfx, sz, delta, False );
   21661          return delta;
   21662       }
   21663       /* Fall through, since F3 0F BD is LZCNT, and needs to
   21664          be handled by dis_ESC_0F__SSE4. */
   21665       break;
   21666 
   21667    case 0xBE: /* MOVSXb Eb,Gv */
   21668       if (haveF2orF3(pfx)) goto decode_failure;
   21669       if (sz != 2 && sz != 4 && sz != 8)
   21670          goto decode_failure;
   21671       delta = dis_movx_E_G ( vbi, pfx, delta, 1, sz, True );
   21672       return delta;
   21673 
   21674    case 0xBF: /* MOVSXw Ew,Gv */
   21675       if (haveF2orF3(pfx)) goto decode_failure;
   21676       if (sz != 4 && sz != 8)
   21677          goto decode_failure;
   21678       delta = dis_movx_E_G ( vbi, pfx, delta, 2, sz, True );
   21679       return delta;
   21680 
   21681    case 0xC0: { /* XADD Gb,Eb */
   21682       Bool decode_OK = False;
   21683       delta = dis_xadd_G_E ( &decode_OK, vbi, pfx, 1, delta );
   21684       if (!decode_OK)
   21685          goto decode_failure;
   21686       return delta;
   21687    }
   21688 
   21689    case 0xC1: { /* XADD Gv,Ev */
   21690       Bool decode_OK = False;
   21691       delta = dis_xadd_G_E ( &decode_OK, vbi, pfx, sz, delta );
   21692       if (!decode_OK)
   21693          goto decode_failure;
   21694       return delta;
   21695    }
   21696 
   21697    case 0xC7: { /* CMPXCHG8B Ev, CMPXCHG16B Ev */
   21698       IRType  elemTy     = sz==4 ? Ity_I32 : Ity_I64;
   21699       IRTemp  expdHi     = newTemp(elemTy);
   21700       IRTemp  expdLo     = newTemp(elemTy);
   21701       IRTemp  dataHi     = newTemp(elemTy);
   21702       IRTemp  dataLo     = newTemp(elemTy);
   21703       IRTemp  oldHi      = newTemp(elemTy);
   21704       IRTemp  oldLo      = newTemp(elemTy);
   21705       IRTemp  flags_old  = newTemp(Ity_I64);
   21706       IRTemp  flags_new  = newTemp(Ity_I64);
   21707       IRTemp  success    = newTemp(Ity_I1);
   21708       IROp    opOR       = sz==4 ? Iop_Or32    : Iop_Or64;
   21709       IROp    opXOR      = sz==4 ? Iop_Xor32   : Iop_Xor64;
   21710       IROp    opCasCmpEQ = sz==4 ? Iop_CasCmpEQ32 : Iop_CasCmpEQ64;
   21711       IRExpr* zero       = sz==4 ? mkU32(0)    : mkU64(0);
   21712       IRTemp expdHi64    = newTemp(Ity_I64);
   21713       IRTemp expdLo64    = newTemp(Ity_I64);
   21714 
   21715       /* Translate this using a DCAS, even if there is no LOCK
   21716          prefix.  Life is too short to bother with generating two
   21717          different translations for the with/without-LOCK-prefix
   21718          cases. */
   21719       *expect_CAS = True;
   21720 
   21721       /* Decode, and generate address. */
   21722       if (have66(pfx)) goto decode_failure;
   21723       if (sz != 4 && sz != 8) goto decode_failure;
   21724       if (sz == 8 && !(archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16))
   21725          goto decode_failure;
   21726       modrm = getUChar(delta);
   21727       if (epartIsReg(modrm)) goto decode_failure;
   21728       if (gregLO3ofRM(modrm) != 1) goto decode_failure;
   21729       if (haveF2orF3(pfx)) {
   21730          /* Since the e-part is memory only, F2 or F3 (one or the
   21731             other) is acceptable if LOCK is also present.  But only
   21732             for cmpxchg8b. */
   21733          if (sz == 8) goto decode_failure;
   21734          if (haveF2andF3(pfx) || !haveLOCK(pfx)) goto decode_failure;
   21735       }
   21736 
   21737       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21738       delta += alen;
   21739 
   21740       /* cmpxchg16b requires an alignment check. */
   21741       if (sz == 8)
   21742          gen_SEGV_if_not_16_aligned( addr );
   21743 
   21744       /* Get the expected and new values. */
   21745       assign( expdHi64, getIReg64(R_RDX) );
   21746       assign( expdLo64, getIReg64(R_RAX) );
   21747 
   21748       /* These are the correctly-sized expected and new values.
   21749          However, we also get expdHi64/expdLo64 above as 64-bits
   21750          regardless, because we will need them later in the 32-bit
   21751          case (paradoxically). */
   21752       assign( expdHi, sz==4 ? unop(Iop_64to32, mkexpr(expdHi64))
   21753                             : mkexpr(expdHi64) );
   21754       assign( expdLo, sz==4 ? unop(Iop_64to32, mkexpr(expdLo64))
   21755                             : mkexpr(expdLo64) );
   21756       assign( dataHi, sz==4 ? getIReg32(R_RCX) : getIReg64(R_RCX) );
   21757       assign( dataLo, sz==4 ? getIReg32(R_RBX) : getIReg64(R_RBX) );
   21758 
   21759       /* Do the DCAS */
   21760       stmt( IRStmt_CAS(
   21761                mkIRCAS( oldHi, oldLo,
   21762                         Iend_LE, mkexpr(addr),
   21763                         mkexpr(expdHi), mkexpr(expdLo),
   21764                         mkexpr(dataHi), mkexpr(dataLo)
   21765             )));
   21766 
   21767       /* success when oldHi:oldLo == expdHi:expdLo */
   21768       assign( success,
   21769               binop(opCasCmpEQ,
   21770                     binop(opOR,
   21771                           binop(opXOR, mkexpr(oldHi), mkexpr(expdHi)),
   21772                           binop(opXOR, mkexpr(oldLo), mkexpr(expdLo))
   21773                     ),
   21774                     zero
   21775               ));
   21776 
   21777       /* If the DCAS is successful, that is to say oldHi:oldLo ==
   21778          expdHi:expdLo, then put expdHi:expdLo back in RDX:RAX,
   21779          which is where they came from originally.  Both the actual
   21780          contents of these two regs, and any shadow values, are
   21781          unchanged.  If the DCAS fails then we're putting into
   21782          RDX:RAX the value seen in memory. */
   21783       /* Now of course there's a complication in the 32-bit case
   21784          (bah!): if the DCAS succeeds, we need to leave RDX:RAX
   21785          unchanged; but if we use the same scheme as in the 64-bit
   21786          case, we get hit by the standard rule that a write to the
   21787          bottom 32 bits of an integer register zeros the upper 32
   21788          bits.  And so the upper halves of RDX and RAX mysteriously
   21789          become zero.  So we have to stuff back in the original
   21790          64-bit values which we previously stashed in
   21791          expdHi64:expdLo64, even if we're doing a cmpxchg8b. */
   21792       /* It's just _so_ much fun ... */
   21793       putIRegRDX( 8,
   21794                   IRExpr_ITE( mkexpr(success),
   21795                               mkexpr(expdHi64),
   21796                               sz == 4 ? unop(Iop_32Uto64, mkexpr(oldHi))
   21797                                       : mkexpr(oldHi)
   21798                 ));
   21799       putIRegRAX( 8,
   21800                   IRExpr_ITE( mkexpr(success),
   21801                               mkexpr(expdLo64),
   21802                               sz == 4 ? unop(Iop_32Uto64, mkexpr(oldLo))
   21803                                       : mkexpr(oldLo)
   21804                 ));
   21805 
   21806       /* Copy the success bit into the Z flag and leave the others
   21807          unchanged */
   21808       assign( flags_old, widenUto64(mk_amd64g_calculate_rflags_all()));
   21809       assign(
   21810          flags_new,
   21811          binop(Iop_Or64,
   21812                binop(Iop_And64, mkexpr(flags_old),
   21813                                 mkU64(~AMD64G_CC_MASK_Z)),
   21814                binop(Iop_Shl64,
   21815                      binop(Iop_And64,
   21816                            unop(Iop_1Uto64, mkexpr(success)), mkU64(1)),
   21817                      mkU8(AMD64G_CC_SHIFT_Z)) ));
   21818 
   21819       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   21820       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(flags_new) ));
   21821       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   21822       /* Set NDEP even though it isn't used.  This makes
   21823          redundant-PUT elimination of previous stores to this field
   21824          work better. */
   21825       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   21826 
   21827       /* Sheesh.  Aren't you glad it was me and not you that had to
   21828          write and validate all this grunge? */
   21829 
   21830       DIP("cmpxchg8b %s\n", dis_buf);
   21831       return delta;
   21832    }
   21833 
   21834    case 0xC8: /* BSWAP %eax */
   21835    case 0xC9:
   21836    case 0xCA:
   21837    case 0xCB:
   21838    case 0xCC:
   21839    case 0xCD:
   21840    case 0xCE:
   21841    case 0xCF: /* BSWAP %edi */
   21842       if (haveF2orF3(pfx)) goto decode_failure;
   21843       /* According to the AMD64 docs, this insn can have size 4 or
   21844          8. */
   21845       if (sz == 4) {
   21846          t1 = newTemp(Ity_I32);
   21847          assign( t1, getIRegRexB(4, pfx, opc-0xC8) );
   21848          t2 = math_BSWAP( t1, Ity_I32 );
   21849          putIRegRexB(4, pfx, opc-0xC8, mkexpr(t2));
   21850          DIP("bswapl %s\n", nameIRegRexB(4, pfx, opc-0xC8));
   21851          return delta;
   21852       }
   21853       if (sz == 8) {
   21854          t1 = newTemp(Ity_I64);
   21855          t2 = newTemp(Ity_I64);
   21856          assign( t1, getIRegRexB(8, pfx, opc-0xC8) );
   21857          t2 = math_BSWAP( t1, Ity_I64 );
   21858          putIRegRexB(8, pfx, opc-0xC8, mkexpr(t2));
   21859          DIP("bswapq %s\n", nameIRegRexB(8, pfx, opc-0xC8));
   21860          return delta;
   21861       }
   21862       goto decode_failure;
   21863 
   21864    default:
   21865       break;
   21866 
   21867    } /* first switch */
   21868 
   21869 
   21870    /* =-=-=-=-=-=-=-=-= MMXery =-=-=-=-=-=-=-=-= */
   21871    /* In the second switch, pick off MMX insns. */
   21872 
   21873    if (!have66orF2orF3(pfx)) {
   21874       /* So there's no SIMD prefix. */
   21875 
   21876       vassert(sz == 4 || sz == 8);
   21877 
   21878       switch (opc) { /* second switch */
   21879 
   21880       case 0x71:
   21881       case 0x72:
   21882       case 0x73: /* PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
   21883 
   21884       case 0x6E: /* MOVD (src)ireg-or-mem, (dst)mmxreg */
   21885       case 0x7E: /* MOVD (src)mmxreg, (dst)ireg-or-mem */
   21886       case 0x7F: /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
   21887       case 0x6F: /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
   21888 
   21889       case 0xFC:
   21890       case 0xFD:
   21891       case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
   21892 
   21893       case 0xEC:
   21894       case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
   21895 
   21896       case 0xDC:
   21897       case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   21898 
   21899       case 0xF8:
   21900       case 0xF9:
   21901       case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
   21902 
   21903       case 0xE8:
   21904       case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
   21905 
   21906       case 0xD8:
   21907       case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   21908 
   21909       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
   21910       case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
   21911 
   21912       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
   21913 
   21914       case 0x74:
   21915       case 0x75:
   21916       case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
   21917 
   21918       case 0x64:
   21919       case 0x65:
   21920       case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
   21921 
   21922       case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
   21923       case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
   21924       case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
   21925 
   21926       case 0x68:
   21927       case 0x69:
   21928       case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
   21929 
   21930       case 0x60:
   21931       case 0x61:
   21932       case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
   21933 
   21934       case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
   21935       case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
   21936       case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
   21937       case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
   21938 
   21939       case 0xF1: /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
   21940       case 0xF2:
   21941       case 0xF3:
   21942 
   21943       case 0xD1: /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
   21944       case 0xD2:
   21945       case 0xD3:
   21946 
   21947       case 0xE1: /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
   21948       case 0xE2: {
   21949          Bool decode_OK = False;
   21950          delta = dis_MMX ( &decode_OK, vbi, pfx, sz, deltaIN );
   21951          if (decode_OK)
   21952             return delta;
   21953          goto decode_failure;
   21954       }
   21955 
   21956       default:
   21957          break;
   21958       } /* second switch */
   21959 
   21960    }
   21961 
   21962    /* A couple of MMX corner cases */
   21963    if (opc == 0x0E/* FEMMS */ || opc == 0x77/* EMMS */) {
   21964       if (sz != 4)
   21965          goto decode_failure;
   21966       do_EMMS_preamble();
   21967       DIP("{f}emms\n");
   21968       return delta;
   21969    }
   21970 
   21971    /* =-=-=-=-=-=-=-=-= SSE2ery =-=-=-=-=-=-=-=-= */
   21972    /* Perhaps it's an SSE or SSE2 instruction.  We can try this
   21973       without checking the guest hwcaps because SSE2 is a baseline
   21974       facility in 64 bit mode. */
   21975    {
   21976       Bool decode_OK = False;
   21977       delta = dis_ESC_0F__SSE2 ( &decode_OK, vbi, pfx, sz, deltaIN, dres );
   21978       if (decode_OK)
   21979          return delta;
   21980    }
   21981 
   21982    /* =-=-=-=-=-=-=-=-= SSE3ery =-=-=-=-=-=-=-=-= */
   21983    /* Perhaps it's a SSE3 instruction.  FIXME: check guest hwcaps
   21984       first. */
   21985    {
   21986       Bool decode_OK = False;
   21987       delta = dis_ESC_0F__SSE3 ( &decode_OK, vbi, pfx, sz, deltaIN );
   21988       if (decode_OK)
   21989          return delta;
   21990    }
   21991 
   21992    /* =-=-=-=-=-=-=-=-= SSE4ery =-=-=-=-=-=-=-=-= */
   21993    /* Perhaps it's a SSE4 instruction.  FIXME: check guest hwcaps
   21994       first. */
   21995    {
   21996       Bool decode_OK = False;
   21997       delta = dis_ESC_0F__SSE4 ( &decode_OK,
   21998                                  archinfo, vbi, pfx, sz, deltaIN );
   21999       if (decode_OK)
   22000          return delta;
   22001    }
   22002 
   22003   decode_failure:
   22004    return deltaIN; /* fail */
   22005 }
   22006 
   22007 
   22008 /*------------------------------------------------------------*/
   22009 /*---                                                      ---*/
   22010 /*--- Top-level post-escape decoders: dis_ESC_0F38         ---*/
   22011 /*---                                                      ---*/
   22012 /*------------------------------------------------------------*/
   22013 
   22014 __attribute__((noinline))
   22015 static
   22016 Long dis_ESC_0F38 (
   22017         /*MB_OUT*/DisResult* dres,
   22018         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
   22019         Bool         resteerCisOk,
   22020         void*        callback_opaque,
   22021         const VexArchInfo* archinfo,
   22022         const VexAbiInfo*  vbi,
   22023         Prefix pfx, Int sz, Long deltaIN
   22024      )
   22025 {
   22026    Long   delta = deltaIN;
   22027    UChar  opc   = getUChar(delta);
   22028    delta++;
   22029    switch (opc) {
   22030 
   22031    case 0xF0:   /* 0F 38 F0 = MOVBE m16/32/64(E), r16/32/64(G) */
   22032    case 0xF1: { /* 0F 38 F1 = MOVBE r16/32/64(G), m16/32/64(E) */
   22033       if (!haveF2orF3(pfx) && !haveVEX(pfx)
   22034           && (sz == 2 || sz == 4 || sz == 8)) {
   22035          IRTemp addr  = IRTemp_INVALID;
   22036          UChar  modrm = 0;
   22037          Int    alen  = 0;
   22038          HChar  dis_buf[50];
   22039          modrm = getUChar(delta);
   22040          if (epartIsReg(modrm)) break;
   22041          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22042          delta += alen;
   22043          IRType ty = szToITy(sz);
   22044          IRTemp src = newTemp(ty);
   22045          if (opc == 0xF0) { /* LOAD */
   22046             assign(src, loadLE(ty, mkexpr(addr)));
   22047             IRTemp dst = math_BSWAP(src, ty);
   22048             putIRegG(sz, pfx, modrm, mkexpr(dst));
   22049             DIP("movbe %s,%s\n", dis_buf, nameIRegG(sz, pfx, modrm));
   22050          } else { /* STORE */
   22051             assign(src, getIRegG(sz, pfx, modrm));
   22052             IRTemp dst = math_BSWAP(src, ty);
   22053             storeLE(mkexpr(addr), mkexpr(dst));
   22054             DIP("movbe %s,%s\n", nameIRegG(sz, pfx, modrm), dis_buf);
   22055          }
   22056          return delta;
   22057       }
   22058       /* else fall through; maybe one of the decoders below knows what
   22059          it is. */
   22060       break;
   22061    }
   22062 
   22063    default:
   22064       break;
   22065 
   22066    }
   22067 
   22068    /* =-=-=-=-=-=-=-=-= SSSE3ery =-=-=-=-=-=-=-=-= */
   22069    /* Perhaps it's an SSSE3 instruction.  FIXME: consult guest hwcaps
   22070       rather than proceeding indiscriminately. */
   22071    {
   22072       Bool decode_OK = False;
   22073       delta = dis_ESC_0F38__SupSSE3 ( &decode_OK, vbi, pfx, sz, deltaIN );
   22074       if (decode_OK)
   22075          return delta;
   22076    }
   22077 
   22078    /* =-=-=-=-=-=-=-=-= SSE4ery =-=-=-=-=-=-=-=-= */
   22079    /* Perhaps it's an SSE4 instruction.  FIXME: consult guest hwcaps
   22080       rather than proceeding indiscriminately. */
   22081    {
   22082       Bool decode_OK = False;
   22083       delta = dis_ESC_0F38__SSE4 ( &decode_OK, vbi, pfx, sz, deltaIN );
   22084       if (decode_OK)
   22085          return delta;
   22086    }
   22087 
   22088   /*decode_failure:*/
   22089    return deltaIN; /* fail */
   22090 }
   22091 
   22092 
   22093 /*------------------------------------------------------------*/
   22094 /*---                                                      ---*/
   22095 /*--- Top-level post-escape decoders: dis_ESC_0F3A         ---*/
   22096 /*---                                                      ---*/
   22097 /*------------------------------------------------------------*/
   22098 
   22099 __attribute__((noinline))
   22100 static
   22101 Long dis_ESC_0F3A (
   22102         /*MB_OUT*/DisResult* dres,
   22103         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
   22104         Bool         resteerCisOk,
   22105         void*        callback_opaque,
   22106         const VexArchInfo* archinfo,
   22107         const VexAbiInfo*  vbi,
   22108         Prefix pfx, Int sz, Long deltaIN
   22109      )
   22110 {
   22111    Long   delta = deltaIN;
   22112    UChar  opc   = getUChar(delta);
   22113    delta++;
   22114    switch (opc) {
   22115 
   22116    default:
   22117       break;
   22118 
   22119    }
   22120 
   22121    /* =-=-=-=-=-=-=-=-= SSSE3ery =-=-=-=-=-=-=-=-= */
   22122    /* Perhaps it's an SSSE3 instruction.  FIXME: consult guest hwcaps
   22123       rather than proceeding indiscriminately. */
   22124    {
   22125       Bool decode_OK = False;
   22126       delta = dis_ESC_0F3A__SupSSE3 ( &decode_OK, vbi, pfx, sz, deltaIN );
   22127       if (decode_OK)
   22128          return delta;
   22129    }
   22130 
   22131    /* =-=-=-=-=-=-=-=-= SSE4ery =-=-=-=-=-=-=-=-= */
   22132    /* Perhaps it's an SSE4 instruction.  FIXME: consult guest hwcaps
   22133       rather than proceeding indiscriminately. */
   22134    {
   22135       Bool decode_OK = False;
   22136       delta = dis_ESC_0F3A__SSE4 ( &decode_OK, vbi, pfx, sz, deltaIN );
   22137       if (decode_OK)
   22138          return delta;
   22139    }
   22140 
   22141    return deltaIN; /* fail */
   22142 }
   22143 
   22144 
   22145 /*------------------------------------------------------------*/
   22146 /*---                                                      ---*/
   22147 /*--- Top-level post-escape decoders: dis_ESC_0F__VEX      ---*/
   22148 /*---                                                      ---*/
   22149 /*------------------------------------------------------------*/
   22150 
   22151 /* FIXME: common up with the _256_ version below? */
   22152 static
   22153 Long dis_VEX_NDS_128_AnySimdPfx_0F_WIG (
   22154         /*OUT*/Bool* uses_vvvv, const VexAbiInfo* vbi,
   22155         Prefix pfx, Long delta, const HChar* name,
   22156         /* The actual operation.  Use either 'op' or 'opfn',
   22157            but not both. */
   22158         IROp op, IRTemp(*opFn)(IRTemp,IRTemp),
   22159         Bool invertLeftArg,
   22160         Bool swapArgs
   22161      )
   22162 {
   22163    UChar  modrm = getUChar(delta);
   22164    UInt   rD    = gregOfRexRM(pfx, modrm);
   22165    UInt   rSL   = getVexNvvvv(pfx);
   22166    IRTemp tSL   = newTemp(Ity_V128);
   22167    IRTemp tSR   = newTemp(Ity_V128);
   22168    IRTemp addr  = IRTemp_INVALID;
   22169    HChar  dis_buf[50];
   22170    Int    alen  = 0;
   22171    vassert(0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*WIG?*/);
   22172 
   22173    assign(tSL, invertLeftArg ? unop(Iop_NotV128, getXMMReg(rSL))
   22174                              : getXMMReg(rSL));
   22175 
   22176    if (epartIsReg(modrm)) {
   22177       UInt rSR = eregOfRexRM(pfx, modrm);
   22178       delta += 1;
   22179       assign(tSR, getXMMReg(rSR));
   22180       DIP("%s %s,%s,%s\n",
   22181           name, nameXMMReg(rSR), nameXMMReg(rSL), nameXMMReg(rD));
   22182    } else {
   22183       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   22184       delta += alen;
   22185       assign(tSR, loadLE(Ity_V128, mkexpr(addr)));
   22186       DIP("%s %s,%s,%s\n",
   22187           name, dis_buf, nameXMMReg(rSL), nameXMMReg(rD));
   22188    }
   22189 
   22190    IRTemp res = IRTemp_INVALID;
   22191    if (op != Iop_INVALID) {
   22192       vassert(opFn == NULL);
   22193       res = newTemp(Ity_V128);
   22194       if (requiresRMode(op)) {
   22195          IRTemp rm = newTemp(Ity_I32);
   22196          assign(rm, get_FAKE_roundingmode()); /* XXXROUNDINGFIXME */
   22197          assign(res, swapArgs
   22198                         ? triop(op, mkexpr(rm), mkexpr(tSR), mkexpr(tSL))
   22199                         : triop(op, mkexpr(rm), mkexpr(tSL), mkexpr(tSR)));
   22200       } else {
   22201          assign(res, swapArgs
   22202                         ? binop(op, mkexpr(tSR), mkexpr(tSL))
   22203                         : binop(op, mkexpr(tSL), mkexpr(tSR)));
   22204       }
   22205    } else {
   22206       vassert(opFn != NULL);
   22207       res = swapArgs ? opFn(tSR, tSL) : opFn(tSL, tSR);
   22208    }
   22209 
   22210    putYMMRegLoAndZU(rD, mkexpr(res));
   22211 
   22212    *uses_vvvv = True;
   22213    return delta;
   22214 }
   22215 
   22216 
   22217 /* Handle a VEX_NDS_128_66_0F_WIG (3-addr) insn, with a simple IROp
   22218    for the operation, no inversion of the left arg, and no swapping of
   22219    args. */
   22220 static
   22221 Long dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple (
   22222         /*OUT*/Bool* uses_vvvv, const VexAbiInfo* vbi,
   22223         Prefix pfx, Long delta, const HChar* name,
   22224         IROp op
   22225      )
   22226 {
   22227    return dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   22228              uses_vvvv, vbi, pfx, delta, name, op, NULL, False, False);
   22229 }
   22230 
   22231 
   22232 /* Handle a VEX_NDS_128_66_0F_WIG (3-addr) insn, using the given IR
   22233    generator to compute the result, no inversion of the left
   22234    arg, and no swapping of args. */
   22235 static
   22236 Long dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex (
   22237         /*OUT*/Bool* uses_vvvv, const VexAbiInfo* vbi,
   22238         Prefix pfx, Long delta, const HChar* name,
   22239         IRTemp(*opFn)(IRTemp,IRTemp)
   22240      )
   22241 {
   22242    return dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   22243              uses_vvvv, vbi, pfx, delta, name,
   22244              Iop_INVALID, opFn, False, False );
   22245 }
   22246 
   22247 
   22248 /* Vector by scalar shift of V by the amount specified at the bottom
   22249    of E. */
   22250 static ULong dis_AVX128_shiftV_byE ( const VexAbiInfo* vbi,
   22251                                      Prefix pfx, Long delta,
   22252                                      const HChar* opname, IROp op )
   22253 {
   22254    HChar   dis_buf[50];
   22255    Int     alen, size;
   22256    IRTemp  addr;
   22257    Bool    shl, shr, sar;
   22258    UChar   modrm = getUChar(delta);
   22259    UInt    rG    = gregOfRexRM(pfx,modrm);
   22260    UInt    rV    = getVexNvvvv(pfx);;
   22261    IRTemp  g0    = newTemp(Ity_V128);
   22262    IRTemp  g1    = newTemp(Ity_V128);
   22263    IRTemp  amt   = newTemp(Ity_I64);
   22264    IRTemp  amt8  = newTemp(Ity_I8);
   22265    if (epartIsReg(modrm)) {
   22266       UInt rE = eregOfRexRM(pfx,modrm);
   22267       assign( amt, getXMMRegLane64(rE, 0) );
   22268       DIP("%s %s,%s,%s\n", opname, nameXMMReg(rE),
   22269           nameXMMReg(rV), nameXMMReg(rG) );
   22270       delta++;
   22271    } else {
   22272       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22273       assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
   22274       DIP("%s %s,%s,%s\n", opname, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   22275       delta += alen;
   22276    }
   22277    assign( g0, getXMMReg(rV) );
   22278    assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
   22279 
   22280    shl = shr = sar = False;
   22281    size = 0;
   22282    switch (op) {
   22283       case Iop_ShlN16x8: shl = True; size = 32; break;
   22284       case Iop_ShlN32x4: shl = True; size = 32; break;
   22285       case Iop_ShlN64x2: shl = True; size = 64; break;
   22286       case Iop_SarN16x8: sar = True; size = 16; break;
   22287       case Iop_SarN32x4: sar = True; size = 32; break;
   22288       case Iop_ShrN16x8: shr = True; size = 16; break;
   22289       case Iop_ShrN32x4: shr = True; size = 32; break;
   22290       case Iop_ShrN64x2: shr = True; size = 64; break;
   22291       default: vassert(0);
   22292    }
   22293 
   22294    if (shl || shr) {
   22295      assign(
   22296         g1,
   22297         IRExpr_ITE(
   22298            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
   22299            binop(op, mkexpr(g0), mkexpr(amt8)),
   22300            mkV128(0x0000)
   22301         )
   22302      );
   22303    } else
   22304    if (sar) {
   22305      assign(
   22306         g1,
   22307         IRExpr_ITE(
   22308            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
   22309            binop(op, mkexpr(g0), mkexpr(amt8)),
   22310            binop(op, mkexpr(g0), mkU8(size-1))
   22311         )
   22312      );
   22313    } else {
   22314       vassert(0);
   22315    }
   22316 
   22317    putYMMRegLoAndZU( rG, mkexpr(g1) );
   22318    return delta;
   22319 }
   22320 
   22321 
   22322 /* Vector by scalar shift of V by the amount specified at the bottom
   22323    of E. */
   22324 static ULong dis_AVX256_shiftV_byE ( const VexAbiInfo* vbi,
   22325                                      Prefix pfx, Long delta,
   22326                                      const HChar* opname, IROp op )
   22327 {
   22328    HChar   dis_buf[50];
   22329    Int     alen, size;
   22330    IRTemp  addr;
   22331    Bool    shl, shr, sar;
   22332    UChar   modrm = getUChar(delta);
   22333    UInt    rG    = gregOfRexRM(pfx,modrm);
   22334    UInt    rV    = getVexNvvvv(pfx);;
   22335    IRTemp  g0    = newTemp(Ity_V256);
   22336    IRTemp  g1    = newTemp(Ity_V256);
   22337    IRTemp  amt   = newTemp(Ity_I64);
   22338    IRTemp  amt8  = newTemp(Ity_I8);
   22339    if (epartIsReg(modrm)) {
   22340       UInt rE = eregOfRexRM(pfx,modrm);
   22341       assign( amt, getXMMRegLane64(rE, 0) );
   22342       DIP("%s %s,%s,%s\n", opname, nameXMMReg(rE),
   22343           nameYMMReg(rV), nameYMMReg(rG) );
   22344       delta++;
   22345    } else {
   22346       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22347       assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
   22348       DIP("%s %s,%s,%s\n", opname, dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
   22349       delta += alen;
   22350    }
   22351    assign( g0, getYMMReg(rV) );
   22352    assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
   22353 
   22354    shl = shr = sar = False;
   22355    size = 0;
   22356    switch (op) {
   22357       case Iop_ShlN16x16: shl = True; size = 32; break;
   22358       case Iop_ShlN32x8:  shl = True; size = 32; break;
   22359       case Iop_ShlN64x4:  shl = True; size = 64; break;
   22360       case Iop_SarN16x16: sar = True; size = 16; break;
   22361       case Iop_SarN32x8:  sar = True; size = 32; break;
   22362       case Iop_ShrN16x16: shr = True; size = 16; break;
   22363       case Iop_ShrN32x8:  shr = True; size = 32; break;
   22364       case Iop_ShrN64x4:  shr = True; size = 64; break;
   22365       default: vassert(0);
   22366    }
   22367 
   22368    if (shl || shr) {
   22369      assign(
   22370         g1,
   22371         IRExpr_ITE(
   22372            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
   22373            binop(op, mkexpr(g0), mkexpr(amt8)),
   22374            binop(Iop_V128HLtoV256, mkV128(0), mkV128(0))
   22375         )
   22376      );
   22377    } else
   22378    if (sar) {
   22379      assign(
   22380         g1,
   22381         IRExpr_ITE(
   22382            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
   22383            binop(op, mkexpr(g0), mkexpr(amt8)),
   22384            binop(op, mkexpr(g0), mkU8(size-1))
   22385         )
   22386      );
   22387    } else {
   22388       vassert(0);
   22389    }
   22390 
   22391    putYMMReg( rG, mkexpr(g1) );
   22392    return delta;
   22393 }
   22394 
   22395 
   22396 /* Vector by vector shift of V by the amount specified at the bottom
   22397    of E.  Vector by vector shifts are defined for all shift amounts,
   22398    so not using Iop_S*x* here (and SSE2 doesn't support variable shifts
   22399    anyway).  */
   22400 static ULong dis_AVX_var_shiftV_byE ( const VexAbiInfo* vbi,
   22401                                       Prefix pfx, Long delta,
   22402                                       const HChar* opname, IROp op, Bool isYMM )
   22403 {
   22404    HChar   dis_buf[50];
   22405    Int     alen, size, i;
   22406    IRTemp  addr;
   22407    UChar   modrm = getUChar(delta);
   22408    UInt    rG    = gregOfRexRM(pfx,modrm);
   22409    UInt    rV    = getVexNvvvv(pfx);;
   22410    IRTemp  sV    = isYMM ? newTemp(Ity_V256) : newTemp(Ity_V128);
   22411    IRTemp  amt   = isYMM ? newTemp(Ity_V256) : newTemp(Ity_V128);
   22412    IRTemp  amts[8], sVs[8], res[8];
   22413    if (epartIsReg(modrm)) {
   22414       UInt rE = eregOfRexRM(pfx,modrm);
   22415       assign( amt, isYMM ? getYMMReg(rE) : getXMMReg(rE) );
   22416       if (isYMM) {
   22417          DIP("%s %s,%s,%s\n", opname, nameYMMReg(rE),
   22418              nameYMMReg(rV), nameYMMReg(rG) );
   22419       } else {
   22420          DIP("%s %s,%s,%s\n", opname, nameXMMReg(rE),
   22421              nameXMMReg(rV), nameXMMReg(rG) );
   22422       }
   22423       delta++;
   22424    } else {
   22425       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22426       assign( amt, loadLE(isYMM ? Ity_V256 : Ity_V128, mkexpr(addr)) );
   22427       if (isYMM) {
   22428          DIP("%s %s,%s,%s\n", opname, dis_buf, nameYMMReg(rV),
   22429              nameYMMReg(rG) );
   22430       } else {
   22431          DIP("%s %s,%s,%s\n", opname, dis_buf, nameXMMReg(rV),
   22432              nameXMMReg(rG) );
   22433       }
   22434       delta += alen;
   22435    }
   22436    assign( sV, isYMM ? getYMMReg(rV) : getXMMReg(rV) );
   22437 
   22438    size = 0;
   22439    switch (op) {
   22440       case Iop_Shl32: size = 32; break;
   22441       case Iop_Shl64: size = 64; break;
   22442       case Iop_Sar32: size = 32; break;
   22443       case Iop_Shr32: size = 32; break;
   22444       case Iop_Shr64: size = 64; break;
   22445       default: vassert(0);
   22446    }
   22447 
   22448    for (i = 0; i < 8; i++) {
   22449       sVs[i] = IRTemp_INVALID;
   22450       amts[i] = IRTemp_INVALID;
   22451    }
   22452    switch (size) {
   22453       case 32:
   22454          if (isYMM) {
   22455             breakupV256to32s( sV, &sVs[7], &sVs[6], &sVs[5], &sVs[4],
   22456                                   &sVs[3], &sVs[2], &sVs[1], &sVs[0] );
   22457             breakupV256to32s( amt, &amts[7], &amts[6], &amts[5], &amts[4],
   22458                                    &amts[3], &amts[2], &amts[1], &amts[0] );
   22459          } else {
   22460             breakupV128to32s( sV, &sVs[3], &sVs[2], &sVs[1], &sVs[0] );
   22461             breakupV128to32s( amt, &amts[3], &amts[2], &amts[1], &amts[0] );
   22462         }
   22463          break;
   22464       case 64:
   22465          if (isYMM) {
   22466             breakupV256to64s( sV, &sVs[3], &sVs[2], &sVs[1], &sVs[0] );
   22467             breakupV256to64s( amt, &amts[3], &amts[2], &amts[1], &amts[0] );
   22468          } else {
   22469             breakupV128to64s( sV, &sVs[1], &sVs[0] );
   22470             breakupV128to64s( amt, &amts[1], &amts[0] );
   22471          }
   22472          break;
   22473       default: vassert(0);
   22474    }
   22475    for (i = 0; i < 8; i++)
   22476       if (sVs[i] != IRTemp_INVALID) {
   22477          res[i] = size == 32 ? newTemp(Ity_I32) : newTemp(Ity_I64);
   22478          assign( res[i],
   22479                  IRExpr_ITE(
   22480                     binop(size == 32 ? Iop_CmpLT32U : Iop_CmpLT64U,
   22481                           mkexpr(amts[i]),
   22482                           size == 32 ? mkU32(size) : mkU64(size)),
   22483                     binop(op, mkexpr(sVs[i]),
   22484                                unop(size == 32 ? Iop_32to8 : Iop_64to8,
   22485                                     mkexpr(amts[i]))),
   22486                     op == Iop_Sar32 ? binop(op, mkexpr(sVs[i]), mkU8(size-1))
   22487                                     : size == 32 ? mkU32(0) : mkU64(0)
   22488          ));
   22489       }
   22490    switch (size) {
   22491       case 32:
   22492          for (i = 0; i < 8; i++)
   22493             putYMMRegLane32( rG, i, (i < 4 || isYMM)
   22494                                     ? mkexpr(res[i]) : mkU32(0) );
   22495          break;
   22496       case 64:
   22497          for (i = 0; i < 4; i++)
   22498             putYMMRegLane64( rG, i, (i < 2 || isYMM)
   22499                                     ? mkexpr(res[i]) : mkU64(0) );
   22500          break;
   22501       default: vassert(0);
   22502    }
   22503 
   22504    return delta;
   22505 }
   22506 
   22507 
   22508 /* Vector by scalar shift of E into V, by an immediate byte.  Modified
   22509    version of dis_SSE_shiftE_imm. */
   22510 static
   22511 Long dis_AVX128_shiftE_to_V_imm( Prefix pfx,
   22512                                  Long delta, const HChar* opname, IROp op )
   22513 {
   22514    Bool    shl, shr, sar;
   22515    UChar   rm   = getUChar(delta);
   22516    IRTemp  e0   = newTemp(Ity_V128);
   22517    IRTemp  e1   = newTemp(Ity_V128);
   22518    UInt    rD   = getVexNvvvv(pfx);
   22519    UChar   amt, size;
   22520    vassert(epartIsReg(rm));
   22521    vassert(gregLO3ofRM(rm) == 2
   22522            || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
   22523    amt = getUChar(delta+1);
   22524    delta += 2;
   22525    DIP("%s $%d,%s,%s\n", opname,
   22526                          (Int)amt,
   22527                          nameXMMReg(eregOfRexRM(pfx,rm)),
   22528                          nameXMMReg(rD));
   22529    assign( e0, getXMMReg(eregOfRexRM(pfx,rm)) );
   22530 
   22531    shl = shr = sar = False;
   22532    size = 0;
   22533    switch (op) {
   22534       case Iop_ShlN16x8: shl = True; size = 16; break;
   22535       case Iop_ShlN32x4: shl = True; size = 32; break;
   22536       case Iop_ShlN64x2: shl = True; size = 64; break;
   22537       case Iop_SarN16x8: sar = True; size = 16; break;
   22538       case Iop_SarN32x4: sar = True; size = 32; break;
   22539       case Iop_ShrN16x8: shr = True; size = 16; break;
   22540       case Iop_ShrN32x4: shr = True; size = 32; break;
   22541       case Iop_ShrN64x2: shr = True; size = 64; break;
   22542       default: vassert(0);
   22543    }
   22544 
   22545    if (shl || shr) {
   22546      assign( e1, amt >= size
   22547                     ? mkV128(0x0000)
   22548                     : binop(op, mkexpr(e0), mkU8(amt))
   22549      );
   22550    } else
   22551    if (sar) {
   22552      assign( e1, amt >= size
   22553                     ? binop(op, mkexpr(e0), mkU8(size-1))
   22554                     : binop(op, mkexpr(e0), mkU8(amt))
   22555      );
   22556    } else {
   22557       vassert(0);
   22558    }
   22559 
   22560    putYMMRegLoAndZU( rD, mkexpr(e1) );
   22561    return delta;
   22562 }
   22563 
   22564 
   22565 /* Vector by scalar shift of E into V, by an immediate byte.  Modified
   22566    version of dis_AVX128_shiftE_to_V_imm. */
   22567 static
   22568 Long dis_AVX256_shiftE_to_V_imm( Prefix pfx,
   22569                                  Long delta, const HChar* opname, IROp op )
   22570 {
   22571    Bool    shl, shr, sar;
   22572    UChar   rm   = getUChar(delta);
   22573    IRTemp  e0   = newTemp(Ity_V256);
   22574    IRTemp  e1   = newTemp(Ity_V256);
   22575    UInt    rD   = getVexNvvvv(pfx);
   22576    UChar   amt, size;
   22577    vassert(epartIsReg(rm));
   22578    vassert(gregLO3ofRM(rm) == 2
   22579            || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
   22580    amt = getUChar(delta+1);
   22581    delta += 2;
   22582    DIP("%s $%d,%s,%s\n", opname,
   22583                          (Int)amt,
   22584                          nameYMMReg(eregOfRexRM(pfx,rm)),
   22585                          nameYMMReg(rD));
   22586    assign( e0, getYMMReg(eregOfRexRM(pfx,rm)) );
   22587 
   22588    shl = shr = sar = False;
   22589    size = 0;
   22590    switch (op) {
   22591       case Iop_ShlN16x16: shl = True; size = 16; break;
   22592       case Iop_ShlN32x8:  shl = True; size = 32; break;
   22593       case Iop_ShlN64x4:  shl = True; size = 64; break;
   22594       case Iop_SarN16x16: sar = True; size = 16; break;
   22595       case Iop_SarN32x8:  sar = True; size = 32; break;
   22596       case Iop_ShrN16x16: shr = True; size = 16; break;
   22597       case Iop_ShrN32x8:  shr = True; size = 32; break;
   22598       case Iop_ShrN64x4:  shr = True; size = 64; break;
   22599       default: vassert(0);
   22600    }
   22601 
   22602 
   22603    if (shl || shr) {
   22604      assign( e1, amt >= size
   22605                     ? binop(Iop_V128HLtoV256, mkV128(0), mkV128(0))
   22606                     : binop(op, mkexpr(e0), mkU8(amt))
   22607      );
   22608    } else
   22609    if (sar) {
   22610      assign( e1, amt >= size
   22611                     ? binop(op, mkexpr(e0), mkU8(size-1))
   22612                     : binop(op, mkexpr(e0), mkU8(amt))
   22613      );
   22614    } else {
   22615       vassert(0);
   22616    }
   22617 
   22618    putYMMReg( rD, mkexpr(e1) );
   22619    return delta;
   22620 }
   22621 
   22622 
   22623 /* Lower 64-bit lane only AVX128 binary operation:
   22624    G[63:0]    = V[63:0] `op` E[63:0]
   22625    G[127:64]  = V[127:64]
   22626    G[255:128] = 0.
   22627    The specified op must be of the 64F0x2 kind, so that it
   22628    copies the upper half of the left operand to the result.
   22629 */
   22630 static Long dis_AVX128_E_V_to_G_lo64 ( /*OUT*/Bool* uses_vvvv,
   22631                                        const VexAbiInfo* vbi,
   22632                                        Prefix pfx, Long delta,
   22633                                        const HChar* opname, IROp op )
   22634 {
   22635    HChar   dis_buf[50];
   22636    Int     alen;
   22637    IRTemp  addr;
   22638    UChar   rm    = getUChar(delta);
   22639    UInt    rG    = gregOfRexRM(pfx,rm);
   22640    UInt    rV    = getVexNvvvv(pfx);
   22641    IRExpr* vpart = getXMMReg(rV);
   22642    if (epartIsReg(rm)) {
   22643       UInt rE = eregOfRexRM(pfx,rm);
   22644       putXMMReg( rG, binop(op, vpart, getXMMReg(rE)) );
   22645       DIP("%s %s,%s,%s\n", opname,
   22646           nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   22647       delta = delta+1;
   22648    } else {
   22649       /* We can only do a 64-bit memory read, so the upper half of the
   22650          E operand needs to be made simply of zeroes. */
   22651       IRTemp epart = newTemp(Ity_V128);
   22652       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22653       assign( epart, unop( Iop_64UtoV128,
   22654                            loadLE(Ity_I64, mkexpr(addr))) );
   22655       putXMMReg( rG, binop(op, vpart, mkexpr(epart)) );
   22656       DIP("%s %s,%s,%s\n", opname,
   22657           dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   22658       delta = delta+alen;
   22659    }
   22660    putYMMRegLane128( rG, 1, mkV128(0) );
   22661    *uses_vvvv = True;
   22662    return delta;
   22663 }
   22664 
   22665 
   22666 /* Lower 64-bit lane only AVX128 unary operation:
   22667    G[63:0]    = op(E[63:0])
   22668    G[127:64]  = V[127:64]
   22669    G[255:128] = 0
   22670    The specified op must be of the 64F0x2 kind, so that it
   22671    copies the upper half of the operand to the result.
   22672 */
   22673 static Long dis_AVX128_E_V_to_G_lo64_unary ( /*OUT*/Bool* uses_vvvv,
   22674                                              const VexAbiInfo* vbi,
   22675                                              Prefix pfx, Long delta,
   22676                                              const HChar* opname, IROp op )
   22677 {
   22678    HChar   dis_buf[50];
   22679    Int     alen;
   22680    IRTemp  addr;
   22681    UChar   rm  = getUChar(delta);
   22682    UInt    rG  = gregOfRexRM(pfx,rm);
   22683    UInt    rV  = getVexNvvvv(pfx);
   22684    IRTemp  e64 = newTemp(Ity_I64);
   22685 
   22686    /* Fetch E[63:0] */
   22687    if (epartIsReg(rm)) {
   22688       UInt rE = eregOfRexRM(pfx,rm);
   22689       assign(e64, getXMMRegLane64(rE, 0));
   22690       DIP("%s %s,%s,%s\n", opname,
   22691           nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   22692       delta += 1;
   22693    } else {
   22694       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22695       assign(e64, loadLE(Ity_I64, mkexpr(addr)));
   22696       DIP("%s %s,%s,%s\n", opname,
   22697           dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   22698       delta += alen;
   22699    }
   22700 
   22701    /* Create a value 'arg' as V[127:64]++E[63:0] */
   22702    IRTemp arg = newTemp(Ity_V128);
   22703    assign(arg,
   22704           binop(Iop_SetV128lo64,
   22705                 getXMMReg(rV), mkexpr(e64)));
   22706    /* and apply op to it */
   22707    putYMMRegLoAndZU( rG, unop(op, mkexpr(arg)) );
   22708    *uses_vvvv = True;
   22709    return delta;
   22710 }
   22711 
   22712 
   22713 /* Lower 32-bit lane only AVX128 unary operation:
   22714    G[31:0]    = op(E[31:0])
   22715    G[127:32]  = V[127:32]
   22716    G[255:128] = 0
   22717    The specified op must be of the 32F0x4 kind, so that it
   22718    copies the upper 3/4 of the operand to the result.
   22719 */
   22720 static Long dis_AVX128_E_V_to_G_lo32_unary ( /*OUT*/Bool* uses_vvvv,
   22721                                              const VexAbiInfo* vbi,
   22722                                              Prefix pfx, Long delta,
   22723                                              const HChar* opname, IROp op )
   22724 {
   22725    HChar   dis_buf[50];
   22726    Int     alen;
   22727    IRTemp  addr;
   22728    UChar   rm  = getUChar(delta);
   22729    UInt    rG  = gregOfRexRM(pfx,rm);
   22730    UInt    rV  = getVexNvvvv(pfx);
   22731    IRTemp  e32 = newTemp(Ity_I32);
   22732 
   22733    /* Fetch E[31:0] */
   22734    if (epartIsReg(rm)) {
   22735       UInt rE = eregOfRexRM(pfx,rm);
   22736       assign(e32, getXMMRegLane32(rE, 0));
   22737       DIP("%s %s,%s,%s\n", opname,
   22738           nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   22739       delta += 1;
   22740    } else {
   22741       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22742       assign(e32, loadLE(Ity_I32, mkexpr(addr)));
   22743       DIP("%s %s,%s,%s\n", opname,
   22744           dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   22745       delta += alen;
   22746    }
   22747 
   22748    /* Create a value 'arg' as V[127:32]++E[31:0] */
   22749    IRTemp arg = newTemp(Ity_V128);
   22750    assign(arg,
   22751           binop(Iop_SetV128lo32,
   22752                 getXMMReg(rV), mkexpr(e32)));
   22753    /* and apply op to it */
   22754    putYMMRegLoAndZU( rG, unop(op, mkexpr(arg)) );
   22755    *uses_vvvv = True;
   22756    return delta;
   22757 }
   22758 
   22759 
   22760 /* Lower 32-bit lane only AVX128 binary operation:
   22761    G[31:0]    = V[31:0] `op` E[31:0]
   22762    G[127:32]  = V[127:32]
   22763    G[255:128] = 0.
   22764    The specified op must be of the 32F0x4 kind, so that it
   22765    copies the upper 3/4 of the left operand to the result.
   22766 */
   22767 static Long dis_AVX128_E_V_to_G_lo32 ( /*OUT*/Bool* uses_vvvv,
   22768                                        const VexAbiInfo* vbi,
   22769                                        Prefix pfx, Long delta,
   22770                                        const HChar* opname, IROp op )
   22771 {
   22772    HChar   dis_buf[50];
   22773    Int     alen;
   22774    IRTemp  addr;
   22775    UChar   rm    = getUChar(delta);
   22776    UInt    rG    = gregOfRexRM(pfx,rm);
   22777    UInt    rV    = getVexNvvvv(pfx);
   22778    IRExpr* vpart = getXMMReg(rV);
   22779    if (epartIsReg(rm)) {
   22780       UInt rE = eregOfRexRM(pfx,rm);
   22781       putXMMReg( rG, binop(op, vpart, getXMMReg(rE)) );
   22782       DIP("%s %s,%s,%s\n", opname,
   22783           nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   22784       delta = delta+1;
   22785    } else {
   22786       /* We can only do a 32-bit memory read, so the upper 3/4 of the
   22787          E operand needs to be made simply of zeroes. */
   22788       IRTemp epart = newTemp(Ity_V128);
   22789       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22790       assign( epart, unop( Iop_32UtoV128,
   22791                            loadLE(Ity_I32, mkexpr(addr))) );
   22792       putXMMReg( rG, binop(op, vpart, mkexpr(epart)) );
   22793       DIP("%s %s,%s,%s\n", opname,
   22794           dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   22795       delta = delta+alen;
   22796    }
   22797    putYMMRegLane128( rG, 1, mkV128(0) );
   22798    *uses_vvvv = True;
   22799    return delta;
   22800 }
   22801 
   22802 
   22803 /* All-lanes AVX128 binary operation:
   22804    G[127:0]   = V[127:0] `op` E[127:0]
   22805    G[255:128] = 0.
   22806 */
   22807 static Long dis_AVX128_E_V_to_G ( /*OUT*/Bool* uses_vvvv,
   22808                                   const VexAbiInfo* vbi,
   22809                                   Prefix pfx, Long delta,
   22810                                   const HChar* opname, IROp op )
   22811 {
   22812    return dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   22813              uses_vvvv, vbi, pfx, delta, opname, op,
   22814              NULL, False/*!invertLeftArg*/, False/*!swapArgs*/
   22815    );
   22816 }
   22817 
   22818 
   22819 /* Handles AVX128 32F/64F comparisons.  A derivative of
   22820    dis_SSEcmp_E_to_G.  It can fail, in which case it returns the
   22821    original delta to indicate failure. */
   22822 static
   22823 Long dis_AVX128_cmp_V_E_to_G ( /*OUT*/Bool* uses_vvvv,
   22824                                const VexAbiInfo* vbi,
   22825                                Prefix pfx, Long delta,
   22826                                const HChar* opname, Bool all_lanes, Int sz )
   22827 {
   22828    vassert(sz == 4 || sz == 8);
   22829    Long    deltaIN = delta;
   22830    HChar   dis_buf[50];
   22831    Int     alen;
   22832    UInt    imm8;
   22833    IRTemp  addr;
   22834    Bool    preSwap = False;
   22835    IROp    op      = Iop_INVALID;
   22836    Bool    postNot = False;
   22837    IRTemp  plain   = newTemp(Ity_V128);
   22838    UChar   rm      = getUChar(delta);
   22839    UInt    rG      = gregOfRexRM(pfx, rm);
   22840    UInt    rV      = getVexNvvvv(pfx);
   22841    IRTemp argL     = newTemp(Ity_V128);
   22842    IRTemp argR     = newTemp(Ity_V128);
   22843 
   22844    assign(argL, getXMMReg(rV));
   22845    if (epartIsReg(rm)) {
   22846       imm8 = getUChar(delta+1);
   22847       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
   22848       if (!ok) return deltaIN; /* FAIL */
   22849       UInt rE = eregOfRexRM(pfx,rm);
   22850       assign(argR, getXMMReg(rE));
   22851       delta += 1+1;
   22852       DIP("%s $%d,%s,%s,%s\n",
   22853           opname, (Int)imm8,
   22854           nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   22855    } else {
   22856       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   22857       imm8 = getUChar(delta+alen);
   22858       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
   22859       if (!ok) return deltaIN; /* FAIL */
   22860       assign(argR,
   22861              all_lanes   ? loadLE(Ity_V128, mkexpr(addr))
   22862              : sz == 8   ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
   22863              : /*sz==4*/   unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr))));
   22864       delta += alen+1;
   22865       DIP("%s $%d,%s,%s,%s\n",
   22866           opname, (Int)imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   22867    }
   22868 
   22869    assign(plain, preSwap ? binop(op, mkexpr(argR), mkexpr(argL))
   22870                          : binop(op, mkexpr(argL), mkexpr(argR)));
   22871 
   22872    if (all_lanes) {
   22873       /* This is simple: just invert the result, if necessary, and
   22874          have done. */
   22875       if (postNot) {
   22876          putYMMRegLoAndZU( rG, unop(Iop_NotV128, mkexpr(plain)) );
   22877       } else {
   22878          putYMMRegLoAndZU( rG, mkexpr(plain) );
   22879       }
   22880    }
   22881    else
   22882    if (!preSwap) {
   22883       /* More complex.  It's a one-lane-only, hence need to possibly
   22884          invert only that one lane.  But at least the other lanes are
   22885          correctly "in" the result, having been copied from the left
   22886          operand (argL). */
   22887       if (postNot) {
   22888          IRExpr* mask = mkV128(sz==4 ? 0x000F : 0x00FF);
   22889          putYMMRegLoAndZU( rG, binop(Iop_XorV128, mkexpr(plain),
   22890                                                   mask) );
   22891       } else {
   22892          putYMMRegLoAndZU( rG, mkexpr(plain) );
   22893       }
   22894    }
   22895    else {
   22896       /* This is the most complex case.  One-lane-only, but the args
   22897          were swapped.  So we have to possibly invert the bottom lane,
   22898          and (definitely) we have to copy the upper lane(s) from argL
   22899          since, due to the swapping, what's currently there is from
   22900          argR, which is not correct. */
   22901       IRTemp res     = newTemp(Ity_V128);
   22902       IRTemp mask    = newTemp(Ity_V128);
   22903       IRTemp notMask = newTemp(Ity_V128);
   22904       assign(mask,    mkV128(sz==4 ? 0x000F : 0x00FF));
   22905       assign(notMask, mkV128(sz==4 ? 0xFFF0 : 0xFF00));
   22906       if (postNot) {
   22907          assign(res,
   22908                 binop(Iop_OrV128,
   22909                       binop(Iop_AndV128,
   22910                             unop(Iop_NotV128, mkexpr(plain)),
   22911                             mkexpr(mask)),
   22912                       binop(Iop_AndV128, mkexpr(argL), mkexpr(notMask))));
   22913       } else {
   22914          assign(res,
   22915                 binop(Iop_OrV128,
   22916                       binop(Iop_AndV128,
   22917                             mkexpr(plain),
   22918                             mkexpr(mask)),
   22919                       binop(Iop_AndV128, mkexpr(argL), mkexpr(notMask))));
   22920       }
   22921       putYMMRegLoAndZU( rG, mkexpr(res) );
   22922    }
   22923 
   22924    *uses_vvvv = True;
   22925    return delta;
   22926 }
   22927 
   22928 
   22929 /* Handles AVX256 32F/64F comparisons.  A derivative of
   22930    dis_SSEcmp_E_to_G.  It can fail, in which case it returns the
   22931    original delta to indicate failure. */
   22932 static
   22933 Long dis_AVX256_cmp_V_E_to_G ( /*OUT*/Bool* uses_vvvv,
   22934                                const VexAbiInfo* vbi,
   22935                                Prefix pfx, Long delta,
   22936                                const HChar* opname, Int sz )
   22937 {
   22938    vassert(sz == 4 || sz == 8);
   22939    Long    deltaIN = delta;
   22940    HChar   dis_buf[50];
   22941    Int     alen;
   22942    UInt    imm8;
   22943    IRTemp  addr;
   22944    Bool    preSwap = False;
   22945    IROp    op      = Iop_INVALID;
   22946    Bool    postNot = False;
   22947    IRTemp  plain   = newTemp(Ity_V256);
   22948    UChar   rm      = getUChar(delta);
   22949    UInt    rG      = gregOfRexRM(pfx, rm);
   22950    UInt    rV      = getVexNvvvv(pfx);
   22951    IRTemp argL     = newTemp(Ity_V256);
   22952    IRTemp argR     = newTemp(Ity_V256);
   22953    IRTemp argLhi   = IRTemp_INVALID;
   22954    IRTemp argLlo   = IRTemp_INVALID;
   22955    IRTemp argRhi   = IRTemp_INVALID;
   22956    IRTemp argRlo   = IRTemp_INVALID;
   22957 
   22958    assign(argL, getYMMReg(rV));
   22959    if (epartIsReg(rm)) {
   22960       imm8 = getUChar(delta+1);
   22961       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8,
   22962                              True/*all_lanes*/, sz);
   22963       if (!ok) return deltaIN; /* FAIL */
   22964       UInt rE = eregOfRexRM(pfx,rm);
   22965       assign(argR, getYMMReg(rE));
   22966       delta += 1+1;
   22967       DIP("%s $%d,%s,%s,%s\n",
   22968           opname, (Int)imm8,
   22969           nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   22970    } else {
   22971       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   22972       imm8 = getUChar(delta+alen);
   22973       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8,
   22974                              True/*all_lanes*/, sz);
   22975       if (!ok) return deltaIN; /* FAIL */
   22976       assign(argR, loadLE(Ity_V256, mkexpr(addr)) );
   22977       delta += alen+1;
   22978       DIP("%s $%d,%s,%s,%s\n",
   22979           opname, (Int)imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   22980    }
   22981 
   22982    breakupV256toV128s( preSwap ? argR : argL, &argLhi, &argLlo );
   22983    breakupV256toV128s( preSwap ? argL : argR, &argRhi, &argRlo );
   22984    assign(plain, binop( Iop_V128HLtoV256,
   22985                         binop(op, mkexpr(argLhi), mkexpr(argRhi)),
   22986                         binop(op, mkexpr(argLlo), mkexpr(argRlo)) ) );
   22987 
   22988    /* This is simple: just invert the result, if necessary, and
   22989       have done. */
   22990    if (postNot) {
   22991       putYMMReg( rG, unop(Iop_NotV256, mkexpr(plain)) );
   22992    } else {
   22993       putYMMReg( rG, mkexpr(plain) );
   22994    }
   22995 
   22996    *uses_vvvv = True;
   22997    return delta;
   22998 }
   22999 
   23000 
   23001 /* Handles AVX128 unary E-to-G all-lanes operations. */
   23002 static
   23003 Long dis_AVX128_E_to_G_unary ( /*OUT*/Bool* uses_vvvv,
   23004                                const VexAbiInfo* vbi,
   23005                                Prefix pfx, Long delta,
   23006                                const HChar* opname,
   23007                                IRTemp (*opFn)(IRTemp) )
   23008 {
   23009    HChar  dis_buf[50];
   23010    Int    alen;
   23011    IRTemp addr;
   23012    IRTemp res  = newTemp(Ity_V128);
   23013    IRTemp arg  = newTemp(Ity_V128);
   23014    UChar  rm   = getUChar(delta);
   23015    UInt   rG   = gregOfRexRM(pfx, rm);
   23016    if (epartIsReg(rm)) {
   23017       UInt rE = eregOfRexRM(pfx,rm);
   23018       assign(arg, getXMMReg(rE));
   23019       delta += 1;
   23020       DIP("%s %s,%s\n", opname, nameXMMReg(rE), nameXMMReg(rG));
   23021    } else {
   23022       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23023       assign(arg, loadLE(Ity_V128, mkexpr(addr)));
   23024       delta += alen;
   23025       DIP("%s %s,%s\n", opname, dis_buf, nameXMMReg(rG));
   23026    }
   23027    res = opFn(arg);
   23028    putYMMRegLoAndZU( rG, mkexpr(res) );
   23029    *uses_vvvv = False;
   23030    return delta;
   23031 }
   23032 
   23033 
   23034 /* Handles AVX128 unary E-to-G all-lanes operations. */
   23035 static
   23036 Long dis_AVX128_E_to_G_unary_all ( /*OUT*/Bool* uses_vvvv,
   23037                                    const VexAbiInfo* vbi,
   23038                                    Prefix pfx, Long delta,
   23039                                    const HChar* opname, IROp op )
   23040 {
   23041    HChar  dis_buf[50];
   23042    Int    alen;
   23043    IRTemp addr;
   23044    IRTemp arg  = newTemp(Ity_V128);
   23045    UChar  rm   = getUChar(delta);
   23046    UInt   rG   = gregOfRexRM(pfx, rm);
   23047    if (epartIsReg(rm)) {
   23048       UInt rE = eregOfRexRM(pfx,rm);
   23049       assign(arg, getXMMReg(rE));
   23050       delta += 1;
   23051       DIP("%s %s,%s\n", opname, nameXMMReg(rE), nameXMMReg(rG));
   23052    } else {
   23053       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23054       assign(arg, loadLE(Ity_V128, mkexpr(addr)));
   23055       delta += alen;
   23056       DIP("%s %s,%s\n", opname, dis_buf, nameXMMReg(rG));
   23057    }
   23058    // Sqrt32Fx4 and Sqrt64Fx2 take a rounding mode, which is faked
   23059    // up in the usual way.
   23060    Bool needsIRRM = op == Iop_Sqrt32Fx4 || op == Iop_Sqrt64Fx2;
   23061    /* XXXROUNDINGFIXME */
   23062    IRExpr* res = needsIRRM ? binop(op, get_FAKE_roundingmode(), mkexpr(arg))
   23063                            : unop(op, mkexpr(arg));
   23064    putYMMRegLoAndZU( rG, res );
   23065    *uses_vvvv = False;
   23066    return delta;
   23067 }
   23068 
   23069 
   23070 /* FIXME: common up with the _128_ version above? */
   23071 static
   23072 Long dis_VEX_NDS_256_AnySimdPfx_0F_WIG (
   23073         /*OUT*/Bool* uses_vvvv, const VexAbiInfo* vbi,
   23074         Prefix pfx, Long delta, const HChar* name,
   23075         /* The actual operation.  Use either 'op' or 'opfn',
   23076            but not both. */
   23077         IROp op, IRTemp(*opFn)(IRTemp,IRTemp),
   23078         Bool invertLeftArg,
   23079         Bool swapArgs
   23080      )
   23081 {
   23082    UChar  modrm = getUChar(delta);
   23083    UInt   rD    = gregOfRexRM(pfx, modrm);
   23084    UInt   rSL   = getVexNvvvv(pfx);
   23085    IRTemp tSL   = newTemp(Ity_V256);
   23086    IRTemp tSR   = newTemp(Ity_V256);
   23087    IRTemp addr  = IRTemp_INVALID;
   23088    HChar  dis_buf[50];
   23089    Int    alen  = 0;
   23090    vassert(1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*WIG?*/);
   23091 
   23092    assign(tSL, invertLeftArg ? unop(Iop_NotV256, getYMMReg(rSL))
   23093                              : getYMMReg(rSL));
   23094 
   23095    if (epartIsReg(modrm)) {
   23096       UInt rSR = eregOfRexRM(pfx, modrm);
   23097       delta += 1;
   23098       assign(tSR, getYMMReg(rSR));
   23099       DIP("%s %s,%s,%s\n",
   23100           name, nameYMMReg(rSR), nameYMMReg(rSL), nameYMMReg(rD));
   23101    } else {
   23102       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   23103       delta += alen;
   23104       assign(tSR, loadLE(Ity_V256, mkexpr(addr)));
   23105       DIP("%s %s,%s,%s\n",
   23106           name, dis_buf, nameYMMReg(rSL), nameYMMReg(rD));
   23107    }
   23108 
   23109    IRTemp res = IRTemp_INVALID;
   23110    if (op != Iop_INVALID) {
   23111       vassert(opFn == NULL);
   23112       res = newTemp(Ity_V256);
   23113       if (requiresRMode(op)) {
   23114          IRTemp rm = newTemp(Ity_I32);
   23115          assign(rm, get_FAKE_roundingmode()); /* XXXROUNDINGFIXME */
   23116          assign(res, swapArgs
   23117                         ? triop(op, mkexpr(rm), mkexpr(tSR), mkexpr(tSL))
   23118                         : triop(op, mkexpr(rm), mkexpr(tSL), mkexpr(tSR)));
   23119       } else {
   23120          assign(res, swapArgs
   23121                         ? binop(op, mkexpr(tSR), mkexpr(tSL))
   23122                         : binop(op, mkexpr(tSL), mkexpr(tSR)));
   23123       }
   23124    } else {
   23125       vassert(opFn != NULL);
   23126       res = swapArgs ? opFn(tSR, tSL) : opFn(tSL, tSR);
   23127    }
   23128 
   23129    putYMMReg(rD, mkexpr(res));
   23130 
   23131    *uses_vvvv = True;
   23132    return delta;
   23133 }
   23134 
   23135 
   23136 /* All-lanes AVX256 binary operation:
   23137    G[255:0] = V[255:0] `op` E[255:0]
   23138 */
   23139 static Long dis_AVX256_E_V_to_G ( /*OUT*/Bool* uses_vvvv,
   23140                                   const VexAbiInfo* vbi,
   23141                                   Prefix pfx, Long delta,
   23142                                   const HChar* opname, IROp op )
   23143 {
   23144    return dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
   23145              uses_vvvv, vbi, pfx, delta, opname, op,
   23146              NULL, False/*!invertLeftArg*/, False/*!swapArgs*/
   23147    );
   23148 }
   23149 
   23150 
   23151 /* Handle a VEX_NDS_256_66_0F_WIG (3-addr) insn, with a simple IROp
   23152    for the operation, no inversion of the left arg, and no swapping of
   23153    args. */
   23154 static
   23155 Long dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple (
   23156         /*OUT*/Bool* uses_vvvv, const VexAbiInfo* vbi,
   23157         Prefix pfx, Long delta, const HChar* name,
   23158         IROp op
   23159      )
   23160 {
   23161    return dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
   23162              uses_vvvv, vbi, pfx, delta, name, op, NULL, False, False);
   23163 }
   23164 
   23165 
   23166 /* Handle a VEX_NDS_256_66_0F_WIG (3-addr) insn, using the given IR
   23167    generator to compute the result, no inversion of the left
   23168    arg, and no swapping of args. */
   23169 static
   23170 Long dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex (
   23171         /*OUT*/Bool* uses_vvvv, const VexAbiInfo* vbi,
   23172         Prefix pfx, Long delta, const HChar* name,
   23173         IRTemp(*opFn)(IRTemp,IRTemp)
   23174      )
   23175 {
   23176    return dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
   23177              uses_vvvv, vbi, pfx, delta, name,
   23178              Iop_INVALID, opFn, False, False );
   23179 }
   23180 
   23181 
   23182 /* Handles AVX256 unary E-to-G all-lanes operations. */
   23183 static
   23184 Long dis_AVX256_E_to_G_unary ( /*OUT*/Bool* uses_vvvv,
   23185                                const VexAbiInfo* vbi,
   23186                                Prefix pfx, Long delta,
   23187                                const HChar* opname,
   23188                                IRTemp (*opFn)(IRTemp) )
   23189 {
   23190    HChar  dis_buf[50];
   23191    Int    alen;
   23192    IRTemp addr;
   23193    IRTemp res  = newTemp(Ity_V256);
   23194    IRTemp arg  = newTemp(Ity_V256);
   23195    UChar  rm   = getUChar(delta);
   23196    UInt   rG   = gregOfRexRM(pfx, rm);
   23197    if (epartIsReg(rm)) {
   23198       UInt rE = eregOfRexRM(pfx,rm);
   23199       assign(arg, getYMMReg(rE));
   23200       delta += 1;
   23201       DIP("%s %s,%s\n", opname, nameYMMReg(rE), nameYMMReg(rG));
   23202    } else {
   23203       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23204       assign(arg, loadLE(Ity_V256, mkexpr(addr)));
   23205       delta += alen;
   23206       DIP("%s %s,%s\n", opname, dis_buf, nameYMMReg(rG));
   23207    }
   23208    res = opFn(arg);
   23209    putYMMReg( rG, mkexpr(res) );
   23210    *uses_vvvv = False;
   23211    return delta;
   23212 }
   23213 
   23214 
   23215 /* Handles AVX256 unary E-to-G all-lanes operations. */
   23216 static
   23217 Long dis_AVX256_E_to_G_unary_all ( /*OUT*/Bool* uses_vvvv,
   23218                                    const VexAbiInfo* vbi,
   23219                                    Prefix pfx, Long delta,
   23220                                    const HChar* opname, IROp op )
   23221 {
   23222    HChar  dis_buf[50];
   23223    Int    alen;
   23224    IRTemp addr;
   23225    IRTemp arg  = newTemp(Ity_V256);
   23226    UChar  rm   = getUChar(delta);
   23227    UInt   rG   = gregOfRexRM(pfx, rm);
   23228    if (epartIsReg(rm)) {
   23229       UInt rE = eregOfRexRM(pfx,rm);
   23230       assign(arg, getYMMReg(rE));
   23231       delta += 1;
   23232       DIP("%s %s,%s\n", opname, nameYMMReg(rE), nameYMMReg(rG));
   23233    } else {
   23234       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23235       assign(arg, loadLE(Ity_V256, mkexpr(addr)));
   23236       delta += alen;
   23237       DIP("%s %s,%s\n", opname, dis_buf, nameYMMReg(rG));
   23238    }
   23239    putYMMReg( rG, unop(op, mkexpr(arg)) );
   23240    *uses_vvvv = False;
   23241    return delta;
   23242 }
   23243 
   23244 
   23245 /* The use of ReinterpF64asI64 is ugly.  Surely could do better if we
   23246    had a variant of Iop_64x4toV256 that took F64s as args instead. */
   23247 static Long dis_CVTDQ2PD_256 ( const VexAbiInfo* vbi, Prefix pfx,
   23248                                Long delta )
   23249 {
   23250    IRTemp addr  = IRTemp_INVALID;
   23251    Int    alen  = 0;
   23252    HChar  dis_buf[50];
   23253    UChar  modrm = getUChar(delta);
   23254    IRTemp sV    = newTemp(Ity_V128);
   23255    UInt   rG    = gregOfRexRM(pfx,modrm);
   23256    if (epartIsReg(modrm)) {
   23257       UInt rE = eregOfRexRM(pfx,modrm);
   23258       assign( sV, getXMMReg(rE) );
   23259       delta += 1;
   23260       DIP("vcvtdq2pd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   23261    } else {
   23262       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23263       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   23264       delta += alen;
   23265       DIP("vcvtdq2pd %s,%s\n", dis_buf, nameYMMReg(rG) );
   23266    }
   23267    IRTemp s3, s2, s1, s0;
   23268    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   23269    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   23270    IRExpr* res
   23271       = IRExpr_Qop(
   23272            Iop_64x4toV256,
   23273            unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s3))),
   23274            unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s2))),
   23275            unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s1))),
   23276            unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s0)))
   23277         );
   23278    putYMMReg(rG, res);
   23279    return delta;
   23280 }
   23281 
   23282 
   23283 static Long dis_CVTPD2PS_256 ( const VexAbiInfo* vbi, Prefix pfx,
   23284                                Long delta )
   23285 {
   23286    IRTemp addr  = IRTemp_INVALID;
   23287    Int    alen  = 0;
   23288    HChar  dis_buf[50];
   23289    UChar  modrm = getUChar(delta);
   23290    UInt   rG    = gregOfRexRM(pfx,modrm);
   23291    IRTemp argV  = newTemp(Ity_V256);
   23292    IRTemp rmode = newTemp(Ity_I32);
   23293    if (epartIsReg(modrm)) {
   23294       UInt rE = eregOfRexRM(pfx,modrm);
   23295       assign( argV, getYMMReg(rE) );
   23296       delta += 1;
   23297       DIP("vcvtpd2psy %s,%s\n", nameYMMReg(rE), nameXMMReg(rG));
   23298    } else {
   23299       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23300       assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
   23301       delta += alen;
   23302       DIP("vcvtpd2psy %s,%s\n", dis_buf, nameXMMReg(rG) );
   23303    }
   23304 
   23305    assign( rmode, get_sse_roundingmode() );
   23306    IRTemp t3, t2, t1, t0;
   23307    t3 = t2 = t1 = t0 = IRTemp_INVALID;
   23308    breakupV256to64s( argV, &t3, &t2, &t1, &t0 );
   23309 #  define CVT(_t)  binop( Iop_F64toF32, mkexpr(rmode), \
   23310                           unop(Iop_ReinterpI64asF64, mkexpr(_t)) )
   23311    putXMMRegLane32F( rG, 3, CVT(t3) );
   23312    putXMMRegLane32F( rG, 2, CVT(t2) );
   23313    putXMMRegLane32F( rG, 1, CVT(t1) );
   23314    putXMMRegLane32F( rG, 0, CVT(t0) );
   23315 #  undef CVT
   23316    putYMMRegLane128( rG, 1, mkV128(0) );
   23317    return delta;
   23318 }
   23319 
   23320 
   23321 static IRTemp math_VPUNPCK_YMM ( IRTemp tL, IRType tR, IROp op )
   23322 {
   23323    IRTemp tLhi, tLlo, tRhi, tRlo;
   23324    tLhi = tLlo = tRhi = tRlo = IRTemp_INVALID;
   23325    IRTemp res = newTemp(Ity_V256);
   23326    breakupV256toV128s( tL, &tLhi, &tLlo );
   23327    breakupV256toV128s( tR, &tRhi, &tRlo );
   23328    assign( res, binop( Iop_V128HLtoV256,
   23329                        binop( op, mkexpr(tRhi), mkexpr(tLhi) ),
   23330                        binop( op, mkexpr(tRlo), mkexpr(tLlo) ) ) );
   23331    return res;
   23332 }
   23333 
   23334 
   23335 static IRTemp math_VPUNPCKLBW_YMM ( IRTemp tL, IRTemp tR )
   23336 {
   23337    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO8x16 );
   23338 }
   23339 
   23340 
   23341 static IRTemp math_VPUNPCKLWD_YMM ( IRTemp tL, IRTemp tR )
   23342 {
   23343    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO16x8 );
   23344 }
   23345 
   23346 
   23347 static IRTemp math_VPUNPCKLDQ_YMM ( IRTemp tL, IRTemp tR )
   23348 {
   23349    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO32x4 );
   23350 }
   23351 
   23352 
   23353 static IRTemp math_VPUNPCKLQDQ_YMM ( IRTemp tL, IRTemp tR )
   23354 {
   23355    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO64x2 );
   23356 }
   23357 
   23358 
   23359 static IRTemp math_VPUNPCKHBW_YMM ( IRTemp tL, IRTemp tR )
   23360 {
   23361    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI8x16 );
   23362 }
   23363 
   23364 
   23365 static IRTemp math_VPUNPCKHWD_YMM ( IRTemp tL, IRTemp tR )
   23366 {
   23367    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI16x8 );
   23368 }
   23369 
   23370 
   23371 static IRTemp math_VPUNPCKHDQ_YMM ( IRTemp tL, IRTemp tR )
   23372 {
   23373    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI32x4 );
   23374 }
   23375 
   23376 
   23377 static IRTemp math_VPUNPCKHQDQ_YMM ( IRTemp tL, IRTemp tR )
   23378 {
   23379    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI64x2 );
   23380 }
   23381 
   23382 
   23383 static IRTemp math_VPACKSSWB_YMM ( IRTemp tL, IRTemp tR )
   23384 {
   23385    return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin16Sto8Sx16 );
   23386 }
   23387 
   23388 
   23389 static IRTemp math_VPACKUSWB_YMM ( IRTemp tL, IRTemp tR )
   23390 {
   23391    return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin16Sto8Ux16 );
   23392 }
   23393 
   23394 
   23395 static IRTemp math_VPACKSSDW_YMM ( IRTemp tL, IRTemp tR )
   23396 {
   23397    return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin32Sto16Sx8 );
   23398 }
   23399 
   23400 
   23401 static IRTemp math_VPACKUSDW_YMM ( IRTemp tL, IRTemp tR )
   23402 {
   23403    return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin32Sto16Ux8 );
   23404 }
   23405 
   23406 
   23407 __attribute__((noinline))
   23408 static
   23409 Long dis_ESC_0F__VEX (
   23410         /*MB_OUT*/DisResult* dres,
   23411         /*OUT*/   Bool*      uses_vvvv,
   23412         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
   23413         Bool         resteerCisOk,
   23414         void*        callback_opaque,
   23415         const VexArchInfo* archinfo,
   23416         const VexAbiInfo*  vbi,
   23417         Prefix pfx, Int sz, Long deltaIN
   23418      )
   23419 {
   23420    IRTemp addr  = IRTemp_INVALID;
   23421    Int    alen  = 0;
   23422    HChar  dis_buf[50];
   23423    Long   delta = deltaIN;
   23424    UChar  opc   = getUChar(delta);
   23425    delta++;
   23426    *uses_vvvv = False;
   23427 
   23428    switch (opc) {
   23429 
   23430    case 0x10:
   23431       /* VMOVSD m64, xmm1 = VEX.LIG.F2.0F.WIG 10 /r */
   23432       /* Move 64 bits from E (mem only) to G (lo half xmm).
   23433          Bits 255-64 of the dest are zeroed out. */
   23434       if (haveF2no66noF3(pfx) && !epartIsReg(getUChar(delta))) {
   23435          UChar modrm = getUChar(delta);
   23436          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23437          UInt   rG   = gregOfRexRM(pfx,modrm);
   23438          IRTemp z128 = newTemp(Ity_V128);
   23439          assign(z128, mkV128(0));
   23440          putXMMReg( rG, mkexpr(z128) );
   23441          /* FIXME: ALIGNMENT CHECK? */
   23442          putXMMRegLane64( rG, 0, loadLE(Ity_I64, mkexpr(addr)) );
   23443          putYMMRegLane128( rG, 1, mkexpr(z128) );
   23444          DIP("vmovsd %s,%s\n", dis_buf, nameXMMReg(rG));
   23445          delta += alen;
   23446          goto decode_success;
   23447       }
   23448       /* VMOVSD xmm3, xmm2, xmm1 = VEX.LIG.F2.0F.WIG 10 /r */
   23449       /* Reg form. */
   23450       if (haveF2no66noF3(pfx) && epartIsReg(getUChar(delta))) {
   23451          UChar modrm = getUChar(delta);
   23452          UInt  rG    = gregOfRexRM(pfx, modrm);
   23453          UInt  rE    = eregOfRexRM(pfx, modrm);
   23454          UInt  rV    = getVexNvvvv(pfx);
   23455          delta++;
   23456          DIP("vmovsd %s,%s,%s\n",
   23457              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   23458          IRTemp res = newTemp(Ity_V128);
   23459          assign(res, binop(Iop_64HLtoV128,
   23460                            getXMMRegLane64(rV, 1),
   23461                            getXMMRegLane64(rE, 0)));
   23462          putYMMRegLoAndZU(rG, mkexpr(res));
   23463          *uses_vvvv = True;
   23464          goto decode_success;
   23465       }
   23466       /* VMOVSS m32, xmm1 = VEX.LIG.F3.0F.WIG 10 /r */
   23467       /* Move 32 bits from E (mem only) to G (lo half xmm).
   23468          Bits 255-32 of the dest are zeroed out. */
   23469       if (haveF3no66noF2(pfx) && !epartIsReg(getUChar(delta))) {
   23470          UChar modrm = getUChar(delta);
   23471          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23472          UInt   rG   = gregOfRexRM(pfx,modrm);
   23473          IRTemp z128 = newTemp(Ity_V128);
   23474          assign(z128, mkV128(0));
   23475          putXMMReg( rG, mkexpr(z128) );
   23476          /* FIXME: ALIGNMENT CHECK? */
   23477          putXMMRegLane32( rG, 0, loadLE(Ity_I32, mkexpr(addr)) );
   23478          putYMMRegLane128( rG, 1, mkexpr(z128) );
   23479          DIP("vmovss %s,%s\n", dis_buf, nameXMMReg(rG));
   23480          delta += alen;
   23481          goto decode_success;
   23482       }
   23483       /* VMOVSS xmm3, xmm2, xmm1 = VEX.LIG.F3.0F.WIG 10 /r */
   23484       /* Reg form. */
   23485       if (haveF3no66noF2(pfx) && epartIsReg(getUChar(delta))) {
   23486          UChar modrm = getUChar(delta);
   23487          UInt  rG    = gregOfRexRM(pfx, modrm);
   23488          UInt  rE    = eregOfRexRM(pfx, modrm);
   23489          UInt  rV    = getVexNvvvv(pfx);
   23490          delta++;
   23491          DIP("vmovss %s,%s,%s\n",
   23492              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   23493          IRTemp res = newTemp(Ity_V128);
   23494          assign( res, binop( Iop_64HLtoV128,
   23495                              getXMMRegLane64(rV, 1),
   23496                              binop(Iop_32HLto64,
   23497                                    getXMMRegLane32(rV, 1),
   23498                                    getXMMRegLane32(rE, 0)) ) );
   23499          putYMMRegLoAndZU(rG, mkexpr(res));
   23500          *uses_vvvv = True;
   23501          goto decode_success;
   23502       }
   23503       /* VMOVUPD xmm2/m128, xmm1 = VEX.128.66.0F.WIG 10 /r */
   23504       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23505          UChar modrm = getUChar(delta);
   23506          UInt  rG    = gregOfRexRM(pfx, modrm);
   23507          if (epartIsReg(modrm)) {
   23508             UInt rE = eregOfRexRM(pfx,modrm);
   23509             putYMMRegLoAndZU( rG, getXMMReg( rE ));
   23510             DIP("vmovupd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   23511             delta += 1;
   23512          } else {
   23513             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23514             putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
   23515             DIP("vmovupd %s,%s\n", dis_buf, nameXMMReg(rG));
   23516             delta += alen;
   23517          }
   23518          goto decode_success;
   23519       }
   23520       /* VMOVUPD ymm2/m256, ymm1 = VEX.256.66.0F.WIG 10 /r */
   23521       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   23522          UChar modrm = getUChar(delta);
   23523          UInt  rG    = gregOfRexRM(pfx, modrm);
   23524          if (epartIsReg(modrm)) {
   23525             UInt rE = eregOfRexRM(pfx,modrm);
   23526             putYMMReg( rG, getYMMReg( rE ));
   23527             DIP("vmovupd %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   23528             delta += 1;
   23529          } else {
   23530             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23531             putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
   23532             DIP("vmovupd %s,%s\n", dis_buf, nameYMMReg(rG));
   23533             delta += alen;
   23534          }
   23535          goto decode_success;
   23536       }
   23537       /* VMOVUPS xmm2/m128, xmm1 = VEX.128.0F.WIG 10 /r */
   23538       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23539          UChar modrm = getUChar(delta);
   23540          UInt  rG    = gregOfRexRM(pfx, modrm);
   23541          if (epartIsReg(modrm)) {
   23542             UInt rE = eregOfRexRM(pfx,modrm);
   23543             putYMMRegLoAndZU( rG, getXMMReg( rE ));
   23544             DIP("vmovups %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   23545             delta += 1;
   23546          } else {
   23547             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23548             putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
   23549             DIP("vmovups %s,%s\n", dis_buf, nameXMMReg(rG));
   23550             delta += alen;
   23551          }
   23552          goto decode_success;
   23553       }
   23554       /* VMOVUPS ymm2/m256, ymm1 = VEX.256.0F.WIG 10 /r */
   23555       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   23556          UChar modrm = getUChar(delta);
   23557          UInt  rG    = gregOfRexRM(pfx, modrm);
   23558          if (epartIsReg(modrm)) {
   23559             UInt rE = eregOfRexRM(pfx,modrm);
   23560             putYMMReg( rG, getYMMReg( rE ));
   23561             DIP("vmovups %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   23562             delta += 1;
   23563          } else {
   23564             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23565             putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
   23566             DIP("vmovups %s,%s\n", dis_buf, nameYMMReg(rG));
   23567             delta += alen;
   23568          }
   23569          goto decode_success;
   23570       }
   23571       break;
   23572 
   23573    case 0x11:
   23574       /* VMOVSD xmm1, m64 = VEX.LIG.F2.0F.WIG 11 /r */
   23575       /* Move 64 bits from G (low half xmm) to mem only. */
   23576       if (haveF2no66noF3(pfx) && !epartIsReg(getUChar(delta))) {
   23577          UChar modrm = getUChar(delta);
   23578          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23579          UInt   rG   = gregOfRexRM(pfx,modrm);
   23580          /* FIXME: ALIGNMENT CHECK? */
   23581          storeLE( mkexpr(addr), getXMMRegLane64(rG, 0));
   23582          DIP("vmovsd %s,%s\n", nameXMMReg(rG), dis_buf);
   23583          delta += alen;
   23584          goto decode_success;
   23585       }
   23586       /* VMOVSD xmm3, xmm2, xmm1 = VEX.LIG.F2.0F.WIG 11 /r */
   23587       /* Reg form. */
   23588       if (haveF2no66noF3(pfx) && epartIsReg(getUChar(delta))) {
   23589          UChar modrm = getUChar(delta);
   23590          UInt  rG    = gregOfRexRM(pfx, modrm);
   23591          UInt  rE    = eregOfRexRM(pfx, modrm);
   23592          UInt  rV    = getVexNvvvv(pfx);
   23593          delta++;
   23594          DIP("vmovsd %s,%s,%s\n",
   23595              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   23596          IRTemp res = newTemp(Ity_V128);
   23597          assign(res, binop(Iop_64HLtoV128,
   23598                            getXMMRegLane64(rV, 1),
   23599                            getXMMRegLane64(rE, 0)));
   23600          putYMMRegLoAndZU(rG, mkexpr(res));
   23601          *uses_vvvv = True;
   23602          goto decode_success;
   23603       }
   23604       /* VMOVSS xmm1, m64 = VEX.LIG.F3.0F.WIG 11 /r */
   23605       /* Move 32 bits from G (low 1/4 xmm) to mem only. */
   23606       if (haveF3no66noF2(pfx) && !epartIsReg(getUChar(delta))) {
   23607          UChar modrm = getUChar(delta);
   23608          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23609          UInt   rG   = gregOfRexRM(pfx,modrm);
   23610          /* FIXME: ALIGNMENT CHECK? */
   23611          storeLE( mkexpr(addr), getXMMRegLane32(rG, 0));
   23612          DIP("vmovss %s,%s\n", nameXMMReg(rG), dis_buf);
   23613          delta += alen;
   23614          goto decode_success;
   23615       }
   23616       /* VMOVSS xmm3, xmm2, xmm1 = VEX.LIG.F3.0F.WIG 11 /r */
   23617       /* Reg form. */
   23618       if (haveF3no66noF2(pfx) && epartIsReg(getUChar(delta))) {
   23619          UChar modrm = getUChar(delta);
   23620          UInt  rG    = gregOfRexRM(pfx, modrm);
   23621          UInt  rE    = eregOfRexRM(pfx, modrm);
   23622          UInt  rV    = getVexNvvvv(pfx);
   23623          delta++;
   23624          DIP("vmovss %s,%s,%s\n",
   23625              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   23626          IRTemp res = newTemp(Ity_V128);
   23627          assign( res, binop( Iop_64HLtoV128,
   23628                              getXMMRegLane64(rV, 1),
   23629                              binop(Iop_32HLto64,
   23630                                    getXMMRegLane32(rV, 1),
   23631                                    getXMMRegLane32(rE, 0)) ) );
   23632          putYMMRegLoAndZU(rG, mkexpr(res));
   23633          *uses_vvvv = True;
   23634          goto decode_success;
   23635       }
   23636       /* VMOVUPD xmm1, xmm2/m128 = VEX.128.66.0F.WIG 11 /r */
   23637       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23638          UChar modrm = getUChar(delta);
   23639          UInt  rG    = gregOfRexRM(pfx,modrm);
   23640          if (epartIsReg(modrm)) {
   23641             UInt rE = eregOfRexRM(pfx,modrm);
   23642             putYMMRegLoAndZU( rE, getXMMReg(rG) );
   23643             DIP("vmovupd %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
   23644             delta += 1;
   23645          } else {
   23646             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23647             storeLE( mkexpr(addr), getXMMReg(rG) );
   23648             DIP("vmovupd %s,%s\n", nameXMMReg(rG), dis_buf);
   23649             delta += alen;
   23650          }
   23651          goto decode_success;
   23652       }
   23653       /* VMOVUPD ymm1, ymm2/m256 = VEX.256.66.0F.WIG 11 /r */
   23654       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   23655          UChar modrm = getUChar(delta);
   23656          UInt  rG    = gregOfRexRM(pfx,modrm);
   23657          if (epartIsReg(modrm)) {
   23658             UInt rE = eregOfRexRM(pfx,modrm);
   23659             putYMMReg( rE, getYMMReg(rG) );
   23660             DIP("vmovupd %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
   23661             delta += 1;
   23662          } else {
   23663             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23664             storeLE( mkexpr(addr), getYMMReg(rG) );
   23665             DIP("vmovupd %s,%s\n", nameYMMReg(rG), dis_buf);
   23666             delta += alen;
   23667          }
   23668          goto decode_success;
   23669       }
   23670       /* VMOVUPS xmm1, xmm2/m128 = VEX.128.0F.WIG 11 /r */
   23671       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23672          UChar modrm = getUChar(delta);
   23673          UInt  rG    = gregOfRexRM(pfx,modrm);
   23674          if (epartIsReg(modrm)) {
   23675             UInt rE = eregOfRexRM(pfx,modrm);
   23676             putYMMRegLoAndZU( rE, getXMMReg(rG) );
   23677             DIP("vmovups %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
   23678             delta += 1;
   23679          } else {
   23680             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23681             storeLE( mkexpr(addr), getXMMReg(rG) );
   23682             DIP("vmovups %s,%s\n", nameXMMReg(rG), dis_buf);
   23683             delta += alen;
   23684          }
   23685          goto decode_success;
   23686       }
   23687       /* VMOVUPS ymm1, ymm2/m256 = VEX.256.0F.WIG 11 /r */
   23688       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   23689          UChar modrm = getUChar(delta);
   23690          UInt  rG    = gregOfRexRM(pfx,modrm);
   23691          if (epartIsReg(modrm)) {
   23692             UInt rE = eregOfRexRM(pfx,modrm);
   23693             putYMMReg( rE, getYMMReg(rG) );
   23694             DIP("vmovups %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
   23695             delta += 1;
   23696          } else {
   23697             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23698             storeLE( mkexpr(addr), getYMMReg(rG) );
   23699             DIP("vmovups %s,%s\n", nameYMMReg(rG), dis_buf);
   23700             delta += alen;
   23701          }
   23702          goto decode_success;
   23703       }
   23704       break;
   23705 
   23706    case 0x12:
   23707       /* VMOVDDUP xmm2/m64, xmm1 = VEX.128.F2.0F.WIG /12 r */
   23708       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23709          delta = dis_MOVDDUP_128( vbi, pfx, delta, True/*isAvx*/ );
   23710          goto decode_success;
   23711       }
   23712       /* VMOVDDUP ymm2/m256, ymm1 = VEX.256.F2.0F.WIG /12 r */
   23713       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   23714          delta = dis_MOVDDUP_256( vbi, pfx, delta );
   23715          goto decode_success;
   23716       }
   23717       /* VMOVHLPS xmm3, xmm2, xmm1 = VEX.NDS.128.0F.WIG 12 /r */
   23718       /* Insn only exists in reg form */
   23719       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   23720           && epartIsReg(getUChar(delta))) {
   23721          UChar modrm = getUChar(delta);
   23722          UInt  rG    = gregOfRexRM(pfx, modrm);
   23723          UInt  rE    = eregOfRexRM(pfx, modrm);
   23724          UInt  rV    = getVexNvvvv(pfx);
   23725          delta++;
   23726          DIP("vmovhlps %s,%s,%s\n",
   23727              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   23728          IRTemp res = newTemp(Ity_V128);
   23729          assign(res, binop(Iop_64HLtoV128,
   23730                            getXMMRegLane64(rV, 1),
   23731                            getXMMRegLane64(rE, 1)));
   23732          putYMMRegLoAndZU(rG, mkexpr(res));
   23733          *uses_vvvv = True;
   23734          goto decode_success;
   23735       }
   23736       /* VMOVLPS m64, xmm1, xmm2 = VEX.NDS.128.0F.WIG 12 /r */
   23737       /* Insn exists only in mem form, it appears. */
   23738       /* VMOVLPD m64, xmm1, xmm2 = VEX.NDS.128.66.0F.WIG 12 /r */
   23739       /* Insn exists only in mem form, it appears. */
   23740       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
   23741           && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
   23742          UChar modrm = getUChar(delta);
   23743          UInt  rG    = gregOfRexRM(pfx, modrm);
   23744          UInt  rV    = getVexNvvvv(pfx);
   23745          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23746          delta += alen;
   23747          DIP("vmovlpd %s,%s,%s\n",
   23748              dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   23749          IRTemp res = newTemp(Ity_V128);
   23750          assign(res, binop(Iop_64HLtoV128,
   23751                            getXMMRegLane64(rV, 1),
   23752                            loadLE(Ity_I64, mkexpr(addr))));
   23753          putYMMRegLoAndZU(rG, mkexpr(res));
   23754          *uses_vvvv = True;
   23755          goto decode_success;
   23756       }
   23757       /* VMOVSLDUP xmm2/m128, xmm1 = VEX.NDS.128.F3.0F.WIG 12 /r */
   23758       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
   23759          delta = dis_MOVSxDUP_128( vbi, pfx, delta, True/*isAvx*/,
   23760                                    True/*isL*/ );
   23761          goto decode_success;
   23762       }
   23763       /* VMOVSLDUP ymm2/m256, ymm1 = VEX.NDS.256.F3.0F.WIG 12 /r */
   23764       if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
   23765          delta = dis_MOVSxDUP_256( vbi, pfx, delta, True/*isL*/ );
   23766          goto decode_success;
   23767       }
   23768       break;
   23769 
   23770    case 0x13:
   23771       /* VMOVLPS xmm1, m64 = VEX.128.0F.WIG 13 /r */
   23772       /* Insn exists only in mem form, it appears. */
   23773       /* VMOVLPD xmm1, m64 = VEX.128.66.0F.WIG 13 /r */
   23774       /* Insn exists only in mem form, it appears. */
   23775       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
   23776           && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
   23777          UChar modrm = getUChar(delta);
   23778          UInt  rG    = gregOfRexRM(pfx, modrm);
   23779          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23780          delta += alen;
   23781          storeLE( mkexpr(addr), getXMMRegLane64( rG, 0));
   23782          DIP("vmovlpd %s,%s\n", nameXMMReg(rG), dis_buf);
   23783          goto decode_success;
   23784       }
   23785       break;
   23786 
   23787    case 0x14:
   23788    case 0x15:
   23789       /* VUNPCKLPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 14 /r */
   23790       /* VUNPCKHPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 15 /r */
   23791       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23792          Bool   hi    = opc == 0x15;
   23793          UChar  modrm = getUChar(delta);
   23794          UInt   rG    = gregOfRexRM(pfx,modrm);
   23795          UInt   rV    = getVexNvvvv(pfx);
   23796          IRTemp eV    = newTemp(Ity_V128);
   23797          IRTemp vV    = newTemp(Ity_V128);
   23798          assign( vV, getXMMReg(rV) );
   23799          if (epartIsReg(modrm)) {
   23800             UInt rE = eregOfRexRM(pfx,modrm);
   23801             assign( eV, getXMMReg(rE) );
   23802             delta += 1;
   23803             DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
   23804                 nameXMMReg(rE), nameXMMReg(rG));
   23805          } else {
   23806             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23807             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   23808             delta += alen;
   23809             DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
   23810                 dis_buf, nameXMMReg(rG));
   23811          }
   23812          IRTemp res = math_UNPCKxPS_128( eV, vV, hi );
   23813          putYMMRegLoAndZU( rG, mkexpr(res) );
   23814          *uses_vvvv = True;
   23815          goto decode_success;
   23816       }
   23817       /* VUNPCKLPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 14 /r */
   23818       /* VUNPCKHPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 15 /r */
   23819       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   23820          Bool   hi    = opc == 0x15;
   23821          UChar  modrm = getUChar(delta);
   23822          UInt   rG    = gregOfRexRM(pfx,modrm);
   23823          UInt   rV    = getVexNvvvv(pfx);
   23824          IRTemp eV    = newTemp(Ity_V256);
   23825          IRTemp vV    = newTemp(Ity_V256);
   23826          assign( vV, getYMMReg(rV) );
   23827          if (epartIsReg(modrm)) {
   23828             UInt rE = eregOfRexRM(pfx,modrm);
   23829             assign( eV, getYMMReg(rE) );
   23830             delta += 1;
   23831             DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
   23832                 nameYMMReg(rE), nameYMMReg(rG));
   23833          } else {
   23834             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23835             assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
   23836             delta += alen;
   23837             DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
   23838                 dis_buf, nameYMMReg(rG));
   23839          }
   23840          IRTemp res = math_UNPCKxPS_256( eV, vV, hi );
   23841          putYMMReg( rG, mkexpr(res) );
   23842          *uses_vvvv = True;
   23843          goto decode_success;
   23844       }
   23845       /* VUNPCKLPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 14 /r */
   23846       /* VUNPCKHPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 15 /r */
   23847       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23848          Bool   hi    = opc == 0x15;
   23849          UChar  modrm = getUChar(delta);
   23850          UInt   rG    = gregOfRexRM(pfx,modrm);
   23851          UInt   rV    = getVexNvvvv(pfx);
   23852          IRTemp eV    = newTemp(Ity_V128);
   23853          IRTemp vV    = newTemp(Ity_V128);
   23854          assign( vV, getXMMReg(rV) );
   23855          if (epartIsReg(modrm)) {
   23856             UInt rE = eregOfRexRM(pfx,modrm);
   23857             assign( eV, getXMMReg(rE) );
   23858             delta += 1;
   23859             DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
   23860                 nameXMMReg(rE), nameXMMReg(rG));
   23861          } else {
   23862             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23863             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   23864             delta += alen;
   23865             DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
   23866                 dis_buf, nameXMMReg(rG));
   23867          }
   23868          IRTemp res = math_UNPCKxPD_128( eV, vV, hi );
   23869          putYMMRegLoAndZU( rG, mkexpr(res) );
   23870          *uses_vvvv = True;
   23871          goto decode_success;
   23872       }
   23873       /* VUNPCKLPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 14 /r */
   23874       /* VUNPCKHPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 15 /r */
   23875       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   23876          Bool   hi    = opc == 0x15;
   23877          UChar  modrm = getUChar(delta);
   23878          UInt   rG    = gregOfRexRM(pfx,modrm);
   23879          UInt   rV    = getVexNvvvv(pfx);
   23880          IRTemp eV    = newTemp(Ity_V256);
   23881          IRTemp vV    = newTemp(Ity_V256);
   23882          assign( vV, getYMMReg(rV) );
   23883          if (epartIsReg(modrm)) {
   23884             UInt rE = eregOfRexRM(pfx,modrm);
   23885             assign( eV, getYMMReg(rE) );
   23886             delta += 1;
   23887             DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
   23888                 nameYMMReg(rE), nameYMMReg(rG));
   23889          } else {
   23890             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23891             assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
   23892             delta += alen;
   23893             DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
   23894                 dis_buf, nameYMMReg(rG));
   23895          }
   23896          IRTemp res = math_UNPCKxPD_256( eV, vV, hi );
   23897          putYMMReg( rG, mkexpr(res) );
   23898          *uses_vvvv = True;
   23899          goto decode_success;
   23900       }
   23901       break;
   23902 
   23903    case 0x16:
   23904       /* VMOVLHPS xmm3, xmm2, xmm1 = VEX.NDS.128.0F.WIG 16 /r */
   23905       /* Insn only exists in reg form */
   23906       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   23907           && epartIsReg(getUChar(delta))) {
   23908          UChar modrm = getUChar(delta);
   23909          UInt  rG    = gregOfRexRM(pfx, modrm);
   23910          UInt  rE    = eregOfRexRM(pfx, modrm);
   23911          UInt  rV    = getVexNvvvv(pfx);
   23912          delta++;
   23913          DIP("vmovlhps %s,%s,%s\n",
   23914              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   23915          IRTemp res = newTemp(Ity_V128);
   23916          assign(res, binop(Iop_64HLtoV128,
   23917                            getXMMRegLane64(rE, 0),
   23918                            getXMMRegLane64(rV, 0)));
   23919          putYMMRegLoAndZU(rG, mkexpr(res));
   23920          *uses_vvvv = True;
   23921          goto decode_success;
   23922       }
   23923       /* VMOVHPS m64, xmm1, xmm2 = VEX.NDS.128.0F.WIG 16 /r */
   23924       /* Insn exists only in mem form, it appears. */
   23925       /* VMOVHPD m64, xmm1, xmm2 = VEX.NDS.128.66.0F.WIG 16 /r */
   23926       /* Insn exists only in mem form, it appears. */
   23927       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
   23928           && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
   23929          UChar modrm = getUChar(delta);
   23930          UInt  rG    = gregOfRexRM(pfx, modrm);
   23931          UInt  rV    = getVexNvvvv(pfx);
   23932          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23933          delta += alen;
   23934          DIP("vmovhp%c %s,%s,%s\n", have66(pfx) ? 'd' : 's',
   23935              dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   23936          IRTemp res = newTemp(Ity_V128);
   23937          assign(res, binop(Iop_64HLtoV128,
   23938                            loadLE(Ity_I64, mkexpr(addr)),
   23939                            getXMMRegLane64(rV, 0)));
   23940          putYMMRegLoAndZU(rG, mkexpr(res));
   23941          *uses_vvvv = True;
   23942          goto decode_success;
   23943       }
   23944       /* VMOVSHDUP xmm2/m128, xmm1 = VEX.NDS.128.F3.0F.WIG 16 /r */
   23945       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
   23946          delta = dis_MOVSxDUP_128( vbi, pfx, delta, True/*isAvx*/,
   23947                                    False/*!isL*/ );
   23948          goto decode_success;
   23949       }
   23950       /* VMOVSHDUP ymm2/m256, ymm1 = VEX.NDS.256.F3.0F.WIG 16 /r */
   23951       if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
   23952          delta = dis_MOVSxDUP_256( vbi, pfx, delta, False/*!isL*/ );
   23953          goto decode_success;
   23954       }
   23955       break;
   23956 
   23957    case 0x17:
   23958       /* VMOVHPS xmm1, m64 = VEX.128.0F.WIG 17 /r */
   23959       /* Insn exists only in mem form, it appears. */
   23960       /* VMOVHPD xmm1, m64 = VEX.128.66.0F.WIG 17 /r */
   23961       /* Insn exists only in mem form, it appears. */
   23962       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
   23963           && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
   23964          UChar modrm = getUChar(delta);
   23965          UInt  rG    = gregOfRexRM(pfx, modrm);
   23966          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23967          delta += alen;
   23968          storeLE( mkexpr(addr), getXMMRegLane64( rG, 1));
   23969          DIP("vmovhp%c %s,%s\n", have66(pfx) ? 'd' : 's',
   23970              nameXMMReg(rG), dis_buf);
   23971          goto decode_success;
   23972       }
   23973       break;
   23974 
   23975    case 0x28:
   23976       /* VMOVAPD xmm2/m128, xmm1 = VEX.128.66.0F.WIG 28 /r */
   23977       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23978          UChar modrm = getUChar(delta);
   23979          UInt  rG    = gregOfRexRM(pfx, modrm);
   23980          if (epartIsReg(modrm)) {
   23981             UInt rE = eregOfRexRM(pfx,modrm);
   23982             putYMMRegLoAndZU( rG, getXMMReg( rE ));
   23983             DIP("vmovapd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   23984             delta += 1;
   23985          } else {
   23986             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23987             gen_SEGV_if_not_16_aligned( addr );
   23988             putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
   23989             DIP("vmovapd %s,%s\n", dis_buf, nameXMMReg(rG));
   23990             delta += alen;
   23991          }
   23992          goto decode_success;
   23993       }
   23994       /* VMOVAPD ymm2/m256, ymm1 = VEX.256.66.0F.WIG 28 /r */
   23995       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   23996          UChar modrm = getUChar(delta);
   23997          UInt  rG    = gregOfRexRM(pfx, modrm);
   23998          if (epartIsReg(modrm)) {
   23999             UInt rE = eregOfRexRM(pfx,modrm);
   24000             putYMMReg( rG, getYMMReg( rE ));
   24001             DIP("vmovapd %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   24002             delta += 1;
   24003          } else {
   24004             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24005             gen_SEGV_if_not_32_aligned( addr );
   24006             putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
   24007             DIP("vmovapd %s,%s\n", dis_buf, nameYMMReg(rG));
   24008             delta += alen;
   24009          }
   24010          goto decode_success;
   24011       }
   24012       /* VMOVAPS xmm2/m128, xmm1 = VEX.128.0F.WIG 28 /r */
   24013       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24014          UChar modrm = getUChar(delta);
   24015          UInt  rG    = gregOfRexRM(pfx, modrm);
   24016          if (epartIsReg(modrm)) {
   24017             UInt rE = eregOfRexRM(pfx,modrm);
   24018             putYMMRegLoAndZU( rG, getXMMReg( rE ));
   24019             DIP("vmovaps %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   24020             delta += 1;
   24021          } else {
   24022             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24023             gen_SEGV_if_not_16_aligned( addr );
   24024             putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
   24025             DIP("vmovaps %s,%s\n", dis_buf, nameXMMReg(rG));
   24026             delta += alen;
   24027          }
   24028          goto decode_success;
   24029       }
   24030       /* VMOVAPS ymm2/m256, ymm1 = VEX.256.0F.WIG 28 /r */
   24031       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24032          UChar modrm = getUChar(delta);
   24033          UInt  rG    = gregOfRexRM(pfx, modrm);
   24034          if (epartIsReg(modrm)) {
   24035             UInt rE = eregOfRexRM(pfx,modrm);
   24036             putYMMReg( rG, getYMMReg( rE ));
   24037             DIP("vmovaps %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   24038             delta += 1;
   24039          } else {
   24040             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24041             gen_SEGV_if_not_32_aligned( addr );
   24042             putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
   24043             DIP("vmovaps %s,%s\n", dis_buf, nameYMMReg(rG));
   24044             delta += alen;
   24045          }
   24046          goto decode_success;
   24047       }
   24048       break;
   24049 
   24050    case 0x29:
   24051       /* VMOVAPD xmm1, xmm2/m128 = VEX.128.66.0F.WIG 29 /r */
   24052       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24053          UChar modrm = getUChar(delta);
   24054          UInt  rG    = gregOfRexRM(pfx,modrm);
   24055          if (epartIsReg(modrm)) {
   24056             UInt rE = eregOfRexRM(pfx,modrm);
   24057             putYMMRegLoAndZU( rE, getXMMReg(rG) );
   24058             DIP("vmovapd %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
   24059             delta += 1;
   24060          } else {
   24061             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24062             gen_SEGV_if_not_16_aligned( addr );
   24063             storeLE( mkexpr(addr), getXMMReg(rG) );
   24064             DIP("vmovapd %s,%s\n", nameXMMReg(rG), dis_buf );
   24065             delta += alen;
   24066          }
   24067          goto decode_success;
   24068       }
   24069       /* VMOVAPD ymm1, ymm2/m256 = VEX.256.66.0F.WIG 29 /r */
   24070       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24071          UChar modrm = getUChar(delta);
   24072          UInt  rG    = gregOfRexRM(pfx,modrm);
   24073          if (epartIsReg(modrm)) {
   24074             UInt rE = eregOfRexRM(pfx,modrm);
   24075             putYMMReg( rE, getYMMReg(rG) );
   24076             DIP("vmovapd %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
   24077             delta += 1;
   24078          } else {
   24079             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24080             gen_SEGV_if_not_32_aligned( addr );
   24081             storeLE( mkexpr(addr), getYMMReg(rG) );
   24082             DIP("vmovapd %s,%s\n", nameYMMReg(rG), dis_buf );
   24083             delta += alen;
   24084          }
   24085          goto decode_success;
   24086       }
   24087       /* VMOVAPS xmm1, xmm2/m128 = VEX.128.0F.WIG 29 /r */
   24088       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24089          UChar modrm = getUChar(delta);
   24090          UInt  rG    = gregOfRexRM(pfx,modrm);
   24091          if (epartIsReg(modrm)) {
   24092             UInt rE = eregOfRexRM(pfx,modrm);
   24093             putYMMRegLoAndZU( rE, getXMMReg(rG) );
   24094             DIP("vmovaps %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
   24095             delta += 1;
   24096             goto decode_success;
   24097          } else {
   24098             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24099             gen_SEGV_if_not_16_aligned( addr );
   24100             storeLE( mkexpr(addr), getXMMReg(rG) );
   24101             DIP("vmovaps %s,%s\n", nameXMMReg(rG), dis_buf );
   24102             delta += alen;
   24103             goto decode_success;
   24104          }
   24105       }
   24106       /* VMOVAPS ymm1, ymm2/m256 = VEX.256.0F.WIG 29 /r */
   24107       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24108          UChar modrm = getUChar(delta);
   24109          UInt  rG    = gregOfRexRM(pfx,modrm);
   24110          if (epartIsReg(modrm)) {
   24111             UInt rE = eregOfRexRM(pfx,modrm);
   24112             putYMMReg( rE, getYMMReg(rG) );
   24113             DIP("vmovaps %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
   24114             delta += 1;
   24115             goto decode_success;
   24116          } else {
   24117             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24118             gen_SEGV_if_not_32_aligned( addr );
   24119             storeLE( mkexpr(addr), getYMMReg(rG) );
   24120             DIP("vmovaps %s,%s\n", nameYMMReg(rG), dis_buf );
   24121             delta += alen;
   24122             goto decode_success;
   24123          }
   24124       }
   24125       break;
   24126 
   24127    case 0x2A: {
   24128       IRTemp rmode = newTemp(Ity_I32);
   24129       assign( rmode, get_sse_roundingmode() );
   24130       /* VCVTSI2SD r/m32, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.W0 2A /r */
   24131       if (haveF2no66noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
   24132          UChar  modrm = getUChar(delta);
   24133          UInt   rV    = getVexNvvvv(pfx);
   24134          UInt   rD    = gregOfRexRM(pfx, modrm);
   24135          IRTemp arg32 = newTemp(Ity_I32);
   24136          if (epartIsReg(modrm)) {
   24137             UInt rS = eregOfRexRM(pfx,modrm);
   24138             assign( arg32, getIReg32(rS) );
   24139             delta += 1;
   24140             DIP("vcvtsi2sdl %s,%s,%s\n",
   24141                 nameIReg32(rS), nameXMMReg(rV), nameXMMReg(rD));
   24142          } else {
   24143             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24144             assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   24145             delta += alen;
   24146             DIP("vcvtsi2sdl %s,%s,%s\n",
   24147                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
   24148          }
   24149          putXMMRegLane64F( rD, 0,
   24150                            unop(Iop_I32StoF64, mkexpr(arg32)));
   24151          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
   24152          putYMMRegLane128( rD, 1, mkV128(0) );
   24153          *uses_vvvv = True;
   24154          goto decode_success;
   24155       }
   24156       /* VCVTSI2SD r/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.W1 2A /r */
   24157       if (haveF2no66noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
   24158          UChar  modrm = getUChar(delta);
   24159          UInt   rV    = getVexNvvvv(pfx);
   24160          UInt   rD    = gregOfRexRM(pfx, modrm);
   24161          IRTemp arg64 = newTemp(Ity_I64);
   24162          if (epartIsReg(modrm)) {
   24163             UInt rS = eregOfRexRM(pfx,modrm);
   24164             assign( arg64, getIReg64(rS) );
   24165             delta += 1;
   24166             DIP("vcvtsi2sdq %s,%s,%s\n",
   24167                 nameIReg64(rS), nameXMMReg(rV), nameXMMReg(rD));
   24168          } else {
   24169             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24170             assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   24171             delta += alen;
   24172             DIP("vcvtsi2sdq %s,%s,%s\n",
   24173                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
   24174          }
   24175          putXMMRegLane64F( rD, 0,
   24176                            binop( Iop_I64StoF64,
   24177                                   get_sse_roundingmode(),
   24178                                   mkexpr(arg64)) );
   24179          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
   24180          putYMMRegLane128( rD, 1, mkV128(0) );
   24181          *uses_vvvv = True;
   24182          goto decode_success;
   24183       }
   24184       /* VCVTSI2SS r/m64, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.W1 2A /r */
   24185       if (haveF3no66noF2(pfx) && 1==getRexW(pfx)/*W1*/) {
   24186          UChar  modrm = getUChar(delta);
   24187          UInt   rV    = getVexNvvvv(pfx);
   24188          UInt   rD    = gregOfRexRM(pfx, modrm);
   24189          IRTemp arg64 = newTemp(Ity_I64);
   24190          if (epartIsReg(modrm)) {
   24191             UInt rS = eregOfRexRM(pfx,modrm);
   24192             assign( arg64, getIReg64(rS) );
   24193             delta += 1;
   24194             DIP("vcvtsi2ssq %s,%s,%s\n",
   24195                 nameIReg64(rS), nameXMMReg(rV), nameXMMReg(rD));
   24196          } else {
   24197             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24198             assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   24199             delta += alen;
   24200             DIP("vcvtsi2ssq %s,%s,%s\n",
   24201                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
   24202          }
   24203          putXMMRegLane32F( rD, 0,
   24204                            binop(Iop_F64toF32,
   24205                                  mkexpr(rmode),
   24206                                  binop(Iop_I64StoF64, mkexpr(rmode),
   24207                                                       mkexpr(arg64)) ) );
   24208          putXMMRegLane32( rD, 1, getXMMRegLane32( rV, 1 ));
   24209          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
   24210          putYMMRegLane128( rD, 1, mkV128(0) );
   24211          *uses_vvvv = True;
   24212          goto decode_success;
   24213       }
   24214       /* VCVTSI2SS r/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.W0 2A /r */
   24215       if (haveF3no66noF2(pfx) && 0==getRexW(pfx)/*W0*/) {
   24216          UChar  modrm = getUChar(delta);
   24217          UInt   rV    = getVexNvvvv(pfx);
   24218          UInt   rD    = gregOfRexRM(pfx, modrm);
   24219          IRTemp arg32 = newTemp(Ity_I32);
   24220          if (epartIsReg(modrm)) {
   24221             UInt rS = eregOfRexRM(pfx,modrm);
   24222             assign( arg32, getIReg32(rS) );
   24223             delta += 1;
   24224             DIP("vcvtsi2ssl %s,%s,%s\n",
   24225                 nameIReg32(rS), nameXMMReg(rV), nameXMMReg(rD));
   24226          } else {
   24227             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24228             assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   24229             delta += alen;
   24230             DIP("vcvtsi2ssl %s,%s,%s\n",
   24231                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
   24232          }
   24233          putXMMRegLane32F( rD, 0,
   24234                            binop(Iop_F64toF32,
   24235                                  mkexpr(rmode),
   24236                                  unop(Iop_I32StoF64, mkexpr(arg32)) ) );
   24237          putXMMRegLane32( rD, 1, getXMMRegLane32( rV, 1 ));
   24238          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
   24239          putYMMRegLane128( rD, 1, mkV128(0) );
   24240          *uses_vvvv = True;
   24241          goto decode_success;
   24242       }
   24243       break;
   24244    }
   24245 
   24246    case 0x2B:
   24247       /* VMOVNTPD xmm1, m128 = VEX.128.66.0F.WIG 2B /r */
   24248       /* VMOVNTPS xmm1, m128 = VEX.128.0F.WIG 2B /r */
   24249       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
   24250           && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
   24251          UChar  modrm = getUChar(delta);
   24252          UInt   rS    = gregOfRexRM(pfx, modrm);
   24253          IRTemp tS    = newTemp(Ity_V128);
   24254          assign(tS, getXMMReg(rS));
   24255          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   24256          delta += alen;
   24257          gen_SEGV_if_not_16_aligned(addr);
   24258          storeLE(mkexpr(addr), mkexpr(tS));
   24259          DIP("vmovntp%c %s,%s\n", have66(pfx) ? 'd' : 's',
   24260              nameXMMReg(rS), dis_buf);
   24261          goto decode_success;
   24262       }
   24263       /* VMOVNTPD ymm1, m256 = VEX.256.66.0F.WIG 2B /r */
   24264       /* VMOVNTPS ymm1, m256 = VEX.256.0F.WIG 2B /r */
   24265       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
   24266           && 1==getVexL(pfx)/*256*/ && !epartIsReg(getUChar(delta))) {
   24267          UChar  modrm = getUChar(delta);
   24268          UInt   rS    = gregOfRexRM(pfx, modrm);
   24269          IRTemp tS    = newTemp(Ity_V256);
   24270          assign(tS, getYMMReg(rS));
   24271          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   24272          delta += alen;
   24273          gen_SEGV_if_not_32_aligned(addr);
   24274          storeLE(mkexpr(addr), mkexpr(tS));
   24275          DIP("vmovntp%c %s,%s\n", have66(pfx) ? 'd' : 's',
   24276              nameYMMReg(rS), dis_buf);
   24277          goto decode_success;
   24278       }
   24279       break;
   24280 
   24281    case 0x2C:
   24282       /* VCVTTSD2SI xmm1/m32, r32 = VEX.LIG.F2.0F.W0 2C /r */
   24283       if (haveF2no66noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
   24284          delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
   24285          goto decode_success;
   24286       }
   24287       /* VCVTTSD2SI xmm1/m64, r64 = VEX.LIG.F2.0F.W1 2C /r */
   24288       if (haveF2no66noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
   24289          delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
   24290          goto decode_success;
   24291       }
   24292       /* VCVTTSS2SI xmm1/m32, r32 = VEX.LIG.F3.0F.W0 2C /r */
   24293       if (haveF3no66noF2(pfx) && 0==getRexW(pfx)/*W0*/) {
   24294          delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
   24295          goto decode_success;
   24296       }
   24297       /* VCVTTSS2SI xmm1/m64, r64 = VEX.LIG.F3.0F.W1 2C /r */
   24298       if (haveF3no66noF2(pfx) && 1==getRexW(pfx)/*W1*/) {
   24299          delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
   24300          goto decode_success;
   24301       }
   24302       break;
   24303 
   24304    case 0x2D:
   24305       /* VCVTSD2SI xmm1/m32, r32 = VEX.LIG.F2.0F.W0 2D /r */
   24306       if (haveF2no66noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
   24307          delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
   24308          goto decode_success;
   24309       }
   24310       /* VCVTSD2SI xmm1/m64, r64 = VEX.LIG.F2.0F.W1 2D /r */
   24311       if (haveF2no66noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
   24312          delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
   24313          goto decode_success;
   24314       }
   24315       /* VCVTSS2SI xmm1/m32, r32 = VEX.LIG.F3.0F.W0 2D /r */
   24316       if (haveF3no66noF2(pfx) && 0==getRexW(pfx)/*W0*/) {
   24317          delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
   24318          goto decode_success;
   24319       }
   24320       /* VCVTSS2SI xmm1/m64, r64 = VEX.LIG.F3.0F.W1 2D /r */
   24321       if (haveF3no66noF2(pfx) && 1==getRexW(pfx)/*W1*/) {
   24322          delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
   24323          goto decode_success;
   24324       }
   24325       break;
   24326 
   24327    case 0x2E:
   24328    case 0x2F:
   24329       /* VUCOMISD xmm2/m64, xmm1 = VEX.LIG.66.0F.WIG 2E /r */
   24330       /* VCOMISD  xmm2/m64, xmm1 = VEX.LIG.66.0F.WIG 2F /r */
   24331       if (have66noF2noF3(pfx)) {
   24332          delta = dis_COMISD( vbi, pfx, delta, True/*isAvx*/, opc );
   24333          goto decode_success;
   24334       }
   24335       /* VUCOMISS xmm2/m32, xmm1 = VEX.LIG.0F.WIG 2E /r */
   24336       /* VCOMISS xmm2/m32, xmm1  = VEX.LIG.0F.WIG 2F /r */
   24337       if (haveNo66noF2noF3(pfx)) {
   24338          delta = dis_COMISS( vbi, pfx, delta, True/*isAvx*/, opc );
   24339          goto decode_success;
   24340       }
   24341       break;
   24342 
   24343    case 0x50:
   24344       /* VMOVMSKPD xmm2, r32 = VEX.128.66.0F.WIG 50 /r */
   24345       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24346          delta = dis_MOVMSKPD_128( vbi, pfx, delta, True/*isAvx*/ );
   24347          goto decode_success;
   24348       }
   24349       /* VMOVMSKPD ymm2, r32 = VEX.256.66.0F.WIG 50 /r */
   24350       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24351          delta = dis_MOVMSKPD_256( vbi, pfx, delta );
   24352          goto decode_success;
   24353       }
   24354       /* VMOVMSKPS xmm2, r32 = VEX.128.0F.WIG 50 /r */
   24355       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24356          delta = dis_MOVMSKPS_128( vbi, pfx, delta, True/*isAvx*/ );
   24357          goto decode_success;
   24358       }
   24359       /* VMOVMSKPS ymm2, r32 = VEX.256.0F.WIG 50 /r */
   24360       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24361          delta = dis_MOVMSKPS_256( vbi, pfx, delta );
   24362          goto decode_success;
   24363       }
   24364       break;
   24365 
   24366    case 0x51:
   24367       /* VSQRTSS xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F3.0F.WIG 51 /r */
   24368       if (haveF3no66noF2(pfx)) {
   24369          delta = dis_AVX128_E_V_to_G_lo32_unary(
   24370                     uses_vvvv, vbi, pfx, delta, "vsqrtss", Iop_Sqrt32F0x4 );
   24371          goto decode_success;
   24372       }
   24373       /* VSQRTPS xmm2/m128(E), xmm1(G) = VEX.NDS.128.0F.WIG 51 /r */
   24374       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24375          delta = dis_AVX128_E_to_G_unary_all(
   24376                     uses_vvvv, vbi, pfx, delta, "vsqrtps", Iop_Sqrt32Fx4 );
   24377          goto decode_success;
   24378       }
   24379       /* VSQRTPS ymm2/m256(E), ymm1(G) = VEX.NDS.256.0F.WIG 51 /r */
   24380       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24381          delta = dis_AVX256_E_to_G_unary_all(
   24382                     uses_vvvv, vbi, pfx, delta, "vsqrtps", Iop_Sqrt32Fx8 );
   24383          goto decode_success;
   24384       }
   24385       /* VSQRTSD xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F2.0F.WIG 51 /r */
   24386       if (haveF2no66noF3(pfx)) {
   24387          delta = dis_AVX128_E_V_to_G_lo64_unary(
   24388                     uses_vvvv, vbi, pfx, delta, "vsqrtsd", Iop_Sqrt64F0x2 );
   24389          goto decode_success;
   24390       }
   24391       /* VSQRTPD xmm2/m128(E), xmm1(G) = VEX.NDS.128.66.0F.WIG 51 /r */
   24392       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24393          delta = dis_AVX128_E_to_G_unary_all(
   24394                     uses_vvvv, vbi, pfx, delta, "vsqrtpd", Iop_Sqrt64Fx2 );
   24395          goto decode_success;
   24396       }
   24397       /* VSQRTPD ymm2/m256(E), ymm1(G) = VEX.NDS.256.66.0F.WIG 51 /r */
   24398       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24399          delta = dis_AVX256_E_to_G_unary_all(
   24400                     uses_vvvv, vbi, pfx, delta, "vsqrtpd", Iop_Sqrt64Fx4 );
   24401          goto decode_success;
   24402       }
   24403       break;
   24404 
   24405    case 0x52:
   24406       /* VRSQRTSS xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F3.0F.WIG 52 /r */
   24407       if (haveF3no66noF2(pfx)) {
   24408          delta = dis_AVX128_E_V_to_G_lo32_unary(
   24409                     uses_vvvv, vbi, pfx, delta, "vrsqrtss",
   24410                     Iop_RSqrtEst32F0x4 );
   24411          goto decode_success;
   24412       }
   24413       /* VRSQRTPS xmm2/m128(E), xmm1(G) = VEX.NDS.128.0F.WIG 52 /r */
   24414       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24415          delta = dis_AVX128_E_to_G_unary_all(
   24416                     uses_vvvv, vbi, pfx, delta, "vrsqrtps", Iop_RSqrtEst32Fx4 );
   24417          goto decode_success;
   24418       }
   24419       /* VRSQRTPS ymm2/m256(E), ymm1(G) = VEX.NDS.256.0F.WIG 52 /r */
   24420       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24421          delta = dis_AVX256_E_to_G_unary_all(
   24422                     uses_vvvv, vbi, pfx, delta, "vrsqrtps", Iop_RSqrtEst32Fx8 );
   24423          goto decode_success;
   24424       }
   24425       break;
   24426 
   24427    case 0x53:
   24428       /* VRCPSS xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F3.0F.WIG 53 /r */
   24429       if (haveF3no66noF2(pfx)) {
   24430          delta = dis_AVX128_E_V_to_G_lo32_unary(
   24431                     uses_vvvv, vbi, pfx, delta, "vrcpss", Iop_RecipEst32F0x4 );
   24432          goto decode_success;
   24433       }
   24434       /* VRCPPS xmm2/m128(E), xmm1(G) = VEX.NDS.128.0F.WIG 53 /r */
   24435       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24436          delta = dis_AVX128_E_to_G_unary_all(
   24437                     uses_vvvv, vbi, pfx, delta, "vrcpps", Iop_RecipEst32Fx4 );
   24438          goto decode_success;
   24439       }
   24440       /* VRCPPS ymm2/m256(E), ymm1(G) = VEX.NDS.256.0F.WIG 53 /r */
   24441       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24442          delta = dis_AVX256_E_to_G_unary_all(
   24443                     uses_vvvv, vbi, pfx, delta, "vrcpps", Iop_RecipEst32Fx8 );
   24444          goto decode_success;
   24445       }
   24446       break;
   24447 
   24448    case 0x54:
   24449       /* VANDPD r/m, rV, r ::: r = rV & r/m */
   24450       /* VANDPD = VEX.NDS.128.66.0F.WIG 54 /r */
   24451       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24452          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   24453                     uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV128 );
   24454          goto decode_success;
   24455       }
   24456       /* VANDPD r/m, rV, r ::: r = rV & r/m */
   24457       /* VANDPD = VEX.NDS.256.66.0F.WIG 54 /r */
   24458       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24459          delta = dis_AVX256_E_V_to_G(
   24460                     uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV256 );
   24461          goto decode_success;
   24462       }
   24463       /* VANDPS = VEX.NDS.128.0F.WIG 54 /r */
   24464       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24465          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   24466                     uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV128 );
   24467          goto decode_success;
   24468       }
   24469       /* VANDPS = VEX.NDS.256.0F.WIG 54 /r */
   24470       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24471          delta = dis_AVX256_E_V_to_G(
   24472                     uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV256 );
   24473          goto decode_success;
   24474       }
   24475       break;
   24476 
   24477    case 0x55:
   24478       /* VANDNPD r/m, rV, r ::: r = (not rV) & r/m */
   24479       /* VANDNPD = VEX.NDS.128.66.0F.WIG 55 /r */
   24480       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24481          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   24482                     uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV128,
   24483                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
   24484          goto decode_success;
   24485       }
   24486       /* VANDNPD = VEX.NDS.256.66.0F.WIG 55 /r */
   24487       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24488          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
   24489                     uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV256,
   24490                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
   24491          goto decode_success;
   24492       }
   24493       /* VANDNPS = VEX.NDS.128.0F.WIG 55 /r */
   24494       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24495          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   24496                     uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV128,
   24497                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
   24498          goto decode_success;
   24499       }
   24500       /* VANDNPS = VEX.NDS.256.0F.WIG 55 /r */
   24501       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24502          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
   24503                     uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV256,
   24504                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
   24505          goto decode_success;
   24506       }
   24507       break;
   24508 
   24509    case 0x56:
   24510       /* VORPD r/m, rV, r ::: r = rV | r/m */
   24511       /* VORPD = VEX.NDS.128.66.0F.WIG 56 /r */
   24512       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24513          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   24514                     uses_vvvv, vbi, pfx, delta, "vorpd", Iop_OrV128 );
   24515          goto decode_success;
   24516       }
   24517       /* VORPD r/m, rV, r ::: r = rV | r/m */
   24518       /* VORPD = VEX.NDS.256.66.0F.WIG 56 /r */
   24519       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24520          delta = dis_AVX256_E_V_to_G(
   24521                     uses_vvvv, vbi, pfx, delta, "vorpd", Iop_OrV256 );
   24522          goto decode_success;
   24523       }
   24524       /* VORPS r/m, rV, r ::: r = rV | r/m */
   24525       /* VORPS = VEX.NDS.128.0F.WIG 56 /r */
   24526       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24527          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   24528                     uses_vvvv, vbi, pfx, delta, "vorps", Iop_OrV128 );
   24529          goto decode_success;
   24530       }
   24531       /* VORPS r/m, rV, r ::: r = rV | r/m */
   24532       /* VORPS = VEX.NDS.256.0F.WIG 56 /r */
   24533       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24534          delta = dis_AVX256_E_V_to_G(
   24535                     uses_vvvv, vbi, pfx, delta, "vorps", Iop_OrV256 );
   24536          goto decode_success;
   24537       }
   24538       break;
   24539 
   24540    case 0x57:
   24541       /* VXORPD r/m, rV, r ::: r = rV ^ r/m */
   24542       /* VXORPD = VEX.NDS.128.66.0F.WIG 57 /r */
   24543       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24544          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   24545                     uses_vvvv, vbi, pfx, delta, "vxorpd", Iop_XorV128 );
   24546          goto decode_success;
   24547       }
   24548       /* VXORPD r/m, rV, r ::: r = rV ^ r/m */
   24549       /* VXORPD = VEX.NDS.256.66.0F.WIG 57 /r */
   24550       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24551          delta = dis_AVX256_E_V_to_G(
   24552                     uses_vvvv, vbi, pfx, delta, "vxorpd", Iop_XorV256 );
   24553          goto decode_success;
   24554       }
   24555       /* VXORPS r/m, rV, r ::: r = rV ^ r/m */
   24556       /* VXORPS = VEX.NDS.128.0F.WIG 57 /r */
   24557       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24558          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   24559                     uses_vvvv, vbi, pfx, delta, "vxorps", Iop_XorV128 );
   24560          goto decode_success;
   24561       }
   24562       /* VXORPS r/m, rV, r ::: r = rV ^ r/m */
   24563       /* VXORPS = VEX.NDS.256.0F.WIG 57 /r */
   24564       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24565          delta = dis_AVX256_E_V_to_G(
   24566                     uses_vvvv, vbi, pfx, delta, "vxorps", Iop_XorV256 );
   24567          goto decode_success;
   24568       }
   24569       break;
   24570 
   24571    case 0x58:
   24572       /* VADDSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 58 /r */
   24573       if (haveF2no66noF3(pfx)) {
   24574          delta = dis_AVX128_E_V_to_G_lo64(
   24575                     uses_vvvv, vbi, pfx, delta, "vaddsd", Iop_Add64F0x2 );
   24576          goto decode_success;
   24577       }
   24578       /* VADDSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 58 /r */
   24579       if (haveF3no66noF2(pfx)) {
   24580          delta = dis_AVX128_E_V_to_G_lo32(
   24581                     uses_vvvv, vbi, pfx, delta, "vaddss", Iop_Add32F0x4 );
   24582          goto decode_success;
   24583       }
   24584       /* VADDPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 58 /r */
   24585       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24586          delta = dis_AVX128_E_V_to_G(
   24587                     uses_vvvv, vbi, pfx, delta, "vaddps", Iop_Add32Fx4 );
   24588          goto decode_success;
   24589       }
   24590       /* VADDPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 58 /r */
   24591       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24592          delta = dis_AVX256_E_V_to_G(
   24593                     uses_vvvv, vbi, pfx, delta, "vaddps", Iop_Add32Fx8 );
   24594          goto decode_success;
   24595       }
   24596       /* VADDPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 58 /r */
   24597       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24598          delta = dis_AVX128_E_V_to_G(
   24599                     uses_vvvv, vbi, pfx, delta, "vaddpd", Iop_Add64Fx2 );
   24600          goto decode_success;
   24601       }
   24602       /* VADDPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 58 /r */
   24603       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24604          delta = dis_AVX256_E_V_to_G(
   24605                     uses_vvvv, vbi, pfx, delta, "vaddpd", Iop_Add64Fx4 );
   24606          goto decode_success;
   24607       }
   24608       break;
   24609 
   24610    case 0x59:
   24611       /* VMULSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 59 /r */
   24612       if (haveF2no66noF3(pfx)) {
   24613          delta = dis_AVX128_E_V_to_G_lo64(
   24614                     uses_vvvv, vbi, pfx, delta, "vmulsd", Iop_Mul64F0x2 );
   24615          goto decode_success;
   24616       }
   24617       /* VMULSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 59 /r */
   24618       if (haveF3no66noF2(pfx)) {
   24619          delta = dis_AVX128_E_V_to_G_lo32(
   24620                     uses_vvvv, vbi, pfx, delta, "vmulss", Iop_Mul32F0x4 );
   24621          goto decode_success;
   24622       }
   24623       /* VMULPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 59 /r */
   24624       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24625          delta = dis_AVX128_E_V_to_G(
   24626                     uses_vvvv, vbi, pfx, delta, "vmulps", Iop_Mul32Fx4 );
   24627          goto decode_success;
   24628       }
   24629       /* VMULPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 59 /r */
   24630       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24631          delta = dis_AVX256_E_V_to_G(
   24632                     uses_vvvv, vbi, pfx, delta, "vmulps", Iop_Mul32Fx8 );
   24633          goto decode_success;
   24634       }
   24635       /* VMULPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 59 /r */
   24636       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24637          delta = dis_AVX128_E_V_to_G(
   24638                     uses_vvvv, vbi, pfx, delta, "vmulpd", Iop_Mul64Fx2 );
   24639          goto decode_success;
   24640       }
   24641       /* VMULPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 59 /r */
   24642       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24643          delta = dis_AVX256_E_V_to_G(
   24644                     uses_vvvv, vbi, pfx, delta, "vmulpd", Iop_Mul64Fx4 );
   24645          goto decode_success;
   24646       }
   24647       break;
   24648 
   24649    case 0x5A:
   24650       /* VCVTPS2PD xmm2/m64, xmm1 = VEX.128.0F.WIG 5A /r */
   24651       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24652          delta = dis_CVTPS2PD_128( vbi, pfx, delta, True/*isAvx*/ );
   24653          goto decode_success;
   24654       }
   24655       /* VCVTPS2PD xmm2/m128, ymm1 = VEX.256.0F.WIG 5A /r */
   24656       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24657          delta = dis_CVTPS2PD_256( vbi, pfx, delta );
   24658          goto decode_success;
   24659       }
   24660       /* VCVTPD2PS xmm2/m128, xmm1 = VEX.128.66.0F.WIG 5A /r */
   24661       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24662          delta = dis_CVTPD2PS_128( vbi, pfx, delta, True/*isAvx*/ );
   24663          goto decode_success;
   24664       }
   24665       /* VCVTPD2PS ymm2/m256, xmm1 = VEX.256.66.0F.WIG 5A /r */
   24666       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24667          delta = dis_CVTPD2PS_256( vbi, pfx, delta );
   24668          goto decode_success;
   24669       }
   24670       /* VCVTSD2SS xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5A /r */
   24671       if (haveF2no66noF3(pfx)) {
   24672          UChar  modrm = getUChar(delta);
   24673          UInt   rV    = getVexNvvvv(pfx);
   24674          UInt   rD    = gregOfRexRM(pfx, modrm);
   24675          IRTemp f64lo = newTemp(Ity_F64);
   24676          IRTemp rmode = newTemp(Ity_I32);
   24677          assign( rmode, get_sse_roundingmode() );
   24678          if (epartIsReg(modrm)) {
   24679             UInt rS = eregOfRexRM(pfx,modrm);
   24680             assign(f64lo, getXMMRegLane64F(rS, 0));
   24681             delta += 1;
   24682             DIP("vcvtsd2ss %s,%s,%s\n",
   24683                 nameXMMReg(rS), nameXMMReg(rV), nameXMMReg(rD));
   24684          } else {
   24685             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24686             assign(f64lo, loadLE(Ity_F64, mkexpr(addr)) );
   24687             delta += alen;
   24688             DIP("vcvtsd2ss %s,%s,%s\n",
   24689                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
   24690          }
   24691          putXMMRegLane32F( rD, 0,
   24692                            binop( Iop_F64toF32, mkexpr(rmode),
   24693                                                 mkexpr(f64lo)) );
   24694          putXMMRegLane32( rD, 1, getXMMRegLane32( rV, 1 ));
   24695          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
   24696          putYMMRegLane128( rD, 1, mkV128(0) );
   24697          *uses_vvvv = True;
   24698          goto decode_success;
   24699       }
   24700       /* VCVTSS2SD xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5A /r */
   24701       if (haveF3no66noF2(pfx)) {
   24702          UChar  modrm = getUChar(delta);
   24703          UInt   rV    = getVexNvvvv(pfx);
   24704          UInt   rD    = gregOfRexRM(pfx, modrm);
   24705          IRTemp f32lo = newTemp(Ity_F32);
   24706          if (epartIsReg(modrm)) {
   24707             UInt rS = eregOfRexRM(pfx,modrm);
   24708             assign(f32lo, getXMMRegLane32F(rS, 0));
   24709             delta += 1;
   24710             DIP("vcvtss2sd %s,%s,%s\n",
   24711                 nameXMMReg(rS), nameXMMReg(rV), nameXMMReg(rD));
   24712          } else {
   24713             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24714             assign(f32lo, loadLE(Ity_F32, mkexpr(addr)) );
   24715             delta += alen;
   24716             DIP("vcvtss2sd %s,%s,%s\n",
   24717                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
   24718          }
   24719          putXMMRegLane64F( rD, 0,
   24720                            unop( Iop_F32toF64, mkexpr(f32lo)) );
   24721          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
   24722          putYMMRegLane128( rD, 1, mkV128(0) );
   24723          *uses_vvvv = True;
   24724          goto decode_success;
   24725       }
   24726       break;
   24727 
   24728    case 0x5B:
   24729       /* VCVTPS2DQ xmm2/m128, xmm1 = VEX.128.66.0F.WIG 5B /r */
   24730       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24731          delta = dis_CVTxPS2DQ_128( vbi, pfx, delta,
   24732                                     True/*isAvx*/, False/*!r2zero*/ );
   24733          goto decode_success;
   24734       }
   24735       /* VCVTPS2DQ ymm2/m256, ymm1 = VEX.256.66.0F.WIG 5B /r */
   24736       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24737          delta = dis_CVTxPS2DQ_256( vbi, pfx, delta,
   24738                                     False/*!r2zero*/ );
   24739          goto decode_success;
   24740       }
   24741       /* VCVTTPS2DQ xmm2/m128, xmm1 = VEX.128.F3.0F.WIG 5B /r */
   24742       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
   24743          delta = dis_CVTxPS2DQ_128( vbi, pfx, delta,
   24744                                     True/*isAvx*/, True/*r2zero*/ );
   24745          goto decode_success;
   24746       }
   24747       /* VCVTTPS2DQ ymm2/m256, ymm1 = VEX.256.F3.0F.WIG 5B /r */
   24748       if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
   24749          delta = dis_CVTxPS2DQ_256( vbi, pfx, delta,
   24750                                     True/*r2zero*/ );
   24751          goto decode_success;
   24752       }
   24753       /* VCVTDQ2PS xmm2/m128, xmm1 = VEX.128.0F.WIG 5B /r */
   24754       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24755          delta = dis_CVTDQ2PS_128 ( vbi, pfx, delta, True/*isAvx*/ );
   24756          goto decode_success;
   24757       }
   24758       /* VCVTDQ2PS ymm2/m256, ymm1 = VEX.256.0F.WIG 5B /r */
   24759       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24760          delta = dis_CVTDQ2PS_256 ( vbi, pfx, delta );
   24761          goto decode_success;
   24762       }
   24763       break;
   24764 
   24765    case 0x5C:
   24766       /* VSUBSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5C /r */
   24767       if (haveF2no66noF3(pfx)) {
   24768          delta = dis_AVX128_E_V_to_G_lo64(
   24769                     uses_vvvv, vbi, pfx, delta, "vsubsd", Iop_Sub64F0x2 );
   24770          goto decode_success;
   24771       }
   24772       /* VSUBSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5C /r */
   24773       if (haveF3no66noF2(pfx)) {
   24774          delta = dis_AVX128_E_V_to_G_lo32(
   24775                     uses_vvvv, vbi, pfx, delta, "vsubss", Iop_Sub32F0x4 );
   24776          goto decode_success;
   24777       }
   24778       /* VSUBPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5C /r */
   24779       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24780          delta = dis_AVX128_E_V_to_G(
   24781                     uses_vvvv, vbi, pfx, delta, "vsubps", Iop_Sub32Fx4 );
   24782          goto decode_success;
   24783       }
   24784       /* VSUBPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5C /r */
   24785       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24786          delta = dis_AVX256_E_V_to_G(
   24787                     uses_vvvv, vbi, pfx, delta, "vsubps", Iop_Sub32Fx8 );
   24788          goto decode_success;
   24789       }
   24790       /* VSUBPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5C /r */
   24791       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24792          delta = dis_AVX128_E_V_to_G(
   24793                     uses_vvvv, vbi, pfx, delta, "vsubpd", Iop_Sub64Fx2 );
   24794          goto decode_success;
   24795       }
   24796       /* VSUBPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5C /r */
   24797       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24798          delta = dis_AVX256_E_V_to_G(
   24799                     uses_vvvv, vbi, pfx, delta, "vsubpd", Iop_Sub64Fx4 );
   24800          goto decode_success;
   24801       }
   24802       break;
   24803 
   24804    case 0x5D:
   24805       /* VMINSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5D /r */
   24806       if (haveF2no66noF3(pfx)) {
   24807          delta = dis_AVX128_E_V_to_G_lo64(
   24808                     uses_vvvv, vbi, pfx, delta, "vminsd", Iop_Min64F0x2 );
   24809          goto decode_success;
   24810       }
   24811       /* VMINSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5D /r */
   24812       if (haveF3no66noF2(pfx)) {
   24813          delta = dis_AVX128_E_V_to_G_lo32(
   24814                     uses_vvvv, vbi, pfx, delta, "vminss", Iop_Min32F0x4 );
   24815          goto decode_success;
   24816       }
   24817       /* VMINPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5D /r */
   24818       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24819          delta = dis_AVX128_E_V_to_G(
   24820                     uses_vvvv, vbi, pfx, delta, "vminps", Iop_Min32Fx4 );
   24821          goto decode_success;
   24822       }
   24823       /* VMINPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5D /r */
   24824       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24825          delta = dis_AVX256_E_V_to_G(
   24826                     uses_vvvv, vbi, pfx, delta, "vminps", Iop_Min32Fx8 );
   24827          goto decode_success;
   24828       }
   24829       /* VMINPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5D /r */
   24830       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24831          delta = dis_AVX128_E_V_to_G(
   24832                     uses_vvvv, vbi, pfx, delta, "vminpd", Iop_Min64Fx2 );
   24833          goto decode_success;
   24834       }
   24835       /* VMINPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5D /r */
   24836       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24837          delta = dis_AVX256_E_V_to_G(
   24838                     uses_vvvv, vbi, pfx, delta, "vminpd", Iop_Min64Fx4 );
   24839          goto decode_success;
   24840       }
   24841       break;
   24842 
   24843    case 0x5E:
   24844       /* VDIVSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5E /r */
   24845       if (haveF2no66noF3(pfx)) {
   24846          delta = dis_AVX128_E_V_to_G_lo64(
   24847                     uses_vvvv, vbi, pfx, delta, "vdivsd", Iop_Div64F0x2 );
   24848          goto decode_success;
   24849       }
   24850       /* VDIVSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5E /r */
   24851       if (haveF3no66noF2(pfx)) {
   24852          delta = dis_AVX128_E_V_to_G_lo32(
   24853                     uses_vvvv, vbi, pfx, delta, "vdivss", Iop_Div32F0x4 );
   24854          goto decode_success;
   24855       }
   24856       /* VDIVPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5E /r */
   24857       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24858          delta = dis_AVX128_E_V_to_G(
   24859                     uses_vvvv, vbi, pfx, delta, "vdivps", Iop_Div32Fx4 );
   24860          goto decode_success;
   24861       }
   24862       /* VDIVPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5E /r */
   24863       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24864          delta = dis_AVX256_E_V_to_G(
   24865                     uses_vvvv, vbi, pfx, delta, "vdivps", Iop_Div32Fx8 );
   24866          goto decode_success;
   24867       }
   24868       /* VDIVPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5E /r */
   24869       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24870          delta = dis_AVX128_E_V_to_G(
   24871                     uses_vvvv, vbi, pfx, delta, "vdivpd", Iop_Div64Fx2 );
   24872          goto decode_success;
   24873       }
   24874       /* VDIVPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5E /r */
   24875       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24876          delta = dis_AVX256_E_V_to_G(
   24877                     uses_vvvv, vbi, pfx, delta, "vdivpd", Iop_Div64Fx4 );
   24878          goto decode_success;
   24879       }
   24880       break;
   24881 
   24882    case 0x5F:
   24883       /* VMAXSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5F /r */
   24884       if (haveF2no66noF3(pfx)) {
   24885          delta = dis_AVX128_E_V_to_G_lo64(
   24886                     uses_vvvv, vbi, pfx, delta, "vmaxsd", Iop_Max64F0x2 );
   24887          goto decode_success;
   24888       }
   24889       /* VMAXSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5F /r */
   24890       if (haveF3no66noF2(pfx)) {
   24891          delta = dis_AVX128_E_V_to_G_lo32(
   24892                     uses_vvvv, vbi, pfx, delta, "vmaxss", Iop_Max32F0x4 );
   24893          goto decode_success;
   24894       }
   24895       /* VMAXPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5F /r */
   24896       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24897          delta = dis_AVX128_E_V_to_G(
   24898                     uses_vvvv, vbi, pfx, delta, "vmaxps", Iop_Max32Fx4 );
   24899          goto decode_success;
   24900       }
   24901       /* VMAXPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5F /r */
   24902       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24903          delta = dis_AVX256_E_V_to_G(
   24904                     uses_vvvv, vbi, pfx, delta, "vmaxps", Iop_Max32Fx8 );
   24905          goto decode_success;
   24906       }
   24907       /* VMAXPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5F /r */
   24908       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24909          delta = dis_AVX128_E_V_to_G(
   24910                     uses_vvvv, vbi, pfx, delta, "vmaxpd", Iop_Max64Fx2 );
   24911          goto decode_success;
   24912       }
   24913       /* VMAXPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5F /r */
   24914       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24915          delta = dis_AVX256_E_V_to_G(
   24916                     uses_vvvv, vbi, pfx, delta, "vmaxpd", Iop_Max64Fx4 );
   24917          goto decode_success;
   24918       }
   24919       break;
   24920 
   24921    case 0x60:
   24922       /* VPUNPCKLBW r/m, rV, r ::: r = interleave-lo-bytes(rV, r/m) */
   24923       /* VPUNPCKLBW = VEX.NDS.128.66.0F.WIG 60 /r */
   24924       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24925          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   24926                     uses_vvvv, vbi, pfx, delta, "vpunpcklbw",
   24927                     Iop_InterleaveLO8x16, NULL,
   24928                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   24929          goto decode_success;
   24930       }
   24931       /* VPUNPCKLBW r/m, rV, r ::: r = interleave-lo-bytes(rV, r/m) */
   24932       /* VPUNPCKLBW = VEX.NDS.256.66.0F.WIG 60 /r */
   24933       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24934          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   24935                     uses_vvvv, vbi, pfx, delta, "vpunpcklbw",
   24936                     math_VPUNPCKLBW_YMM );
   24937          goto decode_success;
   24938       }
   24939       break;
   24940 
   24941    case 0x61:
   24942       /* VPUNPCKLWD r/m, rV, r ::: r = interleave-lo-words(rV, r/m) */
   24943       /* VPUNPCKLWD = VEX.NDS.128.66.0F.WIG 61 /r */
   24944       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24945          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   24946                     uses_vvvv, vbi, pfx, delta, "vpunpcklwd",
   24947                     Iop_InterleaveLO16x8, NULL,
   24948                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   24949          goto decode_success;
   24950       }
   24951       /* VPUNPCKLWD r/m, rV, r ::: r = interleave-lo-words(rV, r/m) */
   24952       /* VPUNPCKLWD = VEX.NDS.256.66.0F.WIG 61 /r */
   24953       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24954          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   24955                     uses_vvvv, vbi, pfx, delta, "vpunpcklwd",
   24956                     math_VPUNPCKLWD_YMM );
   24957          goto decode_success;
   24958       }
   24959       break;
   24960 
   24961    case 0x62:
   24962       /* VPUNPCKLDQ r/m, rV, r ::: r = interleave-lo-dwords(rV, r/m) */
   24963       /* VPUNPCKLDQ = VEX.NDS.128.66.0F.WIG 62 /r */
   24964       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24965          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   24966                     uses_vvvv, vbi, pfx, delta, "vpunpckldq",
   24967                     Iop_InterleaveLO32x4, NULL,
   24968                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   24969          goto decode_success;
   24970       }
   24971       /* VPUNPCKLDQ r/m, rV, r ::: r = interleave-lo-dwords(rV, r/m) */
   24972       /* VPUNPCKLDQ = VEX.NDS.256.66.0F.WIG 62 /r */
   24973       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24974          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   24975                     uses_vvvv, vbi, pfx, delta, "vpunpckldq",
   24976                     math_VPUNPCKLDQ_YMM );
   24977          goto decode_success;
   24978       }
   24979       break;
   24980 
   24981    case 0x63:
   24982       /* VPACKSSWB r/m, rV, r ::: r = QNarrowBin16Sto8Sx16(rV, r/m) */
   24983       /* VPACKSSWB = VEX.NDS.128.66.0F.WIG 63 /r */
   24984       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24985          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   24986                     uses_vvvv, vbi, pfx, delta, "vpacksswb",
   24987                     Iop_QNarrowBin16Sto8Sx16, NULL,
   24988                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   24989          goto decode_success;
   24990       }
   24991       /* VPACKSSWB r/m, rV, r ::: r = QNarrowBin16Sto8Sx16(rV, r/m) */
   24992       /* VPACKSSWB = VEX.NDS.256.66.0F.WIG 63 /r */
   24993       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24994          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   24995                     uses_vvvv, vbi, pfx, delta, "vpacksswb",
   24996                     math_VPACKSSWB_YMM );
   24997          goto decode_success;
   24998       }
   24999       break;
   25000 
   25001    case 0x64:
   25002       /* VPCMPGTB r/m, rV, r ::: r = rV `>s-by-8s` r/m */
   25003       /* VPCMPGTB = VEX.NDS.128.66.0F.WIG 64 /r */
   25004       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25005          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25006                     uses_vvvv, vbi, pfx, delta, "vpcmpgtb", Iop_CmpGT8Sx16 );
   25007          goto decode_success;
   25008       }
   25009       /* VPCMPGTB r/m, rV, r ::: r = rV `>s-by-8s` r/m */
   25010       /* VPCMPGTB = VEX.NDS.256.66.0F.WIG 64 /r */
   25011       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25012          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   25013                     uses_vvvv, vbi, pfx, delta, "vpcmpgtb", Iop_CmpGT8Sx32 );
   25014          goto decode_success;
   25015       }
   25016       break;
   25017 
   25018    case 0x65:
   25019       /* VPCMPGTW r/m, rV, r ::: r = rV `>s-by-16s` r/m */
   25020       /* VPCMPGTW = VEX.NDS.128.66.0F.WIG 65 /r */
   25021       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25022          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25023                     uses_vvvv, vbi, pfx, delta, "vpcmpgtw", Iop_CmpGT16Sx8 );
   25024          goto decode_success;
   25025       }
   25026       /* VPCMPGTW r/m, rV, r ::: r = rV `>s-by-16s` r/m */
   25027       /* VPCMPGTW = VEX.NDS.256.66.0F.WIG 65 /r */
   25028       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25029          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   25030                     uses_vvvv, vbi, pfx, delta, "vpcmpgtw", Iop_CmpGT16Sx16 );
   25031          goto decode_success;
   25032       }
   25033       break;
   25034 
   25035    case 0x66:
   25036       /* VPCMPGTD r/m, rV, r ::: r = rV `>s-by-32s` r/m */
   25037       /* VPCMPGTD = VEX.NDS.128.66.0F.WIG 66 /r */
   25038       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25039          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25040                     uses_vvvv, vbi, pfx, delta, "vpcmpgtd", Iop_CmpGT32Sx4 );
   25041          goto decode_success;
   25042       }
   25043       /* VPCMPGTD r/m, rV, r ::: r = rV `>s-by-32s` r/m */
   25044       /* VPCMPGTD = VEX.NDS.256.66.0F.WIG 66 /r */
   25045       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25046          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   25047                     uses_vvvv, vbi, pfx, delta, "vpcmpgtd", Iop_CmpGT32Sx8 );
   25048          goto decode_success;
   25049       }
   25050       break;
   25051 
   25052    case 0x67:
   25053       /* VPACKUSWB r/m, rV, r ::: r = QNarrowBin16Sto8Ux16(rV, r/m) */
   25054       /* VPACKUSWB = VEX.NDS.128.66.0F.WIG 67 /r */
   25055       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25056          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25057                     uses_vvvv, vbi, pfx, delta, "vpackuswb",
   25058                     Iop_QNarrowBin16Sto8Ux16, NULL,
   25059                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25060          goto decode_success;
   25061       }
   25062       /* VPACKUSWB r/m, rV, r ::: r = QNarrowBin16Sto8Ux16(rV, r/m) */
   25063       /* VPACKUSWB = VEX.NDS.256.66.0F.WIG 67 /r */
   25064       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25065          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25066                     uses_vvvv, vbi, pfx, delta, "vpackuswb",
   25067                     math_VPACKUSWB_YMM );
   25068          goto decode_success;
   25069       }
   25070       break;
   25071 
   25072    case 0x68:
   25073       /* VPUNPCKHBW r/m, rV, r ::: r = interleave-hi-bytes(rV, r/m) */
   25074       /* VPUNPCKHBW = VEX.NDS.128.0F.WIG 68 /r */
   25075       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25076          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25077                     uses_vvvv, vbi, pfx, delta, "vpunpckhbw",
   25078                     Iop_InterleaveHI8x16, NULL,
   25079                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25080          goto decode_success;
   25081       }
   25082       /* VPUNPCKHBW r/m, rV, r ::: r = interleave-hi-bytes(rV, r/m) */
   25083       /* VPUNPCKHBW = VEX.NDS.256.0F.WIG 68 /r */
   25084       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25085          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25086                     uses_vvvv, vbi, pfx, delta, "vpunpckhbw",
   25087                     math_VPUNPCKHBW_YMM );
   25088          goto decode_success;
   25089       }
   25090       break;
   25091 
   25092    case 0x69:
   25093       /* VPUNPCKHWD r/m, rV, r ::: r = interleave-hi-words(rV, r/m) */
   25094       /* VPUNPCKHWD = VEX.NDS.128.0F.WIG 69 /r */
   25095       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25096          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25097                     uses_vvvv, vbi, pfx, delta, "vpunpckhwd",
   25098                     Iop_InterleaveHI16x8, NULL,
   25099                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25100          goto decode_success;
   25101       }
   25102       /* VPUNPCKHWD r/m, rV, r ::: r = interleave-hi-words(rV, r/m) */
   25103       /* VPUNPCKHWD = VEX.NDS.256.0F.WIG 69 /r */
   25104       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25105          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25106                     uses_vvvv, vbi, pfx, delta, "vpunpckhwd",
   25107                     math_VPUNPCKHWD_YMM );
   25108          goto decode_success;
   25109       }
   25110       break;
   25111 
   25112    case 0x6A:
   25113       /* VPUNPCKHDQ r/m, rV, r ::: r = interleave-hi-dwords(rV, r/m) */
   25114       /* VPUNPCKHDQ = VEX.NDS.128.66.0F.WIG 6A /r */
   25115       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25116          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25117                     uses_vvvv, vbi, pfx, delta, "vpunpckhdq",
   25118                     Iop_InterleaveHI32x4, NULL,
   25119                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25120          goto decode_success;
   25121       }
   25122       /* VPUNPCKHDQ r/m, rV, r ::: r = interleave-hi-dwords(rV, r/m) */
   25123       /* VPUNPCKHDQ = VEX.NDS.256.66.0F.WIG 6A /r */
   25124       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25125          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25126                     uses_vvvv, vbi, pfx, delta, "vpunpckhdq",
   25127                     math_VPUNPCKHDQ_YMM );
   25128          goto decode_success;
   25129       }
   25130       break;
   25131 
   25132    case 0x6B:
   25133       /* VPACKSSDW r/m, rV, r ::: r = QNarrowBin32Sto16Sx8(rV, r/m) */
   25134       /* VPACKSSDW = VEX.NDS.128.66.0F.WIG 6B /r */
   25135       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25136          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25137                     uses_vvvv, vbi, pfx, delta, "vpackssdw",
   25138                     Iop_QNarrowBin32Sto16Sx8, NULL,
   25139                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25140          goto decode_success;
   25141       }
   25142       /* VPACKSSDW r/m, rV, r ::: r = QNarrowBin32Sto16Sx8(rV, r/m) */
   25143       /* VPACKSSDW = VEX.NDS.256.66.0F.WIG 6B /r */
   25144       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25145          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25146                     uses_vvvv, vbi, pfx, delta, "vpackssdw",
   25147                     math_VPACKSSDW_YMM );
   25148          goto decode_success;
   25149       }
   25150       break;
   25151 
   25152    case 0x6C:
   25153       /* VPUNPCKLQDQ r/m, rV, r ::: r = interleave-lo-64bitses(rV, r/m) */
   25154       /* VPUNPCKLQDQ = VEX.NDS.128.0F.WIG 6C /r */
   25155       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25156          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25157                     uses_vvvv, vbi, pfx, delta, "vpunpcklqdq",
   25158                     Iop_InterleaveLO64x2, NULL,
   25159                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25160          goto decode_success;
   25161       }
   25162       /* VPUNPCKLQDQ r/m, rV, r ::: r = interleave-lo-64bitses(rV, r/m) */
   25163       /* VPUNPCKLQDQ = VEX.NDS.256.0F.WIG 6C /r */
   25164       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25165          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25166                     uses_vvvv, vbi, pfx, delta, "vpunpcklqdq",
   25167                     math_VPUNPCKLQDQ_YMM );
   25168          goto decode_success;
   25169       }
   25170       break;
   25171 
   25172    case 0x6D:
   25173       /* VPUNPCKHQDQ r/m, rV, r ::: r = interleave-hi-64bitses(rV, r/m) */
   25174       /* VPUNPCKHQDQ = VEX.NDS.128.0F.WIG 6D /r */
   25175       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25176          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25177                     uses_vvvv, vbi, pfx, delta, "vpunpckhqdq",
   25178                     Iop_InterleaveHI64x2, NULL,
   25179                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25180          goto decode_success;
   25181       }
   25182       /* VPUNPCKHQDQ r/m, rV, r ::: r = interleave-hi-64bitses(rV, r/m) */
   25183       /* VPUNPCKHQDQ = VEX.NDS.256.0F.WIG 6D /r */
   25184       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25185          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25186                     uses_vvvv, vbi, pfx, delta, "vpunpckhqdq",
   25187                     math_VPUNPCKHQDQ_YMM );
   25188          goto decode_success;
   25189       }
   25190       break;
   25191 
   25192    case 0x6E:
   25193       /* VMOVD r32/m32, xmm1 = VEX.128.66.0F.W0 6E */
   25194       if (have66noF2noF3(pfx)
   25195           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   25196          vassert(sz == 2); /* even tho we are transferring 4, not 2. */
   25197          UChar modrm = getUChar(delta);
   25198          if (epartIsReg(modrm)) {
   25199             delta += 1;
   25200             putYMMRegLoAndZU(
   25201                gregOfRexRM(pfx,modrm),
   25202                unop( Iop_32UtoV128, getIReg32(eregOfRexRM(pfx,modrm)) )
   25203             );
   25204             DIP("vmovd %s, %s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
   25205                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   25206         } else {
   25207             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   25208             delta += alen;
   25209             putYMMRegLoAndZU(
   25210                gregOfRexRM(pfx,modrm),
   25211                unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)))
   25212                              );
   25213             DIP("vmovd %s, %s\n", dis_buf,
   25214                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   25215          }
   25216          goto decode_success;
   25217       }
   25218       /* VMOVQ r64/m64, xmm1 = VEX.128.66.0F.W1 6E */
   25219       if (have66noF2noF3(pfx)
   25220           && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
   25221          vassert(sz == 2); /* even tho we are transferring 8, not 2. */
   25222          UChar modrm = getUChar(delta);
   25223          if (epartIsReg(modrm)) {
   25224             delta += 1;
   25225             putYMMRegLoAndZU(
   25226                gregOfRexRM(pfx,modrm),
   25227                unop( Iop_64UtoV128, getIReg64(eregOfRexRM(pfx,modrm)) )
   25228             );
   25229             DIP("vmovq %s, %s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
   25230                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   25231         } else {
   25232             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   25233             delta += alen;
   25234             putYMMRegLoAndZU(
   25235                gregOfRexRM(pfx,modrm),
   25236                unop( Iop_64UtoV128,loadLE(Ity_I64, mkexpr(addr)))
   25237                              );
   25238             DIP("vmovq %s, %s\n", dis_buf,
   25239                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   25240          }
   25241          goto decode_success;
   25242       }
   25243       break;
   25244 
   25245    case 0x6F:
   25246       /* VMOVDQA ymm2/m256, ymm1 = VEX.256.66.0F.WIG 6F */
   25247       /* VMOVDQU ymm2/m256, ymm1 = VEX.256.F3.0F.WIG 6F */
   25248       if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
   25249           && 1==getVexL(pfx)/*256*/) {
   25250          UChar  modrm = getUChar(delta);
   25251          UInt   rD    = gregOfRexRM(pfx, modrm);
   25252          IRTemp tD    = newTemp(Ity_V256);
   25253          Bool   isA   = have66noF2noF3(pfx);
   25254          HChar  ch    = isA ? 'a' : 'u';
   25255          if (epartIsReg(modrm)) {
   25256             UInt rS = eregOfRexRM(pfx, modrm);
   25257             delta += 1;
   25258             assign(tD, getYMMReg(rS));
   25259             DIP("vmovdq%c %s,%s\n", ch, nameYMMReg(rS), nameYMMReg(rD));
   25260          } else {
   25261             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   25262             delta += alen;
   25263             if (isA)
   25264                gen_SEGV_if_not_32_aligned(addr);
   25265             assign(tD, loadLE(Ity_V256, mkexpr(addr)));
   25266             DIP("vmovdq%c %s,%s\n", ch, dis_buf, nameYMMReg(rD));
   25267          }
   25268          putYMMReg(rD, mkexpr(tD));
   25269          goto decode_success;
   25270       }
   25271       /* VMOVDQA xmm2/m128, xmm1 = VEX.128.66.0F.WIG 6F */
   25272       /* VMOVDQU xmm2/m128, xmm1 = VEX.128.F3.0F.WIG 6F */
   25273       if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
   25274           && 0==getVexL(pfx)/*128*/) {
   25275          UChar  modrm = getUChar(delta);
   25276          UInt   rD    = gregOfRexRM(pfx, modrm);
   25277          IRTemp tD    = newTemp(Ity_V128);
   25278          Bool   isA   = have66noF2noF3(pfx);
   25279          HChar  ch    = isA ? 'a' : 'u';
   25280          if (epartIsReg(modrm)) {
   25281             UInt rS = eregOfRexRM(pfx, modrm);
   25282             delta += 1;
   25283             assign(tD, getXMMReg(rS));
   25284             DIP("vmovdq%c %s,%s\n", ch, nameXMMReg(rS), nameXMMReg(rD));
   25285          } else {
   25286             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   25287             delta += alen;
   25288             if (isA)
   25289                gen_SEGV_if_not_16_aligned(addr);
   25290             assign(tD, loadLE(Ity_V128, mkexpr(addr)));
   25291             DIP("vmovdq%c %s,%s\n", ch, dis_buf, nameXMMReg(rD));
   25292          }
   25293          putYMMRegLoAndZU(rD, mkexpr(tD));
   25294          goto decode_success;
   25295       }
   25296       break;
   25297 
   25298    case 0x70:
   25299       /* VPSHUFD imm8, xmm2/m128, xmm1 = VEX.128.66.0F.WIG 70 /r ib */
   25300       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25301          delta = dis_PSHUFD_32x4( vbi, pfx, delta, True/*writesYmm*/);
   25302          goto decode_success;
   25303       }
   25304       /* VPSHUFD imm8, ymm2/m256, ymm1 = VEX.256.66.0F.WIG 70 /r ib */
   25305       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25306          delta = dis_PSHUFD_32x8( vbi, pfx, delta);
   25307          goto decode_success;
   25308       }
   25309       /* VPSHUFLW imm8, xmm2/m128, xmm1 = VEX.128.F2.0F.WIG 70 /r ib */
   25310       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25311          delta = dis_PSHUFxW_128( vbi, pfx, delta,
   25312                                   True/*isAvx*/, False/*!xIsH*/ );
   25313          goto decode_success;
   25314       }
   25315       /* VPSHUFLW imm8, ymm2/m256, ymm1 = VEX.256.F2.0F.WIG 70 /r ib */
   25316       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25317          delta = dis_PSHUFxW_256( vbi, pfx, delta, False/*!xIsH*/ );
   25318          goto decode_success;
   25319       }
   25320       /* VPSHUFHW imm8, xmm2/m128, xmm1 = VEX.128.F3.0F.WIG 70 /r ib */
   25321       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
   25322          delta = dis_PSHUFxW_128( vbi, pfx, delta,
   25323                                   True/*isAvx*/, True/*xIsH*/ );
   25324          goto decode_success;
   25325       }
   25326       /* VPSHUFHW imm8, ymm2/m256, ymm1 = VEX.256.F3.0F.WIG 70 /r ib */
   25327       if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
   25328          delta = dis_PSHUFxW_256( vbi, pfx, delta, True/*xIsH*/ );
   25329          goto decode_success;
   25330       }
   25331       break;
   25332 
   25333    case 0x71:
   25334       /* VPSRLW imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 71 /2 ib */
   25335       /* VPSRAW imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 71 /4 ib */
   25336       /* VPSLLW imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 71 /6 ib */
   25337       if (have66noF2noF3(pfx)
   25338           && 0==getVexL(pfx)/*128*/
   25339           && epartIsReg(getUChar(delta))) {
   25340          if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
   25341             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   25342                                                 "vpsrlw", Iop_ShrN16x8 );
   25343             *uses_vvvv = True;
   25344             goto decode_success;
   25345          }
   25346          if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
   25347             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   25348                                                 "vpsraw", Iop_SarN16x8 );
   25349             *uses_vvvv = True;
   25350             goto decode_success;
   25351          }
   25352          if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
   25353             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   25354                                                 "vpsllw", Iop_ShlN16x8 );
   25355             *uses_vvvv = True;
   25356             goto decode_success;
   25357          }
   25358          /* else fall through */
   25359       }
   25360       /* VPSRLW imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 71 /2 ib */
   25361       /* VPSRAW imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 71 /4 ib */
   25362       /* VPSLLW imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 71 /6 ib */
   25363       if (have66noF2noF3(pfx)
   25364           && 1==getVexL(pfx)/*256*/
   25365           && epartIsReg(getUChar(delta))) {
   25366          if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
   25367             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
   25368                                                 "vpsrlw", Iop_ShrN16x16 );
   25369             *uses_vvvv = True;
   25370             goto decode_success;
   25371          }
   25372          if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
   25373             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
   25374                                                 "vpsraw", Iop_SarN16x16 );
   25375             *uses_vvvv = True;
   25376             goto decode_success;
   25377          }
   25378          if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
   25379             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
   25380                                                 "vpsllw", Iop_ShlN16x16 );
   25381             *uses_vvvv = True;
   25382             goto decode_success;
   25383          }
   25384          /* else fall through */
   25385       }
   25386       break;
   25387 
   25388    case 0x72:
   25389       /* VPSRLD imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 72 /2 ib */
   25390       /* VPSRAD imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 72 /4 ib */
   25391       /* VPSLLD imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 72 /6 ib */
   25392       if (have66noF2noF3(pfx)
   25393           && 0==getVexL(pfx)/*128*/
   25394           && epartIsReg(getUChar(delta))) {
   25395          if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
   25396             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   25397                                                 "vpsrld", Iop_ShrN32x4 );
   25398             *uses_vvvv = True;
   25399             goto decode_success;
   25400          }
   25401          if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
   25402             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   25403                                                 "vpsrad", Iop_SarN32x4 );
   25404             *uses_vvvv = True;
   25405             goto decode_success;
   25406          }
   25407          if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
   25408             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   25409                                                 "vpslld", Iop_ShlN32x4 );
   25410             *uses_vvvv = True;
   25411             goto decode_success;
   25412          }
   25413          /* else fall through */
   25414       }
   25415       /* VPSRLD imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 72 /2 ib */
   25416       /* VPSRAD imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 72 /4 ib */
   25417       /* VPSLLD imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 72 /6 ib */
   25418       if (have66noF2noF3(pfx)
   25419           && 1==getVexL(pfx)/*256*/
   25420           && epartIsReg(getUChar(delta))) {
   25421          if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
   25422             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
   25423                                                 "vpsrld", Iop_ShrN32x8 );
   25424             *uses_vvvv = True;
   25425             goto decode_success;
   25426          }
   25427          if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
   25428             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
   25429                                                 "vpsrad", Iop_SarN32x8 );
   25430             *uses_vvvv = True;
   25431             goto decode_success;
   25432          }
   25433          if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
   25434             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
   25435                                                 "vpslld", Iop_ShlN32x8 );
   25436             *uses_vvvv = True;
   25437             goto decode_success;
   25438          }
   25439          /* else fall through */
   25440       }
   25441       break;
   25442 
   25443    case 0x73:
   25444       /* VPSRLDQ imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /3 ib */
   25445       /* VPSLLDQ imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /7 ib */
   25446       /* VPSRLQ  imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /2 ib */
   25447       /* VPSLLQ  imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /6 ib */
   25448       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   25449           && epartIsReg(getUChar(delta))) {
   25450          Int    rS   = eregOfRexRM(pfx,getUChar(delta));
   25451          Int    rD   = getVexNvvvv(pfx);
   25452          IRTemp vecS = newTemp(Ity_V128);
   25453          if (gregLO3ofRM(getUChar(delta)) == 3) {
   25454             Int imm = (Int)getUChar(delta+1);
   25455             DIP("vpsrldq $%d,%s,%s\n", imm, nameXMMReg(rS), nameXMMReg(rD));
   25456             delta += 2;
   25457             assign( vecS, getXMMReg(rS) );
   25458             putYMMRegLoAndZU(rD, mkexpr(math_PSRLDQ( vecS, imm )));
   25459             *uses_vvvv = True;
   25460             goto decode_success;
   25461          }
   25462          if (gregLO3ofRM(getUChar(delta)) == 7) {
   25463             Int imm = (Int)getUChar(delta+1);
   25464             DIP("vpslldq $%d,%s,%s\n", imm, nameXMMReg(rS), nameXMMReg(rD));
   25465             delta += 2;
   25466             assign( vecS, getXMMReg(rS) );
   25467             putYMMRegLoAndZU(rD, mkexpr(math_PSLLDQ( vecS, imm )));
   25468             *uses_vvvv = True;
   25469             goto decode_success;
   25470          }
   25471          if (gregLO3ofRM(getUChar(delta)) == 2) {
   25472             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   25473                                                 "vpsrlq", Iop_ShrN64x2 );
   25474             *uses_vvvv = True;
   25475             goto decode_success;
   25476          }
   25477          if (gregLO3ofRM(getUChar(delta)) == 6) {
   25478             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   25479                                                 "vpsllq", Iop_ShlN64x2 );
   25480             *uses_vvvv = True;
   25481             goto decode_success;
   25482          }
   25483          /* else fall through */
   25484       }
   25485       /* VPSRLDQ imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /3 ib */
   25486       /* VPSLLDQ imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /7 ib */
   25487       /* VPSRLQ  imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /2 ib */
   25488       /* VPSLLQ  imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /6 ib */
   25489       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   25490           && epartIsReg(getUChar(delta))) {
   25491          Int    rS   = eregOfRexRM(pfx,getUChar(delta));
   25492          Int    rD   = getVexNvvvv(pfx);
   25493          if (gregLO3ofRM(getUChar(delta)) == 3) {
   25494             IRTemp vecS0 = newTemp(Ity_V128);
   25495             IRTemp vecS1 = newTemp(Ity_V128);
   25496             Int imm = (Int)getUChar(delta+1);
   25497             DIP("vpsrldq $%d,%s,%s\n", imm, nameYMMReg(rS), nameYMMReg(rD));
   25498             delta += 2;
   25499             assign( vecS0, getYMMRegLane128(rS, 0));
   25500             assign( vecS1, getYMMRegLane128(rS, 1));
   25501             putYMMRegLane128(rD, 0, mkexpr(math_PSRLDQ( vecS0, imm )));
   25502             putYMMRegLane128(rD, 1, mkexpr(math_PSRLDQ( vecS1, imm )));
   25503             *uses_vvvv = True;
   25504             goto decode_success;
   25505          }
   25506          if (gregLO3ofRM(getUChar(delta)) == 7) {
   25507             IRTemp vecS0 = newTemp(Ity_V128);
   25508             IRTemp vecS1 = newTemp(Ity_V128);
   25509             Int imm = (Int)getUChar(delta+1);
   25510             DIP("vpslldq $%d,%s,%s\n", imm, nameYMMReg(rS), nameYMMReg(rD));
   25511             delta += 2;
   25512             assign( vecS0, getYMMRegLane128(rS, 0));
   25513             assign( vecS1, getYMMRegLane128(rS, 1));
   25514             putYMMRegLane128(rD, 0, mkexpr(math_PSLLDQ( vecS0, imm )));
   25515             putYMMRegLane128(rD, 1, mkexpr(math_PSLLDQ( vecS1, imm )));
   25516             *uses_vvvv = True;
   25517             goto decode_success;
   25518          }
   25519          if (gregLO3ofRM(getUChar(delta)) == 2) {
   25520             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
   25521                                                 "vpsrlq", Iop_ShrN64x4 );
   25522             *uses_vvvv = True;
   25523             goto decode_success;
   25524          }
   25525          if (gregLO3ofRM(getUChar(delta)) == 6) {
   25526             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
   25527                                                 "vpsllq", Iop_ShlN64x4 );
   25528             *uses_vvvv = True;
   25529             goto decode_success;
   25530          }
   25531          /* else fall through */
   25532       }
   25533       break;
   25534 
   25535    case 0x74:
   25536       /* VPCMPEQB r/m, rV, r ::: r = rV `eq-by-8s` r/m */
   25537       /* VPCMPEQB = VEX.NDS.128.66.0F.WIG 74 /r */
   25538       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25539          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25540                     uses_vvvv, vbi, pfx, delta, "vpcmpeqb", Iop_CmpEQ8x16 );
   25541          goto decode_success;
   25542       }
   25543       /* VPCMPEQB r/m, rV, r ::: r = rV `eq-by-8s` r/m */
   25544       /* VPCMPEQB = VEX.NDS.256.66.0F.WIG 74 /r */
   25545       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25546          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   25547                     uses_vvvv, vbi, pfx, delta, "vpcmpeqb", Iop_CmpEQ8x32 );
   25548          goto decode_success;
   25549       }
   25550       break;
   25551 
   25552    case 0x75:
   25553       /* VPCMPEQW r/m, rV, r ::: r = rV `eq-by-16s` r/m */
   25554       /* VPCMPEQW = VEX.NDS.128.66.0F.WIG 75 /r */
   25555       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25556          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25557                     uses_vvvv, vbi, pfx, delta, "vpcmpeqw", Iop_CmpEQ16x8 );
   25558          goto decode_success;
   25559       }
   25560       /* VPCMPEQW r/m, rV, r ::: r = rV `eq-by-16s` r/m */
   25561       /* VPCMPEQW = VEX.NDS.256.66.0F.WIG 75 /r */
   25562       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25563          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   25564                     uses_vvvv, vbi, pfx, delta, "vpcmpeqw", Iop_CmpEQ16x16 );
   25565          goto decode_success;
   25566       }
   25567       break;
   25568 
   25569    case 0x76:
   25570       /* VPCMPEQD r/m, rV, r ::: r = rV `eq-by-32s` r/m */
   25571       /* VPCMPEQD = VEX.NDS.128.66.0F.WIG 76 /r */
   25572       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25573          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25574                     uses_vvvv, vbi, pfx, delta, "vpcmpeqd", Iop_CmpEQ32x4 );
   25575          goto decode_success;
   25576       }
   25577       /* VPCMPEQD r/m, rV, r ::: r = rV `eq-by-32s` r/m */
   25578       /* VPCMPEQD = VEX.NDS.256.66.0F.WIG 76 /r */
   25579       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25580          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   25581                     uses_vvvv, vbi, pfx, delta, "vpcmpeqd", Iop_CmpEQ32x8 );
   25582          goto decode_success;
   25583       }
   25584       break;
   25585 
   25586    case 0x77:
   25587       /* VZEROUPPER = VEX.128.0F.WIG 77 */
   25588       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25589          Int i;
   25590          IRTemp zero128 = newTemp(Ity_V128);
   25591          assign(zero128, mkV128(0));
   25592          for (i = 0; i < 16; i++) {
   25593             putYMMRegLane128(i, 1, mkexpr(zero128));
   25594          }
   25595          DIP("vzeroupper\n");
   25596          goto decode_success;
   25597       }
   25598       /* VZEROALL = VEX.256.0F.WIG 77 */
   25599       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25600          Int i;
   25601          IRTemp zero128 = newTemp(Ity_V128);
   25602          assign(zero128, mkV128(0));
   25603          for (i = 0; i < 16; i++) {
   25604             putYMMRegLoAndZU(i, mkexpr(zero128));
   25605          }
   25606          DIP("vzeroall\n");
   25607          goto decode_success;
   25608       }
   25609       break;
   25610 
   25611    case 0x7C:
   25612    case 0x7D:
   25613       /* VHADDPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.F2.0F.WIG 7C /r */
   25614       /* VHSUBPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.F2.0F.WIG 7D /r */
   25615       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25616          IRTemp sV     = newTemp(Ity_V128);
   25617          IRTemp dV     = newTemp(Ity_V128);
   25618          Bool   isAdd  = opc == 0x7C;
   25619          const HChar* str = isAdd ? "add" : "sub";
   25620          UChar modrm   = getUChar(delta);
   25621          UInt   rG     = gregOfRexRM(pfx,modrm);
   25622          UInt   rV     = getVexNvvvv(pfx);
   25623          if (epartIsReg(modrm)) {
   25624             UInt rE = eregOfRexRM(pfx,modrm);
   25625             assign( sV, getXMMReg(rE) );
   25626             DIP("vh%spd %s,%s,%s\n", str, nameXMMReg(rE),
   25627                 nameXMMReg(rV), nameXMMReg(rG));
   25628             delta += 1;
   25629          } else {
   25630             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   25631             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   25632             DIP("vh%spd %s,%s,%s\n", str, dis_buf,
   25633                 nameXMMReg(rV), nameXMMReg(rG));
   25634             delta += alen;
   25635          }
   25636          assign( dV, getXMMReg(rV) );
   25637          putYMMRegLoAndZU( rG, mkexpr( math_HADDPS_128 ( dV, sV, isAdd ) ) );
   25638          *uses_vvvv = True;
   25639          goto decode_success;
   25640       }
   25641       /* VHADDPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.F2.0F.WIG 7C /r */
   25642       /* VHSUBPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.F2.0F.WIG 7D /r */
   25643       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25644          IRTemp sV     = newTemp(Ity_V256);
   25645          IRTemp dV     = newTemp(Ity_V256);
   25646          IRTemp s1, s0, d1, d0;
   25647          Bool   isAdd  = opc == 0x7C;
   25648          const HChar* str = isAdd ? "add" : "sub";
   25649          UChar modrm   = getUChar(delta);
   25650          UInt   rG     = gregOfRexRM(pfx,modrm);
   25651          UInt   rV     = getVexNvvvv(pfx);
   25652          s1 = s0 = d1 = d0 = IRTemp_INVALID;
   25653          if (epartIsReg(modrm)) {
   25654             UInt rE = eregOfRexRM(pfx,modrm);
   25655             assign( sV, getYMMReg(rE) );
   25656             DIP("vh%spd %s,%s,%s\n", str, nameYMMReg(rE),
   25657                 nameYMMReg(rV), nameYMMReg(rG));
   25658             delta += 1;
   25659          } else {
   25660             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   25661             assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   25662             DIP("vh%spd %s,%s,%s\n", str, dis_buf,
   25663                 nameYMMReg(rV), nameYMMReg(rG));
   25664             delta += alen;
   25665          }
   25666          assign( dV, getYMMReg(rV) );
   25667          breakupV256toV128s( dV, &d1, &d0 );
   25668          breakupV256toV128s( sV, &s1, &s0 );
   25669          putYMMReg( rG, binop(Iop_V128HLtoV256,
   25670                               mkexpr( math_HADDPS_128 ( d1, s1, isAdd ) ),
   25671                               mkexpr( math_HADDPS_128 ( d0, s0, isAdd ) ) ) );
   25672          *uses_vvvv = True;
   25673          goto decode_success;
   25674       }
   25675       /* VHADDPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 7C /r */
   25676       /* VHSUBPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 7D /r */
   25677       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25678          IRTemp sV     = newTemp(Ity_V128);
   25679          IRTemp dV     = newTemp(Ity_V128);
   25680          Bool   isAdd  = opc == 0x7C;
   25681          const HChar* str = isAdd ? "add" : "sub";
   25682          UChar modrm   = getUChar(delta);
   25683          UInt   rG     = gregOfRexRM(pfx,modrm);
   25684          UInt   rV     = getVexNvvvv(pfx);
   25685          if (epartIsReg(modrm)) {
   25686             UInt rE = eregOfRexRM(pfx,modrm);
   25687             assign( sV, getXMMReg(rE) );
   25688             DIP("vh%spd %s,%s,%s\n", str, nameXMMReg(rE),
   25689                 nameXMMReg(rV), nameXMMReg(rG));
   25690             delta += 1;
   25691          } else {
   25692             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   25693             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   25694             DIP("vh%spd %s,%s,%s\n", str, dis_buf,
   25695                 nameXMMReg(rV), nameXMMReg(rG));
   25696             delta += alen;
   25697          }
   25698          assign( dV, getXMMReg(rV) );
   25699          putYMMRegLoAndZU( rG, mkexpr( math_HADDPD_128 ( dV, sV, isAdd ) ) );
   25700          *uses_vvvv = True;
   25701          goto decode_success;
   25702       }
   25703       /* VHADDPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 7C /r */
   25704       /* VHSUBPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 7D /r */
   25705       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25706          IRTemp sV     = newTemp(Ity_V256);
   25707          IRTemp dV     = newTemp(Ity_V256);
   25708          IRTemp s1, s0, d1, d0;
   25709          Bool   isAdd  = opc == 0x7C;
   25710          const HChar* str = isAdd ? "add" : "sub";
   25711          UChar modrm   = getUChar(delta);
   25712          UInt   rG     = gregOfRexRM(pfx,modrm);
   25713          UInt   rV     = getVexNvvvv(pfx);
   25714          s1 = s0 = d1 = d0 = IRTemp_INVALID;
   25715          if (epartIsReg(modrm)) {
   25716             UInt rE = eregOfRexRM(pfx,modrm);
   25717             assign( sV, getYMMReg(rE) );
   25718             DIP("vh%spd %s,%s,%s\n", str, nameYMMReg(rE),
   25719                 nameYMMReg(rV), nameYMMReg(rG));
   25720             delta += 1;
   25721          } else {
   25722             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   25723             assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   25724             DIP("vh%spd %s,%s,%s\n", str, dis_buf,
   25725                 nameYMMReg(rV), nameYMMReg(rG));
   25726             delta += alen;
   25727          }
   25728          assign( dV, getYMMReg(rV) );
   25729          breakupV256toV128s( dV, &d1, &d0 );
   25730          breakupV256toV128s( sV, &s1, &s0 );
   25731          putYMMReg( rG, binop(Iop_V128HLtoV256,
   25732                               mkexpr( math_HADDPD_128 ( d1, s1, isAdd ) ),
   25733                               mkexpr( math_HADDPD_128 ( d0, s0, isAdd ) ) ) );
   25734          *uses_vvvv = True;
   25735          goto decode_success;
   25736       }
   25737       break;
   25738 
   25739    case 0x7E:
   25740       /* Note the Intel docs don't make sense for this.  I think they
   25741          are wrong.  They seem to imply it is a store when in fact I
   25742          think it is a load.  Also it's unclear whether this is W0, W1
   25743          or WIG. */
   25744       /* VMOVQ xmm2/m64, xmm1 = VEX.128.F3.0F.W0 7E /r */
   25745       if (haveF3no66noF2(pfx)
   25746           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   25747          vassert(sz == 4); /* even tho we are transferring 8, not 4. */
   25748          UChar modrm = getUChar(delta);
   25749          UInt  rG    = gregOfRexRM(pfx,modrm);
   25750          if (epartIsReg(modrm)) {
   25751             UInt rE = eregOfRexRM(pfx,modrm);
   25752             putXMMRegLane64( rG, 0, getXMMRegLane64( rE, 0 ));
   25753             DIP("vmovq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   25754             delta += 1;
   25755          } else {
   25756             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   25757             putXMMRegLane64( rG, 0, loadLE(Ity_I64, mkexpr(addr)) );
   25758             DIP("vmovq %s,%s\n", dis_buf, nameXMMReg(rG));
   25759             delta += alen;
   25760          }
   25761          /* zero bits 255:64 */
   25762          putXMMRegLane64( rG, 1, mkU64(0) );
   25763          putYMMRegLane128( rG, 1, mkV128(0) );
   25764          goto decode_success;
   25765       }
   25766       /* VMOVQ xmm1, r64 = VEX.128.66.0F.W1 7E /r (reg case only) */
   25767       /* Moves from G to E, so is a store-form insn */
   25768       /* Intel docs list this in the VMOVD entry for some reason. */
   25769       if (have66noF2noF3(pfx)
   25770           && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
   25771          UChar modrm = getUChar(delta);
   25772          UInt  rG    = gregOfRexRM(pfx,modrm);
   25773          if (epartIsReg(modrm)) {
   25774             UInt rE = eregOfRexRM(pfx,modrm);
   25775             DIP("vmovq %s,%s\n", nameXMMReg(rG), nameIReg64(rE));
   25776             putIReg64(rE, getXMMRegLane64(rG, 0));
   25777             delta += 1;
   25778          } else {
   25779             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   25780             storeLE( mkexpr(addr), getXMMRegLane64(rG, 0) );
   25781             DIP("vmovq %s,%s\n", dis_buf, nameXMMReg(rG));
   25782             delta += alen;
   25783          }
   25784          goto decode_success;
   25785       }
   25786       /* VMOVD xmm1, m32/r32 = VEX.128.66.0F.W0 7E /r (reg case only) */
   25787       /* Moves from G to E, so is a store-form insn */
   25788       if (have66noF2noF3(pfx)
   25789           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   25790          UChar modrm = getUChar(delta);
   25791          UInt  rG    = gregOfRexRM(pfx,modrm);
   25792          if (epartIsReg(modrm)) {
   25793             UInt rE = eregOfRexRM(pfx,modrm);
   25794             DIP("vmovd %s,%s\n", nameXMMReg(rG), nameIReg32(rE));
   25795             putIReg32(rE, getXMMRegLane32(rG, 0));
   25796             delta += 1;
   25797          } else {
   25798             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   25799             storeLE( mkexpr(addr), getXMMRegLane32(rG, 0) );
   25800             DIP("vmovd %s,%s\n", dis_buf, nameXMMReg(rG));
   25801             delta += alen;
   25802          }
   25803          goto decode_success;
   25804       }
   25805       break;
   25806 
   25807    case 0x7F:
   25808       /* VMOVDQA ymm1, ymm2/m256 = VEX.256.66.0F.WIG 7F */
   25809       /* VMOVDQU ymm1, ymm2/m256 = VEX.256.F3.0F.WIG 7F */
   25810       if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
   25811           && 1==getVexL(pfx)/*256*/) {
   25812          UChar  modrm = getUChar(delta);
   25813          UInt   rS    = gregOfRexRM(pfx, modrm);
   25814          IRTemp tS    = newTemp(Ity_V256);
   25815          Bool   isA   = have66noF2noF3(pfx);
   25816          HChar  ch    = isA ? 'a' : 'u';
   25817          assign(tS, getYMMReg(rS));
   25818          if (epartIsReg(modrm)) {
   25819             UInt rD = eregOfRexRM(pfx, modrm);
   25820             delta += 1;
   25821             putYMMReg(rD, mkexpr(tS));
   25822             DIP("vmovdq%c %s,%s\n", ch, nameYMMReg(rS), nameYMMReg(rD));
   25823          } else {
   25824             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   25825             delta += alen;
   25826             if (isA)
   25827                gen_SEGV_if_not_32_aligned(addr);
   25828             storeLE(mkexpr(addr), mkexpr(tS));
   25829             DIP("vmovdq%c %s,%s\n", ch, nameYMMReg(rS), dis_buf);
   25830          }
   25831          goto decode_success;
   25832       }
   25833       /* VMOVDQA xmm1, xmm2/m128 = VEX.128.66.0F.WIG 7F */
   25834       /* VMOVDQU xmm1, xmm2/m128 = VEX.128.F3.0F.WIG 7F */
   25835       if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
   25836           && 0==getVexL(pfx)/*128*/) {
   25837          UChar  modrm = getUChar(delta);
   25838          UInt   rS    = gregOfRexRM(pfx, modrm);
   25839          IRTemp tS    = newTemp(Ity_V128);
   25840          Bool   isA   = have66noF2noF3(pfx);
   25841          HChar  ch    = isA ? 'a' : 'u';
   25842          assign(tS, getXMMReg(rS));
   25843          if (epartIsReg(modrm)) {
   25844             UInt rD = eregOfRexRM(pfx, modrm);
   25845             delta += 1;
   25846             putYMMRegLoAndZU(rD, mkexpr(tS));
   25847             DIP("vmovdq%c %s,%s\n", ch, nameXMMReg(rS), nameXMMReg(rD));
   25848          } else {
   25849             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   25850             delta += alen;
   25851             if (isA)
   25852                gen_SEGV_if_not_16_aligned(addr);
   25853             storeLE(mkexpr(addr), mkexpr(tS));
   25854             DIP("vmovdq%c %s,%s\n", ch, nameXMMReg(rS), dis_buf);
   25855          }
   25856          goto decode_success;
   25857       }
   25858       break;
   25859 
   25860    case 0xAE:
   25861       /* VSTMXCSR m32 = VEX.LZ.0F.WIG AE /3 */
   25862       if (haveNo66noF2noF3(pfx)
   25863           && 0==getVexL(pfx)/*LZ*/
   25864           && 0==getRexW(pfx) /* be paranoid -- Intel docs don't require this */
   25865           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 3
   25866           && sz == 4) {
   25867          delta = dis_STMXCSR(vbi, pfx, delta, True/*isAvx*/);
   25868          goto decode_success;
   25869       }
   25870       /* VLDMXCSR m32 = VEX.LZ.0F.WIG AE /2 */
   25871       if (haveNo66noF2noF3(pfx)
   25872           && 0==getVexL(pfx)/*LZ*/
   25873           && 0==getRexW(pfx) /* be paranoid -- Intel docs don't require this */
   25874           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 2
   25875           && sz == 4) {
   25876          delta = dis_LDMXCSR(vbi, pfx, delta, True/*isAvx*/);
   25877          goto decode_success;
   25878       }
   25879       break;
   25880 
   25881    case 0xC2:
   25882       /* VCMPSD xmm3/m64(E=argL), xmm2(V=argR), xmm1(G) */
   25883       /* = VEX.NDS.LIG.F2.0F.WIG C2 /r ib */
   25884       if (haveF2no66noF3(pfx)) {
   25885          Long delta0 = delta;
   25886          delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
   25887                                           "vcmpsd", False/*!all_lanes*/,
   25888                                           8/*sz*/);
   25889          if (delta > delta0) goto decode_success;
   25890          /* else fall through -- decoding has failed */
   25891       }
   25892       /* VCMPSS xmm3/m32(E=argL), xmm2(V=argR), xmm1(G) */
   25893       /* = VEX.NDS.LIG.F3.0F.WIG C2 /r ib */
   25894       if (haveF3no66noF2(pfx)) {
   25895          Long delta0 = delta;
   25896          delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
   25897                                           "vcmpss", False/*!all_lanes*/,
   25898                                           4/*sz*/);
   25899          if (delta > delta0) goto decode_success;
   25900          /* else fall through -- decoding has failed */
   25901       }
   25902       /* VCMPPD xmm3/m128(E=argL), xmm2(V=argR), xmm1(G) */
   25903       /* = VEX.NDS.128.66.0F.WIG C2 /r ib */
   25904       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25905          Long delta0 = delta;
   25906          delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
   25907                                           "vcmppd", True/*all_lanes*/,
   25908                                           8/*sz*/);
   25909          if (delta > delta0) goto decode_success;
   25910          /* else fall through -- decoding has failed */
   25911       }
   25912       /* VCMPPD ymm3/m256(E=argL), ymm2(V=argR), ymm1(G) */
   25913       /* = VEX.NDS.256.66.0F.WIG C2 /r ib */
   25914       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25915          Long delta0 = delta;
   25916          delta = dis_AVX256_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
   25917                                           "vcmppd", 8/*sz*/);
   25918          if (delta > delta0) goto decode_success;
   25919          /* else fall through -- decoding has failed */
   25920       }
   25921       /* VCMPPS xmm3/m128(E=argL), xmm2(V=argR), xmm1(G) */
   25922       /* = VEX.NDS.128.0F.WIG C2 /r ib */
   25923       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25924          Long delta0 = delta;
   25925          delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
   25926                                           "vcmpps", True/*all_lanes*/,
   25927                                           4/*sz*/);
   25928          if (delta > delta0) goto decode_success;
   25929          /* else fall through -- decoding has failed */
   25930       }
   25931       /* VCMPPS ymm3/m256(E=argL), ymm2(V=argR), ymm1(G) */
   25932       /* = VEX.NDS.256.0F.WIG C2 /r ib */
   25933       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25934          Long delta0 = delta;
   25935          delta = dis_AVX256_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
   25936                                           "vcmpps", 4/*sz*/);
   25937          if (delta > delta0) goto decode_success;
   25938          /* else fall through -- decoding has failed */
   25939       }
   25940       break;
   25941 
   25942    case 0xC4:
   25943       /* VPINSRW r32/m16, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG C4 /r ib */
   25944       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25945          UChar  modrm = getUChar(delta);
   25946          UInt   rG    = gregOfRexRM(pfx, modrm);
   25947          UInt   rV    = getVexNvvvv(pfx);
   25948          Int    imm8;
   25949          IRTemp new16 = newTemp(Ity_I16);
   25950 
   25951          if ( epartIsReg( modrm ) ) {
   25952             imm8 = (Int)(getUChar(delta+1) & 7);
   25953             assign( new16, unop(Iop_32to16,
   25954                                 getIReg32(eregOfRexRM(pfx,modrm))) );
   25955             delta += 1+1;
   25956             DIP( "vpinsrw $%d,%s,%s\n", imm8,
   25957                  nameIReg32( eregOfRexRM(pfx, modrm) ), nameXMMReg(rG) );
   25958          } else {
   25959             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   25960             imm8 = (Int)(getUChar(delta+alen) & 7);
   25961             assign( new16, loadLE( Ity_I16, mkexpr(addr) ));
   25962             delta += alen+1;
   25963             DIP( "vpinsrw $%d,%s,%s\n",
   25964                  imm8, dis_buf, nameXMMReg(rG) );
   25965          }
   25966 
   25967          IRTemp src_vec = newTemp(Ity_V128);
   25968          assign(src_vec, getXMMReg( rV ));
   25969          IRTemp res_vec = math_PINSRW_128( src_vec, new16, imm8 );
   25970          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
   25971          *uses_vvvv = True;
   25972          goto decode_success;
   25973       }
   25974       break;
   25975 
   25976    case 0xC5:
   25977       /* VPEXTRW imm8, xmm1, reg32 = VEX.128.66.0F.W0 C5 /r ib */
   25978       if (have66noF2noF3(pfx)
   25979          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   25980          Long delta0 = delta;
   25981          delta = dis_PEXTRW_128_EregOnly_toG( vbi, pfx, delta,
   25982                                               True/*isAvx*/ );
   25983          if (delta > delta0) goto decode_success;
   25984          /* else fall through -- decoding has failed */
   25985       }
   25986       break;
   25987 
   25988    case 0xC6:
   25989       /* VSHUFPS imm8, xmm3/m128, xmm2, xmm1, xmm2 */
   25990       /* = VEX.NDS.128.0F.WIG C6 /r ib */
   25991       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25992          Int    imm8 = 0;
   25993          IRTemp eV   = newTemp(Ity_V128);
   25994          IRTemp vV   = newTemp(Ity_V128);
   25995          UInt  modrm = getUChar(delta);
   25996          UInt  rG    = gregOfRexRM(pfx,modrm);
   25997          UInt  rV    = getVexNvvvv(pfx);
   25998          assign( vV, getXMMReg(rV) );
   25999          if (epartIsReg(modrm)) {
   26000             UInt rE = eregOfRexRM(pfx,modrm);
   26001             assign( eV, getXMMReg(rE) );
   26002             imm8 = (Int)getUChar(delta+1);
   26003             delta += 1+1;
   26004             DIP("vshufps $%d,%s,%s,%s\n",
   26005                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   26006          } else {
   26007             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   26008             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   26009             imm8 = (Int)getUChar(delta+alen);
   26010             delta += 1+alen;
   26011             DIP("vshufps $%d,%s,%s,%s\n",
   26012                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   26013          }
   26014          IRTemp res = math_SHUFPS_128( eV, vV, imm8 );
   26015          putYMMRegLoAndZU( rG, mkexpr(res) );
   26016          *uses_vvvv = True;
   26017          goto decode_success;
   26018       }
   26019       /* VSHUFPS imm8, ymm3/m256, ymm2, ymm1, ymm2 */
   26020       /* = VEX.NDS.256.0F.WIG C6 /r ib */
   26021       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26022          Int    imm8 = 0;
   26023          IRTemp eV   = newTemp(Ity_V256);
   26024          IRTemp vV   = newTemp(Ity_V256);
   26025          UInt  modrm = getUChar(delta);
   26026          UInt  rG    = gregOfRexRM(pfx,modrm);
   26027          UInt  rV    = getVexNvvvv(pfx);
   26028          assign( vV, getYMMReg(rV) );
   26029          if (epartIsReg(modrm)) {
   26030             UInt rE = eregOfRexRM(pfx,modrm);
   26031             assign( eV, getYMMReg(rE) );
   26032             imm8 = (Int)getUChar(delta+1);
   26033             delta += 1+1;
   26034             DIP("vshufps $%d,%s,%s,%s\n",
   26035                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   26036          } else {
   26037             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   26038             assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
   26039             imm8 = (Int)getUChar(delta+alen);
   26040             delta += 1+alen;
   26041             DIP("vshufps $%d,%s,%s,%s\n",
   26042                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   26043          }
   26044          IRTemp res = math_SHUFPS_256( eV, vV, imm8 );
   26045          putYMMReg( rG, mkexpr(res) );
   26046          *uses_vvvv = True;
   26047          goto decode_success;
   26048       }
   26049       /* VSHUFPD imm8, xmm3/m128, xmm2, xmm1, xmm2 */
   26050       /* = VEX.NDS.128.66.0F.WIG C6 /r ib */
   26051       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26052          Int    imm8 = 0;
   26053          IRTemp eV   = newTemp(Ity_V128);
   26054          IRTemp vV   = newTemp(Ity_V128);
   26055          UInt  modrm = getUChar(delta);
   26056          UInt  rG    = gregOfRexRM(pfx,modrm);
   26057          UInt  rV    = getVexNvvvv(pfx);
   26058          assign( vV, getXMMReg(rV) );
   26059          if (epartIsReg(modrm)) {
   26060             UInt rE = eregOfRexRM(pfx,modrm);
   26061             assign( eV, getXMMReg(rE) );
   26062             imm8 = (Int)getUChar(delta+1);
   26063             delta += 1+1;
   26064             DIP("vshufpd $%d,%s,%s,%s\n",
   26065                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   26066          } else {
   26067             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   26068             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   26069             imm8 = (Int)getUChar(delta+alen);
   26070             delta += 1+alen;
   26071             DIP("vshufpd $%d,%s,%s,%s\n",
   26072                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   26073          }
   26074          IRTemp res = math_SHUFPD_128( eV, vV, imm8 );
   26075          putYMMRegLoAndZU( rG, mkexpr(res) );
   26076          *uses_vvvv = True;
   26077          goto decode_success;
   26078       }
   26079       /* VSHUFPD imm8, ymm3/m256, ymm2, ymm1, ymm2 */
   26080       /* = VEX.NDS.256.66.0F.WIG C6 /r ib */
   26081       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26082          Int    imm8 = 0;
   26083          IRTemp eV   = newTemp(Ity_V256);
   26084          IRTemp vV   = newTemp(Ity_V256);
   26085          UInt  modrm = getUChar(delta);
   26086          UInt  rG    = gregOfRexRM(pfx,modrm);
   26087          UInt  rV    = getVexNvvvv(pfx);
   26088          assign( vV, getYMMReg(rV) );
   26089          if (epartIsReg(modrm)) {
   26090             UInt rE = eregOfRexRM(pfx,modrm);
   26091             assign( eV, getYMMReg(rE) );
   26092             imm8 = (Int)getUChar(delta+1);
   26093             delta += 1+1;
   26094             DIP("vshufpd $%d,%s,%s,%s\n",
   26095                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   26096          } else {
   26097             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   26098             assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
   26099             imm8 = (Int)getUChar(delta+alen);
   26100             delta += 1+alen;
   26101             DIP("vshufpd $%d,%s,%s,%s\n",
   26102                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   26103          }
   26104          IRTemp res = math_SHUFPD_256( eV, vV, imm8 );
   26105          putYMMReg( rG, mkexpr(res) );
   26106          *uses_vvvv = True;
   26107          goto decode_success;
   26108       }
   26109       break;
   26110 
   26111    case 0xD0:
   26112       /* VADDSUBPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D0 /r */
   26113       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26114          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   26115                     uses_vvvv, vbi, pfx, delta,
   26116                     "vaddsubpd", math_ADDSUBPD_128 );
   26117          goto decode_success;
   26118       }
   26119       /* VADDSUBPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D0 /r */
   26120       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26121          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   26122                     uses_vvvv, vbi, pfx, delta,
   26123                     "vaddsubpd", math_ADDSUBPD_256 );
   26124          goto decode_success;
   26125       }
   26126       /* VADDSUBPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.F2.0F.WIG D0 /r */
   26127       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26128          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   26129                     uses_vvvv, vbi, pfx, delta,
   26130                     "vaddsubps", math_ADDSUBPS_128 );
   26131          goto decode_success;
   26132       }
   26133       /* VADDSUBPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.F2.0F.WIG D0 /r */
   26134       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26135          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   26136                     uses_vvvv, vbi, pfx, delta,
   26137                     "vaddsubps", math_ADDSUBPS_256 );
   26138          goto decode_success;
   26139       }
   26140       break;
   26141 
   26142    case 0xD1:
   26143       /* VPSRLW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D1 /r */
   26144       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26145          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   26146                                         "vpsrlw", Iop_ShrN16x8 );
   26147          *uses_vvvv = True;
   26148          goto decode_success;
   26149 
   26150       }
   26151       /* VPSRLW xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D1 /r */
   26152       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26153          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
   26154                                         "vpsrlw", Iop_ShrN16x16 );
   26155          *uses_vvvv = True;
   26156          goto decode_success;
   26157 
   26158       }
   26159       break;
   26160 
   26161    case 0xD2:
   26162       /* VPSRLD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D2 /r */
   26163       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26164          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   26165                                         "vpsrld", Iop_ShrN32x4 );
   26166          *uses_vvvv = True;
   26167          goto decode_success;
   26168       }
   26169       /* VPSRLD xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D2 /r */
   26170       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26171          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
   26172                                         "vpsrld", Iop_ShrN32x8 );
   26173          *uses_vvvv = True;
   26174          goto decode_success;
   26175       }
   26176       break;
   26177 
   26178    case 0xD3:
   26179       /* VPSRLQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D3 /r */
   26180       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26181          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   26182                                         "vpsrlq", Iop_ShrN64x2 );
   26183          *uses_vvvv = True;
   26184          goto decode_success;
   26185       }
   26186       /* VPSRLQ xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D3 /r */
   26187       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26188          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
   26189                                         "vpsrlq", Iop_ShrN64x4 );
   26190          *uses_vvvv = True;
   26191          goto decode_success;
   26192       }
   26193       break;
   26194 
   26195    case 0xD4:
   26196       /* VPADDQ r/m, rV, r ::: r = rV + r/m */
   26197       /* VPADDQ = VEX.NDS.128.66.0F.WIG D4 /r */
   26198       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26199          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26200                     uses_vvvv, vbi, pfx, delta, "vpaddq", Iop_Add64x2 );
   26201          goto decode_success;
   26202       }
   26203       /* VPADDQ r/m, rV, r ::: r = rV + r/m */
   26204       /* VPADDQ = VEX.NDS.256.66.0F.WIG D4 /r */
   26205       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26206          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26207                     uses_vvvv, vbi, pfx, delta, "vpaddq", Iop_Add64x4 );
   26208          goto decode_success;
   26209       }
   26210       break;
   26211 
   26212    case 0xD5:
   26213       /* VPMULLW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D5 /r */
   26214       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26215          delta = dis_AVX128_E_V_to_G(
   26216                     uses_vvvv, vbi, pfx, delta, "vpmullw", Iop_Mul16x8 );
   26217          goto decode_success;
   26218       }
   26219       /* VPMULLW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D5 /r */
   26220       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26221          delta = dis_AVX256_E_V_to_G(
   26222                     uses_vvvv, vbi, pfx, delta, "vpmullw", Iop_Mul16x16 );
   26223          goto decode_success;
   26224       }
   26225       break;
   26226 
   26227    case 0xD6:
   26228       /* I can't even find any Intel docs for this one. */
   26229       /* Basically: 66 0F D6 = MOVQ -- move 64 bits from G (lo half
   26230          xmm) to E (mem or lo half xmm).  Looks like L==0(128), W==0
   26231          (WIG, maybe?) */
   26232       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   26233           && 0==getRexW(pfx)/*this might be redundant, dunno*/) {
   26234          UChar modrm = getUChar(delta);
   26235          UInt  rG    = gregOfRexRM(pfx,modrm);
   26236          if (epartIsReg(modrm)) {
   26237             /* fall through, awaiting test case */
   26238             /* dst: lo half copied, hi half zeroed */
   26239          } else {
   26240             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   26241             storeLE( mkexpr(addr), getXMMRegLane64( rG, 0 ));
   26242             DIP("vmovq %s,%s\n", nameXMMReg(rG), dis_buf );
   26243             delta += alen;
   26244             goto decode_success;
   26245          }
   26246       }
   26247       break;
   26248 
   26249    case 0xD7:
   26250       /* VEX.128.66.0F.WIG D7 /r = VPMOVMSKB xmm1, r32 */
   26251       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26252          delta = dis_PMOVMSKB_128( vbi, pfx, delta, True/*isAvx*/ );
   26253          goto decode_success;
   26254       }
   26255       /* VEX.128.66.0F.WIG D7 /r = VPMOVMSKB ymm1, r32 */
   26256       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26257          delta = dis_PMOVMSKB_256( vbi, pfx, delta );
   26258          goto decode_success;
   26259       }
   26260       break;
   26261 
   26262    case 0xD8:
   26263       /* VPSUBUSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D8 /r */
   26264       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26265          delta = dis_AVX128_E_V_to_G(
   26266                     uses_vvvv, vbi, pfx, delta, "vpsubusb", Iop_QSub8Ux16 );
   26267          goto decode_success;
   26268       }
   26269       /* VPSUBUSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D8 /r */
   26270       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26271          delta = dis_AVX256_E_V_to_G(
   26272                     uses_vvvv, vbi, pfx, delta, "vpsubusb", Iop_QSub8Ux32 );
   26273          goto decode_success;
   26274       }
   26275       break;
   26276 
   26277    case 0xD9:
   26278       /* VPSUBUSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D9 /r */
   26279       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26280          delta = dis_AVX128_E_V_to_G(
   26281                     uses_vvvv, vbi, pfx, delta, "vpsubusw", Iop_QSub16Ux8 );
   26282          goto decode_success;
   26283       }
   26284       /* VPSUBUSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D9 /r */
   26285       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26286          delta = dis_AVX256_E_V_to_G(
   26287                     uses_vvvv, vbi, pfx, delta, "vpsubusw", Iop_QSub16Ux16 );
   26288          goto decode_success;
   26289       }
   26290       break;
   26291 
   26292    case 0xDA:
   26293       /* VPMINUB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DA /r */
   26294       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26295          delta = dis_AVX128_E_V_to_G(
   26296                     uses_vvvv, vbi, pfx, delta, "vpminub", Iop_Min8Ux16 );
   26297          goto decode_success;
   26298       }
   26299       /* VPMINUB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DA /r */
   26300       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26301          delta = dis_AVX256_E_V_to_G(
   26302                     uses_vvvv, vbi, pfx, delta, "vpminub", Iop_Min8Ux32 );
   26303          goto decode_success;
   26304       }
   26305       break;
   26306 
   26307    case 0xDB:
   26308       /* VPAND r/m, rV, r ::: r = rV & r/m */
   26309       /* VEX.NDS.128.66.0F.WIG DB /r = VPAND xmm3/m128, xmm2, xmm1 */
   26310       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26311          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26312                     uses_vvvv, vbi, pfx, delta, "vpand", Iop_AndV128 );
   26313          goto decode_success;
   26314       }
   26315       /* VPAND r/m, rV, r ::: r = rV & r/m */
   26316       /* VEX.NDS.256.66.0F.WIG DB /r = VPAND ymm3/m256, ymm2, ymm1 */
   26317       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26318          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26319                     uses_vvvv, vbi, pfx, delta, "vpand", Iop_AndV256 );
   26320          goto decode_success;
   26321       }
   26322       break;
   26323 
   26324    case 0xDC:
   26325       /* VPADDUSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DC /r */
   26326       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26327          delta = dis_AVX128_E_V_to_G(
   26328                     uses_vvvv, vbi, pfx, delta, "vpaddusb", Iop_QAdd8Ux16 );
   26329          goto decode_success;
   26330       }
   26331       /* VPADDUSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DC /r */
   26332       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26333          delta = dis_AVX256_E_V_to_G(
   26334                     uses_vvvv, vbi, pfx, delta, "vpaddusb", Iop_QAdd8Ux32 );
   26335          goto decode_success;
   26336       }
   26337       break;
   26338 
   26339    case 0xDD:
   26340       /* VPADDUSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DD /r */
   26341       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26342          delta = dis_AVX128_E_V_to_G(
   26343                     uses_vvvv, vbi, pfx, delta, "vpaddusw", Iop_QAdd16Ux8 );
   26344          goto decode_success;
   26345       }
   26346       /* VPADDUSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DD /r */
   26347       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26348          delta = dis_AVX256_E_V_to_G(
   26349                     uses_vvvv, vbi, pfx, delta, "vpaddusw", Iop_QAdd16Ux16 );
   26350          goto decode_success;
   26351       }
   26352       break;
   26353 
   26354    case 0xDE:
   26355       /* VPMAXUB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DE /r */
   26356       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26357          delta = dis_AVX128_E_V_to_G(
   26358                     uses_vvvv, vbi, pfx, delta, "vpmaxub", Iop_Max8Ux16 );
   26359          goto decode_success;
   26360       }
   26361       /* VPMAXUB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DE /r */
   26362       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26363          delta = dis_AVX256_E_V_to_G(
   26364                     uses_vvvv, vbi, pfx, delta, "vpmaxub", Iop_Max8Ux32 );
   26365          goto decode_success;
   26366       }
   26367       break;
   26368 
   26369    case 0xDF:
   26370       /* VPANDN r/m, rV, r ::: r = rV & ~r/m (is that correct, re the ~ ?) */
   26371       /* VEX.NDS.128.66.0F.WIG DF /r = VPANDN xmm3/m128, xmm2, xmm1 */
   26372       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26373          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   26374                     uses_vvvv, vbi, pfx, delta, "vpandn", Iop_AndV128,
   26375                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
   26376          goto decode_success;
   26377       }
   26378       /* VPANDN r/m, rV, r ::: r = rV & ~r/m (is that correct, re the ~ ?) */
   26379       /* VEX.NDS.256.66.0F.WIG DF /r = VPANDN ymm3/m256, ymm2, ymm1 */
   26380       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26381          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
   26382                     uses_vvvv, vbi, pfx, delta, "vpandn", Iop_AndV256,
   26383                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
   26384          goto decode_success;
   26385       }
   26386       break;
   26387 
   26388    case 0xE0:
   26389       /* VPAVGB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E0 /r */
   26390       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26391          delta = dis_AVX128_E_V_to_G(
   26392                     uses_vvvv, vbi, pfx, delta, "vpavgb", Iop_Avg8Ux16 );
   26393          goto decode_success;
   26394       }
   26395       /* VPAVGB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E0 /r */
   26396       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26397          delta = dis_AVX256_E_V_to_G(
   26398                     uses_vvvv, vbi, pfx, delta, "vpavgb", Iop_Avg8Ux32 );
   26399          goto decode_success;
   26400       }
   26401       break;
   26402 
   26403    case 0xE1:
   26404       /* VPSRAW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E1 /r */
   26405       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26406          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   26407                                         "vpsraw", Iop_SarN16x8 );
   26408          *uses_vvvv = True;
   26409          goto decode_success;
   26410       }
   26411       /* VPSRAW xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E1 /r */
   26412       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26413          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
   26414                                         "vpsraw", Iop_SarN16x16 );
   26415          *uses_vvvv = True;
   26416          goto decode_success;
   26417       }
   26418       break;
   26419 
   26420    case 0xE2:
   26421       /* VPSRAD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E2 /r */
   26422       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26423          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   26424                                         "vpsrad", Iop_SarN32x4 );
   26425          *uses_vvvv = True;
   26426          goto decode_success;
   26427       }
   26428       /* VPSRAD xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E2 /r */
   26429       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26430          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
   26431                                         "vpsrad", Iop_SarN32x8 );
   26432          *uses_vvvv = True;
   26433          goto decode_success;
   26434       }
   26435       break;
   26436 
   26437    case 0xE3:
   26438       /* VPAVGW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E3 /r */
   26439       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26440          delta = dis_AVX128_E_V_to_G(
   26441                     uses_vvvv, vbi, pfx, delta, "vpavgw", Iop_Avg16Ux8 );
   26442          goto decode_success;
   26443       }
   26444       /* VPAVGW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E3 /r */
   26445       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26446          delta = dis_AVX256_E_V_to_G(
   26447                     uses_vvvv, vbi, pfx, delta, "vpavgw", Iop_Avg16Ux16 );
   26448          goto decode_success;
   26449       }
   26450       break;
   26451 
   26452    case 0xE4:
   26453       /* VPMULHUW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E4 /r */
   26454       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26455          delta = dis_AVX128_E_V_to_G(
   26456                     uses_vvvv, vbi, pfx, delta, "vpmulhuw", Iop_MulHi16Ux8 );
   26457          goto decode_success;
   26458       }
   26459       /* VPMULHUW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E4 /r */
   26460       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26461          delta = dis_AVX256_E_V_to_G(
   26462                     uses_vvvv, vbi, pfx, delta, "vpmulhuw", Iop_MulHi16Ux16 );
   26463          goto decode_success;
   26464       }
   26465       break;
   26466 
   26467    case 0xE5:
   26468       /* VPMULHW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E5 /r */
   26469       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26470          delta = dis_AVX128_E_V_to_G(
   26471                     uses_vvvv, vbi, pfx, delta, "vpmulhw", Iop_MulHi16Sx8 );
   26472          goto decode_success;
   26473       }
   26474       /* VPMULHW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E5 /r */
   26475       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26476          delta = dis_AVX256_E_V_to_G(
   26477                     uses_vvvv, vbi, pfx, delta, "vpmulhw", Iop_MulHi16Sx16 );
   26478          goto decode_success;
   26479       }
   26480       break;
   26481 
   26482    case 0xE6:
   26483       /* VCVTDQ2PD xmm2/m64, xmm1 = VEX.128.F3.0F.WIG E6 /r */
   26484       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
   26485          delta = dis_CVTDQ2PD_128(vbi, pfx, delta, True/*isAvx*/);
   26486          goto decode_success;
   26487       }
   26488       /* VCVTDQ2PD xmm2/m128, ymm1 = VEX.256.F3.0F.WIG E6 /r */
   26489       if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
   26490          delta = dis_CVTDQ2PD_256(vbi, pfx, delta);
   26491          goto decode_success;
   26492       }
   26493       /* VCVTTPD2DQ xmm2/m128, xmm1 = VEX.128.66.0F.WIG E6 /r */
   26494       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26495          delta = dis_CVTxPD2DQ_128(vbi, pfx, delta, True/*isAvx*/,
   26496                                    True/*r2zero*/);
   26497          goto decode_success;
   26498       }
   26499       /* VCVTTPD2DQ ymm2/m256, xmm1 = VEX.256.66.0F.WIG E6 /r */
   26500       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26501          delta = dis_CVTxPD2DQ_256(vbi, pfx, delta, True/*r2zero*/);
   26502          goto decode_success;
   26503       }
   26504       /* VCVTPD2DQ xmm2/m128, xmm1 = VEX.128.F2.0F.WIG E6 /r */
   26505       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26506          delta = dis_CVTxPD2DQ_128(vbi, pfx, delta, True/*isAvx*/,
   26507                                    False/*!r2zero*/);
   26508          goto decode_success;
   26509       }
   26510       /* VCVTPD2DQ ymm2/m256, xmm1 = VEX.256.F2.0F.WIG E6 /r */
   26511       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26512          delta = dis_CVTxPD2DQ_256(vbi, pfx, delta, False/*!r2zero*/);
   26513          goto decode_success;
   26514       }
   26515       break;
   26516 
   26517    case 0xE7:
   26518       /* VMOVNTDQ xmm1, m128 = VEX.128.66.0F.WIG E7 /r */
   26519       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26520          UChar modrm = getUChar(delta);
   26521          UInt rG     = gregOfRexRM(pfx,modrm);
   26522          if (!epartIsReg(modrm)) {
   26523             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   26524             gen_SEGV_if_not_16_aligned( addr );
   26525             storeLE( mkexpr(addr), getXMMReg(rG) );
   26526             DIP("vmovntdq %s,%s\n", dis_buf, nameXMMReg(rG));
   26527             delta += alen;
   26528             goto decode_success;
   26529          }
   26530          /* else fall through */
   26531       }
   26532       /* VMOVNTDQ ymm1, m256 = VEX.256.66.0F.WIG E7 /r */
   26533       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26534          UChar modrm = getUChar(delta);
   26535          UInt rG     = gregOfRexRM(pfx,modrm);
   26536          if (!epartIsReg(modrm)) {
   26537             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   26538             gen_SEGV_if_not_32_aligned( addr );
   26539             storeLE( mkexpr(addr), getYMMReg(rG) );
   26540             DIP("vmovntdq %s,%s\n", dis_buf, nameYMMReg(rG));
   26541             delta += alen;
   26542             goto decode_success;
   26543          }
   26544          /* else fall through */
   26545       }
   26546       break;
   26547 
   26548    case 0xE8:
   26549       /* VPSUBSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E8 /r */
   26550       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26551          delta = dis_AVX128_E_V_to_G(
   26552                     uses_vvvv, vbi, pfx, delta, "vpsubsb", Iop_QSub8Sx16 );
   26553          goto decode_success;
   26554       }
   26555       /* VPSUBSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E8 /r */
   26556       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26557          delta = dis_AVX256_E_V_to_G(
   26558                     uses_vvvv, vbi, pfx, delta, "vpsubsb", Iop_QSub8Sx32 );
   26559          goto decode_success;
   26560       }
   26561       break;
   26562 
   26563    case 0xE9:
   26564       /* VPSUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E9 /r */
   26565       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26566          delta = dis_AVX128_E_V_to_G(
   26567                     uses_vvvv, vbi, pfx, delta, "vpsubsw", Iop_QSub16Sx8 );
   26568          goto decode_success;
   26569       }
   26570       /* VPSUBSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E9 /r */
   26571       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26572          delta = dis_AVX256_E_V_to_G(
   26573                     uses_vvvv, vbi, pfx, delta, "vpsubsw", Iop_QSub16Sx16 );
   26574          goto decode_success;
   26575       }
   26576       break;
   26577 
   26578    case 0xEA:
   26579       /* VPMINSW r/m, rV, r ::: r = min-signed16s(rV, r/m) */
   26580       /* VPMINSW = VEX.NDS.128.66.0F.WIG EA /r */
   26581       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26582          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26583                     uses_vvvv, vbi, pfx, delta, "vpminsw", Iop_Min16Sx8 );
   26584          goto decode_success;
   26585       }
   26586       /* VPMINSW r/m, rV, r ::: r = min-signed16s(rV, r/m) */
   26587       /* VPMINSW = VEX.NDS.256.66.0F.WIG EA /r */
   26588       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26589          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26590                     uses_vvvv, vbi, pfx, delta, "vpminsw", Iop_Min16Sx16 );
   26591          goto decode_success;
   26592       }
   26593       break;
   26594 
   26595    case 0xEB:
   26596       /* VPOR r/m, rV, r ::: r = rV | r/m */
   26597       /* VPOR = VEX.NDS.128.66.0F.WIG EB /r */
   26598       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26599          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26600                     uses_vvvv, vbi, pfx, delta, "vpor", Iop_OrV128 );
   26601          goto decode_success;
   26602       }
   26603       /* VPOR r/m, rV, r ::: r = rV | r/m */
   26604       /* VPOR = VEX.NDS.256.66.0F.WIG EB /r */
   26605       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26606          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26607                     uses_vvvv, vbi, pfx, delta, "vpor", Iop_OrV256 );
   26608          goto decode_success;
   26609       }
   26610       break;
   26611 
   26612    case 0xEC:
   26613       /* VPADDSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG EC /r */
   26614       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26615          delta = dis_AVX128_E_V_to_G(
   26616                     uses_vvvv, vbi, pfx, delta, "vpaddsb", Iop_QAdd8Sx16 );
   26617          goto decode_success;
   26618       }
   26619       /* VPADDSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG EC /r */
   26620       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26621          delta = dis_AVX256_E_V_to_G(
   26622                     uses_vvvv, vbi, pfx, delta, "vpaddsb", Iop_QAdd8Sx32 );
   26623          goto decode_success;
   26624       }
   26625       break;
   26626 
   26627    case 0xED:
   26628       /* VPADDSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG ED /r */
   26629       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26630          delta = dis_AVX128_E_V_to_G(
   26631                     uses_vvvv, vbi, pfx, delta, "vpaddsw", Iop_QAdd16Sx8 );
   26632          goto decode_success;
   26633       }
   26634       /* VPADDSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG ED /r */
   26635       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26636          delta = dis_AVX256_E_V_to_G(
   26637                     uses_vvvv, vbi, pfx, delta, "vpaddsw", Iop_QAdd16Sx16 );
   26638          goto decode_success;
   26639       }
   26640       break;
   26641 
   26642    case 0xEE:
   26643       /* VPMAXSW r/m, rV, r ::: r = max-signed16s(rV, r/m) */
   26644       /* VPMAXSW = VEX.NDS.128.66.0F.WIG EE /r */
   26645       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26646          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26647                     uses_vvvv, vbi, pfx, delta, "vpmaxsw", Iop_Max16Sx8 );
   26648          goto decode_success;
   26649       }
   26650       /* VPMAXSW r/m, rV, r ::: r = max-signed16s(rV, r/m) */
   26651       /* VPMAXSW = VEX.NDS.256.66.0F.WIG EE /r */
   26652       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26653          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26654                     uses_vvvv, vbi, pfx, delta, "vpmaxsw", Iop_Max16Sx16 );
   26655          goto decode_success;
   26656       }
   26657       break;
   26658 
   26659    case 0xEF:
   26660       /* VPXOR r/m, rV, r ::: r = rV ^ r/m */
   26661       /* VPXOR = VEX.NDS.128.66.0F.WIG EF /r */
   26662       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26663          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26664                     uses_vvvv, vbi, pfx, delta, "vpxor", Iop_XorV128 );
   26665          goto decode_success;
   26666       }
   26667       /* VPXOR r/m, rV, r ::: r = rV ^ r/m */
   26668       /* VPXOR = VEX.NDS.256.66.0F.WIG EF /r */
   26669       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26670          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26671                     uses_vvvv, vbi, pfx, delta, "vpxor", Iop_XorV256 );
   26672          goto decode_success;
   26673       }
   26674       break;
   26675 
   26676    case 0xF0:
   26677       /* VLDDQU m256, ymm1 = VEX.256.F2.0F.WIG F0 /r */
   26678       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26679          UChar  modrm = getUChar(delta);
   26680          UInt   rD    = gregOfRexRM(pfx, modrm);
   26681          IRTemp tD    = newTemp(Ity_V256);
   26682          if (epartIsReg(modrm)) break;
   26683          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   26684          delta += alen;
   26685          assign(tD, loadLE(Ity_V256, mkexpr(addr)));
   26686          DIP("vlddqu %s,%s\n", dis_buf, nameYMMReg(rD));
   26687          putYMMReg(rD, mkexpr(tD));
   26688          goto decode_success;
   26689       }
   26690       /* VLDDQU m128, xmm1 = VEX.128.F2.0F.WIG F0 /r */
   26691       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26692          UChar  modrm = getUChar(delta);
   26693          UInt   rD    = gregOfRexRM(pfx, modrm);
   26694          IRTemp tD    = newTemp(Ity_V128);
   26695          if (epartIsReg(modrm)) break;
   26696          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   26697          delta += alen;
   26698          assign(tD, loadLE(Ity_V128, mkexpr(addr)));
   26699          DIP("vlddqu %s,%s\n", dis_buf, nameXMMReg(rD));
   26700          putYMMRegLoAndZU(rD, mkexpr(tD));
   26701          goto decode_success;
   26702       }
   26703       break;
   26704 
   26705    case 0xF1:
   26706       /* VPSLLW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F1 /r */
   26707       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26708          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   26709                                         "vpsllw", Iop_ShlN16x8 );
   26710          *uses_vvvv = True;
   26711          goto decode_success;
   26712 
   26713       }
   26714       /* VPSLLW xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F1 /r */
   26715       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26716          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
   26717                                         "vpsllw", Iop_ShlN16x16 );
   26718          *uses_vvvv = True;
   26719          goto decode_success;
   26720 
   26721       }
   26722       break;
   26723 
   26724    case 0xF2:
   26725       /* VPSLLD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F2 /r */
   26726       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26727          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   26728                                         "vpslld", Iop_ShlN32x4 );
   26729          *uses_vvvv = True;
   26730          goto decode_success;
   26731       }
   26732       /* VPSLLD xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F2 /r */
   26733       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26734          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
   26735                                         "vpslld", Iop_ShlN32x8 );
   26736          *uses_vvvv = True;
   26737          goto decode_success;
   26738       }
   26739       break;
   26740 
   26741    case 0xF3:
   26742       /* VPSLLQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F3 /r */
   26743       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26744          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   26745                                         "vpsllq", Iop_ShlN64x2 );
   26746          *uses_vvvv = True;
   26747          goto decode_success;
   26748       }
   26749       /* VPSLLQ xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F3 /r */
   26750       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26751          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
   26752                                         "vpsllq", Iop_ShlN64x4 );
   26753          *uses_vvvv = True;
   26754          goto decode_success;
   26755       }
   26756       break;
   26757 
   26758    case 0xF4:
   26759       /* VPMULUDQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F4 /r */
   26760       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26761          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   26762                     uses_vvvv, vbi, pfx, delta,
   26763                     "vpmuludq", math_PMULUDQ_128 );
   26764          goto decode_success;
   26765       }
   26766       /* VPMULUDQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F4 /r */
   26767       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26768          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   26769                     uses_vvvv, vbi, pfx, delta,
   26770                     "vpmuludq", math_PMULUDQ_256 );
   26771          goto decode_success;
   26772       }
   26773       break;
   26774 
   26775    case 0xF5:
   26776       /* VPMADDWD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F5 /r */
   26777       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26778          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   26779                     uses_vvvv, vbi, pfx, delta,
   26780                     "vpmaddwd", math_PMADDWD_128 );
   26781          goto decode_success;
   26782       }
   26783       /* VPMADDWD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F5 /r */
   26784       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26785          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   26786                     uses_vvvv, vbi, pfx, delta,
   26787                     "vpmaddwd", math_PMADDWD_256 );
   26788          goto decode_success;
   26789       }
   26790       break;
   26791 
   26792    case 0xF6:
   26793       /* VPSADBW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F6 /r */
   26794       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26795          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   26796                     uses_vvvv, vbi, pfx, delta,
   26797                     "vpsadbw", math_PSADBW_128 );
   26798          goto decode_success;
   26799       }
   26800       /* VPSADBW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F6 /r */
   26801       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26802          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   26803                     uses_vvvv, vbi, pfx, delta,
   26804                     "vpsadbw", math_PSADBW_256 );
   26805          goto decode_success;
   26806       }
   26807       break;
   26808 
   26809    case 0xF7:
   26810       /* VMASKMOVDQU xmm2, xmm1 = VEX.128.66.0F.WIG F7 /r */
   26811       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   26812           && epartIsReg(getUChar(delta))) {
   26813          delta = dis_MASKMOVDQU( vbi, pfx, delta, True/*isAvx*/ );
   26814          goto decode_success;
   26815       }
   26816       break;
   26817 
   26818    case 0xF8:
   26819       /* VPSUBB r/m, rV, r ::: r = rV - r/m */
   26820       /* VPSUBB = VEX.NDS.128.66.0F.WIG F8 /r */
   26821       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26822          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26823                     uses_vvvv, vbi, pfx, delta, "vpsubb", Iop_Sub8x16 );
   26824          goto decode_success;
   26825       }
   26826       /* VPSUBB r/m, rV, r ::: r = rV - r/m */
   26827       /* VPSUBB = VEX.NDS.256.66.0F.WIG F8 /r */
   26828       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26829          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26830                     uses_vvvv, vbi, pfx, delta, "vpsubb", Iop_Sub8x32 );
   26831          goto decode_success;
   26832       }
   26833       break;
   26834 
   26835    case 0xF9:
   26836       /* VPSUBW r/m, rV, r ::: r = rV - r/m */
   26837       /* VPSUBW = VEX.NDS.128.66.0F.WIG F9 /r */
   26838       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26839          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26840                     uses_vvvv, vbi, pfx, delta, "vpsubw", Iop_Sub16x8 );
   26841          goto decode_success;
   26842       }
   26843       /* VPSUBW r/m, rV, r ::: r = rV - r/m */
   26844       /* VPSUBW = VEX.NDS.256.66.0F.WIG F9 /r */
   26845       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26846          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26847                     uses_vvvv, vbi, pfx, delta, "vpsubw", Iop_Sub16x16 );
   26848          goto decode_success;
   26849       }
   26850       break;
   26851 
   26852    case 0xFA:
   26853       /* VPSUBD r/m, rV, r ::: r = rV - r/m */
   26854       /* VPSUBD = VEX.NDS.128.66.0F.WIG FA /r */
   26855       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26856          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26857                     uses_vvvv, vbi, pfx, delta, "vpsubd", Iop_Sub32x4 );
   26858          goto decode_success;
   26859       }
   26860       /* VPSUBD r/m, rV, r ::: r = rV - r/m */
   26861       /* VPSUBD = VEX.NDS.256.66.0F.WIG FA /r */
   26862       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26863          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26864                     uses_vvvv, vbi, pfx, delta, "vpsubd", Iop_Sub32x8 );
   26865          goto decode_success;
   26866       }
   26867       break;
   26868 
   26869    case 0xFB:
   26870       /* VPSUBQ r/m, rV, r ::: r = rV - r/m */
   26871       /* VPSUBQ = VEX.NDS.128.66.0F.WIG FB /r */
   26872       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26873          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26874                     uses_vvvv, vbi, pfx, delta, "vpsubq", Iop_Sub64x2 );
   26875          goto decode_success;
   26876       }
   26877       /* VPSUBQ r/m, rV, r ::: r = rV - r/m */
   26878       /* VPSUBQ = VEX.NDS.256.66.0F.WIG FB /r */
   26879       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26880          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26881                     uses_vvvv, vbi, pfx, delta, "vpsubq", Iop_Sub64x4 );
   26882          goto decode_success;
   26883       }
   26884       break;
   26885 
   26886    case 0xFC:
   26887       /* VPADDB r/m, rV, r ::: r = rV + r/m */
   26888       /* VPADDB = VEX.NDS.128.66.0F.WIG FC /r */
   26889       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26890          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26891                     uses_vvvv, vbi, pfx, delta, "vpaddb", Iop_Add8x16 );
   26892          goto decode_success;
   26893       }
   26894       /* VPADDB r/m, rV, r ::: r = rV + r/m */
   26895       /* VPADDB = VEX.NDS.256.66.0F.WIG FC /r */
   26896       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26897          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26898                     uses_vvvv, vbi, pfx, delta, "vpaddb", Iop_Add8x32 );
   26899          goto decode_success;
   26900       }
   26901       break;
   26902 
   26903    case 0xFD:
   26904       /* VPADDW r/m, rV, r ::: r = rV + r/m */
   26905       /* VPADDW = VEX.NDS.128.66.0F.WIG FD /r */
   26906       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26907          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26908                     uses_vvvv, vbi, pfx, delta, "vpaddw", Iop_Add16x8 );
   26909          goto decode_success;
   26910       }
   26911       /* VPADDW r/m, rV, r ::: r = rV + r/m */
   26912       /* VPADDW = VEX.NDS.256.66.0F.WIG FD /r */
   26913       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26914          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26915                     uses_vvvv, vbi, pfx, delta, "vpaddw", Iop_Add16x16 );
   26916          goto decode_success;
   26917       }
   26918       break;
   26919 
   26920    case 0xFE:
   26921       /* VPADDD r/m, rV, r ::: r = rV + r/m */
   26922       /* VPADDD = VEX.NDS.128.66.0F.WIG FE /r */
   26923       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26924          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26925                     uses_vvvv, vbi, pfx, delta, "vpaddd", Iop_Add32x4 );
   26926          goto decode_success;
   26927       }
   26928       /* VPADDD r/m, rV, r ::: r = rV + r/m */
   26929       /* VPADDD = VEX.NDS.256.66.0F.WIG FE /r */
   26930       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26931          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26932                     uses_vvvv, vbi, pfx, delta, "vpaddd", Iop_Add32x8 );
   26933          goto decode_success;
   26934       }
   26935       break;
   26936 
   26937    default:
   26938       break;
   26939 
   26940    }
   26941 
   26942   //decode_failure:
   26943    return deltaIN;
   26944 
   26945   decode_success:
   26946    return delta;
   26947 }
   26948 
   26949 
   26950 /*------------------------------------------------------------*/
   26951 /*---                                                      ---*/
   26952 /*--- Top-level post-escape decoders: dis_ESC_0F38__VEX    ---*/
   26953 /*---                                                      ---*/
   26954 /*------------------------------------------------------------*/
   26955 
   26956 static IRTemp math_PERMILPS_VAR_128 ( IRTemp dataV, IRTemp ctrlV )
   26957 {
   26958    /* In the control vector, zero out all but the bottom two bits of
   26959       each 32-bit lane. */
   26960    IRExpr* cv1 = binop(Iop_ShrN32x4,
   26961                        binop(Iop_ShlN32x4, mkexpr(ctrlV), mkU8(30)),
   26962                        mkU8(30));
   26963    /* And use the resulting cleaned-up control vector as steering
   26964       in a Perm operation. */
   26965    IRTemp res = newTemp(Ity_V128);
   26966    assign(res, binop(Iop_Perm32x4, mkexpr(dataV), cv1));
   26967    return res;
   26968 }
   26969 
   26970 static IRTemp math_PERMILPS_VAR_256 ( IRTemp dataV, IRTemp ctrlV )
   26971 {
   26972    IRTemp dHi, dLo, cHi, cLo;
   26973    dHi = dLo = cHi = cLo = IRTemp_INVALID;
   26974    breakupV256toV128s( dataV, &dHi, &dLo );
   26975    breakupV256toV128s( ctrlV, &cHi, &cLo );
   26976    IRTemp rHi = math_PERMILPS_VAR_128( dHi, cHi );
   26977    IRTemp rLo = math_PERMILPS_VAR_128( dLo, cLo );
   26978    IRTemp res = newTemp(Ity_V256);
   26979    assign(res, binop(Iop_V128HLtoV256, mkexpr(rHi), mkexpr(rLo)));
   26980    return res;
   26981 }
   26982 
   26983 static IRTemp math_PERMILPD_VAR_128 ( IRTemp dataV, IRTemp ctrlV )
   26984 {
   26985    /* No cleverness here .. */
   26986    IRTemp dHi, dLo, cHi, cLo;
   26987    dHi = dLo = cHi = cLo = IRTemp_INVALID;
   26988    breakupV128to64s( dataV, &dHi, &dLo );
   26989    breakupV128to64s( ctrlV, &cHi, &cLo );
   26990    IRExpr* rHi
   26991       = IRExpr_ITE( unop(Iop_64to1,
   26992                          binop(Iop_Shr64, mkexpr(cHi), mkU8(1))),
   26993                     mkexpr(dHi), mkexpr(dLo) );
   26994    IRExpr* rLo
   26995       = IRExpr_ITE( unop(Iop_64to1,
   26996                          binop(Iop_Shr64, mkexpr(cLo), mkU8(1))),
   26997                     mkexpr(dHi), mkexpr(dLo) );
   26998    IRTemp res = newTemp(Ity_V128);
   26999    assign(res, binop(Iop_64HLtoV128, rHi, rLo));
   27000    return res;
   27001 }
   27002 
   27003 static IRTemp math_PERMILPD_VAR_256 ( IRTemp dataV, IRTemp ctrlV )
   27004 {
   27005    IRTemp dHi, dLo, cHi, cLo;
   27006    dHi = dLo = cHi = cLo = IRTemp_INVALID;
   27007    breakupV256toV128s( dataV, &dHi, &dLo );
   27008    breakupV256toV128s( ctrlV, &cHi, &cLo );
   27009    IRTemp rHi = math_PERMILPD_VAR_128( dHi, cHi );
   27010    IRTemp rLo = math_PERMILPD_VAR_128( dLo, cLo );
   27011    IRTemp res = newTemp(Ity_V256);
   27012    assign(res, binop(Iop_V128HLtoV256, mkexpr(rHi), mkexpr(rLo)));
   27013    return res;
   27014 }
   27015 
   27016 static IRTemp math_VPERMD ( IRTemp ctrlV, IRTemp dataV )
   27017 {
   27018    /* In the control vector, zero out all but the bottom three bits of
   27019       each 32-bit lane. */
   27020    IRExpr* cv1 = binop(Iop_ShrN32x8,
   27021                        binop(Iop_ShlN32x8, mkexpr(ctrlV), mkU8(29)),
   27022                        mkU8(29));
   27023    /* And use the resulting cleaned-up control vector as steering
   27024       in a Perm operation. */
   27025    IRTemp res = newTemp(Ity_V256);
   27026    assign(res, binop(Iop_Perm32x8, mkexpr(dataV), cv1));
   27027    return res;
   27028 }
   27029 
   27030 static Long dis_SHIFTX ( /*OUT*/Bool* uses_vvvv,
   27031                          const VexAbiInfo* vbi, Prefix pfx, Long delta,
   27032                          const HChar* opname, IROp op8 )
   27033 {
   27034    HChar   dis_buf[50];
   27035    Int     alen;
   27036    Int     size = getRexW(pfx) ? 8 : 4;
   27037    IRType  ty   = szToITy(size);
   27038    IRTemp  src  = newTemp(ty);
   27039    IRTemp  amt  = newTemp(ty);
   27040    UChar   rm   = getUChar(delta);
   27041 
   27042    assign( amt, getIRegV(size,pfx) );
   27043    if (epartIsReg(rm)) {
   27044       assign( src, getIRegE(size,pfx,rm) );
   27045       DIP("%s %s,%s,%s\n", opname, nameIRegV(size,pfx),
   27046                            nameIRegE(size,pfx,rm), nameIRegG(size,pfx,rm));
   27047       delta++;
   27048    } else {
   27049       IRTemp addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   27050       assign( src, loadLE(ty, mkexpr(addr)) );
   27051       DIP("%s %s,%s,%s\n", opname, nameIRegV(size,pfx), dis_buf,
   27052                            nameIRegG(size,pfx,rm));
   27053       delta += alen;
   27054    }
   27055 
   27056    putIRegG( size, pfx, rm,
   27057              binop(mkSizedOp(ty,op8), mkexpr(src),
   27058                    narrowTo(Ity_I8, binop(mkSizedOp(ty,Iop_And8), mkexpr(amt),
   27059                                           mkU(ty,8*size-1)))) );
   27060    /* Flags aren't modified.  */
   27061    *uses_vvvv = True;
   27062    return delta;
   27063 }
   27064 
   27065 
   27066 static Long dis_FMA ( const VexAbiInfo* vbi, Prefix pfx, Long delta, UChar opc )
   27067 {
   27068    UChar  modrm   = getUChar(delta);
   27069    UInt   rG      = gregOfRexRM(pfx, modrm);
   27070    UInt   rV      = getVexNvvvv(pfx);
   27071    Bool   scalar  = (opc & 0xF) > 7 && (opc & 1);
   27072    IRType ty      = getRexW(pfx) ? Ity_F64 : Ity_F32;
   27073    IRType vty     = scalar ? ty : getVexL(pfx) ? Ity_V256 : Ity_V128;
   27074    IRTemp vX      = newTemp(vty);
   27075    IRTemp vY      = newTemp(vty);
   27076    IRTemp vZ      = newTemp(vty);
   27077    IRExpr *x[8], *y[8], *z[8];
   27078    IRTemp addr    = IRTemp_INVALID;
   27079    HChar  dis_buf[50];
   27080    Int    alen    = 0;
   27081    const HChar *name;
   27082    const HChar *suffix;
   27083    const HChar *order;
   27084    Bool   negateRes   = False;
   27085    Bool   negateZeven = False;
   27086    Bool   negateZodd  = False;
   27087    Int    i, j;
   27088    Int    count;
   27089    static IROp ops[] = { Iop_V256to64_0, Iop_V256to64_1,
   27090                          Iop_V256to64_2, Iop_V256to64_3,
   27091                          Iop_V128to64, Iop_V128HIto64 };
   27092 
   27093    switch (opc & 0xF) {
   27094    case 0x6:
   27095       name = "addsub";
   27096       negateZeven = True;
   27097       break;
   27098    case 0x7:
   27099       name = "subadd";
   27100       negateZodd = True;
   27101       break;
   27102    case 0x8:
   27103    case 0x9:
   27104       name = "add";
   27105       break;
   27106    case 0xA:
   27107    case 0xB:
   27108       name = "sub";
   27109       negateZeven = True;
   27110       negateZodd = True;
   27111       break;
   27112    case 0xC:
   27113    case 0xD:
   27114       name = "add";
   27115       negateRes = True;
   27116       negateZeven = True;
   27117       negateZodd = True;
   27118       break;
   27119    case 0xE:
   27120    case 0xF:
   27121       name = "sub";
   27122       negateRes = True;
   27123       break;
   27124    default:
   27125       vpanic("dis_FMA(amd64)");
   27126       break;
   27127    }
   27128    switch (opc & 0xF0) {
   27129    case 0x90: order = "132"; break;
   27130    case 0xA0: order = "213"; break;
   27131    case 0xB0: order = "231"; break;
   27132    default: vpanic("dis_FMA(amd64)"); break;
   27133    }
   27134    if (scalar)
   27135       suffix = ty == Ity_F64 ? "sd" : "ss";
   27136    else
   27137       suffix = ty == Ity_F64 ? "pd" : "ps";
   27138 
   27139    if (scalar) {
   27140       assign( vX, ty == Ity_F64
   27141                   ? getXMMRegLane64F(rG, 0) : getXMMRegLane32F(rG, 0) );
   27142       assign( vZ, ty == Ity_F64
   27143                   ? getXMMRegLane64F(rV, 0) : getXMMRegLane32F(rV, 0) );
   27144    } else {
   27145       assign( vX, vty == Ity_V256 ? getYMMReg(rG) : getXMMReg(rG) );
   27146       assign( vZ, vty == Ity_V256 ? getYMMReg(rV) : getXMMReg(rV) );
   27147    }
   27148 
   27149    if (epartIsReg(modrm)) {
   27150       UInt rE = eregOfRexRM(pfx, modrm);
   27151       delta += 1;
   27152       if (scalar)
   27153          assign( vY, ty == Ity_F64
   27154                      ? getXMMRegLane64F(rE, 0) : getXMMRegLane32F(rE, 0) );
   27155       else
   27156          assign( vY, vty == Ity_V256 ? getYMMReg(rE) : getXMMReg(rE) );
   27157       if (vty == Ity_V256) {
   27158          DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "",
   27159              name, order, suffix, nameYMMReg(rE), nameYMMReg(rV),
   27160              nameYMMReg(rG));
   27161       } else {
   27162          DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "",
   27163              name, order, suffix, nameXMMReg(rE), nameXMMReg(rV),
   27164              nameXMMReg(rG));
   27165       }
   27166    } else {
   27167       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   27168       delta += alen;
   27169       assign(vY, loadLE(vty, mkexpr(addr)));
   27170       if (vty == Ity_V256) {
   27171          DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "",
   27172              name, order, suffix, dis_buf, nameYMMReg(rV),
   27173              nameYMMReg(rG));
   27174       } else {
   27175          DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "",
   27176              name, order, suffix, dis_buf, nameXMMReg(rV),
   27177              nameXMMReg(rG));
   27178       }
   27179    }
   27180 
   27181    /* vX/vY/vZ now in 132 order.  If it is different order, swap the
   27182       arguments.  */
   27183    if ((opc & 0xF0) != 0x90) {
   27184       IRTemp tem = vX;
   27185       if ((opc & 0xF0) == 0xA0) {
   27186          vX = vZ;
   27187          vZ = vY;
   27188          vY = tem;
   27189       } else {
   27190          vX = vZ;
   27191          vZ = tem;
   27192       }
   27193    }
   27194 
   27195    if (scalar) {
   27196       count = 1;
   27197       x[0] = mkexpr(vX);
   27198       y[0] = mkexpr(vY);
   27199       z[0] = mkexpr(vZ);
   27200    } else if (ty == Ity_F32) {
   27201       count = vty == Ity_V256 ? 8 : 4;
   27202       j = vty == Ity_V256 ? 0 : 4;
   27203       for (i = 0; i < count; i += 2) {
   27204          IRTemp tem = newTemp(Ity_I64);
   27205          assign(tem, unop(ops[i / 2 + j], mkexpr(vX)));
   27206          x[i] = unop(Iop_64to32, mkexpr(tem));
   27207          x[i + 1] = unop(Iop_64HIto32, mkexpr(tem));
   27208          tem = newTemp(Ity_I64);
   27209          assign(tem, unop(ops[i / 2 + j], mkexpr(vY)));
   27210          y[i] = unop(Iop_64to32, mkexpr(tem));
   27211          y[i + 1] = unop(Iop_64HIto32, mkexpr(tem));
   27212          tem = newTemp(Ity_I64);
   27213          assign(tem, unop(ops[i / 2 + j], mkexpr(vZ)));
   27214          z[i] = unop(Iop_64to32, mkexpr(tem));
   27215          z[i + 1] = unop(Iop_64HIto32, mkexpr(tem));
   27216       }
   27217    } else {
   27218       count = vty == Ity_V256 ? 4 : 2;
   27219       j = vty == Ity_V256 ? 0 : 4;
   27220       for (i = 0; i < count; i++) {
   27221          x[i] = unop(ops[i + j], mkexpr(vX));
   27222          y[i] = unop(ops[i + j], mkexpr(vY));
   27223          z[i] = unop(ops[i + j], mkexpr(vZ));
   27224       }
   27225    }
   27226    if (!scalar)
   27227       for (i = 0; i < count; i++) {
   27228          IROp op = ty == Ity_F64
   27229                    ? Iop_ReinterpI64asF64 : Iop_ReinterpI32asF32;
   27230          x[i] = unop(op, x[i]);
   27231          y[i] = unop(op, y[i]);
   27232          z[i] = unop(op, z[i]);
   27233       }
   27234    for (i = 0; i < count; i++) {
   27235       if ((i & 1) ? negateZodd : negateZeven)
   27236          z[i] = unop(ty == Ity_F64 ? Iop_NegF64 : Iop_NegF32, z[i]);
   27237       x[i] = IRExpr_Qop(ty == Ity_F64 ? Iop_MAddF64 : Iop_MAddF32,
   27238                         get_FAKE_roundingmode(), x[i], y[i], z[i]);
   27239       if (negateRes)
   27240          x[i] = unop(ty == Ity_F64 ? Iop_NegF64 : Iop_NegF32, x[i]);
   27241       if (ty == Ity_F64)
   27242          putYMMRegLane64F( rG, i, x[i] );
   27243       else
   27244          putYMMRegLane32F( rG, i, x[i] );
   27245    }
   27246    if (vty != Ity_V256)
   27247       putYMMRegLane128( rG, 1, mkV128(0) );
   27248 
   27249    return delta;
   27250 }
   27251 
   27252 
   27253 /* Masked load or masked store. */
   27254 static ULong dis_VMASKMOV ( Bool *uses_vvvv, const VexAbiInfo* vbi,
   27255                             Prefix pfx, Long delta,
   27256                             const HChar* opname, Bool isYMM, IRType ty,
   27257                             Bool isLoad )
   27258 {
   27259    HChar   dis_buf[50];
   27260    Int     alen, i;
   27261    IRTemp  addr;
   27262    UChar   modrm = getUChar(delta);
   27263    UInt    rG    = gregOfRexRM(pfx,modrm);
   27264    UInt    rV    = getVexNvvvv(pfx);
   27265 
   27266    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   27267    delta += alen;
   27268 
   27269    /**/ if (isLoad && isYMM) {
   27270       DIP("%s %s,%s,%s\n", opname, dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
   27271    }
   27272    else if (isLoad && !isYMM) {
   27273       DIP("%s %s,%s,%s\n", opname, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   27274    }
   27275 
   27276    else if (!isLoad && isYMM) {
   27277       DIP("%s %s,%s,%s\n", opname, nameYMMReg(rG), nameYMMReg(rV), dis_buf );
   27278    }
   27279    else {
   27280       vassert(!isLoad && !isYMM);
   27281       DIP("%s %s,%s,%s\n", opname, nameXMMReg(rG), nameXMMReg(rV), dis_buf );
   27282    }
   27283 
   27284    vassert(ty == Ity_I32 || ty == Ity_I64);
   27285    Bool laneIs32 = ty == Ity_I32;
   27286 
   27287    Int nLanes = (isYMM ? 2 : 1) * (laneIs32 ? 4 : 2);
   27288 
   27289    for (i = 0; i < nLanes; i++) {
   27290       IRExpr* shAmt = laneIs32 ? mkU8(31)    : mkU8(63);
   27291       IRExpr* one   = laneIs32 ? mkU32(1)    : mkU64(1);
   27292       IROp    opSHR = laneIs32 ? Iop_Shr32   : Iop_Shr64;
   27293       IROp    opEQ  = laneIs32 ? Iop_CmpEQ32 : Iop_CmpEQ64;
   27294       IRExpr* lane  = (laneIs32 ? getYMMRegLane32 : getYMMRegLane64)( rV, i );
   27295 
   27296       IRTemp  cond = newTemp(Ity_I1);
   27297       assign(cond, binop(opEQ, binop(opSHR, lane, shAmt), one));
   27298 
   27299       IRTemp  data = newTemp(ty);
   27300       IRExpr* ea   = binop(Iop_Add64, mkexpr(addr),
   27301                                       mkU64(i * (laneIs32 ? 4 : 8)));
   27302       if (isLoad) {
   27303          stmt(
   27304             IRStmt_LoadG(
   27305                Iend_LE, laneIs32 ? ILGop_Ident32 : ILGop_Ident64,
   27306                data, ea, laneIs32 ? mkU32(0) : mkU64(0), mkexpr(cond)
   27307          ));
   27308          (laneIs32 ? putYMMRegLane32 : putYMMRegLane64)( rG, i, mkexpr(data) );
   27309       } else {
   27310          assign(data, (laneIs32 ? getYMMRegLane32 : getYMMRegLane64)( rG, i ));
   27311          stmt( IRStmt_StoreG(Iend_LE, ea, mkexpr(data), mkexpr(cond)) );
   27312       }
   27313    }
   27314 
   27315    if (isLoad && !isYMM)
   27316       putYMMRegLane128( rG, 1, mkV128(0) );
   27317 
   27318    *uses_vvvv = True;
   27319    return delta;
   27320 }
   27321 
   27322 
   27323 /* Gather.  */
   27324 static ULong dis_VGATHER ( Bool *uses_vvvv, const VexAbiInfo* vbi,
   27325                            Prefix pfx, Long delta,
   27326                            const HChar* opname, Bool isYMM,
   27327                            Bool isVM64x, IRType ty )
   27328 {
   27329    HChar  dis_buf[50];
   27330    Int    alen, i, vscale, count1, count2;
   27331    IRTemp addr;
   27332    UChar  modrm = getUChar(delta);
   27333    UInt   rG    = gregOfRexRM(pfx,modrm);
   27334    UInt   rV    = getVexNvvvv(pfx);
   27335    UInt   rI;
   27336    IRType dstTy = (isYMM && (ty == Ity_I64 || !isVM64x)) ? Ity_V256 : Ity_V128;
   27337    IRType idxTy = (isYMM && (ty == Ity_I32 || isVM64x)) ? Ity_V256 : Ity_V128;
   27338    IRTemp cond;
   27339    addr = disAVSIBMode ( &alen, vbi, pfx, delta, dis_buf, &rI,
   27340                          idxTy, &vscale );
   27341    if (addr == IRTemp_INVALID || rI == rG || rI == rV || rG == rV)
   27342       return delta;
   27343    if (dstTy == Ity_V256) {
   27344       DIP("%s %s,%s,%s\n", opname, nameYMMReg(rV), dis_buf, nameYMMReg(rG) );
   27345    } else {
   27346       DIP("%s %s,%s,%s\n", opname, nameXMMReg(rV), dis_buf, nameXMMReg(rG) );
   27347    }
   27348    delta += alen;
   27349 
   27350    if (ty == Ity_I32) {
   27351       count1 = isYMM ? 8 : 4;
   27352       count2 = isVM64x ? count1 / 2 : count1;
   27353    } else {
   27354       count1 = count2 = isYMM ? 4 : 2;
   27355    }
   27356 
   27357    /* First update the mask register to copies of the sign bit.  */
   27358    if (ty == Ity_I32) {
   27359       if (isYMM)
   27360          putYMMReg( rV, binop(Iop_SarN32x8, getYMMReg( rV ), mkU8(31)) );
   27361       else
   27362          putYMMRegLoAndZU( rV, binop(Iop_SarN32x4, getXMMReg( rV ), mkU8(31)) );
   27363    } else {
   27364       for (i = 0; i < count1; i++) {
   27365          putYMMRegLane64( rV, i, binop(Iop_Sar64, getYMMRegLane64( rV, i ),
   27366                                        mkU8(63)) );
   27367       }
   27368    }
   27369 
   27370    /* Next gather the individual elements.  If any fault occurs, the
   27371       corresponding mask element will be set and the loop stops.  */
   27372    for (i = 0; i < count2; i++) {
   27373       IRExpr *expr, *addr_expr;
   27374       cond = newTemp(Ity_I1);
   27375       assign( cond,
   27376               binop(ty == Ity_I32 ? Iop_CmpLT32S : Iop_CmpLT64S,
   27377                     ty == Ity_I32 ? getYMMRegLane32( rV, i )
   27378                                   : getYMMRegLane64( rV, i ),
   27379                     mkU(ty, 0)) );
   27380       expr = ty == Ity_I32 ? getYMMRegLane32( rG, i )
   27381                            : getYMMRegLane64( rG, i );
   27382       addr_expr = isVM64x ? getYMMRegLane64( rI, i )
   27383                           : unop(Iop_32Sto64, getYMMRegLane32( rI, i ));
   27384       switch (vscale) {
   27385          case 2: addr_expr = binop(Iop_Shl64, addr_expr, mkU8(1)); break;
   27386          case 4: addr_expr = binop(Iop_Shl64, addr_expr, mkU8(2)); break;
   27387          case 8: addr_expr = binop(Iop_Shl64, addr_expr, mkU8(3)); break;
   27388          default: break;
   27389       }
   27390       addr_expr = binop(Iop_Add64, mkexpr(addr), addr_expr);
   27391       addr_expr = handleAddrOverrides(vbi, pfx, addr_expr);
   27392       addr_expr = IRExpr_ITE(mkexpr(cond), addr_expr, getIReg64(R_RSP));
   27393       expr = IRExpr_ITE(mkexpr(cond), loadLE(ty, addr_expr), expr);
   27394       if (ty == Ity_I32) {
   27395          putYMMRegLane32( rG, i, expr );
   27396          putYMMRegLane32( rV, i, mkU32(0) );
   27397       } else {
   27398          putYMMRegLane64( rG, i, expr);
   27399          putYMMRegLane64( rV, i, mkU64(0) );
   27400       }
   27401    }
   27402 
   27403    if (!isYMM || (ty == Ity_I32 && isVM64x)) {
   27404       if (ty == Ity_I64 || isYMM)
   27405          putYMMRegLane128( rV, 1, mkV128(0) );
   27406       else if (ty == Ity_I32 && count2 == 2) {
   27407          putYMMRegLane64( rV, 1, mkU64(0) );
   27408          putYMMRegLane64( rG, 1, mkU64(0) );
   27409       }
   27410       putYMMRegLane128( rG, 1, mkV128(0) );
   27411    }
   27412 
   27413    *uses_vvvv = True;
   27414    return delta;
   27415 }
   27416 
   27417 
   27418 __attribute__((noinline))
   27419 static
   27420 Long dis_ESC_0F38__VEX (
   27421         /*MB_OUT*/DisResult* dres,
   27422         /*OUT*/   Bool*      uses_vvvv,
   27423         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
   27424         Bool         resteerCisOk,
   27425         void*        callback_opaque,
   27426         const VexArchInfo* archinfo,
   27427         const VexAbiInfo*  vbi,
   27428         Prefix pfx, Int sz, Long deltaIN
   27429      )
   27430 {
   27431    IRTemp addr  = IRTemp_INVALID;
   27432    Int    alen  = 0;
   27433    HChar  dis_buf[50];
   27434    Long   delta = deltaIN;
   27435    UChar  opc   = getUChar(delta);
   27436    delta++;
   27437    *uses_vvvv = False;
   27438 
   27439    switch (opc) {
   27440 
   27441    case 0x00:
   27442       /* VPSHUFB r/m, rV, r ::: r = shuf(rV, r/m) */
   27443       /* VPSHUFB = VEX.NDS.128.66.0F38.WIG 00 /r */
   27444       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27445          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   27446                     uses_vvvv, vbi, pfx, delta, "vpshufb", math_PSHUFB_XMM );
   27447          goto decode_success;
   27448       }
   27449       /* VPSHUFB r/m, rV, r ::: r = shuf(rV, r/m) */
   27450       /* VPSHUFB = VEX.NDS.256.66.0F38.WIG 00 /r */
   27451       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27452          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   27453                     uses_vvvv, vbi, pfx, delta, "vpshufb", math_PSHUFB_YMM );
   27454          goto decode_success;
   27455       }
   27456       break;
   27457 
   27458    case 0x01:
   27459    case 0x02:
   27460    case 0x03:
   27461       /* VPHADDW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 01 /r */
   27462       /* VPHADDD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 02 /r */
   27463       /* VPHADDSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 03 /r */
   27464       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27465          delta = dis_PHADD_128( vbi, pfx, delta, True/*isAvx*/, opc );
   27466          *uses_vvvv = True;
   27467          goto decode_success;
   27468       }
   27469       /* VPHADDW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 01 /r */
   27470       /* VPHADDD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 02 /r */
   27471       /* VPHADDSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 03 /r */
   27472       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27473          delta = dis_PHADD_256( vbi, pfx, delta, opc );
   27474          *uses_vvvv = True;
   27475          goto decode_success;
   27476       }
   27477       break;
   27478 
   27479    case 0x04:
   27480       /* VPMADDUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 04 /r */
   27481       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27482          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   27483                     uses_vvvv, vbi, pfx, delta, "vpmaddubsw",
   27484                     math_PMADDUBSW_128 );
   27485          goto decode_success;
   27486       }
   27487       /* VPMADDUBSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 04 /r */
   27488       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27489          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   27490                     uses_vvvv, vbi, pfx, delta, "vpmaddubsw",
   27491                     math_PMADDUBSW_256 );
   27492          goto decode_success;
   27493       }
   27494       break;
   27495 
   27496    case 0x05:
   27497    case 0x06:
   27498    case 0x07:
   27499       /* VPHSUBW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 05 /r */
   27500       /* VPHSUBD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 06 /r */
   27501       /* VPHSUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 07 /r */
   27502       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27503          delta = dis_PHADD_128( vbi, pfx, delta, True/*isAvx*/, opc );
   27504          *uses_vvvv = True;
   27505          goto decode_success;
   27506       }
   27507       /* VPHSUBW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 05 /r */
   27508       /* VPHSUBD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 06 /r */
   27509       /* VPHSUBSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 07 /r */
   27510       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27511          delta = dis_PHADD_256( vbi, pfx, delta, opc );
   27512          *uses_vvvv = True;
   27513          goto decode_success;
   27514       }
   27515       break;
   27516 
   27517    case 0x08:
   27518    case 0x09:
   27519    case 0x0A:
   27520       /* VPSIGNB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 08 /r */
   27521       /* VPSIGNW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 09 /r */
   27522       /* VPSIGND xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 0A /r */
   27523       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27524          IRTemp sV      = newTemp(Ity_V128);
   27525          IRTemp dV      = newTemp(Ity_V128);
   27526          IRTemp sHi, sLo, dHi, dLo;
   27527          sHi = sLo = dHi = dLo = IRTemp_INVALID;
   27528          HChar  ch      = '?';
   27529          Int    laneszB = 0;
   27530          UChar  modrm   = getUChar(delta);
   27531          UInt   rG      = gregOfRexRM(pfx,modrm);
   27532          UInt   rV      = getVexNvvvv(pfx);
   27533 
   27534          switch (opc) {
   27535             case 0x08: laneszB = 1; ch = 'b'; break;
   27536             case 0x09: laneszB = 2; ch = 'w'; break;
   27537             case 0x0A: laneszB = 4; ch = 'd'; break;
   27538             default: vassert(0);
   27539          }
   27540 
   27541          assign( dV, getXMMReg(rV) );
   27542 
   27543          if (epartIsReg(modrm)) {
   27544             UInt rE = eregOfRexRM(pfx,modrm);
   27545             assign( sV, getXMMReg(rE) );
   27546             delta += 1;
   27547             DIP("vpsign%c %s,%s,%s\n", ch, nameXMMReg(rE),
   27548                 nameXMMReg(rV), nameXMMReg(rG));
   27549          } else {
   27550             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   27551             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   27552             delta += alen;
   27553             DIP("vpsign%c %s,%s,%s\n", ch, dis_buf,
   27554                 nameXMMReg(rV), nameXMMReg(rG));
   27555          }
   27556 
   27557          breakupV128to64s( dV, &dHi, &dLo );
   27558          breakupV128to64s( sV, &sHi, &sLo );
   27559 
   27560          putYMMRegLoAndZU(
   27561             rG,
   27562             binop(Iop_64HLtoV128,
   27563                   dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
   27564                   dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
   27565             )
   27566          );
   27567          *uses_vvvv = True;
   27568          goto decode_success;
   27569       }
   27570       /* VPSIGNB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 08 /r */
   27571       /* VPSIGNW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 09 /r */
   27572       /* VPSIGND ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 0A /r */
   27573       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27574          IRTemp sV      = newTemp(Ity_V256);
   27575          IRTemp dV      = newTemp(Ity_V256);
   27576          IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   27577          s3 = s2 = s1 = s0 = IRTemp_INVALID;
   27578          d3 = d2 = d1 = d0 = IRTemp_INVALID;
   27579          UChar  ch      = '?';
   27580          Int    laneszB = 0;
   27581          UChar  modrm   = getUChar(delta);
   27582          UInt   rG      = gregOfRexRM(pfx,modrm);
   27583          UInt   rV      = getVexNvvvv(pfx);
   27584 
   27585          switch (opc) {
   27586             case 0x08: laneszB = 1; ch = 'b'; break;
   27587             case 0x09: laneszB = 2; ch = 'w'; break;
   27588             case 0x0A: laneszB = 4; ch = 'd'; break;
   27589             default: vassert(0);
   27590          }
   27591 
   27592          assign( dV, getYMMReg(rV) );
   27593 
   27594          if (epartIsReg(modrm)) {
   27595             UInt rE = eregOfRexRM(pfx,modrm);
   27596             assign( sV, getYMMReg(rE) );
   27597             delta += 1;
   27598             DIP("vpsign%c %s,%s,%s\n", ch, nameYMMReg(rE),
   27599                 nameYMMReg(rV), nameYMMReg(rG));
   27600          } else {
   27601             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   27602             assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   27603             delta += alen;
   27604             DIP("vpsign%c %s,%s,%s\n", ch, dis_buf,
   27605                 nameYMMReg(rV), nameYMMReg(rG));
   27606          }
   27607 
   27608          breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
   27609          breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
   27610 
   27611          putYMMReg(
   27612             rG,
   27613             binop( Iop_V128HLtoV256,
   27614                    binop(Iop_64HLtoV128,
   27615                          dis_PSIGN_helper( mkexpr(s3), mkexpr(d3), laneszB ),
   27616                          dis_PSIGN_helper( mkexpr(s2), mkexpr(d2), laneszB )
   27617                    ),
   27618                    binop(Iop_64HLtoV128,
   27619                          dis_PSIGN_helper( mkexpr(s1), mkexpr(d1), laneszB ),
   27620                          dis_PSIGN_helper( mkexpr(s0), mkexpr(d0), laneszB )
   27621                    )
   27622             )
   27623          );
   27624          *uses_vvvv = True;
   27625          goto decode_success;
   27626       }
   27627       break;
   27628 
   27629    case 0x0B:
   27630       /* VPMULHRSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 0B /r */
   27631       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27632          IRTemp sV      = newTemp(Ity_V128);
   27633          IRTemp dV      = newTemp(Ity_V128);
   27634          IRTemp sHi, sLo, dHi, dLo;
   27635          sHi = sLo = dHi = dLo = IRTemp_INVALID;
   27636          UChar  modrm   = getUChar(delta);
   27637          UInt   rG      = gregOfRexRM(pfx,modrm);
   27638          UInt   rV      = getVexNvvvv(pfx);
   27639 
   27640          assign( dV, getXMMReg(rV) );
   27641 
   27642          if (epartIsReg(modrm)) {
   27643             UInt rE = eregOfRexRM(pfx,modrm);
   27644             assign( sV, getXMMReg(rE) );
   27645             delta += 1;
   27646             DIP("vpmulhrsw %s,%s,%s\n", nameXMMReg(rE),
   27647                 nameXMMReg(rV), nameXMMReg(rG));
   27648          } else {
   27649             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   27650             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   27651             delta += alen;
   27652             DIP("vpmulhrsw %s,%s,%s\n", dis_buf,
   27653                 nameXMMReg(rV), nameXMMReg(rG));
   27654          }
   27655 
   27656          breakupV128to64s( dV, &dHi, &dLo );
   27657          breakupV128to64s( sV, &sHi, &sLo );
   27658 
   27659          putYMMRegLoAndZU(
   27660             rG,
   27661             binop(Iop_64HLtoV128,
   27662                   dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
   27663                   dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
   27664             )
   27665          );
   27666          *uses_vvvv = True;
   27667          goto decode_success;
   27668       }
   27669       /* VPMULHRSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 0B /r */
   27670       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27671          IRTemp sV      = newTemp(Ity_V256);
   27672          IRTemp dV      = newTemp(Ity_V256);
   27673          IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   27674          s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   27675          UChar  modrm   = getUChar(delta);
   27676          UInt   rG      = gregOfRexRM(pfx,modrm);
   27677          UInt   rV      = getVexNvvvv(pfx);
   27678 
   27679          assign( dV, getYMMReg(rV) );
   27680 
   27681          if (epartIsReg(modrm)) {
   27682             UInt rE = eregOfRexRM(pfx,modrm);
   27683             assign( sV, getYMMReg(rE) );
   27684             delta += 1;
   27685             DIP("vpmulhrsw %s,%s,%s\n", nameYMMReg(rE),
   27686                 nameYMMReg(rV), nameYMMReg(rG));
   27687          } else {
   27688             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   27689             assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   27690             delta += alen;
   27691             DIP("vpmulhrsw %s,%s,%s\n", dis_buf,
   27692                 nameYMMReg(rV), nameYMMReg(rG));
   27693          }
   27694 
   27695          breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
   27696          breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
   27697 
   27698          putYMMReg(
   27699             rG,
   27700             binop(Iop_V128HLtoV256,
   27701                   binop(Iop_64HLtoV128,
   27702                         dis_PMULHRSW_helper( mkexpr(s3), mkexpr(d3) ),
   27703                         dis_PMULHRSW_helper( mkexpr(s2), mkexpr(d2) ) ),
   27704                   binop(Iop_64HLtoV128,
   27705                         dis_PMULHRSW_helper( mkexpr(s1), mkexpr(d1) ),
   27706                         dis_PMULHRSW_helper( mkexpr(s0), mkexpr(d0) ) )
   27707             )
   27708          );
   27709          *uses_vvvv = True;
   27710          goto decode_success;
   27711       }
   27712       break;
   27713 
   27714    case 0x0C:
   27715       /* VPERMILPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 0C /r */
   27716       if (have66noF2noF3(pfx)
   27717           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   27718          UChar  modrm = getUChar(delta);
   27719          UInt   rG    = gregOfRexRM(pfx, modrm);
   27720          UInt   rV    = getVexNvvvv(pfx);
   27721          IRTemp ctrlV = newTemp(Ity_V128);
   27722          if (epartIsReg(modrm)) {
   27723             UInt rE = eregOfRexRM(pfx, modrm);
   27724             delta += 1;
   27725             DIP("vpermilps %s,%s,%s\n",
   27726                 nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   27727             assign(ctrlV, getXMMReg(rE));
   27728          } else {
   27729             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   27730             delta += alen;
   27731             DIP("vpermilps %s,%s,%s\n",
   27732                 dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   27733             assign(ctrlV, loadLE(Ity_V128, mkexpr(addr)));
   27734          }
   27735          IRTemp dataV = newTemp(Ity_V128);
   27736          assign(dataV, getXMMReg(rV));
   27737          IRTemp resV = math_PERMILPS_VAR_128(dataV, ctrlV);
   27738          putYMMRegLoAndZU(rG, mkexpr(resV));
   27739          *uses_vvvv = True;
   27740          goto decode_success;
   27741       }
   27742       /* VPERMILPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 0C /r */
   27743       if (have66noF2noF3(pfx)
   27744           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   27745          UChar  modrm = getUChar(delta);
   27746          UInt   rG    = gregOfRexRM(pfx, modrm);
   27747          UInt   rV    = getVexNvvvv(pfx);
   27748          IRTemp ctrlV = newTemp(Ity_V256);
   27749          if (epartIsReg(modrm)) {
   27750             UInt rE = eregOfRexRM(pfx, modrm);
   27751             delta += 1;
   27752             DIP("vpermilps %s,%s,%s\n",
   27753                 nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   27754             assign(ctrlV, getYMMReg(rE));
   27755          } else {
   27756             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   27757             delta += alen;
   27758             DIP("vpermilps %s,%s,%s\n",
   27759                 dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   27760             assign(ctrlV, loadLE(Ity_V256, mkexpr(addr)));
   27761          }
   27762          IRTemp dataV = newTemp(Ity_V256);
   27763          assign(dataV, getYMMReg(rV));
   27764          IRTemp resV = math_PERMILPS_VAR_256(dataV, ctrlV);
   27765          putYMMReg(rG, mkexpr(resV));
   27766          *uses_vvvv = True;
   27767          goto decode_success;
   27768       }
   27769       break;
   27770 
   27771    case 0x0D:
   27772       /* VPERMILPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 0D /r */
   27773       if (have66noF2noF3(pfx)
   27774           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   27775          UChar  modrm = getUChar(delta);
   27776          UInt   rG    = gregOfRexRM(pfx, modrm);
   27777          UInt   rV    = getVexNvvvv(pfx);
   27778          IRTemp ctrlV = newTemp(Ity_V128);
   27779          if (epartIsReg(modrm)) {
   27780             UInt rE = eregOfRexRM(pfx, modrm);
   27781             delta += 1;
   27782             DIP("vpermilpd %s,%s,%s\n",
   27783                 nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   27784             assign(ctrlV, getXMMReg(rE));
   27785          } else {
   27786             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   27787             delta += alen;
   27788             DIP("vpermilpd %s,%s,%s\n",
   27789                 dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   27790             assign(ctrlV, loadLE(Ity_V128, mkexpr(addr)));
   27791          }
   27792          IRTemp dataV = newTemp(Ity_V128);
   27793          assign(dataV, getXMMReg(rV));
   27794          IRTemp resV = math_PERMILPD_VAR_128(dataV, ctrlV);
   27795          putYMMRegLoAndZU(rG, mkexpr(resV));
   27796          *uses_vvvv = True;
   27797          goto decode_success;
   27798       }
   27799       /* VPERMILPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 0D /r */
   27800       if (have66noF2noF3(pfx)
   27801           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   27802          UChar  modrm = getUChar(delta);
   27803          UInt   rG    = gregOfRexRM(pfx, modrm);
   27804          UInt   rV    = getVexNvvvv(pfx);
   27805          IRTemp ctrlV = newTemp(Ity_V256);
   27806          if (epartIsReg(modrm)) {
   27807             UInt rE = eregOfRexRM(pfx, modrm);
   27808             delta += 1;
   27809             DIP("vpermilpd %s,%s,%s\n",
   27810                 nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   27811             assign(ctrlV, getYMMReg(rE));
   27812          } else {
   27813             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   27814             delta += alen;
   27815             DIP("vpermilpd %s,%s,%s\n",
   27816                 dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   27817             assign(ctrlV, loadLE(Ity_V256, mkexpr(addr)));
   27818          }
   27819          IRTemp dataV = newTemp(Ity_V256);
   27820          assign(dataV, getYMMReg(rV));
   27821          IRTemp resV = math_PERMILPD_VAR_256(dataV, ctrlV);
   27822          putYMMReg(rG, mkexpr(resV));
   27823          *uses_vvvv = True;
   27824          goto decode_success;
   27825       }
   27826       break;
   27827 
   27828    case 0x0E:
   27829       /* VTESTPS xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 0E /r */
   27830       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27831          delta = dis_xTESTy_128( vbi, pfx, delta, True/*isAvx*/, 32 );
   27832          goto decode_success;
   27833       }
   27834       /* VTESTPS ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 0E /r */
   27835       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27836          delta = dis_xTESTy_256( vbi, pfx, delta, 32 );
   27837          goto decode_success;
   27838       }
   27839       break;
   27840 
   27841    case 0x0F:
   27842       /* VTESTPD xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 0F /r */
   27843       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27844          delta = dis_xTESTy_128( vbi, pfx, delta, True/*isAvx*/, 64 );
   27845          goto decode_success;
   27846       }
   27847       /* VTESTPD ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 0F /r */
   27848       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27849          delta = dis_xTESTy_256( vbi, pfx, delta, 64 );
   27850          goto decode_success;
   27851       }
   27852       break;
   27853 
   27854    case 0x16:
   27855       /* VPERMPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 16 /r */
   27856       if (have66noF2noF3(pfx)
   27857           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   27858          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   27859                     uses_vvvv, vbi, pfx, delta, "vpermps", math_VPERMD );
   27860          goto decode_success;
   27861       }
   27862       break;
   27863 
   27864    case 0x17:
   27865       /* VPTEST xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 17 /r */
   27866       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27867          delta = dis_xTESTy_128( vbi, pfx, delta, True/*isAvx*/, 0 );
   27868          goto decode_success;
   27869       }
   27870       /* VPTEST ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 17 /r */
   27871       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27872          delta = dis_xTESTy_256( vbi, pfx, delta, 0 );
   27873          goto decode_success;
   27874       }
   27875       break;
   27876 
   27877    case 0x18:
   27878       /* VBROADCASTSS m32, xmm1 = VEX.128.66.0F38.WIG 18 /r */
   27879       if (have66noF2noF3(pfx)
   27880           && 0==getVexL(pfx)/*128*/
   27881           && !epartIsReg(getUChar(delta))) {
   27882          UChar modrm = getUChar(delta);
   27883          UInt  rG    = gregOfRexRM(pfx, modrm);
   27884          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   27885          delta += alen;
   27886          DIP("vbroadcastss %s,%s\n", dis_buf, nameXMMReg(rG));
   27887          IRTemp t32 = newTemp(Ity_I32);
   27888          assign(t32, loadLE(Ity_I32, mkexpr(addr)));
   27889          IRTemp t64 = newTemp(Ity_I64);
   27890          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   27891          IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
   27892          putYMMRegLoAndZU(rG, res);
   27893          goto decode_success;
   27894       }
   27895       /* VBROADCASTSS m32, ymm1 = VEX.256.66.0F38.WIG 18 /r */
   27896       if (have66noF2noF3(pfx)
   27897           && 1==getVexL(pfx)/*256*/
   27898           && !epartIsReg(getUChar(delta))) {
   27899          UChar modrm = getUChar(delta);
   27900          UInt  rG    = gregOfRexRM(pfx, modrm);
   27901          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   27902          delta += alen;
   27903          DIP("vbroadcastss %s,%s\n", dis_buf, nameYMMReg(rG));
   27904          IRTemp t32 = newTemp(Ity_I32);
   27905          assign(t32, loadLE(Ity_I32, mkexpr(addr)));
   27906          IRTemp t64 = newTemp(Ity_I64);
   27907          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   27908          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   27909                                                   mkexpr(t64), mkexpr(t64));
   27910          putYMMReg(rG, res);
   27911          goto decode_success;
   27912       }
   27913       /* VBROADCASTSS xmm2, xmm1 = VEX.128.66.0F38.WIG 18 /r */
   27914       if (have66noF2noF3(pfx)
   27915           && 0==getVexL(pfx)/*128*/
   27916           && epartIsReg(getUChar(delta))) {
   27917          UChar modrm = getUChar(delta);
   27918          UInt  rG    = gregOfRexRM(pfx, modrm);
   27919          UInt  rE    = eregOfRexRM(pfx, modrm);
   27920          DIP("vbroadcastss %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   27921          IRTemp t32 = newTemp(Ity_I32);
   27922          assign(t32, getXMMRegLane32(rE, 0));
   27923          IRTemp t64 = newTemp(Ity_I64);
   27924          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   27925          IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
   27926          putYMMRegLoAndZU(rG, res);
   27927          delta++;
   27928          goto decode_success;
   27929       }
   27930       /* VBROADCASTSS xmm2, ymm1 = VEX.256.66.0F38.WIG 18 /r */
   27931       if (have66noF2noF3(pfx)
   27932           && 1==getVexL(pfx)/*256*/
   27933           && epartIsReg(getUChar(delta))) {
   27934          UChar modrm = getUChar(delta);
   27935          UInt  rG    = gregOfRexRM(pfx, modrm);
   27936          UInt  rE    = eregOfRexRM(pfx, modrm);
   27937          DIP("vbroadcastss %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   27938          IRTemp t32 = newTemp(Ity_I32);
   27939          assign(t32, getXMMRegLane32(rE, 0));
   27940          IRTemp t64 = newTemp(Ity_I64);
   27941          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   27942          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   27943                                                   mkexpr(t64), mkexpr(t64));
   27944          putYMMReg(rG, res);
   27945          delta++;
   27946          goto decode_success;
   27947       }
   27948       break;
   27949 
   27950    case 0x19:
   27951       /* VBROADCASTSD m64, ymm1 = VEX.256.66.0F38.WIG 19 /r */
   27952       if (have66noF2noF3(pfx)
   27953           && 1==getVexL(pfx)/*256*/
   27954           && !epartIsReg(getUChar(delta))) {
   27955          UChar modrm = getUChar(delta);
   27956          UInt  rG    = gregOfRexRM(pfx, modrm);
   27957          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   27958          delta += alen;
   27959          DIP("vbroadcastsd %s,%s\n", dis_buf, nameYMMReg(rG));
   27960          IRTemp t64 = newTemp(Ity_I64);
   27961          assign(t64, loadLE(Ity_I64, mkexpr(addr)));
   27962          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   27963                                                   mkexpr(t64), mkexpr(t64));
   27964          putYMMReg(rG, res);
   27965          goto decode_success;
   27966       }
   27967       /* VBROADCASTSD xmm2, ymm1 = VEX.256.66.0F38.WIG 19 /r */
   27968       if (have66noF2noF3(pfx)
   27969           && 1==getVexL(pfx)/*256*/
   27970           && epartIsReg(getUChar(delta))) {
   27971          UChar modrm = getUChar(delta);
   27972          UInt  rG    = gregOfRexRM(pfx, modrm);
   27973          UInt  rE    = eregOfRexRM(pfx, modrm);
   27974          DIP("vbroadcastsd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   27975          IRTemp t64 = newTemp(Ity_I64);
   27976          assign(t64, getXMMRegLane64(rE, 0));
   27977          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   27978                                                   mkexpr(t64), mkexpr(t64));
   27979          putYMMReg(rG, res);
   27980          delta++;
   27981          goto decode_success;
   27982       }
   27983       break;
   27984 
   27985    case 0x1A:
   27986       /* VBROADCASTF128 m128, ymm1 = VEX.256.66.0F38.WIG 1A /r */
   27987       if (have66noF2noF3(pfx)
   27988           && 1==getVexL(pfx)/*256*/
   27989           && !epartIsReg(getUChar(delta))) {
   27990          UChar modrm = getUChar(delta);
   27991          UInt  rG    = gregOfRexRM(pfx, modrm);
   27992          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   27993          delta += alen;
   27994          DIP("vbroadcastf128 %s,%s\n", dis_buf, nameYMMReg(rG));
   27995          IRTemp t128 = newTemp(Ity_V128);
   27996          assign(t128, loadLE(Ity_V128, mkexpr(addr)));
   27997          putYMMReg( rG, binop(Iop_V128HLtoV256, mkexpr(t128), mkexpr(t128)) );
   27998          goto decode_success;
   27999       }
   28000       break;
   28001 
   28002    case 0x1C:
   28003       /* VPABSB xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 1C /r */
   28004       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28005          delta = dis_AVX128_E_to_G_unary(
   28006                     uses_vvvv, vbi, pfx, delta,
   28007                     "vpabsb", math_PABS_XMM_pap1 );
   28008          goto decode_success;
   28009       }
   28010       /* VPABSB ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 1C /r */
   28011       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28012          delta = dis_AVX256_E_to_G_unary(
   28013                     uses_vvvv, vbi, pfx, delta,
   28014                     "vpabsb", math_PABS_YMM_pap1 );
   28015          goto decode_success;
   28016       }
   28017       break;
   28018 
   28019    case 0x1D:
   28020       /* VPABSW xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 1D /r */
   28021       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28022          delta = dis_AVX128_E_to_G_unary(
   28023                     uses_vvvv, vbi, pfx, delta,
   28024                     "vpabsw", math_PABS_XMM_pap2 );
   28025          goto decode_success;
   28026       }
   28027       /* VPABSW ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 1D /r */
   28028       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28029          delta = dis_AVX256_E_to_G_unary(
   28030                     uses_vvvv, vbi, pfx, delta,
   28031                     "vpabsw", math_PABS_YMM_pap2 );
   28032          goto decode_success;
   28033       }
   28034       break;
   28035 
   28036    case 0x1E:
   28037       /* VPABSD xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 1E /r */
   28038       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28039          delta = dis_AVX128_E_to_G_unary(
   28040                     uses_vvvv, vbi, pfx, delta,
   28041                     "vpabsd", math_PABS_XMM_pap4 );
   28042          goto decode_success;
   28043       }
   28044       /* VPABSD ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 1E /r */
   28045       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28046          delta = dis_AVX256_E_to_G_unary(
   28047                     uses_vvvv, vbi, pfx, delta,
   28048                     "vpabsd", math_PABS_YMM_pap4 );
   28049          goto decode_success;
   28050       }
   28051       break;
   28052 
   28053    case 0x20:
   28054       /* VPMOVSXBW xmm2/m64, xmm1 */
   28055       /* VPMOVSXBW = VEX.128.66.0F38.WIG 20 /r */
   28056       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28057          delta = dis_PMOVxXBW_128( vbi, pfx, delta,
   28058                                    True/*isAvx*/, False/*!xIsZ*/ );
   28059          goto decode_success;
   28060       }
   28061       /* VPMOVSXBW xmm2/m128, ymm1 */
   28062       /* VPMOVSXBW = VEX.256.66.0F38.WIG 20 /r */
   28063       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28064          delta = dis_PMOVxXBW_256( vbi, pfx, delta, False/*!xIsZ*/ );
   28065          goto decode_success;
   28066       }
   28067       break;
   28068 
   28069    case 0x21:
   28070       /* VPMOVSXBD xmm2/m32, xmm1 */
   28071       /* VPMOVSXBD = VEX.128.66.0F38.WIG 21 /r */
   28072       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28073          delta = dis_PMOVxXBD_128( vbi, pfx, delta,
   28074                                    True/*isAvx*/, False/*!xIsZ*/ );
   28075          goto decode_success;
   28076       }
   28077       /* VPMOVSXBD xmm2/m64, ymm1 */
   28078       /* VPMOVSXBD = VEX.256.66.0F38.WIG 21 /r */
   28079       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28080          delta = dis_PMOVxXBD_256( vbi, pfx, delta, False/*!xIsZ*/ );
   28081          goto decode_success;
   28082       }
   28083       break;
   28084 
   28085    case 0x22:
   28086       /* VPMOVSXBQ xmm2/m16, xmm1 */
   28087       /* VPMOVSXBQ = VEX.128.66.0F38.WIG 22 /r */
   28088       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28089          delta = dis_PMOVSXBQ_128( vbi, pfx, delta, True/*isAvx*/ );
   28090          goto decode_success;
   28091       }
   28092       /* VPMOVSXBQ xmm2/m32, ymm1 */
   28093       /* VPMOVSXBQ = VEX.256.66.0F38.WIG 22 /r */
   28094       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28095          delta = dis_PMOVSXBQ_256( vbi, pfx, delta );
   28096          goto decode_success;
   28097       }
   28098       break;
   28099 
   28100    case 0x23:
   28101       /* VPMOVSXWD xmm2/m64, xmm1 = VEX.128.66.0F38.WIG 23 /r */
   28102       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28103          delta = dis_PMOVxXWD_128( vbi, pfx, delta,
   28104                                    True/*isAvx*/, False/*!xIsZ*/ );
   28105          goto decode_success;
   28106       }
   28107       /* VPMOVSXWD xmm2/m128, ymm1 = VEX.256.66.0F38.WIG 23 /r */
   28108       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28109          delta = dis_PMOVxXWD_256( vbi, pfx, delta, False/*!xIsZ*/ );
   28110          goto decode_success;
   28111       }
   28112       break;
   28113 
   28114    case 0x24:
   28115       /* VPMOVSXWQ xmm2/m32, xmm1 = VEX.128.66.0F38.WIG 24 /r */
   28116       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28117          delta = dis_PMOVSXWQ_128( vbi, pfx, delta, True/*isAvx*/ );
   28118          goto decode_success;
   28119       }
   28120       /* VPMOVSXWQ xmm2/m64, ymm1 = VEX.256.66.0F38.WIG 24 /r */
   28121       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28122          delta = dis_PMOVSXWQ_256( vbi, pfx, delta );
   28123          goto decode_success;
   28124       }
   28125       break;
   28126 
   28127    case 0x25:
   28128       /* VPMOVSXDQ xmm2/m64, xmm1 = VEX.128.66.0F38.WIG 25 /r */
   28129       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28130          delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
   28131                                    True/*isAvx*/, False/*!xIsZ*/ );
   28132          goto decode_success;
   28133       }
   28134       /* VPMOVSXDQ xmm2/m128, ymm1 = VEX.256.66.0F38.WIG 25 /r */
   28135       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28136          delta = dis_PMOVxXDQ_256( vbi, pfx, delta, False/*!xIsZ*/ );
   28137          goto decode_success;
   28138       }
   28139       break;
   28140 
   28141    case 0x28:
   28142       /* VPMULDQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 28 /r */
   28143       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28144          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   28145                     uses_vvvv, vbi, pfx, delta,
   28146                     "vpmuldq", math_PMULDQ_128 );
   28147          goto decode_success;
   28148       }
   28149       /* VPMULDQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 28 /r */
   28150       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28151          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   28152                     uses_vvvv, vbi, pfx, delta,
   28153                     "vpmuldq", math_PMULDQ_256 );
   28154          goto decode_success;
   28155       }
   28156       break;
   28157 
   28158    case 0x29:
   28159       /* VPCMPEQQ r/m, rV, r ::: r = rV `eq-by-64s` r/m */
   28160       /* VPCMPEQQ = VEX.NDS.128.66.0F38.WIG 29 /r */
   28161       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28162          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   28163                     uses_vvvv, vbi, pfx, delta, "vpcmpeqq", Iop_CmpEQ64x2 );
   28164          goto decode_success;
   28165       }
   28166       /* VPCMPEQQ r/m, rV, r ::: r = rV `eq-by-64s` r/m */
   28167       /* VPCMPEQQ = VEX.NDS.256.66.0F38.WIG 29 /r */
   28168       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28169          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   28170                     uses_vvvv, vbi, pfx, delta, "vpcmpeqq", Iop_CmpEQ64x4 );
   28171          goto decode_success;
   28172       }
   28173       break;
   28174 
   28175    case 0x2A:
   28176       /* VMOVNTDQA m128, xmm1 = VEX.128.66.0F38.WIG 2A /r */
   28177       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28178           && !epartIsReg(getUChar(delta))) {
   28179          UChar  modrm = getUChar(delta);
   28180          UInt   rD    = gregOfRexRM(pfx, modrm);
   28181          IRTemp tD    = newTemp(Ity_V128);
   28182          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28183          delta += alen;
   28184          gen_SEGV_if_not_16_aligned(addr);
   28185          assign(tD, loadLE(Ity_V128, mkexpr(addr)));
   28186          DIP("vmovntdqa %s,%s\n", dis_buf, nameXMMReg(rD));
   28187          putYMMRegLoAndZU(rD, mkexpr(tD));
   28188          goto decode_success;
   28189       }
   28190       /* VMOVNTDQA m256, ymm1 = VEX.256.66.0F38.WIG 2A /r */
   28191       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28192           && !epartIsReg(getUChar(delta))) {
   28193          UChar  modrm = getUChar(delta);
   28194          UInt   rD    = gregOfRexRM(pfx, modrm);
   28195          IRTemp tD    = newTemp(Ity_V256);
   28196          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28197          delta += alen;
   28198          gen_SEGV_if_not_32_aligned(addr);
   28199          assign(tD, loadLE(Ity_V256, mkexpr(addr)));
   28200          DIP("vmovntdqa %s,%s\n", dis_buf, nameYMMReg(rD));
   28201          putYMMReg(rD, mkexpr(tD));
   28202          goto decode_success;
   28203       }
   28204       break;
   28205 
   28206    case 0x2B:
   28207       /* VPACKUSDW r/m, rV, r ::: r = QNarrowBin32Sto16Ux8(rV, r/m) */
   28208       /* VPACKUSDW = VEX.NDS.128.66.0F38.WIG 2B /r */
   28209       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28210          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   28211                     uses_vvvv, vbi, pfx, delta, "vpackusdw",
   28212                     Iop_QNarrowBin32Sto16Ux8, NULL,
   28213                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   28214          goto decode_success;
   28215       }
   28216       /* VPACKUSDW r/m, rV, r ::: r = QNarrowBin32Sto16Ux8(rV, r/m) */
   28217       /* VPACKUSDW = VEX.NDS.256.66.0F38.WIG 2B /r */
   28218       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28219          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   28220                     uses_vvvv, vbi, pfx, delta, "vpackusdw",
   28221                     math_VPACKUSDW_YMM );
   28222          goto decode_success;
   28223       }
   28224       break;
   28225 
   28226    case 0x2C:
   28227       /* VMASKMOVPS m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 2C /r */
   28228       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28229           && 0==getRexW(pfx)/*W0*/
   28230           && !epartIsReg(getUChar(delta))) {
   28231          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
   28232                                /*!isYMM*/False, Ity_I32, /*isLoad*/True );
   28233          goto decode_success;
   28234       }
   28235       /* VMASKMOVPS m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 2C /r */
   28236       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28237           && 0==getRexW(pfx)/*W0*/
   28238           && !epartIsReg(getUChar(delta))) {
   28239          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
   28240                                /*isYMM*/True, Ity_I32, /*isLoad*/True );
   28241          goto decode_success;
   28242       }
   28243       break;
   28244 
   28245    case 0x2D:
   28246       /* VMASKMOVPD m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 2D /r */
   28247       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28248           && 0==getRexW(pfx)/*W0*/
   28249           && !epartIsReg(getUChar(delta))) {
   28250          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
   28251                                /*!isYMM*/False, Ity_I64, /*isLoad*/True );
   28252          goto decode_success;
   28253       }
   28254       /* VMASKMOVPD m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 2D /r */
   28255       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28256           && 0==getRexW(pfx)/*W0*/
   28257           && !epartIsReg(getUChar(delta))) {
   28258          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
   28259                                /*isYMM*/True, Ity_I64, /*isLoad*/True );
   28260          goto decode_success;
   28261       }
   28262       break;
   28263 
   28264    case 0x2E:
   28265       /* VMASKMOVPS xmm1, xmm2, m128 = VEX.NDS.128.66.0F38.W0 2E /r */
   28266       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28267           && 0==getRexW(pfx)/*W0*/
   28268           && !epartIsReg(getUChar(delta))) {
   28269          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
   28270                                /*!isYMM*/False, Ity_I32, /*!isLoad*/False );
   28271          goto decode_success;
   28272       }
   28273       /* VMASKMOVPS ymm1, ymm2, m256 = VEX.NDS.256.66.0F38.W0 2E /r */
   28274       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28275           && 0==getRexW(pfx)/*W0*/
   28276           && !epartIsReg(getUChar(delta))) {
   28277          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
   28278                                /*isYMM*/True, Ity_I32, /*!isLoad*/False );
   28279          goto decode_success;
   28280       }
   28281       break;
   28282 
   28283    case 0x2F:
   28284       /* VMASKMOVPD xmm1, xmm2, m128 = VEX.NDS.128.66.0F38.W0 2F /r */
   28285       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28286           && 0==getRexW(pfx)/*W0*/
   28287           && !epartIsReg(getUChar(delta))) {
   28288          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
   28289                                /*!isYMM*/False, Ity_I64, /*!isLoad*/False );
   28290          goto decode_success;
   28291       }
   28292       /* VMASKMOVPD ymm1, ymm2, m256 = VEX.NDS.256.66.0F38.W0 2F /r */
   28293       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28294           && 0==getRexW(pfx)/*W0*/
   28295           && !epartIsReg(getUChar(delta))) {
   28296          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
   28297                                /*isYMM*/True, Ity_I64, /*!isLoad*/False );
   28298          goto decode_success;
   28299       }
   28300       break;
   28301 
   28302    case 0x30:
   28303       /* VPMOVZXBW xmm2/m64, xmm1 */
   28304       /* VPMOVZXBW = VEX.128.66.0F38.WIG 30 /r */
   28305       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28306          delta = dis_PMOVxXBW_128( vbi, pfx, delta,
   28307                                    True/*isAvx*/, True/*xIsZ*/ );
   28308          goto decode_success;
   28309       }
   28310       /* VPMOVZXBW xmm2/m128, ymm1 */
   28311       /* VPMOVZXBW = VEX.256.66.0F38.WIG 30 /r */
   28312       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28313          delta = dis_PMOVxXBW_256( vbi, pfx, delta, True/*xIsZ*/ );
   28314          goto decode_success;
   28315       }
   28316       break;
   28317 
   28318    case 0x31:
   28319       /* VPMOVZXBD xmm2/m32, xmm1 */
   28320       /* VPMOVZXBD = VEX.128.66.0F38.WIG 31 /r */
   28321       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28322          delta = dis_PMOVxXBD_128( vbi, pfx, delta,
   28323                                    True/*isAvx*/, True/*xIsZ*/ );
   28324          goto decode_success;
   28325       }
   28326       /* VPMOVZXBD xmm2/m64, ymm1 */
   28327       /* VPMOVZXBD = VEX.256.66.0F38.WIG 31 /r */
   28328       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28329          delta = dis_PMOVxXBD_256( vbi, pfx, delta, True/*xIsZ*/ );
   28330          goto decode_success;
   28331       }
   28332       break;
   28333 
   28334    case 0x32:
   28335       /* VPMOVZXBQ xmm2/m16, xmm1 */
   28336       /* VPMOVZXBQ = VEX.128.66.0F38.WIG 32 /r */
   28337       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28338          delta = dis_PMOVZXBQ_128( vbi, pfx, delta, True/*isAvx*/ );
   28339          goto decode_success;
   28340       }
   28341       /* VPMOVZXBQ xmm2/m32, ymm1 */
   28342       /* VPMOVZXBQ = VEX.256.66.0F38.WIG 32 /r */
   28343       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28344          delta = dis_PMOVZXBQ_256( vbi, pfx, delta );
   28345          goto decode_success;
   28346       }
   28347       break;
   28348 
   28349    case 0x33:
   28350       /* VPMOVZXWD xmm2/m64, xmm1 */
   28351       /* VPMOVZXWD = VEX.128.66.0F38.WIG 33 /r */
   28352       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28353          delta = dis_PMOVxXWD_128( vbi, pfx, delta,
   28354                                    True/*isAvx*/, True/*xIsZ*/ );
   28355          goto decode_success;
   28356       }
   28357       /* VPMOVZXWD xmm2/m128, ymm1 */
   28358       /* VPMOVZXWD = VEX.256.66.0F38.WIG 33 /r */
   28359       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28360          delta = dis_PMOVxXWD_256( vbi, pfx, delta, True/*xIsZ*/ );
   28361          goto decode_success;
   28362       }
   28363       break;
   28364 
   28365    case 0x34:
   28366       /* VPMOVZXWQ xmm2/m32, xmm1 = VEX.128.66.0F38.WIG 34 /r */
   28367       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28368          delta = dis_PMOVZXWQ_128( vbi, pfx, delta, True/*isAvx*/ );
   28369          goto decode_success;
   28370       }
   28371       /* VPMOVZXWQ xmm2/m64, ymm1 = VEX.256.66.0F38.WIG 34 /r */
   28372       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28373          delta = dis_PMOVZXWQ_256( vbi, pfx, delta );
   28374          goto decode_success;
   28375       }
   28376       break;
   28377 
   28378    case 0x35:
   28379       /* VPMOVZXDQ xmm2/m64, xmm1 = VEX.128.66.0F38.WIG 35 /r */
   28380       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28381          delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
   28382                                    True/*isAvx*/, True/*xIsZ*/ );
   28383          goto decode_success;
   28384       }
   28385       /* VPMOVZXDQ xmm2/m128, ymm1 = VEX.256.66.0F38.WIG 35 /r */
   28386       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28387          delta = dis_PMOVxXDQ_256( vbi, pfx, delta, True/*xIsZ*/ );
   28388          goto decode_success;
   28389       }
   28390       break;
   28391 
   28392    case 0x36:
   28393       /* VPERMD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 36 /r */
   28394       if (have66noF2noF3(pfx)
   28395           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   28396          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   28397                     uses_vvvv, vbi, pfx, delta, "vpermd", math_VPERMD );
   28398          goto decode_success;
   28399       }
   28400       break;
   28401 
   28402    case 0x37:
   28403       /* VPCMPGTQ r/m, rV, r ::: r = rV `>s-by-64s` r/m */
   28404       /* VPCMPGTQ = VEX.NDS.128.66.0F38.WIG 37 /r */
   28405       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28406          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   28407                     uses_vvvv, vbi, pfx, delta, "vpcmpgtq", Iop_CmpGT64Sx2 );
   28408          goto decode_success;
   28409       }
   28410       /* VPCMPGTQ r/m, rV, r ::: r = rV `>s-by-64s` r/m */
   28411       /* VPCMPGTQ = VEX.NDS.256.66.0F38.WIG 37 /r */
   28412       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28413          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   28414                     uses_vvvv, vbi, pfx, delta, "vpcmpgtq", Iop_CmpGT64Sx4 );
   28415          goto decode_success;
   28416       }
   28417       break;
   28418 
   28419    case 0x38:
   28420       /* VPMINSB r/m, rV, r ::: r = min-signed-8s(rV, r/m) */
   28421       /* VPMINSB = VEX.NDS.128.66.0F38.WIG 38 /r */
   28422       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28423          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   28424                     uses_vvvv, vbi, pfx, delta, "vpminsb", Iop_Min8Sx16 );
   28425          goto decode_success;
   28426       }
   28427       /* VPMINSB r/m, rV, r ::: r = min-signed-8s(rV, r/m) */
   28428       /* VPMINSB = VEX.NDS.256.66.0F38.WIG 38 /r */
   28429       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28430          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   28431                     uses_vvvv, vbi, pfx, delta, "vpminsb", Iop_Min8Sx32 );
   28432          goto decode_success;
   28433       }
   28434       break;
   28435 
   28436    case 0x39:
   28437       /* VPMINSD r/m, rV, r ::: r = min-signed-32s(rV, r/m) */
   28438       /* VPMINSD = VEX.NDS.128.66.0F38.WIG 39 /r */
   28439       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28440          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   28441                     uses_vvvv, vbi, pfx, delta, "vpminsd", Iop_Min32Sx4 );
   28442          goto decode_success;
   28443       }
   28444       /* VPMINSD r/m, rV, r ::: r = min-signed-32s(rV, r/m) */
   28445       /* VPMINSD = VEX.NDS.256.66.0F38.WIG 39 /r */
   28446       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28447          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   28448                     uses_vvvv, vbi, pfx, delta, "vpminsd", Iop_Min32Sx8 );
   28449          goto decode_success;
   28450       }
   28451       break;
   28452 
   28453    case 0x3A:
   28454       /* VPMINUW r/m, rV, r ::: r = min-unsigned-16s(rV, r/m) */
   28455       /* VPMINUW = VEX.NDS.128.66.0F38.WIG 3A /r */
   28456       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28457          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   28458                     uses_vvvv, vbi, pfx, delta, "vpminuw", Iop_Min16Ux8 );
   28459          goto decode_success;
   28460       }
   28461       /* VPMINUW r/m, rV, r ::: r = min-unsigned-16s(rV, r/m) */
   28462       /* VPMINUW = VEX.NDS.256.66.0F38.WIG 3A /r */
   28463       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28464          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   28465                     uses_vvvv, vbi, pfx, delta, "vpminuw", Iop_Min16Ux16 );
   28466          goto decode_success;
   28467       }
   28468       break;
   28469 
   28470    case 0x3B:
   28471       /* VPMINUD r/m, rV, r ::: r = min-unsigned-32s(rV, r/m) */
   28472       /* VPMINUD = VEX.NDS.128.66.0F38.WIG 3B /r */
   28473       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28474          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   28475                     uses_vvvv, vbi, pfx, delta, "vpminud", Iop_Min32Ux4 );
   28476          goto decode_success;
   28477       }
   28478       /* VPMINUD r/m, rV, r ::: r = min-unsigned-32s(rV, r/m) */
   28479       /* VPMINUD = VEX.NDS.256.66.0F38.WIG 3B /r */
   28480       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28481          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   28482                     uses_vvvv, vbi, pfx, delta, "vpminud", Iop_Min32Ux8 );
   28483          goto decode_success;
   28484       }
   28485       break;
   28486 
   28487    case 0x3C:
   28488       /* VPMAXSB r/m, rV, r ::: r = max-signed-8s(rV, r/m) */
   28489       /* VPMAXSB = VEX.NDS.128.66.0F38.WIG 3C /r */
   28490       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28491          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   28492                     uses_vvvv, vbi, pfx, delta, "vpmaxsb", Iop_Max8Sx16 );
   28493          goto decode_success;
   28494       }
   28495       /* VPMAXSB r/m, rV, r ::: r = max-signed-8s(rV, r/m) */
   28496       /* VPMAXSB = VEX.NDS.256.66.0F38.WIG 3C /r */
   28497       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28498          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   28499                     uses_vvvv, vbi, pfx, delta, "vpmaxsb", Iop_Max8Sx32 );
   28500          goto decode_success;
   28501       }
   28502       break;
   28503 
   28504    case 0x3D:
   28505       /* VPMAXSD r/m, rV, r ::: r = max-signed-32s(rV, r/m) */
   28506       /* VPMAXSD = VEX.NDS.128.66.0F38.WIG 3D /r */
   28507       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28508          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   28509                     uses_vvvv, vbi, pfx, delta, "vpmaxsd", Iop_Max32Sx4 );
   28510          goto decode_success;
   28511       }
   28512       /* VPMAXSD r/m, rV, r ::: r = max-signed-32s(rV, r/m) */
   28513       /* VPMAXSD = VEX.NDS.256.66.0F38.WIG 3D /r */
   28514       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28515          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   28516                     uses_vvvv, vbi, pfx, delta, "vpmaxsd", Iop_Max32Sx8 );
   28517          goto decode_success;
   28518       }
   28519       break;
   28520 
   28521    case 0x3E:
   28522       /* VPMAXUW r/m, rV, r ::: r = max-unsigned-16s(rV, r/m) */
   28523       /* VPMAXUW = VEX.NDS.128.66.0F38.WIG 3E /r */
   28524       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28525          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   28526                     uses_vvvv, vbi, pfx, delta, "vpmaxuw", Iop_Max16Ux8 );
   28527          goto decode_success;
   28528       }
   28529       /* VPMAXUW r/m, rV, r ::: r = max-unsigned-16s(rV, r/m) */
   28530       /* VPMAXUW = VEX.NDS.256.66.0F38.WIG 3E /r */
   28531       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28532          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   28533                     uses_vvvv, vbi, pfx, delta, "vpmaxuw", Iop_Max16Ux16 );
   28534          goto decode_success;
   28535       }
   28536       break;
   28537 
   28538    case 0x3F:
   28539       /* VPMAXUD r/m, rV, r ::: r = max-unsigned-32s(rV, r/m) */
   28540       /* VPMAXUD = VEX.NDS.128.66.0F38.WIG 3F /r */
   28541       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28542          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   28543                     uses_vvvv, vbi, pfx, delta, "vpmaxud", Iop_Max32Ux4 );
   28544          goto decode_success;
   28545       }
   28546       /* VPMAXUD r/m, rV, r ::: r = max-unsigned-32s(rV, r/m) */
   28547       /* VPMAXUD = VEX.NDS.256.66.0F38.WIG 3F /r */
   28548       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28549          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   28550                     uses_vvvv, vbi, pfx, delta, "vpmaxud", Iop_Max32Ux8 );
   28551          goto decode_success;
   28552       }
   28553       break;
   28554 
   28555    case 0x40:
   28556       /* VPMULLD r/m, rV, r ::: r = mul-32s(rV, r/m) */
   28557       /* VPMULLD = VEX.NDS.128.66.0F38.WIG 40 /r */
   28558       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28559          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   28560                     uses_vvvv, vbi, pfx, delta, "vpmulld", Iop_Mul32x4 );
   28561          goto decode_success;
   28562       }
   28563       /* VPMULLD r/m, rV, r ::: r = mul-32s(rV, r/m) */
   28564       /* VPMULLD = VEX.NDS.256.66.0F38.WIG 40 /r */
   28565       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28566          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   28567                     uses_vvvv, vbi, pfx, delta, "vpmulld", Iop_Mul32x8 );
   28568          goto decode_success;
   28569       }
   28570       break;
   28571 
   28572    case 0x41:
   28573       /* VPHMINPOSUW xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 41 /r */
   28574       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28575          delta = dis_PHMINPOSUW_128( vbi, pfx, delta, True/*isAvx*/ );
   28576          goto decode_success;
   28577       }
   28578       break;
   28579 
   28580    case 0x45:
   28581       /* VPSRLVD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 45 /r */
   28582       /* VPSRLVD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 45 /r */
   28583       if (have66noF2noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
   28584          delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsrlvd",
   28585                                          Iop_Shr32, 1==getVexL(pfx) );
   28586          *uses_vvvv = True;
   28587          goto decode_success;
   28588       }
   28589       /* VPSRLVQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W1 45 /r */
   28590       /* VPSRLVQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W1 45 /r */
   28591       if (have66noF2noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
   28592          delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsrlvq",
   28593                                          Iop_Shr64, 1==getVexL(pfx) );
   28594          *uses_vvvv = True;
   28595          goto decode_success;
   28596       }
   28597       break;
   28598 
   28599    case 0x46:
   28600       /* VPSRAVD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 46 /r */
   28601       /* VPSRAVD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 46 /r */
   28602       if (have66noF2noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
   28603          delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsravd",
   28604                                          Iop_Sar32, 1==getVexL(pfx) );
   28605          *uses_vvvv = True;
   28606          goto decode_success;
   28607       }
   28608       break;
   28609 
   28610    case 0x47:
   28611       /* VPSLLVD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 47 /r */
   28612       /* VPSLLVD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 47 /r */
   28613       if (have66noF2noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
   28614          delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsllvd",
   28615                                          Iop_Shl32, 1==getVexL(pfx) );
   28616          *uses_vvvv = True;
   28617          goto decode_success;
   28618       }
   28619       /* VPSLLVQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W1 47 /r */
   28620       /* VPSLLVQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W1 47 /r */
   28621       if (have66noF2noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
   28622          delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsllvq",
   28623                                          Iop_Shl64, 1==getVexL(pfx) );
   28624          *uses_vvvv = True;
   28625          goto decode_success;
   28626       }
   28627       break;
   28628 
   28629    case 0x58:
   28630       /* VPBROADCASTD xmm2/m32, xmm1 = VEX.128.66.0F38.W0 58 /r */
   28631       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28632           && 0==getRexW(pfx)/*W0*/) {
   28633          UChar modrm = getUChar(delta);
   28634          UInt  rG    = gregOfRexRM(pfx, modrm);
   28635          IRTemp t32 = newTemp(Ity_I32);
   28636          if (epartIsReg(modrm)) {
   28637             UInt rE = eregOfRexRM(pfx, modrm);
   28638             delta++;
   28639             DIP("vpbroadcastd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   28640             assign(t32, getXMMRegLane32(rE, 0));
   28641          } else {
   28642             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28643             delta += alen;
   28644             DIP("vpbroadcastd %s,%s\n", dis_buf, nameXMMReg(rG));
   28645             assign(t32, loadLE(Ity_I32, mkexpr(addr)));
   28646          }
   28647          IRTemp t64 = newTemp(Ity_I64);
   28648          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   28649          IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
   28650          putYMMRegLoAndZU(rG, res);
   28651          goto decode_success;
   28652       }
   28653       /* VPBROADCASTD xmm2/m32, ymm1 = VEX.256.66.0F38.W0 58 /r */
   28654       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28655           && 0==getRexW(pfx)/*W0*/) {
   28656          UChar modrm = getUChar(delta);
   28657          UInt  rG    = gregOfRexRM(pfx, modrm);
   28658          IRTemp t32 = newTemp(Ity_I32);
   28659          if (epartIsReg(modrm)) {
   28660             UInt rE = eregOfRexRM(pfx, modrm);
   28661             delta++;
   28662             DIP("vpbroadcastd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   28663             assign(t32, getXMMRegLane32(rE, 0));
   28664          } else {
   28665             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28666             delta += alen;
   28667             DIP("vpbroadcastd %s,%s\n", dis_buf, nameYMMReg(rG));
   28668             assign(t32, loadLE(Ity_I32, mkexpr(addr)));
   28669          }
   28670          IRTemp t64 = newTemp(Ity_I64);
   28671          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   28672          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   28673                                                   mkexpr(t64), mkexpr(t64));
   28674          putYMMReg(rG, res);
   28675          goto decode_success;
   28676       }
   28677       break;
   28678 
   28679    case 0x59:
   28680       /* VPBROADCASTQ xmm2/m64, xmm1 = VEX.128.66.0F38.W0 59 /r */
   28681       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28682           && 0==getRexW(pfx)/*W0*/) {
   28683          UChar modrm = getUChar(delta);
   28684          UInt  rG    = gregOfRexRM(pfx, modrm);
   28685          IRTemp t64 = newTemp(Ity_I64);
   28686          if (epartIsReg(modrm)) {
   28687             UInt rE = eregOfRexRM(pfx, modrm);
   28688             delta++;
   28689             DIP("vpbroadcastq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   28690             assign(t64, getXMMRegLane64(rE, 0));
   28691          } else {
   28692             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28693             delta += alen;
   28694             DIP("vpbroadcastq %s,%s\n", dis_buf, nameXMMReg(rG));
   28695             assign(t64, loadLE(Ity_I64, mkexpr(addr)));
   28696          }
   28697          IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
   28698          putYMMRegLoAndZU(rG, res);
   28699          goto decode_success;
   28700       }
   28701       /* VPBROADCASTQ xmm2/m64, ymm1 = VEX.256.66.0F38.W0 59 /r */
   28702       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28703           && 0==getRexW(pfx)/*W0*/) {
   28704          UChar modrm = getUChar(delta);
   28705          UInt  rG    = gregOfRexRM(pfx, modrm);
   28706          IRTemp t64 = newTemp(Ity_I64);
   28707          if (epartIsReg(modrm)) {
   28708             UInt rE = eregOfRexRM(pfx, modrm);
   28709             delta++;
   28710             DIP("vpbroadcastq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   28711             assign(t64, getXMMRegLane64(rE, 0));
   28712          } else {
   28713             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28714             delta += alen;
   28715             DIP("vpbroadcastq %s,%s\n", dis_buf, nameYMMReg(rG));
   28716             assign(t64, loadLE(Ity_I64, mkexpr(addr)));
   28717          }
   28718          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   28719                                                   mkexpr(t64), mkexpr(t64));
   28720          putYMMReg(rG, res);
   28721          goto decode_success;
   28722       }
   28723       break;
   28724 
   28725    case 0x5A:
   28726       /* VBROADCASTI128 m128, ymm1 = VEX.256.66.0F38.WIG 5A /r */
   28727       if (have66noF2noF3(pfx)
   28728           && 1==getVexL(pfx)/*256*/
   28729           && !epartIsReg(getUChar(delta))) {
   28730          UChar modrm = getUChar(delta);
   28731          UInt  rG    = gregOfRexRM(pfx, modrm);
   28732          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28733          delta += alen;
   28734          DIP("vbroadcasti128 %s,%s\n", dis_buf, nameYMMReg(rG));
   28735          IRTemp t128 = newTemp(Ity_V128);
   28736          assign(t128, loadLE(Ity_V128, mkexpr(addr)));
   28737          putYMMReg( rG, binop(Iop_V128HLtoV256, mkexpr(t128), mkexpr(t128)) );
   28738          goto decode_success;
   28739       }
   28740       break;
   28741 
   28742    case 0x78:
   28743       /* VPBROADCASTB xmm2/m8, xmm1 = VEX.128.66.0F38.W0 78 /r */
   28744       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28745           && 0==getRexW(pfx)/*W0*/) {
   28746          UChar modrm = getUChar(delta);
   28747          UInt  rG    = gregOfRexRM(pfx, modrm);
   28748          IRTemp t8   = newTemp(Ity_I8);
   28749          if (epartIsReg(modrm)) {
   28750             UInt rE = eregOfRexRM(pfx, modrm);
   28751             delta++;
   28752             DIP("vpbroadcastb %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   28753             assign(t8, unop(Iop_32to8, getXMMRegLane32(rE, 0)));
   28754          } else {
   28755             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28756             delta += alen;
   28757             DIP("vpbroadcastb %s,%s\n", dis_buf, nameXMMReg(rG));
   28758             assign(t8, loadLE(Ity_I8, mkexpr(addr)));
   28759          }
   28760          IRTemp t16 = newTemp(Ity_I16);
   28761          assign(t16, binop(Iop_8HLto16, mkexpr(t8), mkexpr(t8)));
   28762          IRTemp t32 = newTemp(Ity_I32);
   28763          assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16)));
   28764          IRTemp t64 = newTemp(Ity_I64);
   28765          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   28766          IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
   28767          putYMMRegLoAndZU(rG, res);
   28768          goto decode_success;
   28769       }
   28770       /* VPBROADCASTB xmm2/m8, ymm1 = VEX.256.66.0F38.W0 78 /r */
   28771       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28772           && 0==getRexW(pfx)/*W0*/) {
   28773          UChar modrm = getUChar(delta);
   28774          UInt  rG    = gregOfRexRM(pfx, modrm);
   28775          IRTemp t8   = newTemp(Ity_I8);
   28776          if (epartIsReg(modrm)) {
   28777             UInt rE = eregOfRexRM(pfx, modrm);
   28778             delta++;
   28779             DIP("vpbroadcastb %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   28780             assign(t8, unop(Iop_32to8, getXMMRegLane32(rE, 0)));
   28781          } else {
   28782             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28783             delta += alen;
   28784             DIP("vpbroadcastb %s,%s\n", dis_buf, nameYMMReg(rG));
   28785             assign(t8, loadLE(Ity_I8, mkexpr(addr)));
   28786          }
   28787          IRTemp t16 = newTemp(Ity_I16);
   28788          assign(t16, binop(Iop_8HLto16, mkexpr(t8), mkexpr(t8)));
   28789          IRTemp t32 = newTemp(Ity_I32);
   28790          assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16)));
   28791          IRTemp t64 = newTemp(Ity_I64);
   28792          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   28793          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   28794                                                   mkexpr(t64), mkexpr(t64));
   28795          putYMMReg(rG, res);
   28796          goto decode_success;
   28797       }
   28798       break;
   28799 
   28800    case 0x79:
   28801       /* VPBROADCASTW xmm2/m16, xmm1 = VEX.128.66.0F38.W0 79 /r */
   28802       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28803           && 0==getRexW(pfx)/*W0*/) {
   28804          UChar modrm = getUChar(delta);
   28805          UInt  rG    = gregOfRexRM(pfx, modrm);
   28806          IRTemp t16  = newTemp(Ity_I16);
   28807          if (epartIsReg(modrm)) {
   28808             UInt rE = eregOfRexRM(pfx, modrm);
   28809             delta++;
   28810             DIP("vpbroadcastw %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   28811             assign(t16, unop(Iop_32to16, getXMMRegLane32(rE, 0)));
   28812          } else {
   28813             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28814             delta += alen;
   28815             DIP("vpbroadcastw %s,%s\n", dis_buf, nameXMMReg(rG));
   28816             assign(t16, loadLE(Ity_I16, mkexpr(addr)));
   28817          }
   28818          IRTemp t32 = newTemp(Ity_I32);
   28819          assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16)));
   28820          IRTemp t64 = newTemp(Ity_I64);
   28821          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   28822          IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
   28823          putYMMRegLoAndZU(rG, res);
   28824          goto decode_success;
   28825       }
   28826       /* VPBROADCASTW xmm2/m16, ymm1 = VEX.256.66.0F38.W0 79 /r */
   28827       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28828           && 0==getRexW(pfx)/*W0*/) {
   28829          UChar modrm = getUChar(delta);
   28830          UInt  rG    = gregOfRexRM(pfx, modrm);
   28831          IRTemp t16  = newTemp(Ity_I16);
   28832          if (epartIsReg(modrm)) {
   28833             UInt rE = eregOfRexRM(pfx, modrm);
   28834             delta++;
   28835             DIP("vpbroadcastw %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   28836             assign(t16, unop(Iop_32to16, getXMMRegLane32(rE, 0)));
   28837          } else {
   28838             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28839             delta += alen;
   28840             DIP("vpbroadcastw %s,%s\n", dis_buf, nameYMMReg(rG));
   28841             assign(t16, loadLE(Ity_I16, mkexpr(addr)));
   28842          }
   28843          IRTemp t32 = newTemp(Ity_I32);
   28844          assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16)));
   28845          IRTemp t64 = newTemp(Ity_I64);
   28846          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   28847          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   28848                                                   mkexpr(t64), mkexpr(t64));
   28849          putYMMReg(rG, res);
   28850          goto decode_success;
   28851       }
   28852       break;
   28853 
   28854    case 0x8C:
   28855       /* VPMASKMOVD m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 8C /r */
   28856       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28857           && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   28858          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
   28859                                /*!isYMM*/False, Ity_I32, /*isLoad*/True );
   28860          goto decode_success;
   28861       }
   28862       /* VPMASKMOVD m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 8C /r */
   28863       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28864           && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   28865          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
   28866                                /*isYMM*/True, Ity_I32, /*isLoad*/True );
   28867          goto decode_success;
   28868       }
   28869       /* VPMASKMOVQ m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W1 8C /r */
   28870       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28871           && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   28872          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
   28873                                /*!isYMM*/False, Ity_I64, /*isLoad*/True );
   28874          goto decode_success;
   28875       }
   28876       /* VPMASKMOVQ m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W1 8C /r */
   28877       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28878           && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   28879          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
   28880                                /*isYMM*/True, Ity_I64, /*isLoad*/True );
   28881          goto decode_success;
   28882       }
   28883       break;
   28884 
   28885    case 0x8E:
   28886       /* VPMASKMOVD xmm1, xmm2, m128 = VEX.NDS.128.66.0F38.W0 8E /r */
   28887       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28888           && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   28889          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
   28890                                /*!isYMM*/False, Ity_I32, /*!isLoad*/False );
   28891          goto decode_success;
   28892       }
   28893       /* VPMASKMOVD ymm1, ymm2, m256 = VEX.NDS.256.66.0F38.W0 8E /r */
   28894       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28895           && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   28896          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
   28897                                /*isYMM*/True, Ity_I32, /*!isLoad*/False );
   28898          goto decode_success;
   28899       }
   28900       /* VPMASKMOVQ xmm1, xmm2, m128 = VEX.NDS.128.66.0F38.W1 8E /r */
   28901       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28902           && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   28903          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
   28904                                /*!isYMM*/False, Ity_I64, /*!isLoad*/False );
   28905          goto decode_success;
   28906       }
   28907       /* VPMASKMOVQ ymm1, ymm2, m256 = VEX.NDS.256.66.0F38.W1 8E /r */
   28908       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28909           && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   28910          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
   28911                                /*isYMM*/True, Ity_I64, /*!isLoad*/False );
   28912          goto decode_success;
   28913       }
   28914       break;
   28915 
   28916    case 0x90:
   28917       /* VPGATHERDD xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W0 90 /r */
   28918       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28919           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   28920          Long delta0 = delta;
   28921          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdd",
   28922                               /*!isYMM*/False, /*!isVM64x*/False, Ity_I32 );
   28923          if (delta != delta0)
   28924             goto decode_success;
   28925       }
   28926       /* VPGATHERDD ymm2, vm32y, ymm1 = VEX.DDS.256.66.0F38.W0 90 /r */
   28927       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28928           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   28929          Long delta0 = delta;
   28930          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdd",
   28931                               /*isYMM*/True, /*!isVM64x*/False, Ity_I32 );
   28932          if (delta != delta0)
   28933             goto decode_success;
   28934       }
   28935       /* VPGATHERDQ xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W1 90 /r */
   28936       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28937           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   28938          Long delta0 = delta;
   28939          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdq",
   28940                               /*!isYMM*/False, /*!isVM64x*/False, Ity_I64 );
   28941          if (delta != delta0)
   28942             goto decode_success;
   28943       }
   28944       /* VPGATHERDQ ymm2, vm32x, ymm1 = VEX.DDS.256.66.0F38.W1 90 /r */
   28945       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28946           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   28947          Long delta0 = delta;
   28948          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdq",
   28949                               /*isYMM*/True, /*!isVM64x*/False, Ity_I64 );
   28950          if (delta != delta0)
   28951             goto decode_success;
   28952       }
   28953       break;
   28954 
   28955    case 0x91:
   28956       /* VPGATHERQD xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W0 91 /r */
   28957       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28958           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   28959          Long delta0 = delta;
   28960          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqd",
   28961                               /*!isYMM*/False, /*isVM64x*/True, Ity_I32 );
   28962          if (delta != delta0)
   28963             goto decode_success;
   28964       }
   28965       /* VPGATHERQD xmm2, vm64y, xmm1 = VEX.DDS.256.66.0F38.W0 91 /r */
   28966       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28967           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   28968          Long delta0 = delta;
   28969          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqd",
   28970                               /*isYMM*/True, /*isVM64x*/True, Ity_I32 );
   28971          if (delta != delta0)
   28972             goto decode_success;
   28973       }
   28974       /* VPGATHERQQ xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W1 91 /r */
   28975       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28976           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   28977          Long delta0 = delta;
   28978          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqq",
   28979                               /*!isYMM*/False, /*isVM64x*/True, Ity_I64 );
   28980          if (delta != delta0)
   28981             goto decode_success;
   28982       }
   28983       /* VPGATHERQQ ymm2, vm64y, ymm1 = VEX.DDS.256.66.0F38.W1 91 /r */
   28984       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28985           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   28986          Long delta0 = delta;
   28987          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqq",
   28988                               /*isYMM*/True, /*isVM64x*/True, Ity_I64 );
   28989          if (delta != delta0)
   28990             goto decode_success;
   28991       }
   28992       break;
   28993 
   28994    case 0x92:
   28995       /* VGATHERDPS xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W0 92 /r */
   28996       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28997           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   28998          Long delta0 = delta;
   28999          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdps",
   29000                               /*!isYMM*/False, /*!isVM64x*/False, Ity_I32 );
   29001          if (delta != delta0)
   29002             goto decode_success;
   29003       }
   29004       /* VGATHERDPS ymm2, vm32y, ymm1 = VEX.DDS.256.66.0F38.W0 92 /r */
   29005       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29006           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   29007          Long delta0 = delta;
   29008          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdps",
   29009                               /*isYMM*/True, /*!isVM64x*/False, Ity_I32 );
   29010          if (delta != delta0)
   29011             goto decode_success;
   29012       }
   29013       /* VGATHERDPD xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W1 92 /r */
   29014       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29015           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   29016          Long delta0 = delta;
   29017          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdpd",
   29018                               /*!isYMM*/False, /*!isVM64x*/False, Ity_I64 );
   29019          if (delta != delta0)
   29020             goto decode_success;
   29021       }
   29022       /* VGATHERDPD ymm2, vm32x, ymm1 = VEX.DDS.256.66.0F38.W1 92 /r */
   29023       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29024           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   29025          Long delta0 = delta;
   29026          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdpd",
   29027                               /*isYMM*/True, /*!isVM64x*/False, Ity_I64 );
   29028          if (delta != delta0)
   29029             goto decode_success;
   29030       }
   29031       break;
   29032 
   29033    case 0x93:
   29034       /* VGATHERQPS xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W0 93 /r */
   29035       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29036           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   29037          Long delta0 = delta;
   29038          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqps",
   29039                               /*!isYMM*/False, /*isVM64x*/True, Ity_I32 );
   29040          if (delta != delta0)
   29041             goto decode_success;
   29042       }
   29043       /* VGATHERQPS xmm2, vm64y, xmm1 = VEX.DDS.256.66.0F38.W0 93 /r */
   29044       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29045           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   29046          Long delta0 = delta;
   29047          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqps",
   29048                               /*isYMM*/True, /*isVM64x*/True, Ity_I32 );
   29049          if (delta != delta0)
   29050             goto decode_success;
   29051       }
   29052       /* VGATHERQPD xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W1 93 /r */
   29053       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29054           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   29055          Long delta0 = delta;
   29056          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqpd",
   29057                               /*!isYMM*/False, /*isVM64x*/True, Ity_I64 );
   29058          if (delta != delta0)
   29059             goto decode_success;
   29060       }
   29061       /* VGATHERQPD ymm2, vm64y, ymm1 = VEX.DDS.256.66.0F38.W1 93 /r */
   29062       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29063           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   29064          Long delta0 = delta;
   29065          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqpd",
   29066                               /*isYMM*/True, /*isVM64x*/True, Ity_I64 );
   29067          if (delta != delta0)
   29068             goto decode_success;
   29069       }
   29070       break;
   29071 
   29072    case 0x96 ... 0x9F:
   29073    case 0xA6 ... 0xAF:
   29074    case 0xB6 ... 0xBF:
   29075       /* VFMADDSUB132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 96 /r */
   29076       /* VFMADDSUB132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 96 /r */
   29077       /* VFMADDSUB132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 96 /r */
   29078       /* VFMADDSUB132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 96 /r */
   29079       /* VFMSUBADD132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 97 /r */
   29080       /* VFMSUBADD132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 97 /r */
   29081       /* VFMSUBADD132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 97 /r */
   29082       /* VFMSUBADD132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 97 /r */
   29083       /* VFMADD132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 98 /r */
   29084       /* VFMADD132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 98 /r */
   29085       /* VFMADD132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 98 /r */
   29086       /* VFMADD132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 98 /r */
   29087       /* VFMADD132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 99 /r */
   29088       /* VFMADD132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 99 /r */
   29089       /* VFMSUB132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 9A /r */
   29090       /* VFMSUB132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 9A /r */
   29091       /* VFMSUB132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 9A /r */
   29092       /* VFMSUB132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 9A /r */
   29093       /* VFMSUB132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 9B /r */
   29094       /* VFMSUB132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 9B /r */
   29095       /* VFNMADD132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 9C /r */
   29096       /* VFNMADD132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 9C /r */
   29097       /* VFNMADD132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 9C /r */
   29098       /* VFNMADD132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 9C /r */
   29099       /* VFNMADD132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 9D /r */
   29100       /* VFNMADD132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 9D /r */
   29101       /* VFNMSUB132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 9E /r */
   29102       /* VFNMSUB132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 9E /r */
   29103       /* VFNMSUB132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 9E /r */
   29104       /* VFNMSUB132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 9E /r */
   29105       /* VFNMSUB132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 9F /r */
   29106       /* VFNMSUB132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 9F /r */
   29107       /* VFMADDSUB213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 A6 /r */
   29108       /* VFMADDSUB213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 A6 /r */
   29109       /* VFMADDSUB213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 A6 /r */
   29110       /* VFMADDSUB213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 A6 /r */
   29111       /* VFMSUBADD213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 A7 /r */
   29112       /* VFMSUBADD213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 A7 /r */
   29113       /* VFMSUBADD213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 A7 /r */
   29114       /* VFMSUBADD213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 A7 /r */
   29115       /* VFMADD213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 A8 /r */
   29116       /* VFMADD213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 A8 /r */
   29117       /* VFMADD213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 A8 /r */
   29118       /* VFMADD213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 A8 /r */
   29119       /* VFMADD213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 A9 /r */
   29120       /* VFMADD213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 A9 /r */
   29121       /* VFMSUB213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 AA /r */
   29122       /* VFMSUB213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 AA /r */
   29123       /* VFMSUB213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 AA /r */
   29124       /* VFMSUB213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 AA /r */
   29125       /* VFMSUB213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 AB /r */
   29126       /* VFMSUB213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 AB /r */
   29127       /* VFNMADD213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 AC /r */
   29128       /* VFNMADD213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 AC /r */
   29129       /* VFNMADD213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 AC /r */
   29130       /* VFNMADD213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 AC /r */
   29131       /* VFNMADD213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 AD /r */
   29132       /* VFNMADD213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 AD /r */
   29133       /* VFNMSUB213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 AE /r */
   29134       /* VFNMSUB213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 AE /r */
   29135       /* VFNMSUB213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 AE /r */
   29136       /* VFNMSUB213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 AE /r */
   29137       /* VFNMSUB213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 AF /r */
   29138       /* VFNMSUB213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 AF /r */
   29139       /* VFMADDSUB231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 B6 /r */
   29140       /* VFMADDSUB231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 B6 /r */
   29141       /* VFMADDSUB231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 B6 /r */
   29142       /* VFMADDSUB231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 B6 /r */
   29143       /* VFMSUBADD231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 B7 /r */
   29144       /* VFMSUBADD231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 B7 /r */
   29145       /* VFMSUBADD231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 B7 /r */
   29146       /* VFMSUBADD231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 B7 /r */
   29147       /* VFMADD231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 B8 /r */
   29148       /* VFMADD231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 B8 /r */
   29149       /* VFMADD231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 B8 /r */
   29150       /* VFMADD231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 B8 /r */
   29151       /* VFMADD231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 B9 /r */
   29152       /* VFMADD231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 B9 /r */
   29153       /* VFMSUB231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 BA /r */
   29154       /* VFMSUB231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 BA /r */
   29155       /* VFMSUB231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 BA /r */
   29156       /* VFMSUB231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 BA /r */
   29157       /* VFMSUB231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 BB /r */
   29158       /* VFMSUB231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 BB /r */
   29159       /* VFNMADD231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 BC /r */
   29160       /* VFNMADD231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 BC /r */
   29161       /* VFNMADD231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 BC /r */
   29162       /* VFNMADD231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 BC /r */
   29163       /* VFNMADD231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 BD /r */
   29164       /* VFNMADD231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 BD /r */
   29165       /* VFNMSUB231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 BE /r */
   29166       /* VFNMSUB231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 BE /r */
   29167       /* VFNMSUB231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 BE /r */
   29168       /* VFNMSUB231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 BE /r */
   29169       /* VFNMSUB231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 BF /r */
   29170       /* VFNMSUB231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 BF /r */
   29171       if (have66noF2noF3(pfx)) {
   29172          delta = dis_FMA( vbi, pfx, delta, opc );
   29173          *uses_vvvv = True;
   29174          goto decode_success;
   29175       }
   29176       break;
   29177 
   29178    case 0xDB:
   29179    case 0xDC:
   29180    case 0xDD:
   29181    case 0xDE:
   29182    case 0xDF:
   29183       /* VAESIMC xmm2/m128, xmm1 = VEX.128.66.0F38.WIG DB /r */
   29184       /* VAESENC xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DC /r */
   29185       /* VAESENCLAST xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DD /r */
   29186       /* VAESDEC xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DE /r */
   29187       /* VAESDECLAST xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DF /r */
   29188       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   29189          delta = dis_AESx( vbi, pfx, delta, True/*!isAvx*/, opc );
   29190          if (opc != 0xDB) *uses_vvvv = True;
   29191          goto decode_success;
   29192       }
   29193       break;
   29194 
   29195    case 0xF2:
   29196       /* ANDN r/m32, r32b, r32a = VEX.NDS.LZ.0F38.W0 F2 /r */
   29197       /* ANDN r/m64, r64b, r64a = VEX.NDS.LZ.0F38.W1 F2 /r */
   29198       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   29199          Int     size = getRexW(pfx) ? 8 : 4;
   29200          IRType  ty   = szToITy(size);
   29201          IRTemp  dst  = newTemp(ty);
   29202          IRTemp  src1 = newTemp(ty);
   29203          IRTemp  src2 = newTemp(ty);
   29204          UChar   rm   = getUChar(delta);
   29205 
   29206          assign( src1, getIRegV(size,pfx) );
   29207          if (epartIsReg(rm)) {
   29208             assign( src2, getIRegE(size,pfx,rm) );
   29209             DIP("andn %s,%s,%s\n", nameIRegE(size,pfx,rm),
   29210                 nameIRegV(size,pfx), nameIRegG(size,pfx,rm));
   29211             delta++;
   29212          } else {
   29213             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   29214             assign( src2, loadLE(ty, mkexpr(addr)) );
   29215             DIP("andn %s,%s,%s\n", dis_buf, nameIRegV(size,pfx),
   29216                 nameIRegG(size,pfx,rm));
   29217             delta += alen;
   29218          }
   29219 
   29220          assign( dst, binop( mkSizedOp(ty,Iop_And8),
   29221                              unop( mkSizedOp(ty,Iop_Not8), mkexpr(src1) ),
   29222                              mkexpr(src2) ) );
   29223          putIRegG( size, pfx, rm, mkexpr(dst) );
   29224          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
   29225                                                ? AMD64G_CC_OP_ANDN64
   29226                                                : AMD64G_CC_OP_ANDN32)) );
   29227          stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
   29228          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
   29229          *uses_vvvv = True;
   29230          goto decode_success;
   29231       }
   29232       break;
   29233 
   29234    case 0xF3:
   29235       /* BLSI r/m32, r32 = VEX.NDD.LZ.0F38.W0 F3 /3 */
   29236       /* BLSI r/m64, r64 = VEX.NDD.LZ.0F38.W1 F3 /3 */
   29237       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/
   29238           && !haveREX(pfx) && gregLO3ofRM(getUChar(delta)) == 3) {
   29239          Int     size = getRexW(pfx) ? 8 : 4;
   29240          IRType  ty   = szToITy(size);
   29241          IRTemp  src  = newTemp(ty);
   29242          IRTemp  dst  = newTemp(ty);
   29243          UChar   rm   = getUChar(delta);
   29244 
   29245          if (epartIsReg(rm)) {
   29246             assign( src, getIRegE(size,pfx,rm) );
   29247             DIP("blsi %s,%s\n", nameIRegE(size,pfx,rm),
   29248                 nameIRegV(size,pfx));
   29249             delta++;
   29250          } else {
   29251             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   29252             assign( src, loadLE(ty, mkexpr(addr)) );
   29253             DIP("blsi %s,%s\n", dis_buf, nameIRegV(size,pfx));
   29254             delta += alen;
   29255          }
   29256 
   29257          assign( dst, binop(mkSizedOp(ty,Iop_And8),
   29258                             binop(mkSizedOp(ty,Iop_Sub8), mkU(ty, 0),
   29259                                   mkexpr(src)), mkexpr(src)) );
   29260          putIRegV( size, pfx, mkexpr(dst) );
   29261          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
   29262                                                ? AMD64G_CC_OP_BLSI64
   29263                                                : AMD64G_CC_OP_BLSI32)) );
   29264          stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
   29265          stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(src))) );
   29266          *uses_vvvv = True;
   29267          goto decode_success;
   29268       }
   29269       /* BLSMSK r/m32, r32 = VEX.NDD.LZ.0F38.W0 F3 /2 */
   29270       /* BLSMSK r/m64, r64 = VEX.NDD.LZ.0F38.W1 F3 /2 */
   29271       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/
   29272           && !haveREX(pfx) && gregLO3ofRM(getUChar(delta)) == 2) {
   29273          Int     size = getRexW(pfx) ? 8 : 4;
   29274          IRType  ty   = szToITy(size);
   29275          IRTemp  src  = newTemp(ty);
   29276          IRTemp  dst  = newTemp(ty);
   29277          UChar   rm   = getUChar(delta);
   29278 
   29279          if (epartIsReg(rm)) {
   29280             assign( src, getIRegE(size,pfx,rm) );
   29281             DIP("blsmsk %s,%s\n", nameIRegE(size,pfx,rm),
   29282                 nameIRegV(size,pfx));
   29283             delta++;
   29284          } else {
   29285             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   29286             assign( src, loadLE(ty, mkexpr(addr)) );
   29287             DIP("blsmsk %s,%s\n", dis_buf, nameIRegV(size,pfx));
   29288             delta += alen;
   29289          }
   29290 
   29291          assign( dst, binop(mkSizedOp(ty,Iop_Xor8),
   29292                             binop(mkSizedOp(ty,Iop_Sub8), mkexpr(src),
   29293                                   mkU(ty, 1)), mkexpr(src)) );
   29294          putIRegV( size, pfx, mkexpr(dst) );
   29295          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
   29296                                                ? AMD64G_CC_OP_BLSMSK64
   29297                                                : AMD64G_CC_OP_BLSMSK32)) );
   29298          stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
   29299          stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(src))) );
   29300          *uses_vvvv = True;
   29301          goto decode_success;
   29302       }
   29303       /* BLSR r/m32, r32 = VEX.NDD.LZ.0F38.W0 F3 /1 */
   29304       /* BLSR r/m64, r64 = VEX.NDD.LZ.0F38.W1 F3 /1 */
   29305       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/
   29306           && !haveREX(pfx) && gregLO3ofRM(getUChar(delta)) == 1) {
   29307          Int     size = getRexW(pfx) ? 8 : 4;
   29308          IRType  ty   = szToITy(size);
   29309          IRTemp  src  = newTemp(ty);
   29310          IRTemp  dst  = newTemp(ty);
   29311          UChar   rm   = getUChar(delta);
   29312 
   29313          if (epartIsReg(rm)) {
   29314             assign( src, getIRegE(size,pfx,rm) );
   29315             DIP("blsr %s,%s\n", nameIRegE(size,pfx,rm),
   29316                 nameIRegV(size,pfx));
   29317             delta++;
   29318          } else {
   29319             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   29320             assign( src, loadLE(ty, mkexpr(addr)) );
   29321             DIP("blsr %s,%s\n", dis_buf, nameIRegV(size,pfx));
   29322             delta += alen;
   29323          }
   29324 
   29325          assign( dst, binop(mkSizedOp(ty,Iop_And8),
   29326                             binop(mkSizedOp(ty,Iop_Sub8), mkexpr(src),
   29327                                   mkU(ty, 1)), mkexpr(src)) );
   29328          putIRegV( size, pfx, mkexpr(dst) );
   29329          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
   29330                                                ? AMD64G_CC_OP_BLSR64
   29331                                                : AMD64G_CC_OP_BLSR32)) );
   29332          stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
   29333          stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(src))) );
   29334          *uses_vvvv = True;
   29335          goto decode_success;
   29336       }
   29337       break;
   29338 
   29339    case 0xF5:
   29340       /* BZHI r32b, r/m32, r32a = VEX.NDS.LZ.0F38.W0 F5 /r */
   29341       /* BZHI r64b, r/m64, r64a = VEX.NDS.LZ.0F38.W1 F5 /r */
   29342       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   29343          Int     size  = getRexW(pfx) ? 8 : 4;
   29344          IRType  ty    = szToITy(size);
   29345          IRTemp  dst   = newTemp(ty);
   29346          IRTemp  src1  = newTemp(ty);
   29347          IRTemp  src2  = newTemp(ty);
   29348          IRTemp  start = newTemp(Ity_I8);
   29349          IRTemp  cond  = newTemp(Ity_I1);
   29350          UChar   rm    = getUChar(delta);
   29351 
   29352          assign( src2, getIRegV(size,pfx) );
   29353          if (epartIsReg(rm)) {
   29354             assign( src1, getIRegE(size,pfx,rm) );
   29355             DIP("bzhi %s,%s,%s\n", nameIRegV(size,pfx),
   29356                 nameIRegE(size,pfx,rm), nameIRegG(size,pfx,rm));
   29357             delta++;
   29358          } else {
   29359             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   29360             assign( src1, loadLE(ty, mkexpr(addr)) );
   29361             DIP("bzhi %s,%s,%s\n", nameIRegV(size,pfx), dis_buf,
   29362                 nameIRegG(size,pfx,rm));
   29363             delta += alen;
   29364          }
   29365 
   29366          assign( start, narrowTo( Ity_I8, mkexpr(src2) ) );
   29367          assign( cond, binop(Iop_CmpLT32U,
   29368                              unop(Iop_8Uto32, mkexpr(start)),
   29369                              mkU32(8*size)) );
   29370          /* if (start < opsize) {
   29371                if (start == 0)
   29372                   dst = 0;
   29373                else
   29374                   dst = (src1 << (opsize-start)) u>> (opsize-start);
   29375             } else {
   29376                dst = src1;
   29377             } */
   29378          assign( dst,
   29379                  IRExpr_ITE(
   29380                     mkexpr(cond),
   29381                     IRExpr_ITE(
   29382                        binop(Iop_CmpEQ8, mkexpr(start), mkU8(0)),
   29383                        mkU(ty, 0),
   29384                        binop(
   29385                           mkSizedOp(ty,Iop_Shr8),
   29386                           binop(
   29387                              mkSizedOp(ty,Iop_Shl8),
   29388                              mkexpr(src1),
   29389                              binop(Iop_Sub8, mkU8(8*size), mkexpr(start))
   29390                           ),
   29391                           binop(Iop_Sub8, mkU8(8*size), mkexpr(start))
   29392                        )
   29393                     ),
   29394                     mkexpr(src1)
   29395                  )
   29396                );
   29397          putIRegG( size, pfx, rm, mkexpr(dst) );
   29398          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
   29399                                                ? AMD64G_CC_OP_BLSR64
   29400                                                : AMD64G_CC_OP_BLSR32)) );
   29401          stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
   29402          stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(cond))) );
   29403          *uses_vvvv = True;
   29404          goto decode_success;
   29405       }
   29406       /* PDEP r/m32, r32b, r32a = VEX.NDS.LZ.F2.0F38.W0 F5 /r */
   29407       /* PDEP r/m64, r64b, r64a = VEX.NDS.LZ.F2.0F38.W1 F5 /r */
   29408       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   29409          Int     size = getRexW(pfx) ? 8 : 4;
   29410          IRType  ty   = szToITy(size);
   29411          IRTemp  src  = newTemp(ty);
   29412          IRTemp  mask = newTemp(ty);
   29413          UChar   rm   = getUChar(delta);
   29414 
   29415          assign( src, getIRegV(size,pfx) );
   29416          if (epartIsReg(rm)) {
   29417             assign( mask, getIRegE(size,pfx,rm) );
   29418             DIP("pdep %s,%s,%s\n", nameIRegE(size,pfx,rm),
   29419                 nameIRegV(size,pfx), nameIRegG(size,pfx,rm));
   29420             delta++;
   29421          } else {
   29422             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   29423             assign( mask, loadLE(ty, mkexpr(addr)) );
   29424             DIP("pdep %s,%s,%s\n", dis_buf, nameIRegV(size,pfx),
   29425                 nameIRegG(size,pfx,rm));
   29426             delta += alen;
   29427          }
   29428 
   29429          IRExpr** args = mkIRExprVec_2( widenUto64(mkexpr(src)),
   29430                                         widenUto64(mkexpr(mask)) );
   29431          putIRegG( size, pfx, rm,
   29432                    narrowTo(ty, mkIRExprCCall(Ity_I64, 0/*regparms*/,
   29433                                               "amd64g_calculate_pdep",
   29434                                               &amd64g_calculate_pdep, args)) );
   29435          *uses_vvvv = True;
   29436          /* Flags aren't modified.  */
   29437          goto decode_success;
   29438       }
   29439       /* PEXT r/m32, r32b, r32a = VEX.NDS.LZ.F3.0F38.W0 F5 /r */
   29440       /* PEXT r/m64, r64b, r64a = VEX.NDS.LZ.F3.0F38.W1 F5 /r */
   29441       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   29442          Int     size = getRexW(pfx) ? 8 : 4;
   29443          IRType  ty   = szToITy(size);
   29444          IRTemp  src  = newTemp(ty);
   29445          IRTemp  mask = newTemp(ty);
   29446          UChar   rm   = getUChar(delta);
   29447 
   29448          assign( src, getIRegV(size,pfx) );
   29449          if (epartIsReg(rm)) {
   29450             assign( mask, getIRegE(size,pfx,rm) );
   29451             DIP("pext %s,%s,%s\n", nameIRegE(size,pfx,rm),
   29452                 nameIRegV(size,pfx), nameIRegG(size,pfx,rm));
   29453             delta++;
   29454          } else {
   29455             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   29456             assign( mask, loadLE(ty, mkexpr(addr)) );
   29457             DIP("pext %s,%s,%s\n", dis_buf, nameIRegV(size,pfx),
   29458                 nameIRegG(size,pfx,rm));
   29459             delta += alen;
   29460          }
   29461 
   29462          /* First mask off bits not set in mask, they are ignored
   29463             and it should be fine if they contain undefined values.  */
   29464          IRExpr* masked = binop(mkSizedOp(ty,Iop_And8),
   29465                                 mkexpr(src), mkexpr(mask));
   29466          IRExpr** args = mkIRExprVec_2( widenUto64(masked),
   29467                                         widenUto64(mkexpr(mask)) );
   29468          putIRegG( size, pfx, rm,
   29469                    narrowTo(ty, mkIRExprCCall(Ity_I64, 0/*regparms*/,
   29470                                               "amd64g_calculate_pext",
   29471                                               &amd64g_calculate_pext, args)) );
   29472          *uses_vvvv = True;
   29473          /* Flags aren't modified.  */
   29474          goto decode_success;
   29475       }
   29476       break;
   29477 
   29478    case 0xF6:
   29479       /* MULX r/m32, r32b, r32a = VEX.NDD.LZ.F2.0F38.W0 F6 /r */
   29480       /* MULX r/m64, r64b, r64a = VEX.NDD.LZ.F2.0F38.W1 F6 /r */
   29481       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   29482          Int     size = getRexW(pfx) ? 8 : 4;
   29483          IRType  ty   = szToITy(size);
   29484          IRTemp  src1 = newTemp(ty);
   29485          IRTemp  src2 = newTemp(ty);
   29486          IRTemp  res  = newTemp(size == 8 ? Ity_I128 : Ity_I64);
   29487          UChar   rm   = getUChar(delta);
   29488 
   29489          assign( src1, getIRegRDX(size) );
   29490          if (epartIsReg(rm)) {
   29491             assign( src2, getIRegE(size,pfx,rm) );
   29492             DIP("mulx %s,%s,%s\n", nameIRegE(size,pfx,rm),
   29493                 nameIRegV(size,pfx), nameIRegG(size,pfx,rm));
   29494             delta++;
   29495          } else {
   29496             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   29497             assign( src2, loadLE(ty, mkexpr(addr)) );
   29498             DIP("mulx %s,%s,%s\n", dis_buf, nameIRegV(size,pfx),
   29499                 nameIRegG(size,pfx,rm));
   29500             delta += alen;
   29501          }
   29502 
   29503          assign( res, binop(size == 8 ? Iop_MullU64 : Iop_MullU32,
   29504                             mkexpr(src1), mkexpr(src2)) );
   29505          putIRegV( size, pfx,
   29506                    unop(size == 8 ? Iop_128to64 : Iop_64to32, mkexpr(res)) );
   29507          putIRegG( size, pfx, rm,
   29508                    unop(size == 8 ? Iop_128HIto64 : Iop_64HIto32,
   29509                         mkexpr(res)) );
   29510          *uses_vvvv = True;
   29511          /* Flags aren't modified.  */
   29512          goto decode_success;
   29513       }
   29514       break;
   29515 
   29516    case 0xF7:
   29517       /* SARX r32b, r/m32, r32a = VEX.NDS.LZ.F3.0F38.W0 F7 /r */
   29518       /* SARX r64b, r/m64, r64a = VEX.NDS.LZ.F3.0F38.W1 F7 /r */
   29519       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   29520          delta = dis_SHIFTX( uses_vvvv, vbi, pfx, delta, "sarx", Iop_Sar8 );
   29521          goto decode_success;
   29522       }
   29523       /* SHLX r32b, r/m32, r32a = VEX.NDS.LZ.66.0F38.W0 F7 /r */
   29524       /* SHLX r64b, r/m64, r64a = VEX.NDS.LZ.66.0F38.W1 F7 /r */
   29525       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   29526          delta = dis_SHIFTX( uses_vvvv, vbi, pfx, delta, "shlx", Iop_Shl8 );
   29527          goto decode_success;
   29528       }
   29529       /* SHRX r32b, r/m32, r32a = VEX.NDS.LZ.F2.0F38.W0 F7 /r */
   29530       /* SHRX r64b, r/m64, r64a = VEX.NDS.LZ.F2.0F38.W1 F7 /r */
   29531       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   29532          delta = dis_SHIFTX( uses_vvvv, vbi, pfx, delta, "shrx", Iop_Shr8 );
   29533          goto decode_success;
   29534       }
   29535       /* BEXTR r32b, r/m32, r32a = VEX.NDS.LZ.0F38.W0 F7 /r */
   29536       /* BEXTR r64b, r/m64, r64a = VEX.NDS.LZ.0F38.W1 F7 /r */
   29537       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   29538          Int     size  = getRexW(pfx) ? 8 : 4;
   29539          IRType  ty    = szToITy(size);
   29540          IRTemp  dst   = newTemp(ty);
   29541          IRTemp  src1  = newTemp(ty);
   29542          IRTemp  src2  = newTemp(ty);
   29543          IRTemp  stle  = newTemp(Ity_I16);
   29544          IRTemp  start = newTemp(Ity_I8);
   29545          IRTemp  len   = newTemp(Ity_I8);
   29546          UChar   rm    = getUChar(delta);
   29547 
   29548          assign( src2, getIRegV(size,pfx) );
   29549          if (epartIsReg(rm)) {
   29550             assign( src1, getIRegE(size,pfx,rm) );
   29551             DIP("bextr %s,%s,%s\n", nameIRegV(size,pfx),
   29552                 nameIRegE(size,pfx,rm), nameIRegG(size,pfx,rm));
   29553             delta++;
   29554          } else {
   29555             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   29556             assign( src1, loadLE(ty, mkexpr(addr)) );
   29557             DIP("bextr %s,%s,%s\n", nameIRegV(size,pfx), dis_buf,
   29558                 nameIRegG(size,pfx,rm));
   29559             delta += alen;
   29560          }
   29561 
   29562          assign( stle, narrowTo( Ity_I16, mkexpr(src2) ) );
   29563          assign( start, unop( Iop_16to8, mkexpr(stle) ) );
   29564          assign( len, unop( Iop_16HIto8, mkexpr(stle) ) );
   29565          /* if (start+len < opsize) {
   29566                if (len != 0)
   29567                   dst = (src1 << (opsize-start-len)) u>> (opsize-len);
   29568                else
   29569                   dst = 0;
   29570             } else {
   29571                if (start < opsize)
   29572                   dst = src1 u>> start;
   29573                else
   29574                   dst = 0;
   29575             } */
   29576          assign( dst,
   29577                  IRExpr_ITE(
   29578                     binop(Iop_CmpLT32U,
   29579                           binop(Iop_Add32,
   29580                                 unop(Iop_8Uto32, mkexpr(start)),
   29581                                 unop(Iop_8Uto32, mkexpr(len))),
   29582                           mkU32(8*size)),
   29583                     IRExpr_ITE(
   29584                        binop(Iop_CmpEQ8, mkexpr(len), mkU8(0)),
   29585                        mkU(ty, 0),
   29586                        binop(mkSizedOp(ty,Iop_Shr8),
   29587                              binop(mkSizedOp(ty,Iop_Shl8), mkexpr(src1),
   29588                                    binop(Iop_Sub8,
   29589                                          binop(Iop_Sub8, mkU8(8*size),
   29590                                                mkexpr(start)),
   29591                                          mkexpr(len))),
   29592                              binop(Iop_Sub8, mkU8(8*size),
   29593                                    mkexpr(len)))
   29594                     ),
   29595                     IRExpr_ITE(
   29596                        binop(Iop_CmpLT32U,
   29597                              unop(Iop_8Uto32, mkexpr(start)),
   29598                              mkU32(8*size)),
   29599                        binop(mkSizedOp(ty,Iop_Shr8), mkexpr(src1),
   29600                              mkexpr(start)),
   29601                        mkU(ty, 0)
   29602                     )
   29603                  )
   29604                );
   29605          putIRegG( size, pfx, rm, mkexpr(dst) );
   29606          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
   29607                                                ? AMD64G_CC_OP_ANDN64
   29608                                                : AMD64G_CC_OP_ANDN32)) );
   29609          stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
   29610          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
   29611          *uses_vvvv = True;
   29612          goto decode_success;
   29613       }
   29614       break;
   29615 
   29616    default:
   29617       break;
   29618 
   29619    }
   29620 
   29621   //decode_failure:
   29622    return deltaIN;
   29623 
   29624   decode_success:
   29625    return delta;
   29626 }
   29627 
   29628 
   29629 /*------------------------------------------------------------*/
   29630 /*---                                                      ---*/
   29631 /*--- Top-level post-escape decoders: dis_ESC_0F3A__VEX    ---*/
   29632 /*---                                                      ---*/
   29633 /*------------------------------------------------------------*/
   29634 
   29635 static IRTemp math_VPERMILPS_128 ( IRTemp sV, UInt imm8 )
   29636 {
   29637    vassert(imm8 < 256);
   29638    IRTemp s3, s2, s1, s0;
   29639    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   29640    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   29641 #  define SEL(_nn) (((_nn)==0) ? s0 : ((_nn)==1) ? s1 \
   29642                                     : ((_nn)==2) ? s2 : s3)
   29643    IRTemp res = newTemp(Ity_V128);
   29644    assign(res, mkV128from32s( SEL((imm8 >> 6) & 3),
   29645                               SEL((imm8 >> 4) & 3),
   29646                               SEL((imm8 >> 2) & 3),
   29647                               SEL((imm8 >> 0) & 3) ));
   29648 #  undef SEL
   29649    return res;
   29650 }
   29651 
   29652 __attribute__((noinline))
   29653 static
   29654 Long dis_ESC_0F3A__VEX (
   29655         /*MB_OUT*/DisResult* dres,
   29656         /*OUT*/   Bool*      uses_vvvv,
   29657         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
   29658         Bool         resteerCisOk,
   29659         void*        callback_opaque,
   29660         const VexArchInfo* archinfo,
   29661         const VexAbiInfo*  vbi,
   29662         Prefix pfx, Int sz, Long deltaIN
   29663      )
   29664 {
   29665    IRTemp addr  = IRTemp_INVALID;
   29666    Int    alen  = 0;
   29667    HChar  dis_buf[50];
   29668    Long   delta = deltaIN;
   29669    UChar  opc   = getUChar(delta);
   29670    delta++;
   29671    *uses_vvvv = False;
   29672 
   29673    switch (opc) {
   29674 
   29675    case 0x00:
   29676    case 0x01:
   29677       /* VPERMQ imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.W1 00 /r ib */
   29678       /* VPERMPD imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.W1 01 /r ib */
   29679       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29680           && 1==getRexW(pfx)/*W1*/) {
   29681          UChar  modrm = getUChar(delta);
   29682          UInt   imm8  = 0;
   29683          UInt   rG    = gregOfRexRM(pfx, modrm);
   29684          IRTemp sV    = newTemp(Ity_V256);
   29685          const HChar *name  = opc == 0 ? "vpermq" : "vpermpd";
   29686          if (epartIsReg(modrm)) {
   29687             UInt rE = eregOfRexRM(pfx, modrm);
   29688             delta += 1;
   29689             imm8 = getUChar(delta);
   29690             DIP("%s $%u,%s,%s\n",
   29691                 name, imm8, nameYMMReg(rE), nameYMMReg(rG));
   29692             assign(sV, getYMMReg(rE));
   29693          } else {
   29694             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   29695             delta += alen;
   29696             imm8 = getUChar(delta);
   29697             DIP("%s $%u,%s,%s\n",
   29698                 name, imm8, dis_buf, nameYMMReg(rG));
   29699             assign(sV, loadLE(Ity_V256, mkexpr(addr)));
   29700          }
   29701          delta++;
   29702          IRTemp s[4];
   29703          s[3] = s[2] = s[1] = s[0] = IRTemp_INVALID;
   29704          breakupV256to64s(sV, &s[3], &s[2], &s[1], &s[0]);
   29705          IRTemp dV = newTemp(Ity_V256);
   29706          assign(dV, IRExpr_Qop(Iop_64x4toV256,
   29707                                mkexpr(s[(imm8 >> 6) & 3]),
   29708                                mkexpr(s[(imm8 >> 4) & 3]),
   29709                                mkexpr(s[(imm8 >> 2) & 3]),
   29710                                mkexpr(s[(imm8 >> 0) & 3])));
   29711          putYMMReg(rG, mkexpr(dV));
   29712          goto decode_success;
   29713       }
   29714       break;
   29715 
   29716    case 0x02:
   29717       /* VPBLENDD imm8, xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W0 02 /r ib */
   29718       if (have66noF2noF3(pfx)
   29719           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   29720          UChar  modrm = getUChar(delta);
   29721          UInt   imm8  = 0;
   29722          UInt   rG    = gregOfRexRM(pfx, modrm);
   29723          UInt   rV    = getVexNvvvv(pfx);
   29724          IRTemp sV    = newTemp(Ity_V128);
   29725          IRTemp dV    = newTemp(Ity_V128);
   29726          UInt   i;
   29727          IRTemp s[4], d[4];
   29728          assign(sV, getXMMReg(rV));
   29729          if (epartIsReg(modrm)) {
   29730             UInt rE = eregOfRexRM(pfx, modrm);
   29731             delta += 1;
   29732             imm8 = getUChar(delta);
   29733             DIP("vpblendd $%u,%s,%s,%s\n",
   29734                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   29735             assign(dV, getXMMReg(rE));
   29736          } else {
   29737             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   29738             delta += alen;
   29739             imm8 = getUChar(delta);
   29740             DIP("vpblendd $%u,%s,%s,%s\n",
   29741                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   29742             assign(dV, loadLE(Ity_V128, mkexpr(addr)));
   29743          }
   29744          delta++;
   29745          for (i = 0; i < 4; i++) {
   29746             s[i] = IRTemp_INVALID;
   29747             d[i] = IRTemp_INVALID;
   29748          }
   29749          breakupV128to32s( sV, &s[3], &s[2], &s[1], &s[0] );
   29750          breakupV128to32s( dV, &d[3], &d[2], &d[1], &d[0] );
   29751          for (i = 0; i < 4; i++)
   29752             putYMMRegLane32(rG, i, mkexpr((imm8 & (1<<i)) ? d[i] : s[i]));
   29753          putYMMRegLane128(rG, 1, mkV128(0));
   29754          *uses_vvvv = True;
   29755          goto decode_success;
   29756       }
   29757       /* VPBLENDD imm8, ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F3A.W0 02 /r ib */
   29758       if (have66noF2noF3(pfx)
   29759           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   29760          UChar  modrm = getUChar(delta);
   29761          UInt   imm8  = 0;
   29762          UInt   rG    = gregOfRexRM(pfx, modrm);
   29763          UInt   rV    = getVexNvvvv(pfx);
   29764          IRTemp sV    = newTemp(Ity_V256);
   29765          IRTemp dV    = newTemp(Ity_V256);
   29766          UInt   i;
   29767          IRTemp s[8], d[8];
   29768          assign(sV, getYMMReg(rV));
   29769          if (epartIsReg(modrm)) {
   29770             UInt rE = eregOfRexRM(pfx, modrm);
   29771             delta += 1;
   29772             imm8 = getUChar(delta);
   29773             DIP("vpblendd $%u,%s,%s,%s\n",
   29774                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   29775             assign(dV, getYMMReg(rE));
   29776          } else {
   29777             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   29778             delta += alen;
   29779             imm8 = getUChar(delta);
   29780             DIP("vpblendd $%u,%s,%s,%s\n",
   29781                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   29782             assign(dV, loadLE(Ity_V256, mkexpr(addr)));
   29783          }
   29784          delta++;
   29785          for (i = 0; i < 8; i++) {
   29786             s[i] = IRTemp_INVALID;
   29787             d[i] = IRTemp_INVALID;
   29788          }
   29789          breakupV256to32s( sV, &s[7], &s[6], &s[5], &s[4],
   29790                                &s[3], &s[2], &s[1], &s[0] );
   29791          breakupV256to32s( dV, &d[7], &d[6], &d[5], &d[4],
   29792                                &d[3], &d[2], &d[1], &d[0] );
   29793          for (i = 0; i < 8; i++)
   29794             putYMMRegLane32(rG, i, mkexpr((imm8 & (1<<i)) ? d[i] : s[i]));
   29795          *uses_vvvv = True;
   29796          goto decode_success;
   29797       }
   29798       break;
   29799 
   29800    case 0x04:
   29801       /* VPERMILPS imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.WIG 04 /r ib */
   29802       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   29803          UChar  modrm = getUChar(delta);
   29804          UInt   imm8  = 0;
   29805          UInt   rG    = gregOfRexRM(pfx, modrm);
   29806          IRTemp sV    = newTemp(Ity_V256);
   29807          if (epartIsReg(modrm)) {
   29808             UInt rE = eregOfRexRM(pfx, modrm);
   29809             delta += 1;
   29810             imm8 = getUChar(delta);
   29811             DIP("vpermilps $%u,%s,%s\n",
   29812                 imm8, nameYMMReg(rE), nameYMMReg(rG));
   29813             assign(sV, getYMMReg(rE));
   29814          } else {
   29815             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   29816             delta += alen;
   29817             imm8 = getUChar(delta);
   29818             DIP("vpermilps $%u,%s,%s\n",
   29819                 imm8, dis_buf, nameYMMReg(rG));
   29820             assign(sV, loadLE(Ity_V256, mkexpr(addr)));
   29821          }
   29822          delta++;
   29823          IRTemp  sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   29824          breakupV256toV128s( sV, &sVhi, &sVlo );
   29825          IRTemp  dVhi = math_VPERMILPS_128( sVhi, imm8 );
   29826          IRTemp  dVlo = math_VPERMILPS_128( sVlo, imm8 );
   29827          IRExpr* res  = binop(Iop_V128HLtoV256, mkexpr(dVhi), mkexpr(dVlo));
   29828          putYMMReg(rG, res);
   29829          goto decode_success;
   29830       }
   29831       /* VPERMILPS imm8, xmm2/m128, xmm1 = VEX.128.66.0F3A.WIG 04 /r ib */
   29832       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   29833          UChar  modrm = getUChar(delta);
   29834          UInt   imm8  = 0;
   29835          UInt   rG    = gregOfRexRM(pfx, modrm);
   29836          IRTemp sV    = newTemp(Ity_V128);
   29837          if (epartIsReg(modrm)) {
   29838             UInt rE = eregOfRexRM(pfx, modrm);
   29839             delta += 1;
   29840             imm8 = getUChar(delta);
   29841             DIP("vpermilps $%u,%s,%s\n",
   29842                 imm8, nameXMMReg(rE), nameXMMReg(rG));
   29843             assign(sV, getXMMReg(rE));
   29844          } else {
   29845             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   29846             delta += alen;
   29847             imm8 = getUChar(delta);
   29848             DIP("vpermilps $%u,%s,%s\n",
   29849                 imm8, dis_buf, nameXMMReg(rG));
   29850             assign(sV, loadLE(Ity_V128, mkexpr(addr)));
   29851          }
   29852          delta++;
   29853          putYMMRegLoAndZU(rG, mkexpr ( math_VPERMILPS_128 ( sV, imm8 ) ) );
   29854          goto decode_success;
   29855       }
   29856       break;
   29857 
   29858    case 0x05:
   29859       /* VPERMILPD imm8, xmm2/m128, xmm1 = VEX.128.66.0F3A.WIG 05 /r ib */
   29860       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   29861          UChar  modrm = getUChar(delta);
   29862          UInt   imm8  = 0;
   29863          UInt   rG    = gregOfRexRM(pfx, modrm);
   29864          IRTemp sV    = newTemp(Ity_V128);
   29865          if (epartIsReg(modrm)) {
   29866             UInt rE = eregOfRexRM(pfx, modrm);
   29867             delta += 1;
   29868             imm8 = getUChar(delta);
   29869             DIP("vpermilpd $%u,%s,%s\n",
   29870                 imm8, nameXMMReg(rE), nameXMMReg(rG));
   29871             assign(sV, getXMMReg(rE));
   29872          } else {
   29873             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   29874             delta += alen;
   29875             imm8 = getUChar(delta);
   29876             DIP("vpermilpd $%u,%s,%s\n",
   29877                 imm8, dis_buf, nameXMMReg(rG));
   29878             assign(sV, loadLE(Ity_V128, mkexpr(addr)));
   29879          }
   29880          delta++;
   29881          IRTemp s1 = newTemp(Ity_I64);
   29882          IRTemp s0 = newTemp(Ity_I64);
   29883          assign(s1, unop(Iop_V128HIto64, mkexpr(sV)));
   29884          assign(s0, unop(Iop_V128to64,   mkexpr(sV)));
   29885          IRTemp dV = newTemp(Ity_V128);
   29886          assign(dV, binop(Iop_64HLtoV128,
   29887                                mkexpr((imm8 & (1<<1)) ? s1 : s0),
   29888                                mkexpr((imm8 & (1<<0)) ? s1 : s0)));
   29889          putYMMRegLoAndZU(rG, mkexpr(dV));
   29890          goto decode_success;
   29891       }
   29892       /* VPERMILPD imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.WIG 05 /r ib */
   29893       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   29894          UChar  modrm = getUChar(delta);
   29895          UInt   imm8  = 0;
   29896          UInt   rG    = gregOfRexRM(pfx, modrm);
   29897          IRTemp sV    = newTemp(Ity_V256);
   29898          if (epartIsReg(modrm)) {
   29899             UInt rE = eregOfRexRM(pfx, modrm);
   29900             delta += 1;
   29901             imm8 = getUChar(delta);
   29902             DIP("vpermilpd $%u,%s,%s\n",
   29903                 imm8, nameYMMReg(rE), nameYMMReg(rG));
   29904             assign(sV, getYMMReg(rE));
   29905          } else {
   29906             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   29907             delta += alen;
   29908             imm8 = getUChar(delta);
   29909             DIP("vpermilpd $%u,%s,%s\n",
   29910                 imm8, dis_buf, nameYMMReg(rG));
   29911             assign(sV, loadLE(Ity_V256, mkexpr(addr)));
   29912          }
   29913          delta++;
   29914          IRTemp s3, s2, s1, s0;
   29915          s3 = s2 = s1 = s0 = IRTemp_INVALID;
   29916          breakupV256to64s(sV, &s3, &s2, &s1, &s0);
   29917          IRTemp dV = newTemp(Ity_V256);
   29918          assign(dV, IRExpr_Qop(Iop_64x4toV256,
   29919                                mkexpr((imm8 & (1<<3)) ? s3 : s2),
   29920                                mkexpr((imm8 & (1<<2)) ? s3 : s2),
   29921                                mkexpr((imm8 & (1<<1)) ? s1 : s0),
   29922                                mkexpr((imm8 & (1<<0)) ? s1 : s0)));
   29923          putYMMReg(rG, mkexpr(dV));
   29924          goto decode_success;
   29925       }
   29926       break;
   29927 
   29928    case 0x06:
   29929       /* VPERM2F128 imm8, ymm3/m256, ymm2, ymm1 = VEX.NDS.66.0F3A.W0 06 /r ib */
   29930       if (have66noF2noF3(pfx)
   29931           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   29932          UChar  modrm = getUChar(delta);
   29933          UInt   imm8  = 0;
   29934          UInt   rG    = gregOfRexRM(pfx, modrm);
   29935          UInt   rV    = getVexNvvvv(pfx);
   29936          IRTemp s00   = newTemp(Ity_V128);
   29937          IRTemp s01   = newTemp(Ity_V128);
   29938          IRTemp s10   = newTemp(Ity_V128);
   29939          IRTemp s11   = newTemp(Ity_V128);
   29940          assign(s00, getYMMRegLane128(rV, 0));
   29941          assign(s01, getYMMRegLane128(rV, 1));
   29942          if (epartIsReg(modrm)) {
   29943             UInt rE = eregOfRexRM(pfx, modrm);
   29944             delta += 1;
   29945             imm8 = getUChar(delta);
   29946             DIP("vperm2f128 $%u,%s,%s,%s\n",
   29947                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   29948             assign(s10, getYMMRegLane128(rE, 0));
   29949             assign(s11, getYMMRegLane128(rE, 1));
   29950          } else {
   29951             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   29952             delta += alen;
   29953             imm8 = getUChar(delta);
   29954             DIP("vperm2f128 $%u,%s,%s,%s\n",
   29955                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   29956             assign(s10, loadLE(Ity_V128, binop(Iop_Add64,
   29957                                                mkexpr(addr), mkU64(0))));
   29958             assign(s11, loadLE(Ity_V128, binop(Iop_Add64,
   29959                                                mkexpr(addr), mkU64(16))));
   29960          }
   29961          delta++;
   29962 #        define SEL(_nn) (((_nn)==0) ? s00 : ((_nn)==1) ? s01 \
   29963                                            : ((_nn)==2) ? s10 : s11)
   29964          putYMMRegLane128(rG, 0, mkexpr(SEL((imm8 >> 0) & 3)));
   29965          putYMMRegLane128(rG, 1, mkexpr(SEL((imm8 >> 4) & 3)));
   29966 #        undef SEL
   29967          if (imm8 & (1<<3)) putYMMRegLane128(rG, 0, mkV128(0));
   29968          if (imm8 & (1<<7)) putYMMRegLane128(rG, 1, mkV128(0));
   29969          *uses_vvvv = True;
   29970          goto decode_success;
   29971       }
   29972       break;
   29973 
   29974    case 0x08:
   29975       /* VROUNDPS imm8, xmm2/m128, xmm1 */
   29976       /* VROUNDPS = VEX.NDS.128.66.0F3A.WIG 08 ib */
   29977       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   29978          UChar  modrm = getUChar(delta);
   29979          UInt   rG    = gregOfRexRM(pfx, modrm);
   29980          IRTemp src   = newTemp(Ity_V128);
   29981          IRTemp s0    = IRTemp_INVALID;
   29982          IRTemp s1    = IRTemp_INVALID;
   29983          IRTemp s2    = IRTemp_INVALID;
   29984          IRTemp s3    = IRTemp_INVALID;
   29985          IRTemp rm    = newTemp(Ity_I32);
   29986          Int    imm   = 0;
   29987 
   29988          modrm = getUChar(delta);
   29989 
   29990          if (epartIsReg(modrm)) {
   29991             UInt rE = eregOfRexRM(pfx, modrm);
   29992             assign( src, getXMMReg( rE ) );
   29993             imm = getUChar(delta+1);
   29994             if (imm & ~15) break;
   29995             delta += 1+1;
   29996             DIP( "vroundps $%d,%s,%s\n", imm, nameXMMReg(rE), nameXMMReg(rG) );
   29997          } else {
   29998             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   29999             assign( src, loadLE(Ity_V128, mkexpr(addr) ) );
   30000             imm = getUChar(delta+alen);
   30001             if (imm & ~15) break;
   30002             delta += alen+1;
   30003             DIP( "vroundps $%d,%s,%s\n", imm, dis_buf, nameXMMReg(rG) );
   30004          }
   30005 
   30006          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   30007             that encoding is the same as the encoding for IRRoundingMode,
   30008             we can use that value directly in the IR as a rounding
   30009             mode. */
   30010          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   30011 
   30012          breakupV128to32s( src, &s3, &s2, &s1, &s0 );
   30013          putYMMRegLane128( rG, 1, mkV128(0) );
   30014 #        define CVT(s) binop(Iop_RoundF32toInt, mkexpr(rm), \
   30015                              unop(Iop_ReinterpI32asF32, mkexpr(s)))
   30016          putYMMRegLane32F( rG, 3, CVT(s3) );
   30017          putYMMRegLane32F( rG, 2, CVT(s2) );
   30018          putYMMRegLane32F( rG, 1, CVT(s1) );
   30019          putYMMRegLane32F( rG, 0, CVT(s0) );
   30020 #        undef CVT
   30021          goto decode_success;
   30022       }
   30023       /* VROUNDPS imm8, ymm2/m256, ymm1 */
   30024       /* VROUNDPS = VEX.NDS.256.66.0F3A.WIG 08 ib */
   30025       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   30026          UChar  modrm = getUChar(delta);
   30027          UInt   rG    = gregOfRexRM(pfx, modrm);
   30028          IRTemp src   = newTemp(Ity_V256);
   30029          IRTemp s0    = IRTemp_INVALID;
   30030          IRTemp s1    = IRTemp_INVALID;
   30031          IRTemp s2    = IRTemp_INVALID;
   30032          IRTemp s3    = IRTemp_INVALID;
   30033          IRTemp s4    = IRTemp_INVALID;
   30034          IRTemp s5    = IRTemp_INVALID;
   30035          IRTemp s6    = IRTemp_INVALID;
   30036          IRTemp s7    = IRTemp_INVALID;
   30037          IRTemp rm    = newTemp(Ity_I32);
   30038          Int    imm   = 0;
   30039 
   30040          modrm = getUChar(delta);
   30041 
   30042          if (epartIsReg(modrm)) {
   30043             UInt rE = eregOfRexRM(pfx, modrm);
   30044             assign( src, getYMMReg( rE ) );
   30045             imm = getUChar(delta+1);
   30046             if (imm & ~15) break;
   30047             delta += 1+1;
   30048             DIP( "vroundps $%d,%s,%s\n", imm, nameYMMReg(rE), nameYMMReg(rG) );
   30049          } else {
   30050             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30051             assign( src, loadLE(Ity_V256, mkexpr(addr) ) );
   30052             imm = getUChar(delta+alen);
   30053             if (imm & ~15) break;
   30054             delta += alen+1;
   30055             DIP( "vroundps $%d,%s,%s\n", imm, dis_buf, nameYMMReg(rG) );
   30056          }
   30057 
   30058          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   30059             that encoding is the same as the encoding for IRRoundingMode,
   30060             we can use that value directly in the IR as a rounding
   30061             mode. */
   30062          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   30063 
   30064          breakupV256to32s( src, &s7, &s6, &s5, &s4, &s3, &s2, &s1, &s0 );
   30065 #        define CVT(s) binop(Iop_RoundF32toInt, mkexpr(rm), \
   30066                              unop(Iop_ReinterpI32asF32, mkexpr(s)))
   30067          putYMMRegLane32F( rG, 7, CVT(s7) );
   30068          putYMMRegLane32F( rG, 6, CVT(s6) );
   30069          putYMMRegLane32F( rG, 5, CVT(s5) );
   30070          putYMMRegLane32F( rG, 4, CVT(s4) );
   30071          putYMMRegLane32F( rG, 3, CVT(s3) );
   30072          putYMMRegLane32F( rG, 2, CVT(s2) );
   30073          putYMMRegLane32F( rG, 1, CVT(s1) );
   30074          putYMMRegLane32F( rG, 0, CVT(s0) );
   30075 #        undef CVT
   30076          goto decode_success;
   30077       }
   30078 
   30079    case 0x09:
   30080       /* VROUNDPD imm8, xmm2/m128, xmm1 */
   30081       /* VROUNDPD = VEX.NDS.128.66.0F3A.WIG 09 ib */
   30082       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30083          UChar  modrm = getUChar(delta);
   30084          UInt   rG    = gregOfRexRM(pfx, modrm);
   30085          IRTemp src   = newTemp(Ity_V128);
   30086          IRTemp s0    = IRTemp_INVALID;
   30087          IRTemp s1    = IRTemp_INVALID;
   30088          IRTemp rm    = newTemp(Ity_I32);
   30089          Int    imm   = 0;
   30090 
   30091          modrm = getUChar(delta);
   30092 
   30093          if (epartIsReg(modrm)) {
   30094             UInt rE = eregOfRexRM(pfx, modrm);
   30095             assign( src, getXMMReg( rE ) );
   30096             imm = getUChar(delta+1);
   30097             if (imm & ~15) break;
   30098             delta += 1+1;
   30099             DIP( "vroundpd $%d,%s,%s\n", imm, nameXMMReg(rE), nameXMMReg(rG) );
   30100          } else {
   30101             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30102             assign( src, loadLE(Ity_V128, mkexpr(addr) ) );
   30103             imm = getUChar(delta+alen);
   30104             if (imm & ~15) break;
   30105             delta += alen+1;
   30106             DIP( "vroundpd $%d,%s,%s\n", imm, dis_buf, nameXMMReg(rG) );
   30107          }
   30108 
   30109          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   30110             that encoding is the same as the encoding for IRRoundingMode,
   30111             we can use that value directly in the IR as a rounding
   30112             mode. */
   30113          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   30114 
   30115          breakupV128to64s( src, &s1, &s0 );
   30116          putYMMRegLane128( rG, 1, mkV128(0) );
   30117 #        define CVT(s) binop(Iop_RoundF64toInt, mkexpr(rm), \
   30118                              unop(Iop_ReinterpI64asF64, mkexpr(s)))
   30119          putYMMRegLane64F( rG, 1, CVT(s1) );
   30120          putYMMRegLane64F( rG, 0, CVT(s0) );
   30121 #        undef CVT
   30122          goto decode_success;
   30123       }
   30124       /* VROUNDPD imm8, ymm2/m256, ymm1 */
   30125       /* VROUNDPD = VEX.NDS.256.66.0F3A.WIG 09 ib */
   30126       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   30127          UChar  modrm = getUChar(delta);
   30128          UInt   rG    = gregOfRexRM(pfx, modrm);
   30129          IRTemp src   = newTemp(Ity_V256);
   30130          IRTemp s0    = IRTemp_INVALID;
   30131          IRTemp s1    = IRTemp_INVALID;
   30132          IRTemp s2    = IRTemp_INVALID;
   30133          IRTemp s3    = IRTemp_INVALID;
   30134          IRTemp rm    = newTemp(Ity_I32);
   30135          Int    imm   = 0;
   30136 
   30137          modrm = getUChar(delta);
   30138 
   30139          if (epartIsReg(modrm)) {
   30140             UInt rE = eregOfRexRM(pfx, modrm);
   30141             assign( src, getYMMReg( rE ) );
   30142             imm = getUChar(delta+1);
   30143             if (imm & ~15) break;
   30144             delta += 1+1;
   30145             DIP( "vroundpd $%d,%s,%s\n", imm, nameYMMReg(rE), nameYMMReg(rG) );
   30146          } else {
   30147             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30148             assign( src, loadLE(Ity_V256, mkexpr(addr) ) );
   30149             imm = getUChar(delta+alen);
   30150             if (imm & ~15) break;
   30151             delta += alen+1;
   30152             DIP( "vroundps $%d,%s,%s\n", imm, dis_buf, nameYMMReg(rG) );
   30153          }
   30154 
   30155          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   30156             that encoding is the same as the encoding for IRRoundingMode,
   30157             we can use that value directly in the IR as a rounding
   30158             mode. */
   30159          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   30160 
   30161          breakupV256to64s( src, &s3, &s2, &s1, &s0 );
   30162 #        define CVT(s) binop(Iop_RoundF64toInt, mkexpr(rm), \
   30163                              unop(Iop_ReinterpI64asF64, mkexpr(s)))
   30164          putYMMRegLane64F( rG, 3, CVT(s3) );
   30165          putYMMRegLane64F( rG, 2, CVT(s2) );
   30166          putYMMRegLane64F( rG, 1, CVT(s1) );
   30167          putYMMRegLane64F( rG, 0, CVT(s0) );
   30168 #        undef CVT
   30169          goto decode_success;
   30170       }
   30171 
   30172    case 0x0A:
   30173    case 0x0B:
   30174       /* VROUNDSS imm8, xmm3/m32, xmm2, xmm1 */
   30175       /* VROUNDSS = VEX.NDS.128.66.0F3A.WIG 0A ib */
   30176       /* VROUNDSD imm8, xmm3/m64, xmm2, xmm1 */
   30177       /* VROUNDSD = VEX.NDS.128.66.0F3A.WIG 0B ib */
   30178       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30179          UChar  modrm = getUChar(delta);
   30180          UInt   rG    = gregOfRexRM(pfx, modrm);
   30181          UInt   rV    = getVexNvvvv(pfx);
   30182          Bool   isD   = opc == 0x0B;
   30183          IRTemp src   = newTemp(isD ? Ity_F64 : Ity_F32);
   30184          IRTemp res   = newTemp(isD ? Ity_F64 : Ity_F32);
   30185          Int    imm   = 0;
   30186 
   30187          if (epartIsReg(modrm)) {
   30188             UInt rE = eregOfRexRM(pfx, modrm);
   30189             assign( src,
   30190                     isD ? getXMMRegLane64F(rE, 0) : getXMMRegLane32F(rE, 0) );
   30191             imm = getUChar(delta+1);
   30192             if (imm & ~15) break;
   30193             delta += 1+1;
   30194             DIP( "vrounds%c $%d,%s,%s,%s\n",
   30195                  isD ? 'd' : 's',
   30196                  imm, nameXMMReg( rE ), nameXMMReg( rV ), nameXMMReg( rG ) );
   30197          } else {
   30198             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30199             assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
   30200             imm = getUChar(delta+alen);
   30201             if (imm & ~15) break;
   30202             delta += alen+1;
   30203             DIP( "vrounds%c $%d,%s,%s,%s\n",
   30204                  isD ? 'd' : 's',
   30205                  imm, dis_buf, nameXMMReg( rV ), nameXMMReg( rG ) );
   30206          }
   30207 
   30208          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   30209             that encoding is the same as the encoding for IRRoundingMode,
   30210             we can use that value directly in the IR as a rounding
   30211             mode. */
   30212          assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
   30213                            (imm & 4) ? get_sse_roundingmode()
   30214                                      : mkU32(imm & 3),
   30215                            mkexpr(src)) );
   30216 
   30217          if (isD)
   30218             putXMMRegLane64F( rG, 0, mkexpr(res) );
   30219          else {
   30220             putXMMRegLane32F( rG, 0, mkexpr(res) );
   30221             putXMMRegLane32F( rG, 1, getXMMRegLane32F( rV, 1 ) );
   30222          }
   30223          putXMMRegLane64F( rG, 1, getXMMRegLane64F( rV, 1 ) );
   30224          putYMMRegLane128( rG, 1, mkV128(0) );
   30225          *uses_vvvv = True;
   30226          goto decode_success;
   30227       }
   30228       break;
   30229 
   30230    case 0x0C:
   30231       /* VBLENDPS imm8, ymm3/m256, ymm2, ymm1 */
   30232       /* VBLENDPS = VEX.NDS.256.66.0F3A.WIG 0C /r ib */
   30233       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   30234          UChar  modrm = getUChar(delta);
   30235          UInt   imm8;
   30236          UInt   rG    = gregOfRexRM(pfx, modrm);
   30237          UInt   rV    = getVexNvvvv(pfx);
   30238          IRTemp sV    = newTemp(Ity_V256);
   30239          IRTemp sE    = newTemp(Ity_V256);
   30240          assign ( sV, getYMMReg(rV) );
   30241          if (epartIsReg(modrm)) {
   30242             UInt rE = eregOfRexRM(pfx, modrm);
   30243             delta += 1;
   30244             imm8 = getUChar(delta);
   30245             DIP("vblendps $%u,%s,%s,%s\n",
   30246                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   30247             assign(sE, getYMMReg(rE));
   30248          } else {
   30249             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30250             delta += alen;
   30251             imm8 = getUChar(delta);
   30252             DIP("vblendps $%u,%s,%s,%s\n",
   30253                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   30254             assign(sE, loadLE(Ity_V256, mkexpr(addr)));
   30255          }
   30256          delta++;
   30257          putYMMReg( rG,
   30258                     mkexpr( math_BLENDPS_256( sE, sV, imm8) ) );
   30259          *uses_vvvv = True;
   30260          goto decode_success;
   30261       }
   30262       /* VBLENDPS imm8, xmm3/m128, xmm2, xmm1 */
   30263       /* VBLENDPS = VEX.NDS.128.66.0F3A.WIG 0C /r ib */
   30264       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30265          UChar  modrm = getUChar(delta);
   30266          UInt   imm8;
   30267          UInt   rG    = gregOfRexRM(pfx, modrm);
   30268          UInt   rV    = getVexNvvvv(pfx);
   30269          IRTemp sV    = newTemp(Ity_V128);
   30270          IRTemp sE    = newTemp(Ity_V128);
   30271          assign ( sV, getXMMReg(rV) );
   30272          if (epartIsReg(modrm)) {
   30273             UInt rE = eregOfRexRM(pfx, modrm);
   30274             delta += 1;
   30275             imm8 = getUChar(delta);
   30276             DIP("vblendps $%u,%s,%s,%s\n",
   30277                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   30278             assign(sE, getXMMReg(rE));
   30279          } else {
   30280             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30281             delta += alen;
   30282             imm8 = getUChar(delta);
   30283             DIP("vblendps $%u,%s,%s,%s\n",
   30284                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   30285             assign(sE, loadLE(Ity_V128, mkexpr(addr)));
   30286          }
   30287          delta++;
   30288          putYMMRegLoAndZU( rG,
   30289                            mkexpr( math_BLENDPS_128( sE, sV, imm8) ) );
   30290          *uses_vvvv = True;
   30291          goto decode_success;
   30292       }
   30293       break;
   30294 
   30295    case 0x0D:
   30296       /* VBLENDPD imm8, ymm3/m256, ymm2, ymm1 */
   30297       /* VBLENDPD = VEX.NDS.256.66.0F3A.WIG 0D /r ib */
   30298       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   30299          UChar  modrm = getUChar(delta);
   30300          UInt   imm8;
   30301          UInt   rG    = gregOfRexRM(pfx, modrm);
   30302          UInt   rV    = getVexNvvvv(pfx);
   30303          IRTemp sV    = newTemp(Ity_V256);
   30304          IRTemp sE    = newTemp(Ity_V256);
   30305          assign ( sV, getYMMReg(rV) );
   30306          if (epartIsReg(modrm)) {
   30307             UInt rE = eregOfRexRM(pfx, modrm);
   30308             delta += 1;
   30309             imm8 = getUChar(delta);
   30310             DIP("vblendpd $%u,%s,%s,%s\n",
   30311                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   30312             assign(sE, getYMMReg(rE));
   30313          } else {
   30314             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30315             delta += alen;
   30316             imm8 = getUChar(delta);
   30317             DIP("vblendpd $%u,%s,%s,%s\n",
   30318                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   30319             assign(sE, loadLE(Ity_V256, mkexpr(addr)));
   30320          }
   30321          delta++;
   30322          putYMMReg( rG,
   30323                     mkexpr( math_BLENDPD_256( sE, sV, imm8) ) );
   30324          *uses_vvvv = True;
   30325          goto decode_success;
   30326       }
   30327       /* VBLENDPD imm8, xmm3/m128, xmm2, xmm1 */
   30328       /* VBLENDPD = VEX.NDS.128.66.0F3A.WIG 0D /r ib */
   30329       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30330          UChar  modrm = getUChar(delta);
   30331          UInt   imm8;
   30332          UInt   rG    = gregOfRexRM(pfx, modrm);
   30333          UInt   rV    = getVexNvvvv(pfx);
   30334          IRTemp sV    = newTemp(Ity_V128);
   30335          IRTemp sE    = newTemp(Ity_V128);
   30336          assign ( sV, getXMMReg(rV) );
   30337          if (epartIsReg(modrm)) {
   30338             UInt rE = eregOfRexRM(pfx, modrm);
   30339             delta += 1;
   30340             imm8 = getUChar(delta);
   30341             DIP("vblendpd $%u,%s,%s,%s\n",
   30342                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   30343             assign(sE, getXMMReg(rE));
   30344          } else {
   30345             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30346             delta += alen;
   30347             imm8 = getUChar(delta);
   30348             DIP("vblendpd $%u,%s,%s,%s\n",
   30349                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   30350             assign(sE, loadLE(Ity_V128, mkexpr(addr)));
   30351          }
   30352          delta++;
   30353          putYMMRegLoAndZU( rG,
   30354                            mkexpr( math_BLENDPD_128( sE, sV, imm8) ) );
   30355          *uses_vvvv = True;
   30356          goto decode_success;
   30357       }
   30358       break;
   30359 
   30360    case 0x0E:
   30361       /* VPBLENDW imm8, xmm3/m128, xmm2, xmm1 */
   30362       /* VPBLENDW = VEX.NDS.128.66.0F3A.WIG 0E /r ib */
   30363       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30364          UChar  modrm = getUChar(delta);
   30365          UInt   imm8;
   30366          UInt   rG    = gregOfRexRM(pfx, modrm);
   30367          UInt   rV    = getVexNvvvv(pfx);
   30368          IRTemp sV    = newTemp(Ity_V128);
   30369          IRTemp sE    = newTemp(Ity_V128);
   30370          assign ( sV, getXMMReg(rV) );
   30371          if (epartIsReg(modrm)) {
   30372             UInt rE = eregOfRexRM(pfx, modrm);
   30373             delta += 1;
   30374             imm8 = getUChar(delta);
   30375             DIP("vpblendw $%u,%s,%s,%s\n",
   30376                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   30377             assign(sE, getXMMReg(rE));
   30378          } else {
   30379             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30380             delta += alen;
   30381             imm8 = getUChar(delta);
   30382             DIP("vpblendw $%u,%s,%s,%s\n",
   30383                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   30384             assign(sE, loadLE(Ity_V128, mkexpr(addr)));
   30385          }
   30386          delta++;
   30387          putYMMRegLoAndZU( rG,
   30388                            mkexpr( math_PBLENDW_128( sE, sV, imm8) ) );
   30389          *uses_vvvv = True;
   30390          goto decode_success;
   30391       }
   30392       /* VPBLENDW imm8, ymm3/m256, ymm2, ymm1 */
   30393       /* VPBLENDW = VEX.NDS.256.66.0F3A.WIG 0E /r ib */
   30394       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   30395          UChar  modrm = getUChar(delta);
   30396          UInt   imm8;
   30397          UInt   rG    = gregOfRexRM(pfx, modrm);
   30398          UInt   rV    = getVexNvvvv(pfx);
   30399          IRTemp sV    = newTemp(Ity_V256);
   30400          IRTemp sE    = newTemp(Ity_V256);
   30401          IRTemp sVhi, sVlo, sEhi, sElo;
   30402          sVhi = sVlo = sEhi = sElo = IRTemp_INVALID;
   30403          assign ( sV, getYMMReg(rV) );
   30404          if (epartIsReg(modrm)) {
   30405             UInt rE = eregOfRexRM(pfx, modrm);
   30406             delta += 1;
   30407             imm8 = getUChar(delta);
   30408             DIP("vpblendw $%u,%s,%s,%s\n",
   30409                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   30410             assign(sE, getYMMReg(rE));
   30411          } else {
   30412             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30413             delta += alen;
   30414             imm8 = getUChar(delta);
   30415             DIP("vpblendw $%u,%s,%s,%s\n",
   30416                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   30417             assign(sE, loadLE(Ity_V256, mkexpr(addr)));
   30418          }
   30419          delta++;
   30420          breakupV256toV128s( sV, &sVhi, &sVlo );
   30421          breakupV256toV128s( sE, &sEhi, &sElo );
   30422          putYMMReg( rG, binop( Iop_V128HLtoV256,
   30423                                mkexpr( math_PBLENDW_128( sEhi, sVhi, imm8) ),
   30424                                mkexpr( math_PBLENDW_128( sElo, sVlo, imm8) ) ) );
   30425          *uses_vvvv = True;
   30426          goto decode_success;
   30427       }
   30428       break;
   30429 
   30430    case 0x0F:
   30431       /* VPALIGNR imm8, xmm3/m128, xmm2, xmm1 */
   30432       /* VPALIGNR = VEX.NDS.128.66.0F3A.WIG 0F /r ib */
   30433       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30434          UChar  modrm = getUChar(delta);
   30435          UInt   rG    = gregOfRexRM(pfx, modrm);
   30436          UInt   rV    = getVexNvvvv(pfx);
   30437          IRTemp sV    = newTemp(Ity_V128);
   30438          IRTemp dV    = newTemp(Ity_V128);
   30439          UInt   imm8;
   30440 
   30441          assign( dV, getXMMReg(rV) );
   30442 
   30443          if ( epartIsReg( modrm ) ) {
   30444             UInt   rE = eregOfRexRM(pfx, modrm);
   30445             assign( sV, getXMMReg(rE) );
   30446             imm8 = getUChar(delta+1);
   30447             delta += 1+1;
   30448             DIP("vpalignr $%d,%s,%s,%s\n", imm8, nameXMMReg(rE),
   30449                                            nameXMMReg(rV), nameXMMReg(rG));
   30450          } else {
   30451             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30452             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   30453             imm8 = getUChar(delta+alen);
   30454             delta += alen+1;
   30455             DIP("vpalignr $%d,%s,%s,%s\n", imm8, dis_buf,
   30456                                            nameXMMReg(rV), nameXMMReg(rG));
   30457          }
   30458 
   30459          IRTemp res = math_PALIGNR_XMM( sV, dV, imm8 );
   30460          putYMMRegLoAndZU( rG, mkexpr(res) );
   30461          *uses_vvvv = True;
   30462          goto decode_success;
   30463       }
   30464       /* VPALIGNR imm8, ymm3/m256, ymm2, ymm1 */
   30465       /* VPALIGNR = VEX.NDS.256.66.0F3A.WIG 0F /r ib */
   30466       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   30467          UChar  modrm = getUChar(delta);
   30468          UInt   rG    = gregOfRexRM(pfx, modrm);
   30469          UInt   rV    = getVexNvvvv(pfx);
   30470          IRTemp sV    = newTemp(Ity_V256);
   30471          IRTemp dV    = newTemp(Ity_V256);
   30472          IRTemp sHi, sLo, dHi, dLo;
   30473          sHi = sLo = dHi = dLo = IRTemp_INVALID;
   30474          UInt   imm8;
   30475 
   30476          assign( dV, getYMMReg(rV) );
   30477 
   30478          if ( epartIsReg( modrm ) ) {
   30479             UInt   rE = eregOfRexRM(pfx, modrm);
   30480             assign( sV, getYMMReg(rE) );
   30481             imm8 = getUChar(delta+1);
   30482             delta += 1+1;
   30483             DIP("vpalignr $%d,%s,%s,%s\n", imm8, nameYMMReg(rE),
   30484                                            nameYMMReg(rV), nameYMMReg(rG));
   30485          } else {
   30486             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30487             assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   30488             imm8 = getUChar(delta+alen);
   30489             delta += alen+1;
   30490             DIP("vpalignr $%d,%s,%s,%s\n", imm8, dis_buf,
   30491                                            nameYMMReg(rV), nameYMMReg(rG));
   30492          }
   30493 
   30494          breakupV256toV128s( dV, &dHi, &dLo );
   30495          breakupV256toV128s( sV, &sHi, &sLo );
   30496          putYMMReg( rG, binop( Iop_V128HLtoV256,
   30497                                mkexpr( math_PALIGNR_XMM( sHi, dHi, imm8 ) ),
   30498                                mkexpr( math_PALIGNR_XMM( sLo, dLo, imm8 ) ) )
   30499                     );
   30500          *uses_vvvv = True;
   30501          goto decode_success;
   30502       }
   30503       break;
   30504 
   30505    case 0x14:
   30506       /* VPEXTRB imm8, xmm2, reg/m8 = VEX.128.66.0F3A.W0 14 /r ib */
   30507       if (have66noF2noF3(pfx)
   30508           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   30509          delta = dis_PEXTRB_128_GtoE( vbi, pfx, delta, False/*!isAvx*/ );
   30510          goto decode_success;
   30511       }
   30512       break;
   30513 
   30514    case 0x15:
   30515       /* VPEXTRW imm8, reg/m16, xmm2 */
   30516       /* VPEXTRW = VEX.128.66.0F3A.W0 15 /r ib */
   30517       if (have66noF2noF3(pfx)
   30518           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   30519          delta = dis_PEXTRW( vbi, pfx, delta, True/*isAvx*/ );
   30520          goto decode_success;
   30521       }
   30522       break;
   30523 
   30524    case 0x16:
   30525       /* VPEXTRD imm8, r32/m32, xmm2 */
   30526       /* VPEXTRD = VEX.128.66.0F3A.W0 16 /r ib */
   30527       if (have66noF2noF3(pfx)
   30528           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   30529          delta = dis_PEXTRD( vbi, pfx, delta, True/*isAvx*/ );
   30530          goto decode_success;
   30531       }
   30532       /* VPEXTRQ = VEX.128.66.0F3A.W1 16 /r ib */
   30533       if (have66noF2noF3(pfx)
   30534           && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
   30535          delta = dis_PEXTRQ( vbi, pfx, delta, True/*isAvx*/ );
   30536          goto decode_success;
   30537       }
   30538       break;
   30539 
   30540    case 0x17:
   30541       /* VEXTRACTPS imm8, xmm1, r32/m32 = VEX.128.66.0F3A.WIG 17 /r ib */
   30542       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30543          delta = dis_EXTRACTPS( vbi, pfx, delta, True/*isAvx*/ );
   30544          goto decode_success;
   30545       }
   30546       break;
   30547 
   30548    case 0x18:
   30549       /* VINSERTF128 r/m, rV, rD
   30550          ::: rD = insertinto(a lane in rV, 128 bits from r/m) */
   30551       /* VINSERTF128 = VEX.NDS.256.66.0F3A.W0 18 /r ib */
   30552       if (have66noF2noF3(pfx)
   30553           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   30554          UChar  modrm = getUChar(delta);
   30555          UInt   ib    = 0;
   30556          UInt   rG    = gregOfRexRM(pfx, modrm);
   30557          UInt   rV    = getVexNvvvv(pfx);
   30558          IRTemp t128  = newTemp(Ity_V128);
   30559          if (epartIsReg(modrm)) {
   30560             UInt rE = eregOfRexRM(pfx, modrm);
   30561             delta += 1;
   30562             assign(t128, getXMMReg(rE));
   30563             ib = getUChar(delta);
   30564             DIP("vinsertf128 $%u,%s,%s,%s\n",
   30565                 ib, nameXMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   30566          } else {
   30567             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30568             assign(t128, loadLE(Ity_V128, mkexpr(addr)));
   30569             delta += alen;
   30570             ib = getUChar(delta);
   30571             DIP("vinsertf128 $%u,%s,%s,%s\n",
   30572                 ib, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   30573          }
   30574          delta++;
   30575          putYMMRegLane128(rG, 0,   getYMMRegLane128(rV, 0));
   30576          putYMMRegLane128(rG, 1,   getYMMRegLane128(rV, 1));
   30577          putYMMRegLane128(rG, ib & 1, mkexpr(t128));
   30578          *uses_vvvv = True;
   30579          goto decode_success;
   30580       }
   30581       break;
   30582 
   30583    case 0x19:
   30584      /* VEXTRACTF128 $lane_no, rS, r/m
   30585         ::: r/m:V128 = a lane of rS:V256 (RM format) */
   30586      /* VEXTRACTF128 = VEX.256.66.0F3A.W0 19 /r ib */
   30587       if (have66noF2noF3(pfx)
   30588           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   30589          UChar  modrm = getUChar(delta);
   30590          UInt   ib    = 0;
   30591          UInt   rS    = gregOfRexRM(pfx, modrm);
   30592          IRTemp t128  = newTemp(Ity_V128);
   30593          if (epartIsReg(modrm)) {
   30594             UInt rD = eregOfRexRM(pfx, modrm);
   30595             delta += 1;
   30596             ib = getUChar(delta);
   30597             assign(t128, getYMMRegLane128(rS, ib & 1));
   30598             putYMMRegLoAndZU(rD, mkexpr(t128));
   30599             DIP("vextractf128 $%u,%s,%s\n",
   30600                 ib, nameXMMReg(rS), nameYMMReg(rD));
   30601          } else {
   30602             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30603             delta += alen;
   30604             ib = getUChar(delta);
   30605             assign(t128, getYMMRegLane128(rS, ib & 1));
   30606             storeLE(mkexpr(addr), mkexpr(t128));
   30607             DIP("vextractf128 $%u,%s,%s\n",
   30608                 ib, nameYMMReg(rS), dis_buf);
   30609          }
   30610          delta++;
   30611          /* doesn't use vvvv */
   30612          goto decode_success;
   30613       }
   30614       break;
   30615 
   30616    case 0x20:
   30617       /* VPINSRB r32/m8, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W0 20 /r ib */
   30618       if (have66noF2noF3(pfx)
   30619           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   30620          UChar  modrm  = getUChar(delta);
   30621          UInt   rG     = gregOfRexRM(pfx, modrm);
   30622          UInt   rV     = getVexNvvvv(pfx);
   30623          Int    imm8;
   30624          IRTemp src_u8 = newTemp(Ity_I8);
   30625 
   30626          if ( epartIsReg( modrm ) ) {
   30627             UInt rE = eregOfRexRM(pfx,modrm);
   30628             imm8 = (Int)(getUChar(delta+1) & 15);
   30629             assign( src_u8, unop(Iop_32to8, getIReg32( rE )) );
   30630             delta += 1+1;
   30631             DIP( "vpinsrb $%d,%s,%s,%s\n",
   30632                  imm8, nameIReg32(rE), nameXMMReg(rV), nameXMMReg(rG) );
   30633          } else {
   30634             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30635             imm8 = (Int)(getUChar(delta+alen) & 15);
   30636             assign( src_u8, loadLE( Ity_I8, mkexpr(addr) ) );
   30637             delta += alen+1;
   30638             DIP( "vpinsrb $%d,%s,%s,%s\n",
   30639                  imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   30640          }
   30641 
   30642          IRTemp src_vec = newTemp(Ity_V128);
   30643          assign(src_vec, getXMMReg( rV ));
   30644          IRTemp res_vec = math_PINSRB_128( src_vec, src_u8, imm8 );
   30645          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
   30646          *uses_vvvv = True;
   30647          goto decode_success;
   30648       }
   30649       break;
   30650 
   30651    case 0x21:
   30652       /* VINSERTPS imm8, xmm3/m32, xmm2, xmm1
   30653          = VEX.NDS.128.66.0F3A.WIG 21 /r ib */
   30654       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30655          UChar  modrm = getUChar(delta);
   30656          UInt   rG    = gregOfRexRM(pfx, modrm);
   30657          UInt   rV    = getVexNvvvv(pfx);
   30658          UInt   imm8;
   30659          IRTemp d2ins = newTemp(Ity_I32); /* comes from the E part */
   30660          const IRTemp inval = IRTemp_INVALID;
   30661 
   30662          if ( epartIsReg( modrm ) ) {
   30663             UInt   rE = eregOfRexRM(pfx, modrm);
   30664             IRTemp vE = newTemp(Ity_V128);
   30665             assign( vE, getXMMReg(rE) );
   30666             IRTemp dsE[4] = { inval, inval, inval, inval };
   30667             breakupV128to32s( vE, &dsE[3], &dsE[2], &dsE[1], &dsE[0] );
   30668             imm8 = getUChar(delta+1);
   30669             d2ins = dsE[(imm8 >> 6) & 3]; /* "imm8_count_s" */
   30670             delta += 1+1;
   30671             DIP( "insertps $%u, %s,%s\n",
   30672                  imm8, nameXMMReg(rE), nameXMMReg(rG) );
   30673          } else {
   30674             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30675             assign( d2ins, loadLE( Ity_I32, mkexpr(addr) ) );
   30676             imm8 = getUChar(delta+alen);
   30677             delta += alen+1;
   30678             DIP( "insertps $%u, %s,%s\n",
   30679                  imm8, dis_buf, nameXMMReg(rG) );
   30680          }
   30681 
   30682          IRTemp vV = newTemp(Ity_V128);
   30683          assign( vV, getXMMReg(rV) );
   30684 
   30685          putYMMRegLoAndZU( rG, mkexpr(math_INSERTPS( vV, d2ins, imm8 )) );
   30686          *uses_vvvv = True;
   30687          goto decode_success;
   30688       }
   30689       break;
   30690 
   30691    case 0x22:
   30692       /* VPINSRD r32/m32, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W0 22 /r ib */
   30693       if (have66noF2noF3(pfx)
   30694           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   30695          UChar  modrm = getUChar(delta);
   30696          UInt   rG    = gregOfRexRM(pfx, modrm);
   30697          UInt   rV    = getVexNvvvv(pfx);
   30698          Int    imm8_10;
   30699          IRTemp src_u32 = newTemp(Ity_I32);
   30700 
   30701          if ( epartIsReg( modrm ) ) {
   30702             UInt rE = eregOfRexRM(pfx,modrm);
   30703             imm8_10 = (Int)(getUChar(delta+1) & 3);
   30704             assign( src_u32, getIReg32( rE ) );
   30705             delta += 1+1;
   30706             DIP( "vpinsrd $%d,%s,%s,%s\n",
   30707                  imm8_10, nameIReg32(rE), nameXMMReg(rV), nameXMMReg(rG) );
   30708          } else {
   30709             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30710             imm8_10 = (Int)(getUChar(delta+alen) & 3);
   30711             assign( src_u32, loadLE( Ity_I32, mkexpr(addr) ) );
   30712             delta += alen+1;
   30713             DIP( "vpinsrd $%d,%s,%s,%s\n",
   30714                  imm8_10, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   30715          }
   30716 
   30717          IRTemp src_vec = newTemp(Ity_V128);
   30718          assign(src_vec, getXMMReg( rV ));
   30719          IRTemp res_vec = math_PINSRD_128( src_vec, src_u32, imm8_10 );
   30720          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
   30721          *uses_vvvv = True;
   30722          goto decode_success;
   30723       }
   30724       /* VPINSRQ r64/m64, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W1 22 /r ib */
   30725       if (have66noF2noF3(pfx)
   30726           && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
   30727          UChar  modrm = getUChar(delta);
   30728          UInt   rG    = gregOfRexRM(pfx, modrm);
   30729          UInt   rV    = getVexNvvvv(pfx);
   30730          Int    imm8_0;
   30731          IRTemp src_u64 = newTemp(Ity_I64);
   30732 
   30733          if ( epartIsReg( modrm ) ) {
   30734             UInt rE = eregOfRexRM(pfx,modrm);
   30735             imm8_0 = (Int)(getUChar(delta+1) & 1);
   30736             assign( src_u64, getIReg64( rE ) );
   30737             delta += 1+1;
   30738             DIP( "vpinsrq $%d,%s,%s,%s\n",
   30739                  imm8_0, nameIReg64(rE), nameXMMReg(rV), nameXMMReg(rG) );
   30740          } else {
   30741             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30742             imm8_0 = (Int)(getUChar(delta+alen) & 1);
   30743             assign( src_u64, loadLE( Ity_I64, mkexpr(addr) ) );
   30744             delta += alen+1;
   30745             DIP( "vpinsrd $%d,%s,%s,%s\n",
   30746                  imm8_0, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   30747          }
   30748 
   30749          IRTemp src_vec = newTemp(Ity_V128);
   30750          assign(src_vec, getXMMReg( rV ));
   30751          IRTemp res_vec = math_PINSRQ_128( src_vec, src_u64, imm8_0 );
   30752          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
   30753          *uses_vvvv = True;
   30754          goto decode_success;
   30755       }
   30756       break;
   30757 
   30758    case 0x38:
   30759       /* VINSERTI128 r/m, rV, rD
   30760          ::: rD = insertinto(a lane in rV, 128 bits from r/m) */
   30761       /* VINSERTI128 = VEX.NDS.256.66.0F3A.W0 38 /r ib */
   30762       if (have66noF2noF3(pfx)
   30763           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   30764          UChar  modrm = getUChar(delta);
   30765          UInt   ib    = 0;
   30766          UInt   rG    = gregOfRexRM(pfx, modrm);
   30767          UInt   rV    = getVexNvvvv(pfx);
   30768          IRTemp t128  = newTemp(Ity_V128);
   30769          if (epartIsReg(modrm)) {
   30770             UInt rE = eregOfRexRM(pfx, modrm);
   30771             delta += 1;
   30772             assign(t128, getXMMReg(rE));
   30773             ib = getUChar(delta);
   30774             DIP("vinserti128 $%u,%s,%s,%s\n",
   30775                 ib, nameXMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   30776          } else {
   30777             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30778             assign(t128, loadLE(Ity_V128, mkexpr(addr)));
   30779             delta += alen;
   30780             ib = getUChar(delta);
   30781             DIP("vinserti128 $%u,%s,%s,%s\n",
   30782                 ib, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   30783          }
   30784          delta++;
   30785          putYMMRegLane128(rG, 0,   getYMMRegLane128(rV, 0));
   30786          putYMMRegLane128(rG, 1,   getYMMRegLane128(rV, 1));
   30787          putYMMRegLane128(rG, ib & 1, mkexpr(t128));
   30788          *uses_vvvv = True;
   30789          goto decode_success;
   30790       }
   30791       break;
   30792 
   30793    case 0x39:
   30794       /* VEXTRACTI128 $lane_no, rS, r/m
   30795          ::: r/m:V128 = a lane of rS:V256 (RM format) */
   30796       /* VEXTRACTI128 = VEX.256.66.0F3A.W0 39 /r ib */
   30797       if (have66noF2noF3(pfx)
   30798           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   30799          UChar  modrm = getUChar(delta);
   30800          UInt   ib    = 0;
   30801          UInt   rS    = gregOfRexRM(pfx, modrm);
   30802          IRTemp t128  = newTemp(Ity_V128);
   30803          if (epartIsReg(modrm)) {
   30804             UInt rD = eregOfRexRM(pfx, modrm);
   30805             delta += 1;
   30806             ib = getUChar(delta);
   30807             assign(t128, getYMMRegLane128(rS, ib & 1));
   30808             putYMMRegLoAndZU(rD, mkexpr(t128));
   30809             DIP("vextracti128 $%u,%s,%s\n",
   30810                 ib, nameXMMReg(rS), nameYMMReg(rD));
   30811          } else {
   30812             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30813             delta += alen;
   30814             ib = getUChar(delta);
   30815             assign(t128, getYMMRegLane128(rS, ib & 1));
   30816             storeLE(mkexpr(addr), mkexpr(t128));
   30817             DIP("vextracti128 $%u,%s,%s\n",
   30818                 ib, nameYMMReg(rS), dis_buf);
   30819          }
   30820          delta++;
   30821          /* doesn't use vvvv */
   30822          goto decode_success;
   30823       }
   30824       break;
   30825 
   30826    case 0x40:
   30827       /* VDPPS imm8, xmm3/m128,xmm2,xmm1 = VEX.NDS.128.66.0F3A.WIG 40 /r ib */
   30828       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30829          UChar  modrm   = getUChar(delta);
   30830          UInt   rG      = gregOfRexRM(pfx, modrm);
   30831          UInt   rV      = getVexNvvvv(pfx);
   30832          IRTemp dst_vec = newTemp(Ity_V128);
   30833          Int    imm8;
   30834          if (epartIsReg( modrm )) {
   30835             UInt rE = eregOfRexRM(pfx,modrm);
   30836             imm8 = (Int)getUChar(delta+1);
   30837             assign( dst_vec, getXMMReg( rE ) );
   30838             delta += 1+1;
   30839             DIP( "vdpps $%d,%s,%s,%s\n",
   30840                  imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
   30841          } else {
   30842             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30843             imm8 = (Int)getUChar(delta+alen);
   30844             assign( dst_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   30845             delta += alen+1;
   30846             DIP( "vdpps $%d,%s,%s,%s\n",
   30847                  imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   30848          }
   30849 
   30850          IRTemp src_vec = newTemp(Ity_V128);
   30851          assign(src_vec, getXMMReg( rV ));
   30852          IRTemp res_vec = math_DPPS_128( src_vec, dst_vec, imm8 );
   30853          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
   30854          *uses_vvvv = True;
   30855          goto decode_success;
   30856       }
   30857       /* VDPPS imm8, ymm3/m128,ymm2,ymm1 = VEX.NDS.256.66.0F3A.WIG 40 /r ib */
   30858       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   30859          UChar  modrm   = getUChar(delta);
   30860          UInt   rG      = gregOfRexRM(pfx, modrm);
   30861          UInt   rV      = getVexNvvvv(pfx);
   30862          IRTemp dst_vec = newTemp(Ity_V256);
   30863          Int    imm8;
   30864          if (epartIsReg( modrm )) {
   30865             UInt rE = eregOfRexRM(pfx,modrm);
   30866             imm8 = (Int)getUChar(delta+1);
   30867             assign( dst_vec, getYMMReg( rE ) );
   30868             delta += 1+1;
   30869             DIP( "vdpps $%d,%s,%s,%s\n",
   30870                  imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG) );
   30871          } else {
   30872             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30873             imm8 = (Int)getUChar(delta+alen);
   30874             assign( dst_vec, loadLE( Ity_V256, mkexpr(addr) ) );
   30875             delta += alen+1;
   30876             DIP( "vdpps $%d,%s,%s,%s\n",
   30877                  imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
   30878          }
   30879 
   30880          IRTemp src_vec = newTemp(Ity_V256);
   30881          assign(src_vec, getYMMReg( rV ));
   30882          IRTemp s0, s1, d0, d1;
   30883          s0 = s1 = d0 = d1 = IRTemp_INVALID;
   30884          breakupV256toV128s( dst_vec, &d1, &d0 );
   30885          breakupV256toV128s( src_vec, &s1, &s0 );
   30886          putYMMReg( rG, binop( Iop_V128HLtoV256,
   30887                                mkexpr( math_DPPS_128(s1, d1, imm8) ),
   30888                                mkexpr( math_DPPS_128(s0, d0, imm8) ) ) );
   30889          *uses_vvvv = True;
   30890          goto decode_success;
   30891       }
   30892       break;
   30893 
   30894    case 0x41:
   30895       /* VDPPD imm8, xmm3/m128,xmm2,xmm1 = VEX.NDS.128.66.0F3A.WIG 41 /r ib */
   30896       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30897          UChar  modrm   = getUChar(delta);
   30898          UInt   rG      = gregOfRexRM(pfx, modrm);
   30899          UInt   rV      = getVexNvvvv(pfx);
   30900          IRTemp dst_vec = newTemp(Ity_V128);
   30901          Int    imm8;
   30902          if (epartIsReg( modrm )) {
   30903             UInt rE = eregOfRexRM(pfx,modrm);
   30904             imm8 = (Int)getUChar(delta+1);
   30905             assign( dst_vec, getXMMReg( rE ) );
   30906             delta += 1+1;
   30907             DIP( "vdppd $%d,%s,%s,%s\n",
   30908                  imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
   30909          } else {
   30910             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30911             imm8 = (Int)getUChar(delta+alen);
   30912             assign( dst_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   30913             delta += alen+1;
   30914             DIP( "vdppd $%d,%s,%s,%s\n",
   30915                  imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   30916          }
   30917 
   30918          IRTemp src_vec = newTemp(Ity_V128);
   30919          assign(src_vec, getXMMReg( rV ));
   30920          IRTemp res_vec = math_DPPD_128( src_vec, dst_vec, imm8 );
   30921          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
   30922          *uses_vvvv = True;
   30923          goto decode_success;
   30924       }
   30925       break;
   30926 
   30927    case 0x42:
   30928       /* VMPSADBW imm8, xmm3/m128,xmm2,xmm1 */
   30929       /* VMPSADBW = VEX.NDS.128.66.0F3A.WIG 42 /r ib */
   30930       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30931          UChar  modrm   = getUChar(delta);
   30932          Int    imm8;
   30933          IRTemp src_vec = newTemp(Ity_V128);
   30934          IRTemp dst_vec = newTemp(Ity_V128);
   30935          UInt   rG      = gregOfRexRM(pfx, modrm);
   30936          UInt   rV      = getVexNvvvv(pfx);
   30937 
   30938          assign( dst_vec, getXMMReg(rV) );
   30939 
   30940          if ( epartIsReg( modrm ) ) {
   30941             UInt rE = eregOfRexRM(pfx, modrm);
   30942 
   30943             imm8 = (Int)getUChar(delta+1);
   30944             assign( src_vec, getXMMReg(rE) );
   30945             delta += 1+1;
   30946             DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
   30947                  nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
   30948          } else {
   30949             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   30950                              1/* imm8 is 1 byte after the amode */ );
   30951             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   30952             imm8 = (Int)getUChar(delta+alen);
   30953             delta += alen+1;
   30954             DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
   30955                  dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   30956          }
   30957 
   30958          putYMMRegLoAndZU( rG, mkexpr( math_MPSADBW_128(dst_vec,
   30959                                                         src_vec, imm8) ) );
   30960          *uses_vvvv = True;
   30961          goto decode_success;
   30962       }
   30963       /* VMPSADBW imm8, ymm3/m256,ymm2,ymm1 */
   30964       /* VMPSADBW = VEX.NDS.256.66.0F3A.WIG 42 /r ib */
   30965       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   30966          UChar  modrm   = getUChar(delta);
   30967          Int    imm8;
   30968          IRTemp src_vec = newTemp(Ity_V256);
   30969          IRTemp dst_vec = newTemp(Ity_V256);
   30970          UInt   rG      = gregOfRexRM(pfx, modrm);
   30971          UInt   rV      = getVexNvvvv(pfx);
   30972          IRTemp sHi, sLo, dHi, dLo;
   30973          sHi = sLo = dHi = dLo = IRTemp_INVALID;
   30974 
   30975          assign( dst_vec, getYMMReg(rV) );
   30976 
   30977          if ( epartIsReg( modrm ) ) {
   30978             UInt rE = eregOfRexRM(pfx, modrm);
   30979 
   30980             imm8 = (Int)getUChar(delta+1);
   30981             assign( src_vec, getYMMReg(rE) );
   30982             delta += 1+1;
   30983             DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
   30984                  nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG) );
   30985          } else {
   30986             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   30987                              1/* imm8 is 1 byte after the amode */ );
   30988             assign( src_vec, loadLE( Ity_V256, mkexpr(addr) ) );
   30989             imm8 = (Int)getUChar(delta+alen);
   30990             delta += alen+1;
   30991             DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
   30992                  dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
   30993          }
   30994 
   30995          breakupV256toV128s( dst_vec, &dHi, &dLo );
   30996          breakupV256toV128s( src_vec, &sHi, &sLo );
   30997          putYMMReg( rG, binop( Iop_V128HLtoV256,
   30998                                mkexpr( math_MPSADBW_128(dHi, sHi, imm8 >> 3) ),
   30999                                mkexpr( math_MPSADBW_128(dLo, sLo, imm8) ) ) );
   31000          *uses_vvvv = True;
   31001          goto decode_success;
   31002       }
   31003       break;
   31004 
   31005    case 0x44:
   31006       /* VPCLMULQDQ imm8, xmm3/m128,xmm2,xmm1 */
   31007       /* VPCLMULQDQ = VEX.NDS.128.66.0F3A.WIG 44 /r ib */
   31008       /* 66 0F 3A 44 /r ib = PCLMULQDQ xmm1, xmm2/m128, imm8
   31009        * Carry-less multiplication of selected XMM quadwords into XMM
   31010        * registers (a.k.a multiplication of polynomials over GF(2))
   31011        */
   31012       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31013          UChar  modrm = getUChar(delta);
   31014          Int imm8;
   31015          IRTemp sV    = newTemp(Ity_V128);
   31016          IRTemp dV    = newTemp(Ity_V128);
   31017          UInt   rG    = gregOfRexRM(pfx, modrm);
   31018          UInt   rV    = getVexNvvvv(pfx);
   31019 
   31020          assign( dV, getXMMReg(rV) );
   31021 
   31022          if ( epartIsReg( modrm ) ) {
   31023             UInt rE = eregOfRexRM(pfx, modrm);
   31024             imm8 = (Int)getUChar(delta+1);
   31025             assign( sV, getXMMReg(rE) );
   31026             delta += 1+1;
   31027             DIP( "vpclmulqdq $%d, %s,%s,%s\n", imm8,
   31028                  nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
   31029          } else {
   31030             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   31031                              1/* imm8 is 1 byte after the amode */ );
   31032             assign( sV, loadLE( Ity_V128, mkexpr(addr) ) );
   31033             imm8 = (Int)getUChar(delta+alen);
   31034             delta += alen+1;
   31035             DIP( "vpclmulqdq $%d, %s,%s,%s\n",
   31036                  imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   31037          }
   31038 
   31039          putYMMRegLoAndZU( rG, mkexpr( math_PCLMULQDQ(dV, sV, imm8) ) );
   31040          *uses_vvvv = True;
   31041          goto decode_success;
   31042       }
   31043       break;
   31044 
   31045    case 0x46:
   31046       /* VPERM2I128 imm8, ymm3/m256, ymm2, ymm1 = VEX.NDS.66.0F3A.W0 46 /r ib */
   31047       if (have66noF2noF3(pfx)
   31048           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   31049          UChar  modrm = getUChar(delta);
   31050          UInt   imm8  = 0;
   31051          UInt   rG    = gregOfRexRM(pfx, modrm);
   31052          UInt   rV    = getVexNvvvv(pfx);
   31053          IRTemp s00   = newTemp(Ity_V128);
   31054          IRTemp s01   = newTemp(Ity_V128);
   31055          IRTemp s10   = newTemp(Ity_V128);
   31056          IRTemp s11   = newTemp(Ity_V128);
   31057          assign(s00, getYMMRegLane128(rV, 0));
   31058          assign(s01, getYMMRegLane128(rV, 1));
   31059          if (epartIsReg(modrm)) {
   31060             UInt rE = eregOfRexRM(pfx, modrm);
   31061             delta += 1;
   31062             imm8 = getUChar(delta);
   31063             DIP("vperm2i128 $%u,%s,%s,%s\n",
   31064                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   31065             assign(s10, getYMMRegLane128(rE, 0));
   31066             assign(s11, getYMMRegLane128(rE, 1));
   31067          } else {
   31068             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   31069             delta += alen;
   31070             imm8 = getUChar(delta);
   31071             DIP("vperm2i128 $%u,%s,%s,%s\n",
   31072                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   31073             assign(s10, loadLE(Ity_V128, binop(Iop_Add64,
   31074                                                mkexpr(addr), mkU64(0))));
   31075             assign(s11, loadLE(Ity_V128, binop(Iop_Add64,
   31076                                                mkexpr(addr), mkU64(16))));
   31077          }
   31078          delta++;
   31079 #        define SEL(_nn) (((_nn)==0) ? s00 : ((_nn)==1) ? s01 \
   31080                                            : ((_nn)==2) ? s10 : s11)
   31081          putYMMRegLane128(rG, 0, mkexpr(SEL((imm8 >> 0) & 3)));
   31082          putYMMRegLane128(rG, 1, mkexpr(SEL((imm8 >> 4) & 3)));
   31083 #        undef SEL
   31084          if (imm8 & (1<<3)) putYMMRegLane128(rG, 0, mkV128(0));
   31085          if (imm8 & (1<<7)) putYMMRegLane128(rG, 1, mkV128(0));
   31086          *uses_vvvv = True;
   31087          goto decode_success;
   31088       }
   31089       break;
   31090 
   31091    case 0x4A:
   31092       /* VBLENDVPS xmmG, xmmE/memE, xmmV, xmmIS4
   31093          ::: xmmG:V128 = PBLEND(xmmE, xmmV, xmmIS4) (RMVR) */
   31094       /* VBLENDVPS = VEX.NDS.128.66.0F3A.WIG 4A /r /is4 */
   31095       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31096          delta = dis_VBLENDV_128 ( vbi, pfx, delta,
   31097                                    "vblendvps", 4, Iop_SarN32x4 );
   31098          *uses_vvvv = True;
   31099          goto decode_success;
   31100       }
   31101       /* VBLENDVPS ymmG, ymmE/memE, ymmV, ymmIS4
   31102          ::: ymmG:V256 = PBLEND(ymmE, ymmV, ymmIS4) (RMVR) */
   31103       /* VBLENDVPS = VEX.NDS.256.66.0F3A.WIG 4A /r /is4 */
   31104       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   31105          delta = dis_VBLENDV_256 ( vbi, pfx, delta,
   31106                                    "vblendvps", 4, Iop_SarN32x4 );
   31107          *uses_vvvv = True;
   31108          goto decode_success;
   31109       }
   31110       break;
   31111 
   31112    case 0x4B:
   31113       /* VBLENDVPD xmmG, xmmE/memE, xmmV, xmmIS4
   31114          ::: xmmG:V128 = PBLEND(xmmE, xmmV, xmmIS4) (RMVR) */
   31115       /* VBLENDVPD = VEX.NDS.128.66.0F3A.WIG 4B /r /is4 */
   31116       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31117          delta = dis_VBLENDV_128 ( vbi, pfx, delta,
   31118                                    "vblendvpd", 8, Iop_SarN64x2 );
   31119          *uses_vvvv = True;
   31120          goto decode_success;
   31121       }
   31122       /* VBLENDVPD ymmG, ymmE/memE, ymmV, ymmIS4
   31123          ::: ymmG:V256 = PBLEND(ymmE, ymmV, ymmIS4) (RMVR) */
   31124       /* VBLENDVPD = VEX.NDS.256.66.0F3A.WIG 4B /r /is4 */
   31125       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   31126          delta = dis_VBLENDV_256 ( vbi, pfx, delta,
   31127                                    "vblendvpd", 8, Iop_SarN64x2 );
   31128          *uses_vvvv = True;
   31129          goto decode_success;
   31130       }
   31131       break;
   31132 
   31133    case 0x4C:
   31134       /* VPBLENDVB xmmG, xmmE/memE, xmmV, xmmIS4
   31135          ::: xmmG:V128 = PBLEND(xmmE, xmmV, xmmIS4) (RMVR) */
   31136       /* VPBLENDVB = VEX.NDS.128.66.0F3A.WIG 4C /r /is4 */
   31137       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31138          delta = dis_VBLENDV_128 ( vbi, pfx, delta,
   31139                                    "vpblendvb", 1, Iop_SarN8x16 );
   31140          *uses_vvvv = True;
   31141          goto decode_success;
   31142       }
   31143       /* VPBLENDVB ymmG, ymmE/memE, ymmV, ymmIS4
   31144          ::: ymmG:V256 = PBLEND(ymmE, ymmV, ymmIS4) (RMVR) */
   31145       /* VPBLENDVB = VEX.NDS.256.66.0F3A.WIG 4C /r /is4 */
   31146       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   31147          delta = dis_VBLENDV_256 ( vbi, pfx, delta,
   31148                                    "vpblendvb", 1, Iop_SarN8x16 );
   31149          *uses_vvvv = True;
   31150          goto decode_success;
   31151       }
   31152       break;
   31153 
   31154    case 0x60:
   31155    case 0x61:
   31156    case 0x62:
   31157    case 0x63:
   31158       /* VEX.128.66.0F3A.WIG 63 /r ib = VPCMPISTRI imm8, xmm2/m128, xmm1
   31159          VEX.128.66.0F3A.WIG 62 /r ib = VPCMPISTRM imm8, xmm2/m128, xmm1
   31160          VEX.128.66.0F3A.WIG 61 /r ib = VPCMPESTRI imm8, xmm2/m128, xmm1
   31161          VEX.128.66.0F3A.WIG 60 /r ib = VPCMPESTRM imm8, xmm2/m128, xmm1
   31162          (selected special cases that actually occur in glibc,
   31163           not by any means a complete implementation.)
   31164       */
   31165       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31166          Long delta0 = delta;
   31167          delta = dis_PCMPxSTRx( vbi, pfx, delta, True/*isAvx*/, opc );
   31168          if (delta > delta0) goto decode_success;
   31169          /* else fall though; dis_PCMPxSTRx failed to decode it */
   31170       }
   31171       break;
   31172 
   31173    case 0xDF:
   31174       /* VAESKEYGENASSIST imm8, xmm2/m128, xmm1 = VEX.128.66.0F3A.WIG DF /r */
   31175       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31176          delta = dis_AESKEYGENASSIST( vbi, pfx, delta, True/*!isAvx*/ );
   31177          goto decode_success;
   31178       }
   31179       break;
   31180 
   31181    case 0xF0:
   31182       /* RORX imm8, r/m32, r32a = VEX.LZ.F2.0F3A.W0 F0 /r /i */
   31183       /* RORX imm8, r/m64, r64a = VEX.LZ.F2.0F3A.W1 F0 /r /i */
   31184       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   31185          Int     size = getRexW(pfx) ? 8 : 4;
   31186          IRType  ty   = szToITy(size);
   31187          IRTemp  src  = newTemp(ty);
   31188          UChar   rm   = getUChar(delta);
   31189          UChar   imm8;
   31190 
   31191          if (epartIsReg(rm)) {
   31192             imm8 = getUChar(delta+1);
   31193             assign( src, getIRegE(size,pfx,rm) );
   31194             DIP("rorx %d,%s,%s\n", imm8, nameIRegE(size,pfx,rm),
   31195                                    nameIRegG(size,pfx,rm));
   31196             delta += 2;
   31197          } else {
   31198             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   31199             imm8 = getUChar(delta+alen);
   31200             assign( src, loadLE(ty, mkexpr(addr)) );
   31201             DIP("rorx %d,%s,%s\n", imm8, dis_buf, nameIRegG(size,pfx,rm));
   31202             delta += alen + 1;
   31203          }
   31204          imm8 &= 8*size-1;
   31205 
   31206          /* dst = (src >>u imm8) | (src << (size-imm8)) */
   31207          putIRegG( size, pfx, rm,
   31208                    imm8 == 0 ? mkexpr(src)
   31209                    : binop( mkSizedOp(ty,Iop_Or8),
   31210                             binop( mkSizedOp(ty,Iop_Shr8), mkexpr(src),
   31211                                    mkU8(imm8) ),
   31212                             binop( mkSizedOp(ty,Iop_Shl8), mkexpr(src),
   31213                                    mkU8(8*size-imm8) ) ) );
   31214          /* Flags aren't modified.  */
   31215          goto decode_success;
   31216       }
   31217       break;
   31218 
   31219    default:
   31220       break;
   31221 
   31222    }
   31223 
   31224   //decode_failure:
   31225    return deltaIN;
   31226 
   31227   decode_success:
   31228    return delta;
   31229 }
   31230 
   31231 
   31232 /*------------------------------------------------------------*/
   31233 /*---                                                      ---*/
   31234 /*--- Disassemble a single instruction                     ---*/
   31235 /*---                                                      ---*/
   31236 /*------------------------------------------------------------*/
   31237 
   31238 /* Disassemble a single instruction into IR.  The instruction is
   31239    located in host memory at &guest_code[delta]. */
   31240 
   31241 static
   31242 DisResult disInstr_AMD64_WRK (
   31243              /*OUT*/Bool* expect_CAS,
   31244              Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
   31245              Bool         resteerCisOk,
   31246              void*        callback_opaque,
   31247              Long         delta64,
   31248              const VexArchInfo* archinfo,
   31249              const VexAbiInfo*  vbi,
   31250              Bool         sigill_diag
   31251           )
   31252 {
   31253    IRTemp    t1, t2;
   31254    UChar     pre;
   31255    Int       n, n_prefixes;
   31256    DisResult dres;
   31257 
   31258    /* The running delta */
   31259    Long delta = delta64;
   31260 
   31261    /* Holds eip at the start of the insn, so that we can print
   31262       consistent error messages for unimplemented insns. */
   31263    Long delta_start = delta;
   31264 
   31265    /* sz denotes the nominal data-op size of the insn; we change it to
   31266       2 if an 0x66 prefix is seen and 8 if REX.W is 1.  In case of
   31267       conflict REX.W takes precedence. */
   31268    Int sz = 4;
   31269 
   31270    /* pfx holds the summary of prefixes. */
   31271    Prefix pfx = PFX_EMPTY;
   31272 
   31273    /* Holds the computed opcode-escape indication. */
   31274    Escape esc = ESC_NONE;
   31275 
   31276    /* Set result defaults. */
   31277    dres.whatNext    = Dis_Continue;
   31278    dres.len         = 0;
   31279    dres.continueAt  = 0;
   31280    dres.jk_StopHere = Ijk_INVALID;
   31281    *expect_CAS = False;
   31282 
   31283    vassert(guest_RIP_next_assumed == 0);
   31284    vassert(guest_RIP_next_mustcheck == False);
   31285 
   31286    t1 = t2 = IRTemp_INVALID;
   31287 
   31288    DIP("\t0x%llx:  ", guest_RIP_bbstart+delta);
   31289 
   31290    /* Spot "Special" instructions (see comment at top of file). */
   31291    {
   31292       const UChar* code = guest_code + delta;
   31293       /* Spot the 16-byte preamble:
   31294          48C1C703   rolq $3,  %rdi
   31295          48C1C70D   rolq $13, %rdi
   31296          48C1C73D   rolq $61, %rdi
   31297          48C1C733   rolq $51, %rdi
   31298       */
   31299       if (code[ 0] == 0x48 && code[ 1] == 0xC1 && code[ 2] == 0xC7
   31300                                                && code[ 3] == 0x03 &&
   31301           code[ 4] == 0x48 && code[ 5] == 0xC1 && code[ 6] == 0xC7
   31302                                                && code[ 7] == 0x0D &&
   31303           code[ 8] == 0x48 && code[ 9] == 0xC1 && code[10] == 0xC7
   31304                                                && code[11] == 0x3D &&
   31305           code[12] == 0x48 && code[13] == 0xC1 && code[14] == 0xC7
   31306                                                && code[15] == 0x33) {
   31307          /* Got a "Special" instruction preamble.  Which one is it? */
   31308          if (code[16] == 0x48 && code[17] == 0x87
   31309                               && code[18] == 0xDB /* xchgq %rbx,%rbx */) {
   31310             /* %RDX = client_request ( %RAX ) */
   31311             DIP("%%rdx = client_request ( %%rax )\n");
   31312             delta += 19;
   31313             jmp_lit(&dres, Ijk_ClientReq, guest_RIP_bbstart+delta);
   31314             vassert(dres.whatNext == Dis_StopHere);
   31315             goto decode_success;
   31316          }
   31317          else
   31318          if (code[16] == 0x48 && code[17] == 0x87
   31319                               && code[18] == 0xC9 /* xchgq %rcx,%rcx */) {
   31320             /* %RAX = guest_NRADDR */
   31321             DIP("%%rax = guest_NRADDR\n");
   31322             delta += 19;
   31323             putIRegRAX(8, IRExpr_Get( OFFB_NRADDR, Ity_I64 ));
   31324             goto decode_success;
   31325          }
   31326          else
   31327          if (code[16] == 0x48 && code[17] == 0x87
   31328                               && code[18] == 0xD2 /* xchgq %rdx,%rdx */) {
   31329             /* call-noredir *%RAX */
   31330             DIP("call-noredir *%%rax\n");
   31331             delta += 19;
   31332             t1 = newTemp(Ity_I64);
   31333             assign(t1, getIRegRAX(8));
   31334             t2 = newTemp(Ity_I64);
   31335             assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   31336             putIReg64(R_RSP, mkexpr(t2));
   31337             storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta));
   31338             jmp_treg(&dres, Ijk_NoRedir, t1);
   31339             vassert(dres.whatNext == Dis_StopHere);
   31340             goto decode_success;
   31341          }
   31342          else
   31343          if (code[16] == 0x48 && code[17] == 0x87
   31344                               && code[18] == 0xff /* xchgq %rdi,%rdi */) {
   31345            /* IR injection */
   31346             DIP("IR injection\n");
   31347             vex_inject_ir(irsb, Iend_LE);
   31348 
   31349             // Invalidate the current insn. The reason is that the IRop we're
   31350             // injecting here can change. In which case the translation has to
   31351             // be redone. For ease of handling, we simply invalidate all the
   31352             // time.
   31353             stmt(IRStmt_Put(OFFB_CMSTART, mkU64(guest_RIP_curr_instr)));
   31354             stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(19)));
   31355 
   31356             delta += 19;
   31357 
   31358             stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_bbstart + delta) ) );
   31359             dres.whatNext    = Dis_StopHere;
   31360             dres.jk_StopHere = Ijk_InvalICache;
   31361             goto decode_success;
   31362          }
   31363          /* We don't know what it is. */
   31364          goto decode_failure;
   31365          /*NOTREACHED*/
   31366       }
   31367    }
   31368 
   31369    /* Eat prefixes, summarising the result in pfx and sz, and rejecting
   31370       as many invalid combinations as possible. */
   31371    n_prefixes = 0;
   31372    while (True) {
   31373       if (n_prefixes > 7) goto decode_failure;
   31374       pre = getUChar(delta);
   31375       switch (pre) {
   31376          case 0x66: pfx |= PFX_66; break;
   31377          case 0x67: pfx |= PFX_ASO; break;
   31378          case 0xF2: pfx |= PFX_F2; break;
   31379          case 0xF3: pfx |= PFX_F3; break;
   31380          case 0xF0: pfx |= PFX_LOCK; *expect_CAS = True; break;
   31381          case 0x2E: pfx |= PFX_CS; break;
   31382          case 0x3E: pfx |= PFX_DS; break;
   31383          case 0x26: pfx |= PFX_ES; break;
   31384          case 0x64: pfx |= PFX_FS; break;
   31385          case 0x65: pfx |= PFX_GS; break;
   31386          case 0x36: pfx |= PFX_SS; break;
   31387          case 0x40 ... 0x4F:
   31388             pfx |= PFX_REX;
   31389             if (pre & (1<<3)) pfx |= PFX_REXW;
   31390             if (pre & (1<<2)) pfx |= PFX_REXR;
   31391             if (pre & (1<<1)) pfx |= PFX_REXX;
   31392             if (pre & (1<<0)) pfx |= PFX_REXB;
   31393             break;
   31394          default:
   31395             goto not_a_legacy_prefix;
   31396       }
   31397       n_prefixes++;
   31398       delta++;
   31399    }
   31400 
   31401    not_a_legacy_prefix:
   31402    /* We've used up all the non-VEX prefixes.  Parse and validate a
   31403       VEX prefix if that's appropriate. */
   31404    if (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX) {
   31405       /* Used temporarily for holding VEX prefixes. */
   31406       UChar vex0 = getUChar(delta);
   31407       if (vex0 == 0xC4) {
   31408          /* 3-byte VEX */
   31409          UChar vex1 = getUChar(delta+1);
   31410          UChar vex2 = getUChar(delta+2);
   31411          delta += 3;
   31412          pfx |= PFX_VEX;
   31413          /* Snarf contents of byte 1 */
   31414          /* R */ pfx |= (vex1 & (1<<7)) ? 0 : PFX_REXR;
   31415          /* X */ pfx |= (vex1 & (1<<6)) ? 0 : PFX_REXX;
   31416          /* B */ pfx |= (vex1 & (1<<5)) ? 0 : PFX_REXB;
   31417          /* m-mmmm */
   31418          switch (vex1 & 0x1F) {
   31419             case 1: esc = ESC_0F;   break;
   31420             case 2: esc = ESC_0F38; break;
   31421             case 3: esc = ESC_0F3A; break;
   31422             /* Any other m-mmmm field will #UD */
   31423             default: goto decode_failure;
   31424          }
   31425          /* Snarf contents of byte 2 */
   31426          /* W */    pfx |= (vex2 & (1<<7)) ? PFX_REXW : 0;
   31427          /* ~v3 */  pfx |= (vex2 & (1<<6)) ? 0 : PFX_VEXnV3;
   31428          /* ~v2 */  pfx |= (vex2 & (1<<5)) ? 0 : PFX_VEXnV2;
   31429          /* ~v1 */  pfx |= (vex2 & (1<<4)) ? 0 : PFX_VEXnV1;
   31430          /* ~v0 */  pfx |= (vex2 & (1<<3)) ? 0 : PFX_VEXnV0;
   31431          /* L */    pfx |= (vex2 & (1<<2)) ? PFX_VEXL : 0;
   31432          /* pp */
   31433          switch (vex2 & 3) {
   31434             case 0: break;
   31435             case 1: pfx |= PFX_66; break;
   31436             case 2: pfx |= PFX_F3; break;
   31437             case 3: pfx |= PFX_F2; break;
   31438             default: vassert(0);
   31439          }
   31440       }
   31441       else if (vex0 == 0xC5) {
   31442          /* 2-byte VEX */
   31443          UChar vex1 = getUChar(delta+1);
   31444          delta += 2;
   31445          pfx |= PFX_VEX;
   31446          /* Snarf contents of byte 1 */
   31447          /* R */    pfx |= (vex1 & (1<<7)) ? 0 : PFX_REXR;
   31448          /* ~v3 */  pfx |= (vex1 & (1<<6)) ? 0 : PFX_VEXnV3;
   31449          /* ~v2 */  pfx |= (vex1 & (1<<5)) ? 0 : PFX_VEXnV2;
   31450          /* ~v1 */  pfx |= (vex1 & (1<<4)) ? 0 : PFX_VEXnV1;
   31451          /* ~v0 */  pfx |= (vex1 & (1<<3)) ? 0 : PFX_VEXnV0;
   31452          /* L */    pfx |= (vex1 & (1<<2)) ? PFX_VEXL : 0;
   31453          /* pp */
   31454          switch (vex1 & 3) {
   31455             case 0: break;
   31456             case 1: pfx |= PFX_66; break;
   31457             case 2: pfx |= PFX_F3; break;
   31458             case 3: pfx |= PFX_F2; break;
   31459             default: vassert(0);
   31460          }
   31461          /* implied: */
   31462          esc = ESC_0F;
   31463       }
   31464       /* Can't have both VEX and REX */
   31465       if ((pfx & PFX_VEX) && (pfx & PFX_REX))
   31466          goto decode_failure; /* can't have both */
   31467    }
   31468 
   31469    /* Dump invalid combinations */
   31470    n = 0;
   31471    if (pfx & PFX_F2) n++;
   31472    if (pfx & PFX_F3) n++;
   31473    if (n > 1)
   31474       goto decode_failure; /* can't have both */
   31475 
   31476    n = 0;
   31477    if (pfx & PFX_CS) n++;
   31478    if (pfx & PFX_DS) n++;
   31479    if (pfx & PFX_ES) n++;
   31480    if (pfx & PFX_FS) n++;
   31481    if (pfx & PFX_GS) n++;
   31482    if (pfx & PFX_SS) n++;
   31483    if (n > 1)
   31484       goto decode_failure; /* multiple seg overrides == illegal */
   31485 
   31486    /* We have a %fs prefix.  Reject it if there's no evidence in 'vbi'
   31487       that we should accept it. */
   31488    if ((pfx & PFX_FS) && !vbi->guest_amd64_assume_fs_is_const)
   31489       goto decode_failure;
   31490 
   31491    /* Ditto for %gs prefixes. */
   31492    if ((pfx & PFX_GS) && !vbi->guest_amd64_assume_gs_is_const)
   31493       goto decode_failure;
   31494 
   31495    /* Set up sz. */
   31496    sz = 4;
   31497    if (pfx & PFX_66) sz = 2;
   31498    if ((pfx & PFX_REX) && (pfx & PFX_REXW)) sz = 8;
   31499 
   31500    /* Now we should be looking at the primary opcode byte or the
   31501       leading escapes.  Check that any LOCK prefix is actually
   31502       allowed. */
   31503    if (haveLOCK(pfx)) {
   31504       if (can_be_used_with_LOCK_prefix( &guest_code[delta] )) {
   31505          DIP("lock ");
   31506       } else {
   31507          *expect_CAS = False;
   31508          goto decode_failure;
   31509       }
   31510    }
   31511 
   31512    /* Eat up opcode escape bytes, until we're really looking at the
   31513       primary opcode byte.  But only if there's no VEX present. */
   31514    if (!(pfx & PFX_VEX)) {
   31515       vassert(esc == ESC_NONE);
   31516       pre = getUChar(delta);
   31517       if (pre == 0x0F) {
   31518          delta++;
   31519          pre = getUChar(delta);
   31520          switch (pre) {
   31521             case 0x38: esc = ESC_0F38; delta++; break;
   31522             case 0x3A: esc = ESC_0F3A; delta++; break;
   31523             default:   esc = ESC_0F; break;
   31524          }
   31525       }
   31526    }
   31527 
   31528    /* So now we're really really looking at the primary opcode
   31529       byte. */
   31530    Long delta_at_primary_opcode = delta;
   31531 
   31532    if (!(pfx & PFX_VEX)) {
   31533       /* Handle non-VEX prefixed instructions.  "Legacy" (non-VEX) SSE
   31534          instructions preserve the upper 128 bits of YMM registers;
   31535          iow we can simply ignore the presence of the upper halves of
   31536          these registers. */
   31537       switch (esc) {
   31538          case ESC_NONE:
   31539             delta = dis_ESC_NONE( &dres, expect_CAS,
   31540                                   resteerOkFn, resteerCisOk, callback_opaque,
   31541                                   archinfo, vbi, pfx, sz, delta );
   31542             break;
   31543          case ESC_0F:
   31544             delta = dis_ESC_0F  ( &dres, expect_CAS,
   31545                                   resteerOkFn, resteerCisOk, callback_opaque,
   31546                                   archinfo, vbi, pfx, sz, delta );
   31547             break;
   31548          case ESC_0F38:
   31549             delta = dis_ESC_0F38( &dres,
   31550                                   resteerOkFn, resteerCisOk, callback_opaque,
   31551                                   archinfo, vbi, pfx, sz, delta );
   31552             break;
   31553          case ESC_0F3A:
   31554             delta = dis_ESC_0F3A( &dres,
   31555                                   resteerOkFn, resteerCisOk, callback_opaque,
   31556                                   archinfo, vbi, pfx, sz, delta );
   31557             break;
   31558          default:
   31559             vassert(0);
   31560       }
   31561    } else {
   31562       /* VEX prefixed instruction */
   31563       /* Sloppy Intel wording: "An instruction encoded with a VEX.128
   31564          prefix that loads a YMM register operand ..." zeroes out bits
   31565          128 and above of the register. */
   31566       Bool uses_vvvv = False;
   31567       switch (esc) {
   31568          case ESC_0F:
   31569             delta = dis_ESC_0F__VEX ( &dres, &uses_vvvv,
   31570                                       resteerOkFn, resteerCisOk,
   31571                                       callback_opaque,
   31572                                       archinfo, vbi, pfx, sz, delta );
   31573             break;
   31574          case ESC_0F38:
   31575             delta = dis_ESC_0F38__VEX ( &dres, &uses_vvvv,
   31576                                         resteerOkFn, resteerCisOk,
   31577                                         callback_opaque,
   31578                                         archinfo, vbi, pfx, sz, delta );
   31579             break;
   31580          case ESC_0F3A:
   31581             delta = dis_ESC_0F3A__VEX ( &dres, &uses_vvvv,
   31582                                         resteerOkFn, resteerCisOk,
   31583                                         callback_opaque,
   31584                                         archinfo, vbi, pfx, sz, delta );
   31585             break;
   31586          case ESC_NONE:
   31587             /* The presence of a VEX prefix, by Intel definition,
   31588                always implies at least an 0F escape. */
   31589             goto decode_failure;
   31590          default:
   31591             vassert(0);
   31592       }
   31593       /* If the insn doesn't use VEX.vvvv then it must be all ones.
   31594          Check this. */
   31595       if (!uses_vvvv) {
   31596          if (getVexNvvvv(pfx) != 0)
   31597             goto decode_failure;
   31598       }
   31599    }
   31600 
   31601    vassert(delta - delta_at_primary_opcode >= 0);
   31602    vassert(delta - delta_at_primary_opcode < 16/*let's say*/);
   31603 
   31604    /* Use delta == delta_at_primary_opcode to denote decode failure.
   31605       This implies that any successful decode must use at least one
   31606       byte up. */
   31607    if (delta == delta_at_primary_opcode)
   31608       goto decode_failure;
   31609    else
   31610       goto decode_success; /* \o/ */
   31611 
   31612 #if 0 /* XYZZY */
   31613 
   31614    /* ---------------------------------------------------- */
   31615    /* --- The SSE/SSE2 decoder.                        --- */
   31616    /* ---------------------------------------------------- */
   31617 
   31618    /* What did I do to deserve SSE ?  Perhaps I was really bad in a
   31619       previous life? */
   31620 
   31621    /* Note, this doesn't handle SSE3 right now.  All amd64s support
   31622       SSE2 as a minimum so there is no point distinguishing SSE1 vs
   31623       SSE2. */
   31624 
   31625    insn = &guest_code[delta];
   31626 
   31627    /* FXSAVE is spuriously at the start here only because it is
   31628       thusly placed in guest-x86/toIR.c. */
   31629 
   31630    /* ------ SSE decoder main ------ */
   31631 
   31632    /* ---------------------------------------------------- */
   31633    /* --- end of the SSE decoder.                      --- */
   31634    /* ---------------------------------------------------- */
   31635 
   31636    /* ---------------------------------------------------- */
   31637    /* --- start of the SSE2 decoder.                   --- */
   31638    /* ---------------------------------------------------- */
   31639 
   31640    /* ---------------------------------------------------- */
   31641    /* --- end of the SSE/SSE2 decoder.                 --- */
   31642    /* ---------------------------------------------------- */
   31643 
   31644    /* ---------------------------------------------------- */
   31645    /* --- start of the SSE3 decoder.                   --- */
   31646    /* ---------------------------------------------------- */
   31647 
   31648    /* ---------------------------------------------------- */
   31649    /* --- end of the SSE3 decoder.                     --- */
   31650    /* ---------------------------------------------------- */
   31651 
   31652    /* ---------------------------------------------------- */
   31653    /* --- start of the SSSE3 decoder.                  --- */
   31654    /* ---------------------------------------------------- */
   31655 
   31656    /* ---------------------------------------------------- */
   31657    /* --- end of the SSSE3 decoder.                    --- */
   31658    /* ---------------------------------------------------- */
   31659 
   31660    /* ---------------------------------------------------- */
   31661    /* --- start of the SSE4 decoder                    --- */
   31662    /* ---------------------------------------------------- */
   31663 
   31664    /* ---------------------------------------------------- */
   31665    /* --- end of the SSE4 decoder                      --- */
   31666    /* ---------------------------------------------------- */
   31667 
   31668    /*after_sse_decoders:*/
   31669 
   31670    /* Get the primary opcode. */
   31671    opc = getUChar(delta); delta++;
   31672 
   31673    /* We get here if the current insn isn't SSE, or this CPU doesn't
   31674       support SSE. */
   31675 
   31676    switch (opc) {
   31677 
   31678    /* ------------------------ Control flow --------------- */
   31679 
   31680    /* ------------------------ CWD/CDQ -------------------- */
   31681 
   31682    /* ------------------------ FPU ops -------------------- */
   31683 
   31684    /* ------------------------ INT ------------------------ */
   31685 
   31686    case 0xCD: { /* INT imm8 */
   31687       IRJumpKind jk = Ijk_Boring;
   31688       if (have66orF2orF3(pfx)) goto decode_failure;
   31689       d64 = getUChar(delta); delta++;
   31690       switch (d64) {
   31691          case 32: jk = Ijk_Sys_int32; break;
   31692          default: goto decode_failure;
   31693       }
   31694       guest_RIP_next_mustcheck = True;
   31695       guest_RIP_next_assumed = guest_RIP_bbstart + delta;
   31696       jmp_lit(jk, guest_RIP_next_assumed);
   31697       /* It's important that all ArchRegs carry their up-to-date value
   31698          at this point.  So we declare an end-of-block here, which
   31699          forces any TempRegs caching ArchRegs to be flushed. */
   31700       vassert(dres.whatNext == Dis_StopHere);
   31701       DIP("int $0x%02x\n", (UInt)d64);
   31702       break;
   31703    }
   31704 
   31705    /* ------------------------ Jcond, byte offset --------- */
   31706 
   31707    /* ------------------------ IMUL ----------------------- */
   31708 
   31709    /* ------------------------ MOV ------------------------ */
   31710 
   31711    /* ------------------------ MOVx ------------------------ */
   31712 
   31713    /* ------------------------ opl imm, A ----------------- */
   31714 
   31715    /* ------------------------ opl Ev, Gv ----------------- */
   31716 
   31717    /* ------------------------ opl Gv, Ev ----------------- */
   31718 
   31719    /* ------------------------ POP ------------------------ */
   31720 
   31721    /* ------------------------ PUSH ----------------------- */
   31722 
   31723    /* ------ AE: SCAS variants ------ */
   31724 
   31725    /* ------ A6, A7: CMPS variants ------ */
   31726 
   31727    /* ------ AA, AB: STOS variants ------ */
   31728 
   31729    /* ------ A4, A5: MOVS variants ------ */
   31730 
   31731    /* ------------------------ XCHG ----------------------- */
   31732 
   31733    /* ------------------------ IN / OUT ----------------------- */
   31734 
   31735    /* ------------------------ (Grp1 extensions) ---------- */
   31736 
   31737    /* ------------------------ (Grp2 extensions) ---------- */
   31738 
   31739    /* ------------------------ (Grp3 extensions) ---------- */
   31740 
   31741    /* ------------------------ (Grp4 extensions) ---------- */
   31742 
   31743    /* ------------------------ (Grp5 extensions) ---------- */
   31744 
   31745    /* ------------------------ Escapes to 2-byte opcodes -- */
   31746 
   31747    case 0x0F: {
   31748       opc = getUChar(delta); delta++;
   31749       switch (opc) {
   31750 
   31751       /* =-=-=-=-=-=-=-=-=- Grp8 =-=-=-=-=-=-=-=-=-=-=-= */
   31752 
   31753       /* =-=-=-=-=-=-=-=-=- BSF/BSR -=-=-=-=-=-=-=-=-=-= */
   31754 
   31755       /* =-=-=-=-=-=-=-=-=- BSWAP -=-=-=-=-=-=-=-=-=-=-= */
   31756 
   31757       /* =-=-=-=-=-=-=-=-=- BT/BTS/BTR/BTC =-=-=-=-=-=-= */
   31758 
   31759       /* =-=-=-=-=-=-=-=-=- CMOV =-=-=-=-=-=-=-=-=-=-=-= */
   31760 
   31761       /* =-=-=-=-=-=-=-=-=- CMPXCHG -=-=-=-=-=-=-=-=-=-= */
   31762 
   31763       /* =-=-=-=-=-=-=-=-=- CPUID -=-=-=-=-=-=-=-=-=-=-= */
   31764 
   31765       /* =-=-=-=-=-=-=-=-=- MOVZX, MOVSX =-=-=-=-=-=-=-= */
   31766 
   31767       /* =-=-=-=-=-=-=-=-=- MUL/IMUL =-=-=-=-=-=-=-=-=-= */
   31768 
   31769       /* =-=-=-=-=-=-=-=-=- NOPs =-=-=-=-=-=-=-=-=-=-=-= */
   31770 
   31771       /* =-=-=-=-=-=-=-=-=- Jcond d32 -=-=-=-=-=-=-=-=-= */
   31772 
   31773       /* =-=-=-=-=-=-=-=-=- PREFETCH =-=-=-=-=-=-=-=-=-= */
   31774 
   31775       /* =-=-=-=-=-=-=-=-=- RDTSC -=-=-=-=-=-=-=-=-=-=-= */
   31776 
   31777       /* =-=-=-=-=-=-=-=-=- SETcc Eb =-=-=-=-=-=-=-=-=-= */
   31778 
   31779       /* =-=-=-=-=-=-=-=-=- SHLD/SHRD -=-=-=-=-=-=-=-=-= */
   31780 
   31781       /* =-=-=-=-=-=-=-=-=- SYSCALL -=-=-=-=-=-=-=-=-=-= */
   31782 
   31783       /* =-=-=-=-=-=-=-=-=- XADD -=-=-=-=-=-=-=-=-=-= */
   31784 
   31785       /* =-=-=-=-=-=-=-=-=- SGDT and SIDT =-=-=-=-=-=-=-=-=-=-= */
   31786 
   31787       /* =-=-=-=-=-=-=-=-=- unimp2 =-=-=-=-=-=-=-=-=-=-= */
   31788 
   31789       default:
   31790          goto decode_failure;
   31791    } /* switch (opc) for the 2-byte opcodes */
   31792    goto decode_success;
   31793    } /* case 0x0F: of primary opcode */
   31794 
   31795    /* ------------------------ ??? ------------------------ */
   31796 #endif /* XYZZY */
   31797 
   31798      //default:
   31799   decode_failure:
   31800    /* All decode failures end up here. */
   31801    if (sigill_diag) {
   31802       vex_printf("vex amd64->IR: unhandled instruction bytes: "
   31803                  "0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
   31804                  (Int)getUChar(delta_start+0),
   31805                  (Int)getUChar(delta_start+1),
   31806                  (Int)getUChar(delta_start+2),
   31807                  (Int)getUChar(delta_start+3),
   31808                  (Int)getUChar(delta_start+4),
   31809                  (Int)getUChar(delta_start+5),
   31810                  (Int)getUChar(delta_start+6),
   31811                  (Int)getUChar(delta_start+7) );
   31812       vex_printf("vex amd64->IR:   REX=%d REX.W=%d REX.R=%d REX.X=%d REX.B=%d\n",
   31813                  haveREX(pfx) ? 1 : 0, getRexW(pfx), getRexR(pfx),
   31814                  getRexX(pfx), getRexB(pfx));
   31815       vex_printf("vex amd64->IR:   VEX=%d VEX.L=%d VEX.nVVVV=0x%x ESC=%s\n",
   31816                  haveVEX(pfx) ? 1 : 0, getVexL(pfx),
   31817                  getVexNvvvv(pfx),
   31818                  esc==ESC_NONE ? "NONE" :
   31819                    esc==ESC_0F ? "0F" :
   31820                    esc==ESC_0F38 ? "0F38" :
   31821                    esc==ESC_0F3A ? "0F3A" : "???");
   31822       vex_printf("vex amd64->IR:   PFX.66=%d PFX.F2=%d PFX.F3=%d\n",
   31823                  have66(pfx) ? 1 : 0, haveF2(pfx) ? 1 : 0,
   31824                  haveF3(pfx) ? 1 : 0);
   31825    }
   31826 
   31827    /* Tell the dispatcher that this insn cannot be decoded, and so has
   31828       not been executed, and (is currently) the next to be executed.
   31829       RIP should be up-to-date since it made so at the start of each
   31830       insn, but nevertheless be paranoid and update it again right
   31831       now. */
   31832    stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_curr_instr) ) );
   31833    jmp_lit(&dres, Ijk_NoDecode, guest_RIP_curr_instr);
   31834    vassert(dres.whatNext == Dis_StopHere);
   31835    dres.len = 0;
   31836    /* We also need to say that a CAS is not expected now, regardless
   31837       of what it might have been set to at the start of the function,
   31838       since the IR that we've emitted just above (to synthesis a
   31839       SIGILL) does not involve any CAS, and presumably no other IR has
   31840       been emitted for this (non-decoded) insn. */
   31841    *expect_CAS = False;
   31842    return dres;
   31843 
   31844    //   } /* switch (opc) for the main (primary) opcode switch. */
   31845 
   31846   decode_success:
   31847    /* All decode successes end up here. */
   31848    switch (dres.whatNext) {
   31849       case Dis_Continue:
   31850          stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_bbstart + delta) ) );
   31851          break;
   31852       case Dis_ResteerU:
   31853       case Dis_ResteerC:
   31854          stmt( IRStmt_Put( OFFB_RIP, mkU64(dres.continueAt) ) );
   31855          break;
   31856       case Dis_StopHere:
   31857          break;
   31858       default:
   31859          vassert(0);
   31860    }
   31861 
   31862    DIP("\n");
   31863    dres.len = toUInt(delta - delta_start);
   31864    return dres;
   31865 }
   31866 
   31867 #undef DIP
   31868 #undef DIS
   31869 
   31870 
   31871 /*------------------------------------------------------------*/
   31872 /*--- Top-level fn                                         ---*/
   31873 /*------------------------------------------------------------*/
   31874 
   31875 /* Disassemble a single instruction into IR.  The instruction
   31876    is located in host memory at &guest_code[delta]. */
   31877 
   31878 DisResult disInstr_AMD64 ( IRSB*        irsb_IN,
   31879                            Bool         (*resteerOkFn) ( void*, Addr ),
   31880                            Bool         resteerCisOk,
   31881                            void*        callback_opaque,
   31882                            const UChar* guest_code_IN,
   31883                            Long         delta,
   31884                            Addr         guest_IP,
   31885                            VexArch      guest_arch,
   31886                            const VexArchInfo* archinfo,
   31887                            const VexAbiInfo*  abiinfo,
   31888                            VexEndness   host_endness_IN,
   31889                            Bool         sigill_diag_IN )
   31890 {
   31891    Int       i, x1, x2;
   31892    Bool      expect_CAS, has_CAS;
   31893    DisResult dres;
   31894 
   31895    /* Set globals (see top of this file) */
   31896    vassert(guest_arch == VexArchAMD64);
   31897    guest_code           = guest_code_IN;
   31898    irsb                 = irsb_IN;
   31899    host_endness         = host_endness_IN;
   31900    guest_RIP_curr_instr = guest_IP;
   31901    guest_RIP_bbstart    = guest_IP - delta;
   31902 
   31903    /* We'll consult these after doing disInstr_AMD64_WRK. */
   31904    guest_RIP_next_assumed   = 0;
   31905    guest_RIP_next_mustcheck = False;
   31906 
   31907    x1 = irsb_IN->stmts_used;
   31908    expect_CAS = False;
   31909    dres = disInstr_AMD64_WRK ( &expect_CAS, resteerOkFn,
   31910                                resteerCisOk,
   31911                                callback_opaque,
   31912                                delta, archinfo, abiinfo, sigill_diag_IN );
   31913    x2 = irsb_IN->stmts_used;
   31914    vassert(x2 >= x1);
   31915 
   31916    /* If disInstr_AMD64_WRK tried to figure out the next rip, check it
   31917       got it right.  Failure of this assertion is serious and denotes
   31918       a bug in disInstr. */
   31919    if (guest_RIP_next_mustcheck
   31920        && guest_RIP_next_assumed != guest_RIP_curr_instr + dres.len) {
   31921       vex_printf("\n");
   31922       vex_printf("assumed next %%rip = 0x%llx\n",
   31923                  guest_RIP_next_assumed );
   31924       vex_printf(" actual next %%rip = 0x%llx\n",
   31925                  guest_RIP_curr_instr + dres.len );
   31926       vpanic("disInstr_AMD64: disInstr miscalculated next %rip");
   31927    }
   31928 
   31929    /* See comment at the top of disInstr_AMD64_WRK for meaning of
   31930       expect_CAS.  Here, we (sanity-)check for the presence/absence of
   31931       IRCAS as directed by the returned expect_CAS value. */
   31932    has_CAS = False;
   31933    for (i = x1; i < x2; i++) {
   31934       if (irsb_IN->stmts[i]->tag == Ist_CAS)
   31935          has_CAS = True;
   31936    }
   31937 
   31938    if (expect_CAS != has_CAS) {
   31939       /* inconsistency detected.  re-disassemble the instruction so as
   31940          to generate a useful error message; then assert. */
   31941       vex_traceflags |= VEX_TRACE_FE;
   31942       dres = disInstr_AMD64_WRK ( &expect_CAS, resteerOkFn,
   31943                                   resteerCisOk,
   31944                                   callback_opaque,
   31945                                   delta, archinfo, abiinfo, sigill_diag_IN );
   31946       for (i = x1; i < x2; i++) {
   31947          vex_printf("\t\t");
   31948          ppIRStmt(irsb_IN->stmts[i]);
   31949          vex_printf("\n");
   31950       }
   31951       /* Failure of this assertion is serious and denotes a bug in
   31952          disInstr. */
   31953       vpanic("disInstr_AMD64: inconsistency in LOCK prefix handling");
   31954    }
   31955 
   31956    return dres;
   31957 }
   31958 
   31959 
   31960 /*------------------------------------------------------------*/
   31961 /*--- Unused stuff                                         ---*/
   31962 /*------------------------------------------------------------*/
   31963 
   31964 // A potentially more Memcheck-friendly version of gen_LZCNT, if
   31965 // this should ever be needed.
   31966 //
   31967 //static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
   31968 //{
   31969 //   /* Scheme is simple: propagate the most significant 1-bit into all
   31970 //      lower positions in the word.  This gives a word of the form
   31971 //      0---01---1.  Now invert it, giving a word of the form
   31972 //      1---10---0, then do a population-count idiom (to count the 1s,
   31973 //      which is the number of leading zeroes, or the word size if the
   31974 //      original word was 0.
   31975 //   */
   31976 //   Int i;
   31977 //   IRTemp t[7];
   31978 //   for (i = 0; i < 7; i++) {
   31979 //      t[i] = newTemp(ty);
   31980 //   }
   31981 //   if (ty == Ity_I64) {
   31982 //      assign(t[0], binop(Iop_Or64, mkexpr(src),
   31983 //                                   binop(Iop_Shr64, mkexpr(src),  mkU8(1))));
   31984 //      assign(t[1], binop(Iop_Or64, mkexpr(t[0]),
   31985 //                                   binop(Iop_Shr64, mkexpr(t[0]), mkU8(2))));
   31986 //      assign(t[2], binop(Iop_Or64, mkexpr(t[1]),
   31987 //                                   binop(Iop_Shr64, mkexpr(t[1]), mkU8(4))));
   31988 //      assign(t[3], binop(Iop_Or64, mkexpr(t[2]),
   31989 //                                   binop(Iop_Shr64, mkexpr(t[2]), mkU8(8))));
   31990 //      assign(t[4], binop(Iop_Or64, mkexpr(t[3]),
   31991 //                                   binop(Iop_Shr64, mkexpr(t[3]), mkU8(16))));
   31992 //      assign(t[5], binop(Iop_Or64, mkexpr(t[4]),
   31993 //                                   binop(Iop_Shr64, mkexpr(t[4]), mkU8(32))));
   31994 //      assign(t[6], unop(Iop_Not64, mkexpr(t[5])));
   31995 //      return gen_POPCOUNT(ty, t[6]);
   31996 //   }
   31997 //   if (ty == Ity_I32) {
   31998 //      assign(t[0], binop(Iop_Or32, mkexpr(src),
   31999 //                                   binop(Iop_Shr32, mkexpr(src),  mkU8(1))));
   32000 //      assign(t[1], binop(Iop_Or32, mkexpr(t[0]),
   32001 //                                   binop(Iop_Shr32, mkexpr(t[0]), mkU8(2))));
   32002 //      assign(t[2], binop(Iop_Or32, mkexpr(t[1]),
   32003 //                                   binop(Iop_Shr32, mkexpr(t[1]), mkU8(4))));
   32004 //      assign(t[3], binop(Iop_Or32, mkexpr(t[2]),
   32005 //                                   binop(Iop_Shr32, mkexpr(t[2]), mkU8(8))));
   32006 //      assign(t[4], binop(Iop_Or32, mkexpr(t[3]),
   32007 //                                   binop(Iop_Shr32, mkexpr(t[3]), mkU8(16))));
   32008 //      assign(t[5], unop(Iop_Not32, mkexpr(t[4])));
   32009 //      return gen_POPCOUNT(ty, t[5]);
   32010 //   }
   32011 //   if (ty == Ity_I16) {
   32012 //      assign(t[0], binop(Iop_Or16, mkexpr(src),
   32013 //                                   binop(Iop_Shr16, mkexpr(src),  mkU8(1))));
   32014 //      assign(t[1], binop(Iop_Or16, mkexpr(t[0]),
   32015 //                                   binop(Iop_Shr16, mkexpr(t[0]), mkU8(2))));
   32016 //      assign(t[2], binop(Iop_Or16, mkexpr(t[1]),
   32017 //                                   binop(Iop_Shr16, mkexpr(t[1]), mkU8(4))));
   32018 //      assign(t[3], binop(Iop_Or16, mkexpr(t[2]),
   32019 //                                   binop(Iop_Shr16, mkexpr(t[2]), mkU8(8))));
   32020 //      assign(t[4], unop(Iop_Not16, mkexpr(t[3])));
   32021 //      return gen_POPCOUNT(ty, t[4]);
   32022 //   }
   32023 //   vassert(0);
   32024 //}
   32025 
   32026 
   32027 /*--------------------------------------------------------------------*/
   32028 /*--- end                                       guest_amd64_toIR.c ---*/
   32029 /*--------------------------------------------------------------------*/
   32030