Home | History | Annotate | Download | only in priv
      1 
      2 /*--------------------------------------------------------------------*/
      3 /*--- begin                                     guest_amd64_toIR.c ---*/
      4 /*--------------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2010 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 /* Translates AMD64 code to IR. */
     37 
     38 /* TODO:
     39 
     40    All Puts to CC_OP/CC_DEP1/CC_DEP2/CC_NDEP should really be checked
     41    to ensure a 64-bit value is being written.
     42 
     43    x87 FP Limitations:
     44 
     45    * all arithmetic done at 64 bits
     46 
     47    * no FP exceptions, except for handling stack over/underflow
     48 
     49    * FP rounding mode observed only for float->int conversions and
     50      int->float conversions which could lose accuracy, and for
     51      float-to-float rounding.  For all other operations,
     52      round-to-nearest is used, regardless.
     53 
     54    * FP sin/cos/tan/sincos: C2 flag is always cleared.  IOW the
     55      simulation claims the argument is in-range (-2^63 <= arg <= 2^63)
     56      even when it isn't.
     57 
     58    * some of the FCOM cases could do with testing -- not convinced
     59      that the args are the right way round.
     60 
     61    * FSAVE does not re-initialise the FPU; it should do
     62 
     63    * FINIT not only initialises the FPU environment, it also zeroes
     64      all the FP registers.  It should leave the registers unchanged.
     65 
     66     RDTSC returns zero, always.
     67 
     68     SAHF should cause eflags[1] == 1, and in fact it produces 0.  As
     69     per Intel docs this bit has no meaning anyway.  Since PUSHF is the
     70     only way to observe eflags[1], a proper fix would be to make that
     71     bit be set by PUSHF.
     72 
     73     This module uses global variables and so is not MT-safe (if that
     74     should ever become relevant).
     75 */
     76 
     77 /* Notes re address size overrides (0x67).
     78 
     79    According to the AMD documentation (24594 Rev 3.09, Sept 2003,
     80    "AMD64 Architecture Programmer's Manual Volume 3: General-Purpose
     81    and System Instructions"), Section 1.2.3 ("Address-Size Override
     82    Prefix"):
     83 
     84    0x67 applies to all explicit memory references, causing the top
     85    32 bits of the effective address to become zero.
     86 
     87    0x67 has no effect on stack references (push/pop); these always
     88    use a 64-bit address.
     89 
     90    0x67 changes the interpretation of instructions which implicitly
     91    reference RCX/RSI/RDI, so that in fact ECX/ESI/EDI are used
     92    instead.  These are:
     93 
     94       cmp{s,sb,sw,sd,sq}
     95       in{s,sb,sw,sd}
     96       jcxz, jecxz, jrcxz
     97       lod{s,sb,sw,sd,sq}
     98       loop{,e,bz,be,z}
     99       mov{s,sb,sw,sd,sq}
    100       out{s,sb,sw,sd}
    101       rep{,e,ne,nz}
    102       sca{s,sb,sw,sd,sq}
    103       sto{s,sb,sw,sd,sq}
    104       xlat{,b} */
    105 
    106 /* "Special" instructions.
    107 
    108    This instruction decoder can decode three special instructions
    109    which mean nothing natively (are no-ops as far as regs/mem are
    110    concerned) but have meaning for supporting Valgrind.  A special
    111    instruction is flagged by the 16-byte preamble 48C1C703 48C1C70D
    112    48C1C73D 48C1C733 (in the standard interpretation, that means: rolq
    113    $3, %rdi; rolq $13, %rdi; rolq $61, %rdi; rolq $51, %rdi).
    114    Following that, one of the following 3 are allowed (standard
    115    interpretation in parentheses):
    116 
    117       4887DB (xchgq %rbx,%rbx)   %RDX = client_request ( %RAX )
    118       4887C9 (xchgq %rcx,%rcx)   %RAX = guest_NRADDR
    119       4887D2 (xchgq %rdx,%rdx)   call-noredir *%RAX
    120 
    121    Any other bytes following the 16-byte preamble are illegal and
    122    constitute a failure in instruction decoding.  This all assumes
    123    that the preamble will never occur except in specific code
    124    fragments designed for Valgrind to catch.
    125 
    126    No prefixes may precede a "Special" instruction.
    127 */
    128 
    129 /* casLE (implementation of lock-prefixed insns) and rep-prefixed
    130    insns: the side-exit back to the start of the insn is done with
    131    Ijk_Boring.  This is quite wrong, it should be done with
    132    Ijk_NoRedir, since otherwise the side exit, which is intended to
    133    restart the instruction for whatever reason, could go somewhere
    134    entirely else.  Doing it right (with Ijk_NoRedir jumps) would make
    135    no-redir jumps performance critical, at least for rep-prefixed
    136    instructions, since all iterations thereof would involve such a
    137    jump.  It's not such a big deal with casLE since the side exit is
    138    only taken if the CAS fails, that is, the location is contended,
    139    which is relatively unlikely.
    140 
    141    Note also, the test for CAS success vs failure is done using
    142    Iop_CasCmp{EQ,NE}{8,16,32,64} rather than the ordinary
    143    Iop_Cmp{EQ,NE} equivalents.  This is so as to tell Memcheck that it
    144    shouldn't definedness-check these comparisons.  See
    145    COMMENT_ON_CasCmpEQ in memcheck/mc_translate.c for
    146    background/rationale.
    147 */
    148 
    149 /* LOCK prefixed instructions.  These are translated using IR-level
    150    CAS statements (IRCAS) and are believed to preserve atomicity, even
    151    from the point of view of some other process racing against a
    152    simulated one (presumably they communicate via a shared memory
    153    segment).
    154 
    155    Handlers which are aware of LOCK prefixes are:
    156       dis_op2_G_E      (add, or, adc, sbb, and, sub, xor)
    157       dis_cmpxchg_G_E  (cmpxchg)
    158       dis_Grp1         (add, or, adc, sbb, and, sub, xor)
    159       dis_Grp3         (not, neg)
    160       dis_Grp4         (inc, dec)
    161       dis_Grp5         (inc, dec)
    162       dis_Grp8_Imm     (bts, btc, btr)
    163       dis_bt_G_E       (bts, btc, btr)
    164       dis_xadd_G_E     (xadd)
    165 */
    166 
    167 
    168 #include "libvex_basictypes.h"
    169 #include "libvex_ir.h"
    170 #include "libvex.h"
    171 #include "libvex_guest_amd64.h"
    172 
    173 #include "main_util.h"
    174 #include "main_globals.h"
    175 #include "guest_generic_bb_to_IR.h"
    176 #include "guest_generic_x87.h"
    177 #include "guest_amd64_defs.h"
    178 
    179 
    180 /*------------------------------------------------------------*/
    181 /*--- Globals                                              ---*/
    182 /*------------------------------------------------------------*/
    183 
    184 /* These are set at the start of the translation of an insn, right
    185    down in disInstr_AMD64, so that we don't have to pass them around
    186    endlessly.  They are all constant during the translation of any
    187    given insn. */
    188 
    189 /* These are set at the start of the translation of a BB, so
    190    that we don't have to pass them around endlessly. */
    191 
    192 /* We need to know this to do sub-register accesses correctly. */
    193 static Bool host_is_bigendian;
    194 
    195 /* Pointer to the guest code area (points to start of BB, not to the
    196    insn being processed). */
    197 static UChar* guest_code;
    198 
    199 /* The guest address corresponding to guest_code[0]. */
    200 static Addr64 guest_RIP_bbstart;
    201 
    202 /* The guest address for the instruction currently being
    203    translated. */
    204 static Addr64 guest_RIP_curr_instr;
    205 
    206 /* The IRSB* into which we're generating code. */
    207 static IRSB* irsb;
    208 
    209 /* For ensuring that %rip-relative addressing is done right.  A read
    210    of %rip generates the address of the next instruction.  It may be
    211    that we don't conveniently know that inside disAMode().  For sanity
    212    checking, if the next insn %rip is needed, we make a guess at what
    213    it is, record that guess here, and set the accompanying Bool to
    214    indicate that -- after this insn's decode is finished -- that guess
    215    needs to be checked.  */
    216 
    217 /* At the start of each insn decode, is set to (0, False).
    218    After the decode, if _mustcheck is now True, _assumed is
    219    checked. */
    220 
    221 static Addr64 guest_RIP_next_assumed;
    222 static Bool   guest_RIP_next_mustcheck;
    223 
    224 
    225 /*------------------------------------------------------------*/
    226 /*--- Helpers for constructing IR.                         ---*/
    227 /*------------------------------------------------------------*/
    228 
    229 /* Generate a new temporary of the given type. */
    230 static IRTemp newTemp ( IRType ty )
    231 {
    232    vassert(isPlausibleIRType(ty));
    233    return newIRTemp( irsb->tyenv, ty );
    234 }
    235 
    236 /* Add a statement to the list held by "irsb". */
    237 static void stmt ( IRStmt* st )
    238 {
    239    addStmtToIRSB( irsb, st );
    240 }
    241 
    242 /* Generate a statement "dst := e". */
    243 static void assign ( IRTemp dst, IRExpr* e )
    244 {
    245    stmt( IRStmt_WrTmp(dst, e) );
    246 }
    247 
    248 static IRExpr* unop ( IROp op, IRExpr* a )
    249 {
    250    return IRExpr_Unop(op, a);
    251 }
    252 
    253 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
    254 {
    255    return IRExpr_Binop(op, a1, a2);
    256 }
    257 
    258 static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
    259 {
    260    return IRExpr_Triop(op, a1, a2, a3);
    261 }
    262 
    263 static IRExpr* mkexpr ( IRTemp tmp )
    264 {
    265    return IRExpr_RdTmp(tmp);
    266 }
    267 
    268 static IRExpr* mkU8 ( ULong i )
    269 {
    270    vassert(i < 256);
    271    return IRExpr_Const(IRConst_U8( (UChar)i ));
    272 }
    273 
    274 static IRExpr* mkU16 ( ULong i )
    275 {
    276    vassert(i < 0x10000ULL);
    277    return IRExpr_Const(IRConst_U16( (UShort)i ));
    278 }
    279 
    280 static IRExpr* mkU32 ( ULong i )
    281 {
    282    vassert(i < 0x100000000ULL);
    283    return IRExpr_Const(IRConst_U32( (UInt)i ));
    284 }
    285 
    286 static IRExpr* mkU64 ( ULong i )
    287 {
    288    return IRExpr_Const(IRConst_U64(i));
    289 }
    290 
    291 static IRExpr* mkU ( IRType ty, ULong i )
    292 {
    293    switch (ty) {
    294       case Ity_I8:  return mkU8(i);
    295       case Ity_I16: return mkU16(i);
    296       case Ity_I32: return mkU32(i);
    297       case Ity_I64: return mkU64(i);
    298       default: vpanic("mkU(amd64)");
    299    }
    300 }
    301 
    302 static void storeLE ( IRExpr* addr, IRExpr* data )
    303 {
    304    stmt( IRStmt_Store(Iend_LE, addr, data) );
    305 }
    306 
    307 static IRExpr* loadLE ( IRType ty, IRExpr* addr )
    308 {
    309    return IRExpr_Load(Iend_LE, ty, addr);
    310 }
    311 
    312 static IROp mkSizedOp ( IRType ty, IROp op8 )
    313 {
    314    vassert(op8 == Iop_Add8 || op8 == Iop_Sub8
    315            || op8 == Iop_Mul8
    316            || op8 == Iop_Or8 || op8 == Iop_And8 || op8 == Iop_Xor8
    317            || op8 == Iop_Shl8 || op8 == Iop_Shr8 || op8 == Iop_Sar8
    318            || op8 == Iop_CmpEQ8 || op8 == Iop_CmpNE8
    319            || op8 == Iop_CasCmpNE8
    320            || op8 == Iop_Not8 );
    321    switch (ty) {
    322       case Ity_I8:  return 0 +op8;
    323       case Ity_I16: return 1 +op8;
    324       case Ity_I32: return 2 +op8;
    325       case Ity_I64: return 3 +op8;
    326       default: vpanic("mkSizedOp(amd64)");
    327    }
    328 }
    329 
    330 static
    331 IRExpr* doScalarWidening ( Int szSmall, Int szBig, Bool signd, IRExpr* src )
    332 {
    333    if (szSmall == 1 && szBig == 4) {
    334       return unop(signd ? Iop_8Sto32 : Iop_8Uto32, src);
    335    }
    336    if (szSmall == 1 && szBig == 2) {
    337       return unop(signd ? Iop_8Sto16 : Iop_8Uto16, src);
    338    }
    339    if (szSmall == 2 && szBig == 4) {
    340       return unop(signd ? Iop_16Sto32 : Iop_16Uto32, src);
    341    }
    342    if (szSmall == 1 && szBig == 8 && !signd) {
    343       return unop(Iop_8Uto64, src);
    344    }
    345    if (szSmall == 1 && szBig == 8 && signd) {
    346       return unop(Iop_8Sto64, src);
    347    }
    348    if (szSmall == 2 && szBig == 8 && !signd) {
    349       return unop(Iop_16Uto64, src);
    350    }
    351    if (szSmall == 2 && szBig == 8 && signd) {
    352       return unop(Iop_16Sto64, src);
    353    }
    354    vpanic("doScalarWidening(amd64)");
    355 }
    356 
    357 
    358 
    359 /*------------------------------------------------------------*/
    360 /*--- Debugging output                                     ---*/
    361 /*------------------------------------------------------------*/
    362 
    363 /* Bomb out if we can't handle something. */
    364 __attribute__ ((noreturn))
    365 static void unimplemented ( HChar* str )
    366 {
    367    vex_printf("amd64toIR: unimplemented feature\n");
    368    vpanic(str);
    369 }
    370 
    371 #define DIP(format, args...)           \
    372    if (vex_traceflags & VEX_TRACE_FE)  \
    373       vex_printf(format, ## args)
    374 
    375 #define DIS(buf, format, args...)      \
    376    if (vex_traceflags & VEX_TRACE_FE)  \
    377       vex_sprintf(buf, format, ## args)
    378 
    379 
    380 /*------------------------------------------------------------*/
    381 /*--- Offsets of various parts of the amd64 guest state.   ---*/
    382 /*------------------------------------------------------------*/
    383 
    384 #define OFFB_RAX       offsetof(VexGuestAMD64State,guest_RAX)
    385 #define OFFB_RBX       offsetof(VexGuestAMD64State,guest_RBX)
    386 #define OFFB_RCX       offsetof(VexGuestAMD64State,guest_RCX)
    387 #define OFFB_RDX       offsetof(VexGuestAMD64State,guest_RDX)
    388 #define OFFB_RSP       offsetof(VexGuestAMD64State,guest_RSP)
    389 #define OFFB_RBP       offsetof(VexGuestAMD64State,guest_RBP)
    390 #define OFFB_RSI       offsetof(VexGuestAMD64State,guest_RSI)
    391 #define OFFB_RDI       offsetof(VexGuestAMD64State,guest_RDI)
    392 #define OFFB_R8        offsetof(VexGuestAMD64State,guest_R8)
    393 #define OFFB_R9        offsetof(VexGuestAMD64State,guest_R9)
    394 #define OFFB_R10       offsetof(VexGuestAMD64State,guest_R10)
    395 #define OFFB_R11       offsetof(VexGuestAMD64State,guest_R11)
    396 #define OFFB_R12       offsetof(VexGuestAMD64State,guest_R12)
    397 #define OFFB_R13       offsetof(VexGuestAMD64State,guest_R13)
    398 #define OFFB_R14       offsetof(VexGuestAMD64State,guest_R14)
    399 #define OFFB_R15       offsetof(VexGuestAMD64State,guest_R15)
    400 
    401 #define OFFB_RIP       offsetof(VexGuestAMD64State,guest_RIP)
    402 
    403 #define OFFB_FS_ZERO   offsetof(VexGuestAMD64State,guest_FS_ZERO)
    404 #define OFFB_GS_0x60   offsetof(VexGuestAMD64State,guest_GS_0x60)
    405 
    406 #define OFFB_CC_OP     offsetof(VexGuestAMD64State,guest_CC_OP)
    407 #define OFFB_CC_DEP1   offsetof(VexGuestAMD64State,guest_CC_DEP1)
    408 #define OFFB_CC_DEP2   offsetof(VexGuestAMD64State,guest_CC_DEP2)
    409 #define OFFB_CC_NDEP   offsetof(VexGuestAMD64State,guest_CC_NDEP)
    410 
    411 #define OFFB_FPREGS    offsetof(VexGuestAMD64State,guest_FPREG[0])
    412 #define OFFB_FPTAGS    offsetof(VexGuestAMD64State,guest_FPTAG[0])
    413 #define OFFB_DFLAG     offsetof(VexGuestAMD64State,guest_DFLAG)
    414 #define OFFB_ACFLAG    offsetof(VexGuestAMD64State,guest_ACFLAG)
    415 #define OFFB_IDFLAG    offsetof(VexGuestAMD64State,guest_IDFLAG)
    416 #define OFFB_FTOP      offsetof(VexGuestAMD64State,guest_FTOP)
    417 #define OFFB_FC3210    offsetof(VexGuestAMD64State,guest_FC3210)
    418 #define OFFB_FPROUND   offsetof(VexGuestAMD64State,guest_FPROUND)
    419 //..
    420 //.. #define OFFB_CS        offsetof(VexGuestX86State,guest_CS)
    421 //.. #define OFFB_DS        offsetof(VexGuestX86State,guest_DS)
    422 //.. #define OFFB_ES        offsetof(VexGuestX86State,guest_ES)
    423 //.. #define OFFB_FS        offsetof(VexGuestX86State,guest_FS)
    424 //.. #define OFFB_GS        offsetof(VexGuestX86State,guest_GS)
    425 //.. #define OFFB_SS        offsetof(VexGuestX86State,guest_SS)
    426 //.. #define OFFB_LDT       offsetof(VexGuestX86State,guest_LDT)
    427 //.. #define OFFB_GDT       offsetof(VexGuestX86State,guest_GDT)
    428 
    429 #define OFFB_SSEROUND  offsetof(VexGuestAMD64State,guest_SSEROUND)
    430 #define OFFB_XMM0      offsetof(VexGuestAMD64State,guest_XMM0)
    431 #define OFFB_XMM1      offsetof(VexGuestAMD64State,guest_XMM1)
    432 #define OFFB_XMM2      offsetof(VexGuestAMD64State,guest_XMM2)
    433 #define OFFB_XMM3      offsetof(VexGuestAMD64State,guest_XMM3)
    434 #define OFFB_XMM4      offsetof(VexGuestAMD64State,guest_XMM4)
    435 #define OFFB_XMM5      offsetof(VexGuestAMD64State,guest_XMM5)
    436 #define OFFB_XMM6      offsetof(VexGuestAMD64State,guest_XMM6)
    437 #define OFFB_XMM7      offsetof(VexGuestAMD64State,guest_XMM7)
    438 #define OFFB_XMM8      offsetof(VexGuestAMD64State,guest_XMM8)
    439 #define OFFB_XMM9      offsetof(VexGuestAMD64State,guest_XMM9)
    440 #define OFFB_XMM10     offsetof(VexGuestAMD64State,guest_XMM10)
    441 #define OFFB_XMM11     offsetof(VexGuestAMD64State,guest_XMM11)
    442 #define OFFB_XMM12     offsetof(VexGuestAMD64State,guest_XMM12)
    443 #define OFFB_XMM13     offsetof(VexGuestAMD64State,guest_XMM13)
    444 #define OFFB_XMM14     offsetof(VexGuestAMD64State,guest_XMM14)
    445 #define OFFB_XMM15     offsetof(VexGuestAMD64State,guest_XMM15)
    446 #define OFFB_XMM16     offsetof(VexGuestAMD64State,guest_XMM16)
    447 
    448 #define OFFB_EMWARN    offsetof(VexGuestAMD64State,guest_EMWARN)
    449 #define OFFB_TISTART   offsetof(VexGuestAMD64State,guest_TISTART)
    450 #define OFFB_TILEN     offsetof(VexGuestAMD64State,guest_TILEN)
    451 
    452 #define OFFB_NRADDR    offsetof(VexGuestAMD64State,guest_NRADDR)
    453 
    454 
    455 /*------------------------------------------------------------*/
    456 /*--- Helper bits and pieces for deconstructing the        ---*/
    457 /*--- amd64 insn stream.                                   ---*/
    458 /*------------------------------------------------------------*/
    459 
    460 /* This is the AMD64 register encoding -- integer regs. */
    461 #define R_RAX 0
    462 #define R_RCX 1
    463 #define R_RDX 2
    464 #define R_RBX 3
    465 #define R_RSP 4
    466 #define R_RBP 5
    467 #define R_RSI 6
    468 #define R_RDI 7
    469 #define R_R8  8
    470 #define R_R9  9
    471 #define R_R10 10
    472 #define R_R11 11
    473 #define R_R12 12
    474 #define R_R13 13
    475 #define R_R14 14
    476 #define R_R15 15
    477 
    478 //.. #define R_AL (0+R_EAX)
    479 //.. #define R_AH (4+R_EAX)
    480 
    481 /* This is the Intel register encoding -- segment regs. */
    482 #define R_ES 0
    483 #define R_CS 1
    484 #define R_SS 2
    485 #define R_DS 3
    486 #define R_FS 4
    487 #define R_GS 5
    488 
    489 
    490 /* Various simple conversions */
    491 
    492 static ULong extend_s_8to64 ( UChar x )
    493 {
    494    return (ULong)((((Long)x) << 56) >> 56);
    495 }
    496 
    497 static ULong extend_s_16to64 ( UShort x )
    498 {
    499    return (ULong)((((Long)x) << 48) >> 48);
    500 }
    501 
    502 static ULong extend_s_32to64 ( UInt x )
    503 {
    504    return (ULong)((((Long)x) << 32) >> 32);
    505 }
    506 
    507 /* Figure out whether the mod and rm parts of a modRM byte refer to a
    508    register or memory.  If so, the byte will have the form 11XXXYYY,
    509    where YYY is the register number. */
    510 inline
    511 static Bool epartIsReg ( UChar mod_reg_rm )
    512 {
    513    return toBool(0xC0 == (mod_reg_rm & 0xC0));
    514 }
    515 
    516 /* Extract the 'g' field from a modRM byte.  This only produces 3
    517    bits, which is not a complete register number.  You should avoid
    518    this function if at all possible. */
    519 inline
    520 static Int gregLO3ofRM ( UChar mod_reg_rm )
    521 {
    522    return (Int)( (mod_reg_rm >> 3) & 7 );
    523 }
    524 
    525 /* Ditto the 'e' field of a modRM byte. */
    526 inline
    527 static Int eregLO3ofRM ( UChar mod_reg_rm )
    528 {
    529    return (Int)(mod_reg_rm & 0x7);
    530 }
    531 
    532 /* Get a 8/16/32-bit unsigned value out of the insn stream. */
    533 
    534 static UChar getUChar ( Long delta )
    535 {
    536    UChar v = guest_code[delta+0];
    537    return v;
    538 }
    539 
    540 static UInt getUDisp16 ( Long delta )
    541 {
    542    UInt v = guest_code[delta+1]; v <<= 8;
    543    v |= guest_code[delta+0];
    544    return v & 0xFFFF;
    545 }
    546 
    547 //.. static UInt getUDisp ( Int size, Long delta )
    548 //.. {
    549 //..    switch (size) {
    550 //..       case 4: return getUDisp32(delta);
    551 //..       case 2: return getUDisp16(delta);
    552 //..       case 1: return getUChar(delta);
    553 //..       default: vpanic("getUDisp(x86)");
    554 //..    }
    555 //..    return 0; /*notreached*/
    556 //.. }
    557 
    558 
    559 /* Get a byte value out of the insn stream and sign-extend to 64
    560    bits. */
    561 static Long getSDisp8 ( Long delta )
    562 {
    563    return extend_s_8to64( guest_code[delta] );
    564 }
    565 
    566 /* Get a 16-bit value out of the insn stream and sign-extend to 64
    567    bits. */
    568 static Long getSDisp16 ( Long delta )
    569 {
    570    UInt v = guest_code[delta+1]; v <<= 8;
    571    v |= guest_code[delta+0];
    572    return extend_s_16to64( (UShort)v );
    573 }
    574 
    575 /* Get a 32-bit value out of the insn stream and sign-extend to 64
    576    bits. */
    577 static Long getSDisp32 ( Long delta )
    578 {
    579    UInt v = guest_code[delta+3]; v <<= 8;
    580    v |= guest_code[delta+2]; v <<= 8;
    581    v |= guest_code[delta+1]; v <<= 8;
    582    v |= guest_code[delta+0];
    583    return extend_s_32to64( v );
    584 }
    585 
    586 /* Get a 64-bit value out of the insn stream. */
    587 static Long getDisp64 ( Long delta )
    588 {
    589    ULong v = 0;
    590    v |= guest_code[delta+7]; v <<= 8;
    591    v |= guest_code[delta+6]; v <<= 8;
    592    v |= guest_code[delta+5]; v <<= 8;
    593    v |= guest_code[delta+4]; v <<= 8;
    594    v |= guest_code[delta+3]; v <<= 8;
    595    v |= guest_code[delta+2]; v <<= 8;
    596    v |= guest_code[delta+1]; v <<= 8;
    597    v |= guest_code[delta+0];
    598    return v;
    599 }
    600 
    601 /* Note: because AMD64 doesn't allow 64-bit literals, it is an error
    602    if this is called with size==8.  Should not happen. */
    603 static Long getSDisp ( Int size, Long delta )
    604 {
    605    switch (size) {
    606       case 4: return getSDisp32(delta);
    607       case 2: return getSDisp16(delta);
    608       case 1: return getSDisp8(delta);
    609       default: vpanic("getSDisp(amd64)");
    610   }
    611 }
    612 
    613 static ULong mkSizeMask ( Int sz )
    614 {
    615    switch (sz) {
    616       case 1: return 0x00000000000000FFULL;
    617       case 2: return 0x000000000000FFFFULL;
    618       case 4: return 0x00000000FFFFFFFFULL;
    619       case 8: return 0xFFFFFFFFFFFFFFFFULL;
    620       default: vpanic("mkSzMask(amd64)");
    621    }
    622 }
    623 
    624 static Int imin ( Int a, Int b )
    625 {
    626    return (a < b) ? a : b;
    627 }
    628 
    629 static IRType szToITy ( Int n )
    630 {
    631    switch (n) {
    632       case 1: return Ity_I8;
    633       case 2: return Ity_I16;
    634       case 4: return Ity_I32;
    635       case 8: return Ity_I64;
    636       default: vex_printf("\nszToITy(%d)\n", n);
    637                vpanic("szToITy(amd64)");
    638    }
    639 }
    640 
    641 
    642 /*------------------------------------------------------------*/
    643 /*--- For dealing with prefixes.                           ---*/
    644 /*------------------------------------------------------------*/
    645 
    646 /* The idea is to pass around an int holding a bitmask summarising
    647    info from the prefixes seen on the current instruction, including
    648    info from the REX byte.  This info is used in various places, but
    649    most especially when making sense of register fields in
    650    instructions.
    651 
    652    The top 16 bits of the prefix are 0x3141, just as a hacky way
    653    to ensure it really is a valid prefix.
    654 
    655    Things you can safely assume about a well-formed prefix:
    656    * at most one segment-override bit (CS,DS,ES,FS,GS,SS) is set.
    657    * if REX is not present then REXW,REXR,REXX,REXB will read
    658      as zero.
    659    * F2 and F3 will not both be 1.
    660 */
    661 
    662 typedef UInt  Prefix;
    663 
    664 #define PFX_ASO   (1<<0)     /* address-size override present (0x67) */
    665 #define PFX_66    (1<<1)     /* operand-size override-to-16 present (0x66) */
    666 #define PFX_REX   (1<<2)     /* REX byte present (0x40 to 0x4F) */
    667 #define PFX_REXW  (1<<3)     /* REX W bit, if REX present, else 0 */
    668 #define PFX_REXR  (1<<4)     /* REX R bit, if REX present, else 0 */
    669 #define PFX_REXX  (1<<5)     /* REX X bit, if REX present, else 0 */
    670 #define PFX_REXB  (1<<6)     /* REX B bit, if REX present, else 0 */
    671 #define PFX_LOCK  (1<<7)     /* bus LOCK prefix present (0xF0) */
    672 #define PFX_F2    (1<<8)     /* REP/REPE/REPZ prefix present (0xF2) */
    673 #define PFX_F3    (1<<9)     /* REPNE/REPNZ prefix present (0xF3) */
    674 #define PFX_CS    (1<<10)    /* CS segment prefix present (0x2E) */
    675 #define PFX_DS    (1<<11)    /* DS segment prefix present (0x3E) */
    676 #define PFX_ES    (1<<12)    /* ES segment prefix present (0x26) */
    677 #define PFX_FS    (1<<13)    /* FS segment prefix present (0x64) */
    678 #define PFX_GS    (1<<14)    /* GS segment prefix present (0x65) */
    679 #define PFX_SS    (1<<15)    /* SS segment prefix present (0x36) */
    680 
    681 #define PFX_EMPTY 0x31410000
    682 
    683 static Bool IS_VALID_PFX ( Prefix pfx ) {
    684    return toBool((pfx & 0xFFFF0000) == PFX_EMPTY);
    685 }
    686 
    687 static Bool haveREX ( Prefix pfx ) {
    688    return toBool(pfx & PFX_REX);
    689 }
    690 
    691 static Int getRexW ( Prefix pfx ) {
    692    return (pfx & PFX_REXW) ? 1 : 0;
    693 }
    694 /* Apparently unused.
    695 static Int getRexR ( Prefix pfx ) {
    696    return (pfx & PFX_REXR) ? 1 : 0;
    697 }
    698 */
    699 static Int getRexX ( Prefix pfx ) {
    700    return (pfx & PFX_REXX) ? 1 : 0;
    701 }
    702 static Int getRexB ( Prefix pfx ) {
    703    return (pfx & PFX_REXB) ? 1 : 0;
    704 }
    705 
    706 /* Check a prefix doesn't have F2 or F3 set in it, since usually that
    707    completely changes what instruction it really is. */
    708 static Bool haveF2orF3 ( Prefix pfx ) {
    709    return toBool((pfx & (PFX_F2|PFX_F3)) > 0);
    710 }
    711 static Bool haveF2 ( Prefix pfx ) {
    712    return toBool((pfx & PFX_F2) > 0);
    713 }
    714 static Bool haveF3 ( Prefix pfx ) {
    715    return toBool((pfx & PFX_F3) > 0);
    716 }
    717 
    718 static Bool have66 ( Prefix pfx ) {
    719    return toBool((pfx & PFX_66) > 0);
    720 }
    721 static Bool haveASO ( Prefix pfx ) {
    722    return toBool((pfx & PFX_ASO) > 0);
    723 }
    724 
    725 /* Return True iff pfx has 66 set and F2 and F3 clear */
    726 static Bool have66noF2noF3 ( Prefix pfx )
    727 {
    728   return
    729      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_66);
    730 }
    731 
    732 /* Return True iff pfx has F2 set and 66 and F3 clear */
    733 static Bool haveF2no66noF3 ( Prefix pfx )
    734 {
    735   return
    736      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_F2);
    737 }
    738 
    739 /* Return True iff pfx has F3 set and 66 and F2 clear */
    740 static Bool haveF3no66noF2 ( Prefix pfx )
    741 {
    742   return
    743      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_F3);
    744 }
    745 
    746 /* Return True iff pfx has F3 set and F2 clear */
    747 static Bool haveF3noF2 ( Prefix pfx )
    748 {
    749   return
    750      toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F3);
    751 }
    752 
    753 /* Return True iff pfx has F2 set and F3 clear */
    754 static Bool haveF2noF3 ( Prefix pfx )
    755 {
    756   return
    757      toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F2);
    758 }
    759 
    760 /* Return True iff pfx has 66, F2 and F3 clear */
    761 static Bool haveNo66noF2noF3 ( Prefix pfx )
    762 {
    763   return
    764      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == 0);
    765 }
    766 
    767 /* Return True iff pfx has any of 66, F2 and F3 set */
    768 static Bool have66orF2orF3 ( Prefix pfx )
    769 {
    770   return toBool( ! haveNo66noF2noF3(pfx) );
    771 }
    772 
    773 /* Return True iff pfx has 66 or F2 set */
    774 static Bool have66orF2 ( Prefix pfx )
    775 {
    776    return toBool((pfx & (PFX_66|PFX_F2)) > 0);
    777 }
    778 
    779 /* Clear all the segment-override bits in a prefix. */
    780 static Prefix clearSegBits ( Prefix p )
    781 {
    782    return
    783       p & ~(PFX_CS | PFX_DS | PFX_ES | PFX_FS | PFX_GS | PFX_SS);
    784 }
    785 
    786 
    787 /*------------------------------------------------------------*/
    788 /*--- For dealing with integer registers                   ---*/
    789 /*------------------------------------------------------------*/
    790 
    791 /* This is somewhat complex.  The rules are:
    792 
    793    For 64, 32 and 16 bit register references, the e or g fields in the
    794    modrm bytes supply the low 3 bits of the register number.  The
    795    fourth (most-significant) bit of the register number is supplied by
    796    the REX byte, if it is present; else that bit is taken to be zero.
    797 
    798    The REX.R bit supplies the high bit corresponding to the g register
    799    field, and the REX.B bit supplies the high bit corresponding to the
    800    e register field (when the mod part of modrm indicates that modrm's
    801    e component refers to a register and not to memory).
    802 
    803    The REX.X bit supplies a high register bit for certain registers
    804    in SIB address modes, and is generally rarely used.
    805 
    806    For 8 bit register references, the presence of the REX byte itself
    807    has significance.  If there is no REX present, then the 3-bit
    808    number extracted from the modrm e or g field is treated as an index
    809    into the sequence %al %cl %dl %bl %ah %ch %dh %bh -- that is, the
    810    old x86 encoding scheme.
    811 
    812    But if there is a REX present, the register reference is
    813    interpreted in the same way as for 64/32/16-bit references: a high
    814    bit is extracted from REX, giving a 4-bit number, and the denoted
    815    register is the lowest 8 bits of the 16 integer registers denoted
    816    by the number.  In particular, values 3 through 7 of this sequence
    817    do not refer to %ah %ch %dh %bh but instead to the lowest 8 bits of
    818    %rsp %rbp %rsi %rdi.
    819 
    820    The REX.W bit has no bearing at all on register numbers.  Instead
    821    its presence indicates that the operand size is to be overridden
    822    from its default value (32 bits) to 64 bits instead.  This is in
    823    the same fashion that an 0x66 prefix indicates the operand size is
    824    to be overridden from 32 bits down to 16 bits.  When both REX.W and
    825    0x66 are present there is a conflict, and REX.W takes precedence.
    826 
    827    Rather than try to handle this complexity using a single huge
    828    function, several smaller ones are provided.  The aim is to make it
    829    as difficult as possible to screw up register decoding in a subtle
    830    and hard-to-track-down way.
    831 
    832    Because these routines fish around in the host's memory (that is,
    833    in the guest state area) for sub-parts of guest registers, their
    834    correctness depends on the host's endianness.  So far these
    835    routines only work for little-endian hosts.  Those for which
    836    endianness is important have assertions to ensure sanity.
    837 */
    838 
    839 
    840 /* About the simplest question you can ask: where do the 64-bit
    841    integer registers live (in the guest state) ? */
    842 
    843 static Int integerGuestReg64Offset ( UInt reg )
    844 {
    845    switch (reg) {
    846       case R_RAX: return OFFB_RAX;
    847       case R_RCX: return OFFB_RCX;
    848       case R_RDX: return OFFB_RDX;
    849       case R_RBX: return OFFB_RBX;
    850       case R_RSP: return OFFB_RSP;
    851       case R_RBP: return OFFB_RBP;
    852       case R_RSI: return OFFB_RSI;
    853       case R_RDI: return OFFB_RDI;
    854       case R_R8:  return OFFB_R8;
    855       case R_R9:  return OFFB_R9;
    856       case R_R10: return OFFB_R10;
    857       case R_R11: return OFFB_R11;
    858       case R_R12: return OFFB_R12;
    859       case R_R13: return OFFB_R13;
    860       case R_R14: return OFFB_R14;
    861       case R_R15: return OFFB_R15;
    862       default: vpanic("integerGuestReg64Offset(amd64)");
    863    }
    864 }
    865 
    866 
    867 /* Produce the name of an integer register, for printing purposes.
    868    reg is a number in the range 0 .. 15 that has been generated from a
    869    3-bit reg-field number and a REX extension bit.  irregular denotes
    870    the case where sz==1 and no REX byte is present. */
    871 
    872 static
    873 HChar* nameIReg ( Int sz, UInt reg, Bool irregular )
    874 {
    875    static HChar* ireg64_names[16]
    876      = { "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
    877          "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" };
    878    static HChar* ireg32_names[16]
    879      = { "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
    880          "%r8d", "%r9d", "%r10d","%r11d","%r12d","%r13d","%r14d","%r15d" };
    881    static HChar* ireg16_names[16]
    882      = { "%ax",  "%cx",  "%dx",  "%bx",  "%sp",  "%bp",  "%si",  "%di",
    883          "%r8w", "%r9w", "%r10w","%r11w","%r12w","%r13w","%r14w","%r15w" };
    884    static HChar* ireg8_names[16]
    885      = { "%al",  "%cl",  "%dl",  "%bl",  "%spl", "%bpl", "%sil", "%dil",
    886          "%r8b", "%r9b", "%r10b","%r11b","%r12b","%r13b","%r14b","%r15b" };
    887    static HChar* ireg8_irregular[8]
    888      = { "%al", "%cl", "%dl", "%bl", "%ah", "%ch", "%dh", "%bh" };
    889 
    890    vassert(reg < 16);
    891    if (sz == 1) {
    892       if (irregular)
    893          vassert(reg < 8);
    894    } else {
    895       vassert(irregular == False);
    896    }
    897 
    898    switch (sz) {
    899       case 8: return ireg64_names[reg];
    900       case 4: return ireg32_names[reg];
    901       case 2: return ireg16_names[reg];
    902       case 1: if (irregular) {
    903                  return ireg8_irregular[reg];
    904               } else {
    905                  return ireg8_names[reg];
    906               }
    907       default: vpanic("nameIReg(amd64)");
    908    }
    909 }
    910 
    911 /* Using the same argument conventions as nameIReg, produce the
    912    guest state offset of an integer register. */
    913 
    914 static
    915 Int offsetIReg ( Int sz, UInt reg, Bool irregular )
    916 {
    917    vassert(reg < 16);
    918    if (sz == 1) {
    919       if (irregular)
    920          vassert(reg < 8);
    921    } else {
    922       vassert(irregular == False);
    923    }
    924 
    925    /* Deal with irregular case -- sz==1 and no REX present */
    926    if (sz == 1 && irregular) {
    927       switch (reg) {
    928          case R_RSP: return 1+ OFFB_RAX;
    929          case R_RBP: return 1+ OFFB_RCX;
    930          case R_RSI: return 1+ OFFB_RDX;
    931          case R_RDI: return 1+ OFFB_RBX;
    932          default:    break; /* use the normal case */
    933       }
    934    }
    935 
    936    /* Normal case */
    937    return integerGuestReg64Offset(reg);
    938 }
    939 
    940 
    941 /* Read the %CL register :: Ity_I8, for shift/rotate operations. */
    942 
    943 static IRExpr* getIRegCL ( void )
    944 {
    945    vassert(!host_is_bigendian);
    946    return IRExpr_Get( OFFB_RCX, Ity_I8 );
    947 }
    948 
    949 
    950 /* Write to the %AH register. */
    951 
    952 static void putIRegAH ( IRExpr* e )
    953 {
    954    vassert(!host_is_bigendian);
    955    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I8);
    956    stmt( IRStmt_Put( OFFB_RAX+1, e ) );
    957 }
    958 
    959 
    960 /* Read/write various widths of %RAX, as it has various
    961    special-purpose uses. */
    962 
    963 static HChar* nameIRegRAX ( Int sz )
    964 {
    965    switch (sz) {
    966       case 1: return "%al";
    967       case 2: return "%ax";
    968       case 4: return "%eax";
    969       case 8: return "%rax";
    970       default: vpanic("nameIRegRAX(amd64)");
    971    }
    972 }
    973 
    974 static IRExpr* getIRegRAX ( Int sz )
    975 {
    976    vassert(!host_is_bigendian);
    977    switch (sz) {
    978       case 1: return IRExpr_Get( OFFB_RAX, Ity_I8 );
    979       case 2: return IRExpr_Get( OFFB_RAX, Ity_I16 );
    980       case 4: return unop(Iop_64to32, IRExpr_Get( OFFB_RAX, Ity_I64 ));
    981       case 8: return IRExpr_Get( OFFB_RAX, Ity_I64 );
    982       default: vpanic("getIRegRAX(amd64)");
    983    }
    984 }
    985 
    986 static void putIRegRAX ( Int sz, IRExpr* e )
    987 {
    988    IRType ty = typeOfIRExpr(irsb->tyenv, e);
    989    vassert(!host_is_bigendian);
    990    switch (sz) {
    991       case 8: vassert(ty == Ity_I64);
    992               stmt( IRStmt_Put( OFFB_RAX, e ));
    993               break;
    994       case 4: vassert(ty == Ity_I32);
    995               stmt( IRStmt_Put( OFFB_RAX, unop(Iop_32Uto64,e) ));
    996               break;
    997       case 2: vassert(ty == Ity_I16);
    998               stmt( IRStmt_Put( OFFB_RAX, e ));
    999               break;
   1000       case 1: vassert(ty == Ity_I8);
   1001               stmt( IRStmt_Put( OFFB_RAX, e ));
   1002               break;
   1003       default: vpanic("putIRegRAX(amd64)");
   1004    }
   1005 }
   1006 
   1007 
   1008 /* Read/write various widths of %RDX, as it has various
   1009    special-purpose uses. */
   1010 
   1011 static HChar* nameIRegRDX ( Int sz )
   1012 {
   1013    switch (sz) {
   1014       case 1: return "%dl";
   1015       case 2: return "%dx";
   1016       case 4: return "%edx";
   1017       case 8: return "%rdx";
   1018       default: vpanic("nameIRegRDX(amd64)");
   1019    }
   1020 }
   1021 
   1022 static IRExpr* getIRegRDX ( Int sz )
   1023 {
   1024    vassert(!host_is_bigendian);
   1025    switch (sz) {
   1026       case 1: return IRExpr_Get( OFFB_RDX, Ity_I8 );
   1027       case 2: return IRExpr_Get( OFFB_RDX, Ity_I16 );
   1028       case 4: return unop(Iop_64to32, IRExpr_Get( OFFB_RDX, Ity_I64 ));
   1029       case 8: return IRExpr_Get( OFFB_RDX, Ity_I64 );
   1030       default: vpanic("getIRegRDX(amd64)");
   1031    }
   1032 }
   1033 
   1034 static void putIRegRDX ( Int sz, IRExpr* e )
   1035 {
   1036    vassert(!host_is_bigendian);
   1037    vassert(typeOfIRExpr(irsb->tyenv, e) == szToITy(sz));
   1038    switch (sz) {
   1039       case 8: stmt( IRStmt_Put( OFFB_RDX, e ));
   1040               break;
   1041       case 4: stmt( IRStmt_Put( OFFB_RDX, unop(Iop_32Uto64,e) ));
   1042               break;
   1043       case 2: stmt( IRStmt_Put( OFFB_RDX, e ));
   1044               break;
   1045       case 1: stmt( IRStmt_Put( OFFB_RDX, e ));
   1046               break;
   1047       default: vpanic("putIRegRDX(amd64)");
   1048    }
   1049 }
   1050 
   1051 
   1052 /* Simplistic functions to deal with the integer registers as a
   1053    straightforward bank of 16 64-bit regs. */
   1054 
   1055 static IRExpr* getIReg64 ( UInt regno )
   1056 {
   1057    return IRExpr_Get( integerGuestReg64Offset(regno),
   1058                       Ity_I64 );
   1059 }
   1060 
   1061 static void putIReg64 ( UInt regno, IRExpr* e )
   1062 {
   1063    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   1064    stmt( IRStmt_Put( integerGuestReg64Offset(regno), e ) );
   1065 }
   1066 
   1067 static HChar* nameIReg64 ( UInt regno )
   1068 {
   1069    return nameIReg( 8, regno, False );
   1070 }
   1071 
   1072 
   1073 /* Simplistic functions to deal with the lower halves of integer
   1074    registers as a straightforward bank of 16 32-bit regs. */
   1075 
   1076 static IRExpr* getIReg32 ( UInt regno )
   1077 {
   1078    vassert(!host_is_bigendian);
   1079    return unop(Iop_64to32,
   1080                IRExpr_Get( integerGuestReg64Offset(regno),
   1081                            Ity_I64 ));
   1082 }
   1083 
   1084 static void putIReg32 ( UInt regno, IRExpr* e )
   1085 {
   1086    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
   1087    stmt( IRStmt_Put( integerGuestReg64Offset(regno),
   1088                      unop(Iop_32Uto64,e) ) );
   1089 }
   1090 
   1091 static HChar* nameIReg32 ( UInt regno )
   1092 {
   1093    return nameIReg( 4, regno, False );
   1094 }
   1095 
   1096 
   1097 /* Simplistic functions to deal with the lower quarters of integer
   1098    registers as a straightforward bank of 16 16-bit regs. */
   1099 
   1100 static IRExpr* getIReg16 ( UInt regno )
   1101 {
   1102    vassert(!host_is_bigendian);
   1103    return IRExpr_Get( integerGuestReg64Offset(regno),
   1104                       Ity_I16 );
   1105 }
   1106 
   1107 static HChar* nameIReg16 ( UInt regno )
   1108 {
   1109    return nameIReg( 2, regno, False );
   1110 }
   1111 
   1112 
   1113 /* Sometimes what we know is a 3-bit register number, a REX byte, and
   1114    which field of the REX byte is to be used to extend to a 4-bit
   1115    number.  These functions cater for that situation.
   1116 */
   1117 static IRExpr* getIReg64rexX ( Prefix pfx, UInt lo3bits )
   1118 {
   1119    vassert(lo3bits < 8);
   1120    vassert(IS_VALID_PFX(pfx));
   1121    return getIReg64( lo3bits | (getRexX(pfx) << 3) );
   1122 }
   1123 
   1124 static HChar* nameIReg64rexX ( Prefix pfx, UInt lo3bits )
   1125 {
   1126    vassert(lo3bits < 8);
   1127    vassert(IS_VALID_PFX(pfx));
   1128    return nameIReg( 8, lo3bits | (getRexX(pfx) << 3), False );
   1129 }
   1130 
   1131 static HChar* nameIRegRexB ( Int sz, Prefix pfx, UInt lo3bits )
   1132 {
   1133    vassert(lo3bits < 8);
   1134    vassert(IS_VALID_PFX(pfx));
   1135    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1136    return nameIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1137                         toBool(sz==1 && !haveREX(pfx)) );
   1138 }
   1139 
   1140 static IRExpr* getIRegRexB ( Int sz, Prefix pfx, UInt lo3bits )
   1141 {
   1142    vassert(lo3bits < 8);
   1143    vassert(IS_VALID_PFX(pfx));
   1144    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1145    if (sz == 4) {
   1146       sz = 8;
   1147       return unop(Iop_64to32,
   1148                   IRExpr_Get(
   1149                      offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1150                                      toBool(sz==1 && !haveREX(pfx)) ),
   1151                      szToITy(sz)
   1152                  )
   1153              );
   1154    } else {
   1155       return IRExpr_Get(
   1156                 offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1157                                 toBool(sz==1 && !haveREX(pfx)) ),
   1158                 szToITy(sz)
   1159              );
   1160    }
   1161 }
   1162 
   1163 static void putIRegRexB ( Int sz, Prefix pfx, UInt lo3bits, IRExpr* e )
   1164 {
   1165    vassert(lo3bits < 8);
   1166    vassert(IS_VALID_PFX(pfx));
   1167    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1168    vassert(typeOfIRExpr(irsb->tyenv, e) == szToITy(sz));
   1169    stmt( IRStmt_Put(
   1170             offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1171                             toBool(sz==1 && !haveREX(pfx)) ),
   1172             sz==4 ? unop(Iop_32Uto64,e) : e
   1173    ));
   1174 }
   1175 
   1176 
   1177 /* Functions for getting register numbers from modrm bytes and REX
   1178    when we don't have to consider the complexities of integer subreg
   1179    accesses.
   1180 */
   1181 /* Extract the g reg field from a modRM byte, and augment it using the
   1182    REX.R bit from the supplied REX byte.  The R bit usually is
   1183    associated with the g register field.
   1184 */
   1185 static UInt gregOfRexRM ( Prefix pfx, UChar mod_reg_rm )
   1186 {
   1187    Int reg = (Int)( (mod_reg_rm >> 3) & 7 );
   1188    reg += (pfx & PFX_REXR) ? 8 : 0;
   1189    return reg;
   1190 }
   1191 
   1192 /* Extract the e reg field from a modRM byte, and augment it using the
   1193    REX.B bit from the supplied REX byte.  The B bit usually is
   1194    associated with the e register field (when modrm indicates e is a
   1195    register, that is).
   1196 */
   1197 static UInt eregOfRexRM ( Prefix pfx, UChar mod_reg_rm )
   1198 {
   1199    Int rm;
   1200    vassert(epartIsReg(mod_reg_rm));
   1201    rm = (Int)(mod_reg_rm & 0x7);
   1202    rm += (pfx & PFX_REXB) ? 8 : 0;
   1203    return rm;
   1204 }
   1205 
   1206 
   1207 /* General functions for dealing with integer register access. */
   1208 
   1209 /* Produce the guest state offset for a reference to the 'g' register
   1210    field in a modrm byte, taking into account REX (or its absence),
   1211    and the size of the access.
   1212 */
   1213 static UInt offsetIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1214 {
   1215    UInt reg;
   1216    vassert(!host_is_bigendian);
   1217    vassert(IS_VALID_PFX(pfx));
   1218    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1219    reg = gregOfRexRM( pfx, mod_reg_rm );
   1220    return offsetIReg( sz, reg, toBool(sz == 1 && !haveREX(pfx)) );
   1221 }
   1222 
   1223 static
   1224 IRExpr* getIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1225 {
   1226    if (sz == 4) {
   1227       sz = 8;
   1228       return unop(Iop_64to32,
   1229                   IRExpr_Get( offsetIRegG( sz, pfx, mod_reg_rm ),
   1230                               szToITy(sz) ));
   1231    } else {
   1232       return IRExpr_Get( offsetIRegG( sz, pfx, mod_reg_rm ),
   1233                          szToITy(sz) );
   1234    }
   1235 }
   1236 
   1237 static
   1238 void putIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm, IRExpr* e )
   1239 {
   1240    vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
   1241    if (sz == 4) {
   1242       e = unop(Iop_32Uto64,e);
   1243    }
   1244    stmt( IRStmt_Put( offsetIRegG( sz, pfx, mod_reg_rm ), e ) );
   1245 }
   1246 
   1247 static
   1248 HChar* nameIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1249 {
   1250    return nameIReg( sz, gregOfRexRM(pfx,mod_reg_rm),
   1251                         toBool(sz==1 && !haveREX(pfx)) );
   1252 }
   1253 
   1254 
   1255 /* Produce the guest state offset for a reference to the 'e' register
   1256    field in a modrm byte, taking into account REX (or its absence),
   1257    and the size of the access.  eregOfRexRM will assert if mod_reg_rm
   1258    denotes a memory access rather than a register access.
   1259 */
   1260 static UInt offsetIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1261 {
   1262    UInt reg;
   1263    vassert(!host_is_bigendian);
   1264    vassert(IS_VALID_PFX(pfx));
   1265    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1266    reg = eregOfRexRM( pfx, mod_reg_rm );
   1267    return offsetIReg( sz, reg, toBool(sz == 1 && !haveREX(pfx)) );
   1268 }
   1269 
   1270 static
   1271 IRExpr* getIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1272 {
   1273    if (sz == 4) {
   1274       sz = 8;
   1275       return unop(Iop_64to32,
   1276                   IRExpr_Get( offsetIRegE( sz, pfx, mod_reg_rm ),
   1277                               szToITy(sz) ));
   1278    } else {
   1279       return IRExpr_Get( offsetIRegE( sz, pfx, mod_reg_rm ),
   1280                          szToITy(sz) );
   1281    }
   1282 }
   1283 
   1284 static
   1285 void putIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm, IRExpr* e )
   1286 {
   1287    vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
   1288    if (sz == 4) {
   1289       e = unop(Iop_32Uto64,e);
   1290    }
   1291    stmt( IRStmt_Put( offsetIRegE( sz, pfx, mod_reg_rm ), e ) );
   1292 }
   1293 
   1294 static
   1295 HChar* nameIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1296 {
   1297    return nameIReg( sz, eregOfRexRM(pfx,mod_reg_rm),
   1298                         toBool(sz==1 && !haveREX(pfx)) );
   1299 }
   1300 
   1301 
   1302 /*------------------------------------------------------------*/
   1303 /*--- For dealing with XMM registers                       ---*/
   1304 /*------------------------------------------------------------*/
   1305 
   1306 //.. static Int segmentGuestRegOffset ( UInt sreg )
   1307 //.. {
   1308 //..    switch (sreg) {
   1309 //..       case R_ES: return OFFB_ES;
   1310 //..       case R_CS: return OFFB_CS;
   1311 //..       case R_SS: return OFFB_SS;
   1312 //..       case R_DS: return OFFB_DS;
   1313 //..       case R_FS: return OFFB_FS;
   1314 //..       case R_GS: return OFFB_GS;
   1315 //..       default: vpanic("segmentGuestRegOffset(x86)");
   1316 //..    }
   1317 //.. }
   1318 
   1319 static Int xmmGuestRegOffset ( UInt xmmreg )
   1320 {
   1321    switch (xmmreg) {
   1322       case 0:  return OFFB_XMM0;
   1323       case 1:  return OFFB_XMM1;
   1324       case 2:  return OFFB_XMM2;
   1325       case 3:  return OFFB_XMM3;
   1326       case 4:  return OFFB_XMM4;
   1327       case 5:  return OFFB_XMM5;
   1328       case 6:  return OFFB_XMM6;
   1329       case 7:  return OFFB_XMM7;
   1330       case 8:  return OFFB_XMM8;
   1331       case 9:  return OFFB_XMM9;
   1332       case 10: return OFFB_XMM10;
   1333       case 11: return OFFB_XMM11;
   1334       case 12: return OFFB_XMM12;
   1335       case 13: return OFFB_XMM13;
   1336       case 14: return OFFB_XMM14;
   1337       case 15: return OFFB_XMM15;
   1338       default: vpanic("xmmGuestRegOffset(amd64)");
   1339    }
   1340 }
   1341 
   1342 /* Lanes of vector registers are always numbered from zero being the
   1343    least significant lane (rightmost in the register).  */
   1344 
   1345 static Int xmmGuestRegLane16offset ( UInt xmmreg, Int laneno )
   1346 {
   1347    /* Correct for little-endian host only. */
   1348    vassert(!host_is_bigendian);
   1349    vassert(laneno >= 0 && laneno < 8);
   1350    return xmmGuestRegOffset( xmmreg ) + 2 * laneno;
   1351 }
   1352 
   1353 static Int xmmGuestRegLane32offset ( UInt xmmreg, Int laneno )
   1354 {
   1355    /* Correct for little-endian host only. */
   1356    vassert(!host_is_bigendian);
   1357    vassert(laneno >= 0 && laneno < 4);
   1358    return xmmGuestRegOffset( xmmreg ) + 4 * laneno;
   1359 }
   1360 
   1361 static Int xmmGuestRegLane64offset ( UInt xmmreg, Int laneno )
   1362 {
   1363    /* Correct for little-endian host only. */
   1364    vassert(!host_is_bigendian);
   1365    vassert(laneno >= 0 && laneno < 2);
   1366    return xmmGuestRegOffset( xmmreg ) + 8 * laneno;
   1367 }
   1368 
   1369 //.. static IRExpr* getSReg ( UInt sreg )
   1370 //.. {
   1371 //..    return IRExpr_Get( segmentGuestRegOffset(sreg), Ity_I16 );
   1372 //.. }
   1373 //..
   1374 //.. static void putSReg ( UInt sreg, IRExpr* e )
   1375 //.. {
   1376 //..    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
   1377 //..    stmt( IRStmt_Put( segmentGuestRegOffset(sreg), e ) );
   1378 //.. }
   1379 
   1380 static IRExpr* getXMMReg ( UInt xmmreg )
   1381 {
   1382    return IRExpr_Get( xmmGuestRegOffset(xmmreg), Ity_V128 );
   1383 }
   1384 
   1385 static IRExpr* getXMMRegLane64 ( UInt xmmreg, Int laneno )
   1386 {
   1387    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_I64 );
   1388 }
   1389 
   1390 static IRExpr* getXMMRegLane64F ( UInt xmmreg, Int laneno )
   1391 {
   1392    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_F64 );
   1393 }
   1394 
   1395 static IRExpr* getXMMRegLane32 ( UInt xmmreg, Int laneno )
   1396 {
   1397    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_I32 );
   1398 }
   1399 
   1400 static IRExpr* getXMMRegLane32F ( UInt xmmreg, Int laneno )
   1401 {
   1402    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_F32 );
   1403 }
   1404 
   1405 static IRExpr* getXMMRegLane16 ( UInt xmmreg, Int laneno )
   1406 {
   1407   return IRExpr_Get( xmmGuestRegLane16offset(xmmreg,laneno), Ity_I16 );
   1408 }
   1409 
   1410 static void putXMMReg ( UInt xmmreg, IRExpr* e )
   1411 {
   1412    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
   1413    stmt( IRStmt_Put( xmmGuestRegOffset(xmmreg), e ) );
   1414 }
   1415 
   1416 static void putXMMRegLane64 ( UInt xmmreg, Int laneno, IRExpr* e )
   1417 {
   1418    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   1419    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
   1420 }
   1421 
   1422 static void putXMMRegLane64F ( UInt xmmreg, Int laneno, IRExpr* e )
   1423 {
   1424    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
   1425    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
   1426 }
   1427 
   1428 static void putXMMRegLane32F ( UInt xmmreg, Int laneno, IRExpr* e )
   1429 {
   1430    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
   1431    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
   1432 }
   1433 
   1434 static void putXMMRegLane32 ( UInt xmmreg, Int laneno, IRExpr* e )
   1435 {
   1436    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
   1437    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
   1438 }
   1439 
   1440 static void putXMMRegLane16 ( UInt xmmreg, Int laneno, IRExpr* e )
   1441 {
   1442    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
   1443    stmt( IRStmt_Put( xmmGuestRegLane16offset(xmmreg,laneno), e ) );
   1444 }
   1445 
   1446 static IRExpr* mkV128 ( UShort mask )
   1447 {
   1448    return IRExpr_Const(IRConst_V128(mask));
   1449 }
   1450 
   1451 static IRExpr* mkAnd1 ( IRExpr* x, IRExpr* y )
   1452 {
   1453    vassert(typeOfIRExpr(irsb->tyenv,x) == Ity_I1);
   1454    vassert(typeOfIRExpr(irsb->tyenv,y) == Ity_I1);
   1455    return unop(Iop_64to1,
   1456                binop(Iop_And64,
   1457                      unop(Iop_1Uto64,x),
   1458                      unop(Iop_1Uto64,y)));
   1459 }
   1460 
   1461 /* Generate a compare-and-swap operation, operating on memory at
   1462    'addr'.  The expected value is 'expVal' and the new value is
   1463    'newVal'.  If the operation fails, then transfer control (with a
   1464    no-redir jump (XXX no -- see comment at top of this file)) to
   1465    'restart_point', which is presumably the address of the guest
   1466    instruction again -- retrying, essentially. */
   1467 static void casLE ( IRExpr* addr, IRExpr* expVal, IRExpr* newVal,
   1468                     Addr64 restart_point )
   1469 {
   1470    IRCAS* cas;
   1471    IRType tyE    = typeOfIRExpr(irsb->tyenv, expVal);
   1472    IRType tyN    = typeOfIRExpr(irsb->tyenv, newVal);
   1473    IRTemp oldTmp = newTemp(tyE);
   1474    IRTemp expTmp = newTemp(tyE);
   1475    vassert(tyE == tyN);
   1476    vassert(tyE == Ity_I64 || tyE == Ity_I32
   1477            || tyE == Ity_I16 || tyE == Ity_I8);
   1478    assign(expTmp, expVal);
   1479    cas = mkIRCAS( IRTemp_INVALID, oldTmp, Iend_LE, addr,
   1480                   NULL, mkexpr(expTmp), NULL, newVal );
   1481    stmt( IRStmt_CAS(cas) );
   1482    stmt( IRStmt_Exit(
   1483             binop( mkSizedOp(tyE,Iop_CasCmpNE8),
   1484                    mkexpr(oldTmp), mkexpr(expTmp) ),
   1485             Ijk_Boring, /*Ijk_NoRedir*/
   1486             IRConst_U64( restart_point )
   1487          ));
   1488 }
   1489 
   1490 
   1491 /*------------------------------------------------------------*/
   1492 /*--- Helpers for %rflags.                                 ---*/
   1493 /*------------------------------------------------------------*/
   1494 
   1495 /* -------------- Evaluating the flags-thunk. -------------- */
   1496 
   1497 /* Build IR to calculate all the eflags from stored
   1498    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
   1499    Ity_I64. */
   1500 static IRExpr* mk_amd64g_calculate_rflags_all ( void )
   1501 {
   1502    IRExpr** args
   1503       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
   1504                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
   1505                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
   1506                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   1507    IRExpr* call
   1508       = mkIRExprCCall(
   1509            Ity_I64,
   1510            0/*regparm*/,
   1511            "amd64g_calculate_rflags_all", &amd64g_calculate_rflags_all,
   1512            args
   1513         );
   1514    /* Exclude OP and NDEP from definedness checking.  We're only
   1515       interested in DEP1 and DEP2. */
   1516    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
   1517    return call;
   1518 }
   1519 
   1520 /* Build IR to calculate some particular condition from stored
   1521    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
   1522    Ity_Bit. */
   1523 static IRExpr* mk_amd64g_calculate_condition ( AMD64Condcode cond )
   1524 {
   1525    IRExpr** args
   1526       = mkIRExprVec_5( mkU64(cond),
   1527                        IRExpr_Get(OFFB_CC_OP,   Ity_I64),
   1528                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
   1529                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
   1530                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   1531    IRExpr* call
   1532       = mkIRExprCCall(
   1533            Ity_I64,
   1534            0/*regparm*/,
   1535            "amd64g_calculate_condition", &amd64g_calculate_condition,
   1536            args
   1537         );
   1538    /* Exclude the requested condition, OP and NDEP from definedness
   1539       checking.  We're only interested in DEP1 and DEP2. */
   1540    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<1) | (1<<4);
   1541    return unop(Iop_64to1, call);
   1542 }
   1543 
   1544 /* Build IR to calculate just the carry flag from stored
   1545    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression :: Ity_I64. */
   1546 static IRExpr* mk_amd64g_calculate_rflags_c ( void )
   1547 {
   1548    IRExpr** args
   1549       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
   1550                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
   1551                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
   1552                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   1553    IRExpr* call
   1554       = mkIRExprCCall(
   1555            Ity_I64,
   1556            0/*regparm*/,
   1557            "amd64g_calculate_rflags_c", &amd64g_calculate_rflags_c,
   1558            args
   1559         );
   1560    /* Exclude OP and NDEP from definedness checking.  We're only
   1561       interested in DEP1 and DEP2. */
   1562    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
   1563    return call;
   1564 }
   1565 
   1566 
   1567 /* -------------- Building the flags-thunk. -------------- */
   1568 
   1569 /* The machinery in this section builds the flag-thunk following a
   1570    flag-setting operation.  Hence the various setFlags_* functions.
   1571 */
   1572 
   1573 static Bool isAddSub ( IROp op8 )
   1574 {
   1575    return toBool(op8 == Iop_Add8 || op8 == Iop_Sub8);
   1576 }
   1577 
   1578 static Bool isLogic ( IROp op8 )
   1579 {
   1580    return toBool(op8 == Iop_And8 || op8 == Iop_Or8 || op8 == Iop_Xor8);
   1581 }
   1582 
   1583 /* U-widen 8/16/32/64 bit int expr to 64. */
   1584 static IRExpr* widenUto64 ( IRExpr* e )
   1585 {
   1586    switch (typeOfIRExpr(irsb->tyenv,e)) {
   1587       case Ity_I64: return e;
   1588       case Ity_I32: return unop(Iop_32Uto64, e);
   1589       case Ity_I16: return unop(Iop_16Uto64, e);
   1590       case Ity_I8:  return unop(Iop_8Uto64, e);
   1591       default: vpanic("widenUto64");
   1592    }
   1593 }
   1594 
   1595 /* S-widen 8/16/32/64 bit int expr to 32. */
   1596 static IRExpr* widenSto64 ( IRExpr* e )
   1597 {
   1598    switch (typeOfIRExpr(irsb->tyenv,e)) {
   1599       case Ity_I64: return e;
   1600       case Ity_I32: return unop(Iop_32Sto64, e);
   1601       case Ity_I16: return unop(Iop_16Sto64, e);
   1602       case Ity_I8:  return unop(Iop_8Sto64, e);
   1603       default: vpanic("widenSto64");
   1604    }
   1605 }
   1606 
   1607 /* Narrow 8/16/32/64 bit int expr to 8/16/32/64.  Clearly only some
   1608    of these combinations make sense. */
   1609 static IRExpr* narrowTo ( IRType dst_ty, IRExpr* e )
   1610 {
   1611    IRType src_ty = typeOfIRExpr(irsb->tyenv,e);
   1612    if (src_ty == dst_ty)
   1613       return e;
   1614    if (src_ty == Ity_I32 && dst_ty == Ity_I16)
   1615       return unop(Iop_32to16, e);
   1616    if (src_ty == Ity_I32 && dst_ty == Ity_I8)
   1617       return unop(Iop_32to8, e);
   1618    if (src_ty == Ity_I64 && dst_ty == Ity_I32)
   1619       return unop(Iop_64to32, e);
   1620    if (src_ty == Ity_I64 && dst_ty == Ity_I16)
   1621       return unop(Iop_64to16, e);
   1622    if (src_ty == Ity_I64 && dst_ty == Ity_I8)
   1623       return unop(Iop_64to8, e);
   1624 
   1625    vex_printf("\nsrc, dst tys are: ");
   1626    ppIRType(src_ty);
   1627    vex_printf(", ");
   1628    ppIRType(dst_ty);
   1629    vex_printf("\n");
   1630    vpanic("narrowTo(amd64)");
   1631 }
   1632 
   1633 
   1634 /* Set the flags thunk OP, DEP1 and DEP2 fields.  The supplied op is
   1635    auto-sized up to the real op. */
   1636 
   1637 static
   1638 void setFlags_DEP1_DEP2 ( IROp op8, IRTemp dep1, IRTemp dep2, IRType ty )
   1639 {
   1640    Int ccOp = 0;
   1641    switch (ty) {
   1642       case Ity_I8:  ccOp = 0; break;
   1643       case Ity_I16: ccOp = 1; break;
   1644       case Ity_I32: ccOp = 2; break;
   1645       case Ity_I64: ccOp = 3; break;
   1646       default: vassert(0);
   1647    }
   1648    switch (op8) {
   1649       case Iop_Add8: ccOp += AMD64G_CC_OP_ADDB;   break;
   1650       case Iop_Sub8: ccOp += AMD64G_CC_OP_SUBB;   break;
   1651       default:       ppIROp(op8);
   1652                      vpanic("setFlags_DEP1_DEP2(amd64)");
   1653    }
   1654    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
   1655    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dep1))) );
   1656    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(dep2))) );
   1657 }
   1658 
   1659 
   1660 /* Set the OP and DEP1 fields only, and write zero to DEP2. */
   1661 
   1662 static
   1663 void setFlags_DEP1 ( IROp op8, IRTemp dep1, IRType ty )
   1664 {
   1665    Int ccOp = 0;
   1666    switch (ty) {
   1667       case Ity_I8:  ccOp = 0; break;
   1668       case Ity_I16: ccOp = 1; break;
   1669       case Ity_I32: ccOp = 2; break;
   1670       case Ity_I64: ccOp = 3; break;
   1671       default: vassert(0);
   1672    }
   1673    switch (op8) {
   1674       case Iop_Or8:
   1675       case Iop_And8:
   1676       case Iop_Xor8: ccOp += AMD64G_CC_OP_LOGICB; break;
   1677       default:       ppIROp(op8);
   1678                      vpanic("setFlags_DEP1(amd64)");
   1679    }
   1680    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
   1681    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dep1))) );
   1682    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
   1683 }
   1684 
   1685 
   1686 /* For shift operations, we put in the result and the undershifted
   1687    result.  Except if the shift amount is zero, the thunk is left
   1688    unchanged. */
   1689 
   1690 static void setFlags_DEP1_DEP2_shift ( IROp    op64,
   1691                                        IRTemp  res,
   1692                                        IRTemp  resUS,
   1693                                        IRType  ty,
   1694                                        IRTemp  guard )
   1695 {
   1696    Int ccOp = 0;
   1697    switch (ty) {
   1698       case Ity_I8:  ccOp = 0; break;
   1699       case Ity_I16: ccOp = 1; break;
   1700       case Ity_I32: ccOp = 2; break;
   1701       case Ity_I64: ccOp = 3; break;
   1702       default: vassert(0);
   1703    }
   1704 
   1705    vassert(guard);
   1706 
   1707    /* Both kinds of right shifts are handled by the same thunk
   1708       operation. */
   1709    switch (op64) {
   1710       case Iop_Shr64:
   1711       case Iop_Sar64: ccOp += AMD64G_CC_OP_SHRB; break;
   1712       case Iop_Shl64: ccOp += AMD64G_CC_OP_SHLB; break;
   1713       default:        ppIROp(op64);
   1714                       vpanic("setFlags_DEP1_DEP2_shift(amd64)");
   1715    }
   1716 
   1717    /* DEP1 contains the result, DEP2 contains the undershifted value. */
   1718    stmt( IRStmt_Put( OFFB_CC_OP,
   1719                      IRExpr_Mux0X( mkexpr(guard),
   1720                                    IRExpr_Get(OFFB_CC_OP,Ity_I64),
   1721                                    mkU64(ccOp))) );
   1722    stmt( IRStmt_Put( OFFB_CC_DEP1,
   1723                      IRExpr_Mux0X( mkexpr(guard),
   1724                                    IRExpr_Get(OFFB_CC_DEP1,Ity_I64),
   1725                                    widenUto64(mkexpr(res)))) );
   1726    stmt( IRStmt_Put( OFFB_CC_DEP2,
   1727                      IRExpr_Mux0X( mkexpr(guard),
   1728                                    IRExpr_Get(OFFB_CC_DEP2,Ity_I64),
   1729                                    widenUto64(mkexpr(resUS)))) );
   1730 }
   1731 
   1732 
   1733 /* For the inc/dec case, we store in DEP1 the result value and in NDEP
   1734    the former value of the carry flag, which unfortunately we have to
   1735    compute. */
   1736 
   1737 static void setFlags_INC_DEC ( Bool inc, IRTemp res, IRType ty )
   1738 {
   1739    Int ccOp = inc ? AMD64G_CC_OP_INCB : AMD64G_CC_OP_DECB;
   1740 
   1741    switch (ty) {
   1742       case Ity_I8:  ccOp += 0; break;
   1743       case Ity_I16: ccOp += 1; break;
   1744       case Ity_I32: ccOp += 2; break;
   1745       case Ity_I64: ccOp += 3; break;
   1746       default: vassert(0);
   1747    }
   1748 
   1749    /* This has to come first, because calculating the C flag
   1750       may require reading all four thunk fields. */
   1751    stmt( IRStmt_Put( OFFB_CC_NDEP, mk_amd64g_calculate_rflags_c()) );
   1752    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
   1753    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(res))) );
   1754    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
   1755 }
   1756 
   1757 
   1758 /* Multiplies are pretty much like add and sub: DEP1 and DEP2 hold the
   1759    two arguments. */
   1760 
   1761 static
   1762 void setFlags_MUL ( IRType ty, IRTemp arg1, IRTemp arg2, ULong base_op )
   1763 {
   1764    switch (ty) {
   1765       case Ity_I8:
   1766          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+0) ) );
   1767          break;
   1768       case Ity_I16:
   1769          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+1) ) );
   1770          break;
   1771       case Ity_I32:
   1772          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+2) ) );
   1773          break;
   1774       case Ity_I64:
   1775          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+3) ) );
   1776          break;
   1777       default:
   1778          vpanic("setFlags_MUL(amd64)");
   1779    }
   1780    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(arg1)) ));
   1781    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(arg2)) ));
   1782 }
   1783 
   1784 
   1785 /* -------------- Condition codes. -------------- */
   1786 
   1787 /* Condition codes, using the AMD encoding.  */
   1788 
   1789 static HChar* name_AMD64Condcode ( AMD64Condcode cond )
   1790 {
   1791    switch (cond) {
   1792       case AMD64CondO:      return "o";
   1793       case AMD64CondNO:     return "no";
   1794       case AMD64CondB:      return "b";
   1795       case AMD64CondNB:     return "ae"; /*"nb";*/
   1796       case AMD64CondZ:      return "e"; /*"z";*/
   1797       case AMD64CondNZ:     return "ne"; /*"nz";*/
   1798       case AMD64CondBE:     return "be";
   1799       case AMD64CondNBE:    return "a"; /*"nbe";*/
   1800       case AMD64CondS:      return "s";
   1801       case AMD64CondNS:     return "ns";
   1802       case AMD64CondP:      return "p";
   1803       case AMD64CondNP:     return "np";
   1804       case AMD64CondL:      return "l";
   1805       case AMD64CondNL:     return "ge"; /*"nl";*/
   1806       case AMD64CondLE:     return "le";
   1807       case AMD64CondNLE:    return "g"; /*"nle";*/
   1808       case AMD64CondAlways: return "ALWAYS";
   1809       default: vpanic("name_AMD64Condcode");
   1810    }
   1811 }
   1812 
   1813 static
   1814 AMD64Condcode positiveIse_AMD64Condcode ( AMD64Condcode  cond,
   1815                                           /*OUT*/Bool*   needInvert )
   1816 {
   1817    vassert(cond >= AMD64CondO && cond <= AMD64CondNLE);
   1818    if (cond & 1) {
   1819       *needInvert = True;
   1820       return cond-1;
   1821    } else {
   1822       *needInvert = False;
   1823       return cond;
   1824    }
   1825 }
   1826 
   1827 
   1828 /* -------------- Helpers for ADD/SUB with carry. -------------- */
   1829 
   1830 /* Given ta1, ta2 and tres, compute tres = ADC(ta1,ta2) and set flags
   1831    appropriately.
   1832 
   1833    Optionally, generate a store for the 'tres' value.  This can either
   1834    be a normal store, or it can be a cas-with-possible-failure style
   1835    store:
   1836 
   1837    if taddr is IRTemp_INVALID, then no store is generated.
   1838 
   1839    if taddr is not IRTemp_INVALID, then a store (using taddr as
   1840    the address) is generated:
   1841 
   1842      if texpVal is IRTemp_INVALID then a normal store is
   1843      generated, and restart_point must be zero (it is irrelevant).
   1844 
   1845      if texpVal is not IRTemp_INVALID then a cas-style store is
   1846      generated.  texpVal is the expected value, restart_point
   1847      is the restart point if the store fails, and texpVal must
   1848      have the same type as tres.
   1849 
   1850 */
   1851 static void helper_ADC ( Int sz,
   1852                          IRTemp tres, IRTemp ta1, IRTemp ta2,
   1853                          /* info about optional store: */
   1854                          IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
   1855 {
   1856    UInt    thunkOp;
   1857    IRType  ty    = szToITy(sz);
   1858    IRTemp  oldc  = newTemp(Ity_I64);
   1859    IRTemp  oldcn = newTemp(ty);
   1860    IROp    plus  = mkSizedOp(ty, Iop_Add8);
   1861    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
   1862 
   1863    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
   1864 
   1865    switch (sz) {
   1866       case 8:  thunkOp = AMD64G_CC_OP_ADCQ; break;
   1867       case 4:  thunkOp = AMD64G_CC_OP_ADCL; break;
   1868       case 2:  thunkOp = AMD64G_CC_OP_ADCW; break;
   1869       case 1:  thunkOp = AMD64G_CC_OP_ADCB; break;
   1870       default: vassert(0);
   1871    }
   1872 
   1873    /* oldc = old carry flag, 0 or 1 */
   1874    assign( oldc,  binop(Iop_And64,
   1875                         mk_amd64g_calculate_rflags_c(),
   1876                         mkU64(1)) );
   1877 
   1878    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
   1879 
   1880    assign( tres, binop(plus,
   1881                        binop(plus,mkexpr(ta1),mkexpr(ta2)),
   1882                        mkexpr(oldcn)) );
   1883 
   1884    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
   1885       start of this function. */
   1886    if (taddr != IRTemp_INVALID) {
   1887       if (texpVal == IRTemp_INVALID) {
   1888          vassert(restart_point == 0);
   1889          storeLE( mkexpr(taddr), mkexpr(tres) );
   1890       } else {
   1891          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
   1892          /* .. and hence 'texpVal' has the same type as 'tres'. */
   1893          casLE( mkexpr(taddr),
   1894                 mkexpr(texpVal), mkexpr(tres), restart_point );
   1895       }
   1896    }
   1897 
   1898    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
   1899    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1))  ));
   1900    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
   1901                                                          mkexpr(oldcn)) )) );
   1902    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
   1903 }
   1904 
   1905 
   1906 /* Given ta1, ta2 and tres, compute tres = SBB(ta1,ta2) and set flags
   1907    appropriately.  As with helper_ADC, possibly generate a store of
   1908    the result -- see comments on helper_ADC for details.
   1909 */
   1910 static void helper_SBB ( Int sz,
   1911                          IRTemp tres, IRTemp ta1, IRTemp ta2,
   1912                          /* info about optional store: */
   1913                          IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
   1914 {
   1915    UInt    thunkOp;
   1916    IRType  ty    = szToITy(sz);
   1917    IRTemp  oldc  = newTemp(Ity_I64);
   1918    IRTemp  oldcn = newTemp(ty);
   1919    IROp    minus = mkSizedOp(ty, Iop_Sub8);
   1920    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
   1921 
   1922    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
   1923 
   1924    switch (sz) {
   1925       case 8:  thunkOp = AMD64G_CC_OP_SBBQ; break;
   1926       case 4:  thunkOp = AMD64G_CC_OP_SBBL; break;
   1927       case 2:  thunkOp = AMD64G_CC_OP_SBBW; break;
   1928       case 1:  thunkOp = AMD64G_CC_OP_SBBB; break;
   1929       default: vassert(0);
   1930    }
   1931 
   1932    /* oldc = old carry flag, 0 or 1 */
   1933    assign( oldc, binop(Iop_And64,
   1934                        mk_amd64g_calculate_rflags_c(),
   1935                        mkU64(1)) );
   1936 
   1937    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
   1938 
   1939    assign( tres, binop(minus,
   1940                        binop(minus,mkexpr(ta1),mkexpr(ta2)),
   1941                        mkexpr(oldcn)) );
   1942 
   1943    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
   1944       start of this function. */
   1945    if (taddr != IRTemp_INVALID) {
   1946       if (texpVal == IRTemp_INVALID) {
   1947          vassert(restart_point == 0);
   1948          storeLE( mkexpr(taddr), mkexpr(tres) );
   1949       } else {
   1950          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
   1951          /* .. and hence 'texpVal' has the same type as 'tres'. */
   1952          casLE( mkexpr(taddr),
   1953                 mkexpr(texpVal), mkexpr(tres), restart_point );
   1954       }
   1955    }
   1956 
   1957    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
   1958    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1) )) );
   1959    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
   1960                                                          mkexpr(oldcn)) )) );
   1961    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
   1962 }
   1963 
   1964 
   1965 /* -------------- Helpers for disassembly printing. -------------- */
   1966 
   1967 static HChar* nameGrp1 ( Int opc_aux )
   1968 {
   1969    static HChar* grp1_names[8]
   1970      = { "add", "or", "adc", "sbb", "and", "sub", "xor", "cmp" };
   1971    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp1(amd64)");
   1972    return grp1_names[opc_aux];
   1973 }
   1974 
   1975 static HChar* nameGrp2 ( Int opc_aux )
   1976 {
   1977    static HChar* grp2_names[8]
   1978      = { "rol", "ror", "rcl", "rcr", "shl", "shr", "shl", "sar" };
   1979    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp2(amd64)");
   1980    return grp2_names[opc_aux];
   1981 }
   1982 
   1983 static HChar* nameGrp4 ( Int opc_aux )
   1984 {
   1985    static HChar* grp4_names[8]
   1986      = { "inc", "dec", "???", "???", "???", "???", "???", "???" };
   1987    if (opc_aux < 0 || opc_aux > 1) vpanic("nameGrp4(amd64)");
   1988    return grp4_names[opc_aux];
   1989 }
   1990 
   1991 static HChar* nameGrp5 ( Int opc_aux )
   1992 {
   1993    static HChar* grp5_names[8]
   1994      = { "inc", "dec", "call*", "call*", "jmp*", "jmp*", "push", "???" };
   1995    if (opc_aux < 0 || opc_aux > 6) vpanic("nameGrp5(amd64)");
   1996    return grp5_names[opc_aux];
   1997 }
   1998 
   1999 static HChar* nameGrp8 ( Int opc_aux )
   2000 {
   2001    static HChar* grp8_names[8]
   2002       = { "???", "???", "???", "???", "bt", "bts", "btr", "btc" };
   2003    if (opc_aux < 4 || opc_aux > 7) vpanic("nameGrp8(amd64)");
   2004    return grp8_names[opc_aux];
   2005 }
   2006 
   2007 //.. static HChar* nameSReg ( UInt sreg )
   2008 //.. {
   2009 //..    switch (sreg) {
   2010 //..       case R_ES: return "%es";
   2011 //..       case R_CS: return "%cs";
   2012 //..       case R_SS: return "%ss";
   2013 //..       case R_DS: return "%ds";
   2014 //..       case R_FS: return "%fs";
   2015 //..       case R_GS: return "%gs";
   2016 //..       default: vpanic("nameSReg(x86)");
   2017 //..    }
   2018 //.. }
   2019 
   2020 static HChar* nameMMXReg ( Int mmxreg )
   2021 {
   2022    static HChar* mmx_names[8]
   2023      = { "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" };
   2024    if (mmxreg < 0 || mmxreg > 7) vpanic("nameMMXReg(amd64,guest)");
   2025    return mmx_names[mmxreg];
   2026 }
   2027 
   2028 static HChar* nameXMMReg ( Int xmmreg )
   2029 {
   2030    static HChar* xmm_names[16]
   2031      = { "%xmm0",  "%xmm1",  "%xmm2",  "%xmm3",
   2032          "%xmm4",  "%xmm5",  "%xmm6",  "%xmm7",
   2033          "%xmm8",  "%xmm9",  "%xmm10", "%xmm11",
   2034          "%xmm12", "%xmm13", "%xmm14", "%xmm15" };
   2035    if (xmmreg < 0 || xmmreg > 15) vpanic("nameXMMReg(amd64)");
   2036    return xmm_names[xmmreg];
   2037 }
   2038 
   2039 static HChar* nameMMXGran ( Int gran )
   2040 {
   2041    switch (gran) {
   2042       case 0: return "b";
   2043       case 1: return "w";
   2044       case 2: return "d";
   2045       case 3: return "q";
   2046       default: vpanic("nameMMXGran(amd64,guest)");
   2047    }
   2048 }
   2049 
   2050 static HChar nameISize ( Int size )
   2051 {
   2052    switch (size) {
   2053       case 8: return 'q';
   2054       case 4: return 'l';
   2055       case 2: return 'w';
   2056       case 1: return 'b';
   2057       default: vpanic("nameISize(amd64)");
   2058    }
   2059 }
   2060 
   2061 
   2062 /*------------------------------------------------------------*/
   2063 /*--- JMP helpers                                          ---*/
   2064 /*------------------------------------------------------------*/
   2065 
   2066 static void jmp_lit( IRJumpKind kind, Addr64 d64 )
   2067 {
   2068    irsb->next     = mkU64(d64);
   2069    irsb->jumpkind = kind;
   2070 }
   2071 
   2072 static void jmp_treg( IRJumpKind kind, IRTemp t )
   2073 {
   2074    irsb->next     = mkexpr(t);
   2075    irsb->jumpkind = kind;
   2076 }
   2077 
   2078 static
   2079 void jcc_01 ( AMD64Condcode cond, Addr64 d64_false, Addr64 d64_true )
   2080 {
   2081    Bool          invert;
   2082    AMD64Condcode condPos;
   2083    condPos = positiveIse_AMD64Condcode ( cond, &invert );
   2084    if (invert) {
   2085       stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
   2086                          Ijk_Boring,
   2087                          IRConst_U64(d64_false) ) );
   2088       irsb->next     = mkU64(d64_true);
   2089       irsb->jumpkind = Ijk_Boring;
   2090    } else {
   2091       stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
   2092                          Ijk_Boring,
   2093                          IRConst_U64(d64_true) ) );
   2094       irsb->next     = mkU64(d64_false);
   2095       irsb->jumpkind = Ijk_Boring;
   2096    }
   2097 }
   2098 
   2099 /* Let new_rsp be the %rsp value after a call/return.  Let nia be the
   2100    guest address of the next instruction to be executed.
   2101 
   2102    This function generates an AbiHint to say that -128(%rsp)
   2103    .. -1(%rsp) should now be regarded as uninitialised.
   2104 */
   2105 static
   2106 void make_redzone_AbiHint ( VexAbiInfo* vbi,
   2107                             IRTemp new_rsp, IRTemp nia, HChar* who )
   2108 {
   2109    Int szB = vbi->guest_stack_redzone_size;
   2110    vassert(szB >= 0);
   2111 
   2112    /* A bit of a kludge.  Currently the only AbI we've guested AMD64
   2113       for is ELF.  So just check it's the expected 128 value
   2114       (paranoia). */
   2115    vassert(szB == 128);
   2116 
   2117    if (0) vex_printf("AbiHint: %s\n", who);
   2118    vassert(typeOfIRTemp(irsb->tyenv, new_rsp) == Ity_I64);
   2119    vassert(typeOfIRTemp(irsb->tyenv, nia) == Ity_I64);
   2120    if (szB > 0)
   2121       stmt( IRStmt_AbiHint(
   2122                binop(Iop_Sub64, mkexpr(new_rsp), mkU64(szB)),
   2123                szB,
   2124                mkexpr(nia)
   2125             ));
   2126 }
   2127 
   2128 
   2129 /*------------------------------------------------------------*/
   2130 /*--- Disassembling addressing modes                       ---*/
   2131 /*------------------------------------------------------------*/
   2132 
   2133 static
   2134 HChar* segRegTxt ( Prefix pfx )
   2135 {
   2136    if (pfx & PFX_CS) return "%cs:";
   2137    if (pfx & PFX_DS) return "%ds:";
   2138    if (pfx & PFX_ES) return "%es:";
   2139    if (pfx & PFX_FS) return "%fs:";
   2140    if (pfx & PFX_GS) return "%gs:";
   2141    if (pfx & PFX_SS) return "%ss:";
   2142    return ""; /* no override */
   2143 }
   2144 
   2145 
   2146 /* 'virtual' is an IRExpr* holding a virtual address.  Convert it to a
   2147    linear address by adding any required segment override as indicated
   2148    by sorb, and also dealing with any address size override
   2149    present. */
   2150 static
   2151 IRExpr* handleAddrOverrides ( VexAbiInfo* vbi,
   2152                               Prefix pfx, IRExpr* virtual )
   2153 {
   2154    /* --- segment overrides --- */
   2155    if (pfx & PFX_FS) {
   2156       if (vbi->guest_amd64_assume_fs_is_zero) {
   2157          /* Note that this is a linux-kernel specific hack that relies
   2158             on the assumption that %fs is always zero. */
   2159          /* return virtual + guest_FS_ZERO. */
   2160          virtual = binop(Iop_Add64, virtual,
   2161                                     IRExpr_Get(OFFB_FS_ZERO, Ity_I64));
   2162       } else {
   2163          unimplemented("amd64 %fs segment override");
   2164       }
   2165    }
   2166 
   2167    if (pfx & PFX_GS) {
   2168       if (vbi->guest_amd64_assume_gs_is_0x60) {
   2169          /* Note that this is a darwin-kernel specific hack that relies
   2170             on the assumption that %gs is always 0x60. */
   2171          /* return virtual + guest_GS_0x60. */
   2172          virtual = binop(Iop_Add64, virtual,
   2173                                     IRExpr_Get(OFFB_GS_0x60, Ity_I64));
   2174       } else {
   2175          unimplemented("amd64 %gs segment override");
   2176       }
   2177    }
   2178 
   2179    /* cs, ds, es and ss are simply ignored in 64-bit mode. */
   2180 
   2181    /* --- address size override --- */
   2182    if (haveASO(pfx))
   2183       virtual = unop(Iop_32Uto64, unop(Iop_64to32, virtual));
   2184 
   2185    return virtual;
   2186 }
   2187 
   2188 //.. {
   2189 //..    Int    sreg;
   2190 //..    IRType hWordTy;
   2191 //..    IRTemp ldt_ptr, gdt_ptr, seg_selector, r64;
   2192 //..
   2193 //..    if (sorb == 0)
   2194 //..       /* the common case - no override */
   2195 //..       return virtual;
   2196 //..
   2197 //..    switch (sorb) {
   2198 //..       case 0x3E: sreg = R_DS; break;
   2199 //..       case 0x26: sreg = R_ES; break;
   2200 //..       case 0x64: sreg = R_FS; break;
   2201 //..       case 0x65: sreg = R_GS; break;
   2202 //..       default: vpanic("handleAddrOverrides(x86,guest)");
   2203 //..    }
   2204 //..
   2205 //..    hWordTy = sizeof(HWord)==4 ? Ity_I32 : Ity_I64;
   2206 //..
   2207 //..    seg_selector = newTemp(Ity_I32);
   2208 //..    ldt_ptr      = newTemp(hWordTy);
   2209 //..    gdt_ptr      = newTemp(hWordTy);
   2210 //..    r64          = newTemp(Ity_I64);
   2211 //..
   2212 //..    assign( seg_selector, unop(Iop_16Uto32, getSReg(sreg)) );
   2213 //..    assign( ldt_ptr, IRExpr_Get( OFFB_LDT, hWordTy ));
   2214 //..    assign( gdt_ptr, IRExpr_Get( OFFB_GDT, hWordTy ));
   2215 //..
   2216 //..    /*
   2217 //..    Call this to do the translation and limit checks:
   2218 //..    ULong x86g_use_seg_selector ( HWord ldt, HWord gdt,
   2219 //..                                  UInt seg_selector, UInt virtual_addr )
   2220 //..    */
   2221 //..    assign(
   2222 //..       r64,
   2223 //..       mkIRExprCCall(
   2224 //..          Ity_I64,
   2225 //..          0/*regparms*/,
   2226 //..          "x86g_use_seg_selector",
   2227 //..          &x86g_use_seg_selector,
   2228 //..          mkIRExprVec_4( mkexpr(ldt_ptr), mkexpr(gdt_ptr),
   2229 //..                         mkexpr(seg_selector), virtual)
   2230 //..       )
   2231 //..    );
   2232 //..
   2233 //..    /* If the high 32 of the result are non-zero, there was a
   2234 //..       failure in address translation.  In which case, make a
   2235 //..       quick exit.
   2236 //..    */
   2237 //..    stmt(
   2238 //..       IRStmt_Exit(
   2239 //..          binop(Iop_CmpNE32, unop(Iop_64HIto32, mkexpr(r64)), mkU32(0)),
   2240 //..          Ijk_MapFail,
   2241 //..          IRConst_U32( guest_eip_curr_instr )
   2242 //..       )
   2243 //..    );
   2244 //..
   2245 //..    /* otherwise, here's the translated result. */
   2246 //..    return unop(Iop_64to32, mkexpr(r64));
   2247 //.. }
   2248 
   2249 
   2250 /* Generate IR to calculate an address indicated by a ModRM and
   2251    following SIB bytes.  The expression, and the number of bytes in
   2252    the address mode, are returned (the latter in *len).  Note that
   2253    this fn should not be called if the R/M part of the address denotes
   2254    a register instead of memory.  If print_codegen is true, text of
   2255    the addressing mode is placed in buf.
   2256 
   2257    The computed address is stored in a new tempreg, and the
   2258    identity of the tempreg is returned.
   2259 
   2260    extra_bytes holds the number of bytes after the amode, as supplied
   2261    by the caller.  This is needed to make sense of %rip-relative
   2262    addresses.  Note that the value that *len is set to is only the
   2263    length of the amode itself and does not include the value supplied
   2264    in extra_bytes.
   2265  */
   2266 
   2267 static IRTemp disAMode_copy2tmp ( IRExpr* addr64 )
   2268 {
   2269    IRTemp tmp = newTemp(Ity_I64);
   2270    assign( tmp, addr64 );
   2271    return tmp;
   2272 }
   2273 
   2274 static
   2275 IRTemp disAMode ( /*OUT*/Int* len,
   2276                   VexAbiInfo* vbi, Prefix pfx, Long delta,
   2277                   /*OUT*/HChar* buf, Int extra_bytes )
   2278 {
   2279    UChar mod_reg_rm = getUChar(delta);
   2280    delta++;
   2281 
   2282    buf[0] = (UChar)0;
   2283    vassert(extra_bytes >= 0 && extra_bytes < 10);
   2284 
   2285    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
   2286       jump table seems a bit excessive.
   2287    */
   2288    mod_reg_rm &= 0xC7;                         /* is now XX000YYY */
   2289    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
   2290                                                /* is now XX0XXYYY */
   2291    mod_reg_rm &= 0x1F;                         /* is now 000XXYYY */
   2292    switch (mod_reg_rm) {
   2293 
   2294       /* REX.B==0: (%rax) .. (%rdi), not including (%rsp) or (%rbp).
   2295          REX.B==1: (%r8)  .. (%r15), not including (%r12) or (%r13).
   2296       */
   2297       case 0x00: case 0x01: case 0x02: case 0x03:
   2298       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
   2299          { UChar rm = toUChar(mod_reg_rm & 7);
   2300            DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,rm));
   2301            *len = 1;
   2302            return disAMode_copy2tmp(
   2303                   handleAddrOverrides(vbi, pfx, getIRegRexB(8,pfx,rm)));
   2304          }
   2305 
   2306       /* REX.B==0: d8(%rax) ... d8(%rdi), not including d8(%rsp)
   2307          REX.B==1: d8(%r8)  ... d8(%r15), not including d8(%r12)
   2308       */
   2309       case 0x08: case 0x09: case 0x0A: case 0x0B:
   2310       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
   2311          { UChar rm = toUChar(mod_reg_rm & 7);
   2312            Long d   = getSDisp8(delta);
   2313            if (d == 0) {
   2314               DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,rm));
   2315            } else {
   2316               DIS(buf, "%s%lld(%s)", segRegTxt(pfx), d, nameIRegRexB(8,pfx,rm));
   2317            }
   2318            *len = 2;
   2319            return disAMode_copy2tmp(
   2320                   handleAddrOverrides(vbi, pfx,
   2321                      binop(Iop_Add64,getIRegRexB(8,pfx,rm),mkU64(d))));
   2322          }
   2323 
   2324       /* REX.B==0: d32(%rax) ... d32(%rdi), not including d32(%rsp)
   2325          REX.B==1: d32(%r8)  ... d32(%r15), not including d32(%r12)
   2326       */
   2327       case 0x10: case 0x11: case 0x12: case 0x13:
   2328       /* ! 14 */ case 0x15: case 0x16: case 0x17:
   2329          { UChar rm = toUChar(mod_reg_rm & 7);
   2330            Long  d  = getSDisp32(delta);
   2331            DIS(buf, "%s%lld(%s)", segRegTxt(pfx), d, nameIRegRexB(8,pfx,rm));
   2332            *len = 5;
   2333            return disAMode_copy2tmp(
   2334                   handleAddrOverrides(vbi, pfx,
   2335                      binop(Iop_Add64,getIRegRexB(8,pfx,rm),mkU64(d))));
   2336          }
   2337 
   2338       /* REX.B==0: a register, %rax .. %rdi.  This shouldn't happen. */
   2339       /* REX.B==1: a register, %r8  .. %r16.  This shouldn't happen. */
   2340       case 0x18: case 0x19: case 0x1A: case 0x1B:
   2341       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
   2342          vpanic("disAMode(amd64): not an addr!");
   2343 
   2344       /* RIP + disp32.  This assumes that guest_RIP_curr_instr is set
   2345          correctly at the start of handling each instruction. */
   2346       case 0x05:
   2347          { Long d = getSDisp32(delta);
   2348            *len = 5;
   2349            DIS(buf, "%s%lld(%%rip)", segRegTxt(pfx), d);
   2350            /* We need to know the next instruction's start address.
   2351               Try and figure out what it is, record the guess, and ask
   2352               the top-level driver logic (bbToIR_AMD64) to check we
   2353               guessed right, after the instruction is completely
   2354               decoded. */
   2355            guest_RIP_next_mustcheck = True;
   2356            guest_RIP_next_assumed = guest_RIP_bbstart
   2357                                     + delta+4 + extra_bytes;
   2358            return disAMode_copy2tmp(
   2359                      handleAddrOverrides(vbi, pfx,
   2360                         binop(Iop_Add64, mkU64(guest_RIP_next_assumed),
   2361                                          mkU64(d))));
   2362          }
   2363 
   2364       case 0x04: {
   2365          /* SIB, with no displacement.  Special cases:
   2366             -- %rsp cannot act as an index value.
   2367                If index_r indicates %rsp, zero is used for the index.
   2368             -- when mod is zero and base indicates RBP or R13, base is
   2369                instead a 32-bit sign-extended literal.
   2370             It's all madness, I tell you.  Extract %index, %base and
   2371             scale from the SIB byte.  The value denoted is then:
   2372                | %index == %RSP && (%base == %RBP || %base == %R13)
   2373                = d32 following SIB byte
   2374                | %index == %RSP && !(%base == %RBP || %base == %R13)
   2375                = %base
   2376                | %index != %RSP && (%base == %RBP || %base == %R13)
   2377                = d32 following SIB byte + (%index << scale)
   2378                | %index != %RSP && !(%base == %RBP || %base == %R13)
   2379                = %base + (%index << scale)
   2380          */
   2381          UChar sib     = getUChar(delta);
   2382          UChar scale   = toUChar((sib >> 6) & 3);
   2383          UChar index_r = toUChar((sib >> 3) & 7);
   2384          UChar base_r  = toUChar(sib & 7);
   2385          /* correct since #(R13) == 8 + #(RBP) */
   2386          Bool  base_is_BPor13 = toBool(base_r == R_RBP);
   2387          Bool  index_is_SP    = toBool(index_r == R_RSP && 0==getRexX(pfx));
   2388          delta++;
   2389 
   2390          if ((!index_is_SP) && (!base_is_BPor13)) {
   2391             if (scale == 0) {
   2392                DIS(buf, "%s(%s,%s)", segRegTxt(pfx),
   2393                          nameIRegRexB(8,pfx,base_r),
   2394                          nameIReg64rexX(pfx,index_r));
   2395             } else {
   2396                DIS(buf, "%s(%s,%s,%d)", segRegTxt(pfx),
   2397                          nameIRegRexB(8,pfx,base_r),
   2398                          nameIReg64rexX(pfx,index_r), 1<<scale);
   2399             }
   2400             *len = 2;
   2401             return
   2402                disAMode_copy2tmp(
   2403                handleAddrOverrides(vbi, pfx,
   2404                   binop(Iop_Add64,
   2405                         getIRegRexB(8,pfx,base_r),
   2406                         binop(Iop_Shl64, getIReg64rexX(pfx,index_r),
   2407                               mkU8(scale)))));
   2408          }
   2409 
   2410          if ((!index_is_SP) && base_is_BPor13) {
   2411             Long d = getSDisp32(delta);
   2412             DIS(buf, "%s%lld(,%s,%d)", segRegTxt(pfx), d,
   2413                       nameIReg64rexX(pfx,index_r), 1<<scale);
   2414             *len = 6;
   2415             return
   2416                disAMode_copy2tmp(
   2417                handleAddrOverrides(vbi, pfx,
   2418                   binop(Iop_Add64,
   2419                         binop(Iop_Shl64, getIReg64rexX(pfx,index_r),
   2420                                          mkU8(scale)),
   2421                         mkU64(d))));
   2422          }
   2423 
   2424          if (index_is_SP && (!base_is_BPor13)) {
   2425             DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,base_r));
   2426             *len = 2;
   2427             return disAMode_copy2tmp(
   2428                    handleAddrOverrides(vbi, pfx, getIRegRexB(8,pfx,base_r)));
   2429          }
   2430 
   2431          if (index_is_SP && base_is_BPor13) {
   2432             Long d = getSDisp32(delta);
   2433             DIS(buf, "%s%lld", segRegTxt(pfx), d);
   2434             *len = 6;
   2435             return disAMode_copy2tmp(
   2436                    handleAddrOverrides(vbi, pfx, mkU64(d)));
   2437          }
   2438 
   2439          vassert(0);
   2440       }
   2441 
   2442       /* SIB, with 8-bit displacement.  Special cases:
   2443          -- %esp cannot act as an index value.
   2444             If index_r indicates %esp, zero is used for the index.
   2445          Denoted value is:
   2446             | %index == %ESP
   2447             = d8 + %base
   2448             | %index != %ESP
   2449             = d8 + %base + (%index << scale)
   2450       */
   2451       case 0x0C: {
   2452          UChar sib     = getUChar(delta);
   2453          UChar scale   = toUChar((sib >> 6) & 3);
   2454          UChar index_r = toUChar((sib >> 3) & 7);
   2455          UChar base_r  = toUChar(sib & 7);
   2456          Long d        = getSDisp8(delta+1);
   2457 
   2458          if (index_r == R_RSP && 0==getRexX(pfx)) {
   2459             DIS(buf, "%s%lld(%s)", segRegTxt(pfx),
   2460                                    d, nameIRegRexB(8,pfx,base_r));
   2461             *len = 3;
   2462             return disAMode_copy2tmp(
   2463                    handleAddrOverrides(vbi, pfx,
   2464                       binop(Iop_Add64, getIRegRexB(8,pfx,base_r), mkU64(d)) ));
   2465          } else {
   2466             if (scale == 0) {
   2467                DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
   2468                          nameIRegRexB(8,pfx,base_r),
   2469                          nameIReg64rexX(pfx,index_r));
   2470             } else {
   2471                DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
   2472                          nameIRegRexB(8,pfx,base_r),
   2473                          nameIReg64rexX(pfx,index_r), 1<<scale);
   2474             }
   2475             *len = 3;
   2476             return
   2477                 disAMode_copy2tmp(
   2478                 handleAddrOverrides(vbi, pfx,
   2479                   binop(Iop_Add64,
   2480                         binop(Iop_Add64,
   2481                               getIRegRexB(8,pfx,base_r),
   2482                               binop(Iop_Shl64,
   2483                                     getIReg64rexX(pfx,index_r), mkU8(scale))),
   2484                         mkU64(d))));
   2485          }
   2486          vassert(0); /*NOTREACHED*/
   2487       }
   2488 
   2489       /* SIB, with 32-bit displacement.  Special cases:
   2490          -- %rsp cannot act as an index value.
   2491             If index_r indicates %rsp, zero is used for the index.
   2492          Denoted value is:
   2493             | %index == %RSP
   2494             = d32 + %base
   2495             | %index != %RSP
   2496             = d32 + %base + (%index << scale)
   2497       */
   2498       case 0x14: {
   2499          UChar sib     = getUChar(delta);
   2500          UChar scale   = toUChar((sib >> 6) & 3);
   2501          UChar index_r = toUChar((sib >> 3) & 7);
   2502          UChar base_r  = toUChar(sib & 7);
   2503          Long d        = getSDisp32(delta+1);
   2504 
   2505          if (index_r == R_RSP && 0==getRexX(pfx)) {
   2506             DIS(buf, "%s%lld(%s)", segRegTxt(pfx),
   2507                                    d, nameIRegRexB(8,pfx,base_r));
   2508             *len = 6;
   2509             return disAMode_copy2tmp(
   2510                    handleAddrOverrides(vbi, pfx,
   2511                       binop(Iop_Add64, getIRegRexB(8,pfx,base_r), mkU64(d)) ));
   2512          } else {
   2513             if (scale == 0) {
   2514                DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
   2515                          nameIRegRexB(8,pfx,base_r),
   2516                          nameIReg64rexX(pfx,index_r));
   2517             } else {
   2518                DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
   2519                          nameIRegRexB(8,pfx,base_r),
   2520                          nameIReg64rexX(pfx,index_r), 1<<scale);
   2521             }
   2522             *len = 6;
   2523             return
   2524                 disAMode_copy2tmp(
   2525                 handleAddrOverrides(vbi, pfx,
   2526                   binop(Iop_Add64,
   2527                         binop(Iop_Add64,
   2528                               getIRegRexB(8,pfx,base_r),
   2529                               binop(Iop_Shl64,
   2530                                     getIReg64rexX(pfx,index_r), mkU8(scale))),
   2531                         mkU64(d))));
   2532          }
   2533          vassert(0); /*NOTREACHED*/
   2534       }
   2535 
   2536       default:
   2537          vpanic("disAMode(amd64)");
   2538          return 0; /*notreached*/
   2539    }
   2540 }
   2541 
   2542 
   2543 /* Figure out the number of (insn-stream) bytes constituting the amode
   2544    beginning at delta.  Is useful for getting hold of literals beyond
   2545    the end of the amode before it has been disassembled.  */
   2546 
   2547 static UInt lengthAMode ( Prefix pfx, Long delta )
   2548 {
   2549    UChar mod_reg_rm = getUChar(delta);
   2550    delta++;
   2551 
   2552    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
   2553       jump table seems a bit excessive.
   2554    */
   2555    mod_reg_rm &= 0xC7;                         /* is now XX000YYY */
   2556    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
   2557                                                /* is now XX0XXYYY */
   2558    mod_reg_rm &= 0x1F;                         /* is now 000XXYYY */
   2559    switch (mod_reg_rm) {
   2560 
   2561       /* REX.B==0: (%rax) .. (%rdi), not including (%rsp) or (%rbp).
   2562          REX.B==1: (%r8)  .. (%r15), not including (%r12) or (%r13).
   2563       */
   2564       case 0x00: case 0x01: case 0x02: case 0x03:
   2565       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
   2566          return 1;
   2567 
   2568       /* REX.B==0: d8(%rax) ... d8(%rdi), not including d8(%rsp)
   2569          REX.B==1: d8(%r8)  ... d8(%r15), not including d8(%r12)
   2570       */
   2571       case 0x08: case 0x09: case 0x0A: case 0x0B:
   2572       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
   2573          return 2;
   2574 
   2575       /* REX.B==0: d32(%rax) ... d32(%rdi), not including d32(%rsp)
   2576          REX.B==1: d32(%r8)  ... d32(%r15), not including d32(%r12)
   2577       */
   2578       case 0x10: case 0x11: case 0x12: case 0x13:
   2579       /* ! 14 */ case 0x15: case 0x16: case 0x17:
   2580          return 5;
   2581 
   2582       /* REX.B==0: a register, %rax .. %rdi.  This shouldn't happen. */
   2583       /* REX.B==1: a register, %r8  .. %r16.  This shouldn't happen. */
   2584       /* Not an address, but still handled. */
   2585       case 0x18: case 0x19: case 0x1A: case 0x1B:
   2586       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
   2587          return 1;
   2588 
   2589       /* RIP + disp32. */
   2590       case 0x05:
   2591          return 5;
   2592 
   2593       case 0x04: {
   2594          /* SIB, with no displacement. */
   2595          UChar sib     = getUChar(delta);
   2596          UChar base_r  = toUChar(sib & 7);
   2597          /* correct since #(R13) == 8 + #(RBP) */
   2598          Bool  base_is_BPor13 = toBool(base_r == R_RBP);
   2599 
   2600          if (base_is_BPor13) {
   2601             return 6;
   2602          } else {
   2603             return 2;
   2604          }
   2605       }
   2606 
   2607       /* SIB, with 8-bit displacement. */
   2608       case 0x0C:
   2609          return 3;
   2610 
   2611       /* SIB, with 32-bit displacement. */
   2612       case 0x14:
   2613          return 6;
   2614 
   2615       default:
   2616          vpanic("lengthAMode(amd64)");
   2617          return 0; /*notreached*/
   2618    }
   2619 }
   2620 
   2621 
   2622 /*------------------------------------------------------------*/
   2623 /*--- Disassembling common idioms                          ---*/
   2624 /*------------------------------------------------------------*/
   2625 
   2626 /* Handle binary integer instructions of the form
   2627       op E, G  meaning
   2628       op reg-or-mem, reg
   2629    Is passed the a ptr to the modRM byte, the actual operation, and the
   2630    data size.  Returns the address advanced completely over this
   2631    instruction.
   2632 
   2633    E(src) is reg-or-mem
   2634    G(dst) is reg.
   2635 
   2636    If E is reg, -->    GET %G,  tmp
   2637                        OP %E,   tmp
   2638                        PUT tmp, %G
   2639 
   2640    If E is mem and OP is not reversible,
   2641                 -->    (getAddr E) -> tmpa
   2642                        LD (tmpa), tmpa
   2643                        GET %G, tmp2
   2644                        OP tmpa, tmp2
   2645                        PUT tmp2, %G
   2646 
   2647    If E is mem and OP is reversible
   2648                 -->    (getAddr E) -> tmpa
   2649                        LD (tmpa), tmpa
   2650                        OP %G, tmpa
   2651                        PUT tmpa, %G
   2652 */
   2653 static
   2654 ULong dis_op2_E_G ( VexAbiInfo* vbi,
   2655                     Prefix      pfx,
   2656                     Bool        addSubCarry,
   2657                     IROp        op8,
   2658                     Bool        keep,
   2659                     Int         size,
   2660                     Long        delta0,
   2661                     HChar*      t_amd64opc )
   2662 {
   2663    HChar   dis_buf[50];
   2664    Int     len;
   2665    IRType  ty   = szToITy(size);
   2666    IRTemp  dst1 = newTemp(ty);
   2667    IRTemp  src  = newTemp(ty);
   2668    IRTemp  dst0 = newTemp(ty);
   2669    UChar   rm   = getUChar(delta0);
   2670    IRTemp  addr = IRTemp_INVALID;
   2671 
   2672    /* addSubCarry == True indicates the intended operation is
   2673       add-with-carry or subtract-with-borrow. */
   2674    if (addSubCarry) {
   2675       vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
   2676       vassert(keep);
   2677    }
   2678 
   2679    if (epartIsReg(rm)) {
   2680       /* Specially handle XOR reg,reg, because that doesn't really
   2681          depend on reg, and doing the obvious thing potentially
   2682          generates a spurious value check failure due to the bogus
   2683          dependency. */
   2684       if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
   2685           && offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
   2686          if (False && op8 == Iop_Sub8)
   2687             vex_printf("vex amd64->IR: sbb %%r,%%r optimisation(1)\n");
   2688 	 putIRegG(size,pfx,rm, mkU(ty,0));
   2689       }
   2690 
   2691       assign( dst0, getIRegG(size,pfx,rm) );
   2692       assign( src,  getIRegE(size,pfx,rm) );
   2693 
   2694       if (addSubCarry && op8 == Iop_Add8) {
   2695          helper_ADC( size, dst1, dst0, src,
   2696                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2697          putIRegG(size, pfx, rm, mkexpr(dst1));
   2698       } else
   2699       if (addSubCarry && op8 == Iop_Sub8) {
   2700          helper_SBB( size, dst1, dst0, src,
   2701                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2702          putIRegG(size, pfx, rm, mkexpr(dst1));
   2703       } else {
   2704          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   2705          if (isAddSub(op8))
   2706             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2707          else
   2708             setFlags_DEP1(op8, dst1, ty);
   2709          if (keep)
   2710             putIRegG(size, pfx, rm, mkexpr(dst1));
   2711       }
   2712 
   2713       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   2714                           nameIRegE(size,pfx,rm),
   2715                           nameIRegG(size,pfx,rm));
   2716       return 1+delta0;
   2717    } else {
   2718       /* E refers to memory */
   2719       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   2720       assign( dst0, getIRegG(size,pfx,rm) );
   2721       assign( src,  loadLE(szToITy(size), mkexpr(addr)) );
   2722 
   2723       if (addSubCarry && op8 == Iop_Add8) {
   2724          helper_ADC( size, dst1, dst0, src,
   2725                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2726          putIRegG(size, pfx, rm, mkexpr(dst1));
   2727       } else
   2728       if (addSubCarry && op8 == Iop_Sub8) {
   2729          helper_SBB( size, dst1, dst0, src,
   2730                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2731          putIRegG(size, pfx, rm, mkexpr(dst1));
   2732       } else {
   2733          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   2734          if (isAddSub(op8))
   2735             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2736          else
   2737             setFlags_DEP1(op8, dst1, ty);
   2738          if (keep)
   2739             putIRegG(size, pfx, rm, mkexpr(dst1));
   2740       }
   2741 
   2742       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   2743                           dis_buf, nameIRegG(size, pfx, rm));
   2744       return len+delta0;
   2745    }
   2746 }
   2747 
   2748 
   2749 
   2750 /* Handle binary integer instructions of the form
   2751       op G, E  meaning
   2752       op reg, reg-or-mem
   2753    Is passed the a ptr to the modRM byte, the actual operation, and the
   2754    data size.  Returns the address advanced completely over this
   2755    instruction.
   2756 
   2757    G(src) is reg.
   2758    E(dst) is reg-or-mem
   2759 
   2760    If E is reg, -->    GET %E,  tmp
   2761                        OP %G,   tmp
   2762                        PUT tmp, %E
   2763 
   2764    If E is mem, -->    (getAddr E) -> tmpa
   2765                        LD (tmpa), tmpv
   2766                        OP %G, tmpv
   2767                        ST tmpv, (tmpa)
   2768 */
   2769 static
   2770 ULong dis_op2_G_E ( VexAbiInfo* vbi,
   2771                     Prefix      pfx,
   2772                     Bool        addSubCarry,
   2773                     IROp        op8,
   2774                     Bool        keep,
   2775                     Int         size,
   2776                     Long        delta0,
   2777                     HChar*      t_amd64opc )
   2778 {
   2779    HChar   dis_buf[50];
   2780    Int     len;
   2781    IRType  ty   = szToITy(size);
   2782    IRTemp  dst1 = newTemp(ty);
   2783    IRTemp  src  = newTemp(ty);
   2784    IRTemp  dst0 = newTemp(ty);
   2785    UChar   rm   = getUChar(delta0);
   2786    IRTemp  addr = IRTemp_INVALID;
   2787 
   2788    /* addSubCarry == True indicates the intended operation is
   2789       add-with-carry or subtract-with-borrow. */
   2790    if (addSubCarry) {
   2791       vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
   2792       vassert(keep);
   2793    }
   2794 
   2795    if (epartIsReg(rm)) {
   2796       /* Specially handle XOR reg,reg, because that doesn't really
   2797          depend on reg, and doing the obvious thing potentially
   2798          generates a spurious value check failure due to the bogus
   2799          dependency.  Ditto SBB reg,reg. */
   2800       if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
   2801           && offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
   2802          putIRegE(size,pfx,rm, mkU(ty,0));
   2803       }
   2804 
   2805       assign(dst0, getIRegE(size,pfx,rm));
   2806       assign(src,  getIRegG(size,pfx,rm));
   2807 
   2808       if (addSubCarry && op8 == Iop_Add8) {
   2809          helper_ADC( size, dst1, dst0, src,
   2810                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2811          putIRegE(size, pfx, rm, mkexpr(dst1));
   2812       } else
   2813       if (addSubCarry && op8 == Iop_Sub8) {
   2814          helper_SBB( size, dst1, dst0, src,
   2815                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2816          putIRegE(size, pfx, rm, mkexpr(dst1));
   2817       } else {
   2818          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   2819          if (isAddSub(op8))
   2820             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2821          else
   2822             setFlags_DEP1(op8, dst1, ty);
   2823          if (keep)
   2824             putIRegE(size, pfx, rm, mkexpr(dst1));
   2825       }
   2826 
   2827       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   2828                           nameIRegG(size,pfx,rm),
   2829                           nameIRegE(size,pfx,rm));
   2830       return 1+delta0;
   2831    }
   2832 
   2833    /* E refers to memory */
   2834    {
   2835       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   2836       assign(dst0, loadLE(ty,mkexpr(addr)));
   2837       assign(src,  getIRegG(size,pfx,rm));
   2838 
   2839       if (addSubCarry && op8 == Iop_Add8) {
   2840          if (pfx & PFX_LOCK) {
   2841             /* cas-style store */
   2842             helper_ADC( size, dst1, dst0, src,
   2843                         /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   2844          } else {
   2845             /* normal store */
   2846             helper_ADC( size, dst1, dst0, src,
   2847                         /*store*/addr, IRTemp_INVALID, 0 );
   2848          }
   2849       } else
   2850       if (addSubCarry && op8 == Iop_Sub8) {
   2851          if (pfx & PFX_LOCK) {
   2852             /* cas-style store */
   2853             helper_SBB( size, dst1, dst0, src,
   2854                         /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   2855          } else {
   2856             /* normal store */
   2857             helper_SBB( size, dst1, dst0, src,
   2858                         /*store*/addr, IRTemp_INVALID, 0 );
   2859          }
   2860       } else {
   2861          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   2862          if (keep) {
   2863             if (pfx & PFX_LOCK) {
   2864                if (0) vex_printf("locked case\n" );
   2865                casLE( mkexpr(addr),
   2866                       mkexpr(dst0)/*expval*/,
   2867                       mkexpr(dst1)/*newval*/, guest_RIP_curr_instr );
   2868             } else {
   2869                if (0) vex_printf("nonlocked case\n");
   2870                storeLE(mkexpr(addr), mkexpr(dst1));
   2871             }
   2872          }
   2873          if (isAddSub(op8))
   2874             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2875          else
   2876             setFlags_DEP1(op8, dst1, ty);
   2877       }
   2878 
   2879       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   2880                           nameIRegG(size,pfx,rm), dis_buf);
   2881       return len+delta0;
   2882    }
   2883 }
   2884 
   2885 
   2886 /* Handle move instructions of the form
   2887       mov E, G  meaning
   2888       mov reg-or-mem, reg
   2889    Is passed the a ptr to the modRM byte, and the data size.  Returns
   2890    the address advanced completely over this instruction.
   2891 
   2892    E(src) is reg-or-mem
   2893    G(dst) is reg.
   2894 
   2895    If E is reg, -->    GET %E,  tmpv
   2896                        PUT tmpv, %G
   2897 
   2898    If E is mem  -->    (getAddr E) -> tmpa
   2899                        LD (tmpa), tmpb
   2900                        PUT tmpb, %G
   2901 */
   2902 static
   2903 ULong dis_mov_E_G ( VexAbiInfo* vbi,
   2904                     Prefix      pfx,
   2905                     Int         size,
   2906                     Long        delta0 )
   2907 {
   2908    Int len;
   2909    UChar rm = getUChar(delta0);
   2910    HChar dis_buf[50];
   2911 
   2912    if (epartIsReg(rm)) {
   2913       putIRegG(size, pfx, rm, getIRegE(size, pfx, rm));
   2914       DIP("mov%c %s,%s\n", nameISize(size),
   2915                            nameIRegE(size,pfx,rm),
   2916                            nameIRegG(size,pfx,rm));
   2917       return 1+delta0;
   2918    }
   2919 
   2920    /* E refers to memory */
   2921    {
   2922       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   2923       putIRegG(size, pfx, rm, loadLE(szToITy(size), mkexpr(addr)));
   2924       DIP("mov%c %s,%s\n", nameISize(size),
   2925                            dis_buf,
   2926                            nameIRegG(size,pfx,rm));
   2927       return delta0+len;
   2928    }
   2929 }
   2930 
   2931 
   2932 /* Handle move instructions of the form
   2933       mov G, E  meaning
   2934       mov reg, reg-or-mem
   2935    Is passed the a ptr to the modRM byte, and the data size.  Returns
   2936    the address advanced completely over this instruction.
   2937 
   2938    G(src) is reg.
   2939    E(dst) is reg-or-mem
   2940 
   2941    If E is reg, -->    GET %G,  tmp
   2942                        PUT tmp, %E
   2943 
   2944    If E is mem, -->    (getAddr E) -> tmpa
   2945                        GET %G, tmpv
   2946                        ST tmpv, (tmpa)
   2947 */
   2948 static
   2949 ULong dis_mov_G_E ( VexAbiInfo* vbi,
   2950                     Prefix      pfx,
   2951                     Int         size,
   2952                     Long        delta0 )
   2953 {
   2954    Int len;
   2955    UChar rm = getUChar(delta0);
   2956    HChar dis_buf[50];
   2957 
   2958    if (epartIsReg(rm)) {
   2959       putIRegE(size, pfx, rm, getIRegG(size, pfx, rm));
   2960       DIP("mov%c %s,%s\n", nameISize(size),
   2961                            nameIRegG(size,pfx,rm),
   2962                            nameIRegE(size,pfx,rm));
   2963       return 1+delta0;
   2964    }
   2965 
   2966    /* E refers to memory */
   2967    {
   2968       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   2969       storeLE( mkexpr(addr), getIRegG(size, pfx, rm) );
   2970       DIP("mov%c %s,%s\n", nameISize(size),
   2971                            nameIRegG(size,pfx,rm),
   2972                            dis_buf);
   2973       return len+delta0;
   2974    }
   2975 }
   2976 
   2977 
   2978 /* op $immediate, AL/AX/EAX/RAX. */
   2979 static
   2980 ULong dis_op_imm_A ( Int    size,
   2981                      Bool   carrying,
   2982                      IROp   op8,
   2983                      Bool   keep,
   2984                      Long   delta,
   2985                      HChar* t_amd64opc )
   2986 {
   2987    Int    size4 = imin(size,4);
   2988    IRType ty    = szToITy(size);
   2989    IRTemp dst0  = newTemp(ty);
   2990    IRTemp src   = newTemp(ty);
   2991    IRTemp dst1  = newTemp(ty);
   2992    Long  lit    = getSDisp(size4,delta);
   2993    assign(dst0, getIRegRAX(size));
   2994    assign(src,  mkU(ty,lit & mkSizeMask(size)));
   2995 
   2996    if (isAddSub(op8) && !carrying) {
   2997       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   2998       setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2999    }
   3000    else
   3001    if (isLogic(op8)) {
   3002       vassert(!carrying);
   3003       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   3004       setFlags_DEP1(op8, dst1, ty);
   3005    }
   3006    else
   3007    if (op8 == Iop_Add8 && carrying) {
   3008       helper_ADC( size, dst1, dst0, src,
   3009                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3010    }
   3011    else
   3012    if (op8 == Iop_Sub8 && carrying) {
   3013       helper_SBB( size, dst1, dst0, src,
   3014                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3015    }
   3016    else
   3017       vpanic("dis_op_imm_A(amd64,guest)");
   3018 
   3019    if (keep)
   3020       putIRegRAX(size, mkexpr(dst1));
   3021 
   3022    DIP("%s%c $%lld, %s\n", t_amd64opc, nameISize(size),
   3023                            lit, nameIRegRAX(size));
   3024    return delta+size4;
   3025 }
   3026 
   3027 
   3028 /* Sign- and Zero-extending moves. */
   3029 static
   3030 ULong dis_movx_E_G ( VexAbiInfo* vbi,
   3031                      Prefix pfx,
   3032                      Long delta, Int szs, Int szd, Bool sign_extend )
   3033 {
   3034    UChar rm = getUChar(delta);
   3035    if (epartIsReg(rm)) {
   3036       putIRegG(szd, pfx, rm,
   3037                     doScalarWidening(
   3038                        szs,szd,sign_extend,
   3039                        getIRegE(szs,pfx,rm)));
   3040       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
   3041                                nameISize(szs),
   3042                                nameISize(szd),
   3043                                nameIRegE(szs,pfx,rm),
   3044                                nameIRegG(szd,pfx,rm));
   3045       return 1+delta;
   3046    }
   3047 
   3048    /* E refers to memory */
   3049    {
   3050       Int    len;
   3051       HChar  dis_buf[50];
   3052       IRTemp addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   3053       putIRegG(szd, pfx, rm,
   3054                     doScalarWidening(
   3055                        szs,szd,sign_extend,
   3056                        loadLE(szToITy(szs),mkexpr(addr))));
   3057       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
   3058                                nameISize(szs),
   3059                                nameISize(szd),
   3060                                dis_buf,
   3061                                nameIRegG(szd,pfx,rm));
   3062       return len+delta;
   3063    }
   3064 }
   3065 
   3066 
   3067 /* Generate code to divide ArchRegs RDX:RAX / EDX:EAX / DX:AX / AX by
   3068    the 64 / 32 / 16 / 8 bit quantity in the given IRTemp.  */
   3069 static
   3070 void codegen_div ( Int sz, IRTemp t, Bool signed_divide )
   3071 {
   3072    /* special-case the 64-bit case */
   3073    if (sz == 8) {
   3074       IROp   op     = signed_divide ? Iop_DivModS128to64
   3075                                     : Iop_DivModU128to64;
   3076       IRTemp src128 = newTemp(Ity_I128);
   3077       IRTemp dst128 = newTemp(Ity_I128);
   3078       assign( src128, binop(Iop_64HLto128,
   3079                             getIReg64(R_RDX),
   3080                             getIReg64(R_RAX)) );
   3081       assign( dst128, binop(op, mkexpr(src128), mkexpr(t)) );
   3082       putIReg64( R_RAX, unop(Iop_128to64,mkexpr(dst128)) );
   3083       putIReg64( R_RDX, unop(Iop_128HIto64,mkexpr(dst128)) );
   3084    } else {
   3085       IROp   op    = signed_divide ? Iop_DivModS64to32
   3086                                    : Iop_DivModU64to32;
   3087       IRTemp src64 = newTemp(Ity_I64);
   3088       IRTemp dst64 = newTemp(Ity_I64);
   3089       switch (sz) {
   3090       case 4:
   3091          assign( src64,
   3092                  binop(Iop_32HLto64, getIRegRDX(4), getIRegRAX(4)) );
   3093          assign( dst64,
   3094                  binop(op, mkexpr(src64), mkexpr(t)) );
   3095          putIRegRAX( 4, unop(Iop_64to32,mkexpr(dst64)) );
   3096          putIRegRDX( 4, unop(Iop_64HIto32,mkexpr(dst64)) );
   3097          break;
   3098       case 2: {
   3099          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
   3100          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
   3101          assign( src64, unop(widen3264,
   3102                              binop(Iop_16HLto32,
   3103                                    getIRegRDX(2),
   3104                                    getIRegRAX(2))) );
   3105          assign( dst64, binop(op, mkexpr(src64), unop(widen1632,mkexpr(t))) );
   3106          putIRegRAX( 2, unop(Iop_32to16,unop(Iop_64to32,mkexpr(dst64))) );
   3107          putIRegRDX( 2, unop(Iop_32to16,unop(Iop_64HIto32,mkexpr(dst64))) );
   3108          break;
   3109       }
   3110       case 1: {
   3111          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
   3112          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
   3113          IROp widen816  = signed_divide ? Iop_8Sto16  : Iop_8Uto16;
   3114          assign( src64, unop(widen3264,
   3115                         unop(widen1632, getIRegRAX(2))) );
   3116          assign( dst64,
   3117                  binop(op, mkexpr(src64),
   3118                            unop(widen1632, unop(widen816, mkexpr(t)))) );
   3119          putIRegRAX( 1, unop(Iop_16to8,
   3120                         unop(Iop_32to16,
   3121                         unop(Iop_64to32,mkexpr(dst64)))) );
   3122          putIRegAH( unop(Iop_16to8,
   3123                     unop(Iop_32to16,
   3124                     unop(Iop_64HIto32,mkexpr(dst64)))) );
   3125          break;
   3126       }
   3127       default:
   3128          vpanic("codegen_div(amd64)");
   3129       }
   3130    }
   3131 }
   3132 
   3133 static
   3134 ULong dis_Grp1 ( VexAbiInfo* vbi,
   3135                  Prefix pfx,
   3136                  Long delta, UChar modrm,
   3137                  Int am_sz, Int d_sz, Int sz, Long d64 )
   3138 {
   3139    Int     len;
   3140    HChar   dis_buf[50];
   3141    IRType  ty   = szToITy(sz);
   3142    IRTemp  dst1 = newTemp(ty);
   3143    IRTemp  src  = newTemp(ty);
   3144    IRTemp  dst0 = newTemp(ty);
   3145    IRTemp  addr = IRTemp_INVALID;
   3146    IROp    op8  = Iop_INVALID;
   3147    ULong   mask = mkSizeMask(sz);
   3148 
   3149    switch (gregLO3ofRM(modrm)) {
   3150       case 0: op8 = Iop_Add8; break;  case 1: op8 = Iop_Or8;  break;
   3151       case 2: break;  // ADC
   3152       case 3: break;  // SBB
   3153       case 4: op8 = Iop_And8; break;  case 5: op8 = Iop_Sub8; break;
   3154       case 6: op8 = Iop_Xor8; break;  case 7: op8 = Iop_Sub8; break;
   3155       /*NOTREACHED*/
   3156       default: vpanic("dis_Grp1(amd64): unhandled case");
   3157    }
   3158 
   3159    if (epartIsReg(modrm)) {
   3160       vassert(am_sz == 1);
   3161 
   3162       assign(dst0, getIRegE(sz,pfx,modrm));
   3163       assign(src,  mkU(ty,d64 & mask));
   3164 
   3165       if (gregLO3ofRM(modrm) == 2 /* ADC */) {
   3166          helper_ADC( sz, dst1, dst0, src,
   3167                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3168       } else
   3169       if (gregLO3ofRM(modrm) == 3 /* SBB */) {
   3170          helper_SBB( sz, dst1, dst0, src,
   3171                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3172       } else {
   3173          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   3174          if (isAddSub(op8))
   3175             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3176          else
   3177             setFlags_DEP1(op8, dst1, ty);
   3178       }
   3179 
   3180       if (gregLO3ofRM(modrm) < 7)
   3181          putIRegE(sz, pfx, modrm, mkexpr(dst1));
   3182 
   3183       delta += (am_sz + d_sz);
   3184       DIP("%s%c $%lld, %s\n",
   3185           nameGrp1(gregLO3ofRM(modrm)), nameISize(sz), d64,
   3186           nameIRegE(sz,pfx,modrm));
   3187    } else {
   3188       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, /*xtra*/d_sz );
   3189 
   3190       assign(dst0, loadLE(ty,mkexpr(addr)));
   3191       assign(src, mkU(ty,d64 & mask));
   3192 
   3193       if (gregLO3ofRM(modrm) == 2 /* ADC */) {
   3194          if (pfx & PFX_LOCK) {
   3195             /* cas-style store */
   3196             helper_ADC( sz, dst1, dst0, src,
   3197                        /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   3198          } else {
   3199             /* normal store */
   3200             helper_ADC( sz, dst1, dst0, src,
   3201                         /*store*/addr, IRTemp_INVALID, 0 );
   3202          }
   3203       } else
   3204       if (gregLO3ofRM(modrm) == 3 /* SBB */) {
   3205          if (pfx & PFX_LOCK) {
   3206             /* cas-style store */
   3207             helper_SBB( sz, dst1, dst0, src,
   3208                        /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   3209          } else {
   3210             /* normal store */
   3211             helper_SBB( sz, dst1, dst0, src,
   3212                         /*store*/addr, IRTemp_INVALID, 0 );
   3213          }
   3214       } else {
   3215          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   3216          if (gregLO3ofRM(modrm) < 7) {
   3217             if (pfx & PFX_LOCK) {
   3218                casLE( mkexpr(addr), mkexpr(dst0)/*expVal*/,
   3219                                     mkexpr(dst1)/*newVal*/,
   3220                                     guest_RIP_curr_instr );
   3221             } else {
   3222                storeLE(mkexpr(addr), mkexpr(dst1));
   3223             }
   3224          }
   3225          if (isAddSub(op8))
   3226             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3227          else
   3228             setFlags_DEP1(op8, dst1, ty);
   3229       }
   3230 
   3231       delta += (len+d_sz);
   3232       DIP("%s%c $%lld, %s\n",
   3233           nameGrp1(gregLO3ofRM(modrm)), nameISize(sz),
   3234           d64, dis_buf);
   3235    }
   3236    return delta;
   3237 }
   3238 
   3239 
   3240 /* Group 2 extended opcodes.  shift_expr must be an 8-bit typed
   3241    expression. */
   3242 
   3243 static
   3244 ULong dis_Grp2 ( VexAbiInfo* vbi,
   3245                  Prefix pfx,
   3246                  Long delta, UChar modrm,
   3247                  Int am_sz, Int d_sz, Int sz, IRExpr* shift_expr,
   3248                  HChar* shift_expr_txt, Bool* decode_OK )
   3249 {
   3250    /* delta on entry points at the modrm byte. */
   3251    HChar  dis_buf[50];
   3252    Int    len;
   3253    Bool   isShift, isRotate, isRotateC;
   3254    IRType ty    = szToITy(sz);
   3255    IRTemp dst0  = newTemp(ty);
   3256    IRTemp dst1  = newTemp(ty);
   3257    IRTemp addr  = IRTemp_INVALID;
   3258 
   3259    *decode_OK = True;
   3260 
   3261    vassert(sz == 1 || sz == 2 || sz == 4 || sz == 8);
   3262 
   3263    /* Put value to shift/rotate in dst0. */
   3264    if (epartIsReg(modrm)) {
   3265       assign(dst0, getIRegE(sz, pfx, modrm));
   3266       delta += (am_sz + d_sz);
   3267    } else {
   3268       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, /*xtra*/d_sz );
   3269       assign(dst0, loadLE(ty,mkexpr(addr)));
   3270       delta += len + d_sz;
   3271    }
   3272 
   3273    isShift = False;
   3274    switch (gregLO3ofRM(modrm)) { case 4: case 5: case 7: isShift = True; }
   3275 
   3276    isRotate = False;
   3277    switch (gregLO3ofRM(modrm)) { case 0: case 1: isRotate = True; }
   3278 
   3279    isRotateC = False;
   3280    switch (gregLO3ofRM(modrm)) { case 2: case 3: isRotateC = True; }
   3281 
   3282    if (gregLO3ofRM(modrm) == 6) {
   3283       *decode_OK = False;
   3284       return delta;
   3285    }
   3286 
   3287    if (!isShift && !isRotate && !isRotateC) {
   3288       /*NOTREACHED*/
   3289       vpanic("dis_Grp2(Reg): unhandled case(amd64)");
   3290    }
   3291 
   3292    if (isRotateC) {
   3293       /* Call a helper; this insn is so ridiculous it does not deserve
   3294          better.  One problem is, the helper has to calculate both the
   3295          new value and the new flags.  This is more than 64 bits, and
   3296          there is no way to return more than 64 bits from the helper.
   3297          Hence the crude and obvious solution is to call it twice,
   3298          using the sign of the sz field to indicate whether it is the
   3299          value or rflags result we want.
   3300       */
   3301       Bool     left = toBool(gregLO3ofRM(modrm) == 2);
   3302       IRExpr** argsVALUE;
   3303       IRExpr** argsRFLAGS;
   3304 
   3305       IRTemp new_value  = newTemp(Ity_I64);
   3306       IRTemp new_rflags = newTemp(Ity_I64);
   3307       IRTemp old_rflags = newTemp(Ity_I64);
   3308 
   3309       assign( old_rflags, widenUto64(mk_amd64g_calculate_rflags_all()) );
   3310 
   3311       argsVALUE
   3312          = mkIRExprVec_4( widenUto64(mkexpr(dst0)), /* thing to rotate */
   3313                           widenUto64(shift_expr),   /* rotate amount */
   3314                           mkexpr(old_rflags),
   3315                           mkU64(sz) );
   3316       assign( new_value,
   3317                  mkIRExprCCall(
   3318                     Ity_I64,
   3319                     0/*regparm*/,
   3320                     left ? "amd64g_calculate_RCL" : "amd64g_calculate_RCR",
   3321                     left ? &amd64g_calculate_RCL  : &amd64g_calculate_RCR,
   3322                     argsVALUE
   3323                  )
   3324             );
   3325 
   3326       argsRFLAGS
   3327          = mkIRExprVec_4( widenUto64(mkexpr(dst0)), /* thing to rotate */
   3328                           widenUto64(shift_expr),   /* rotate amount */
   3329                           mkexpr(old_rflags),
   3330                           mkU64(-sz) );
   3331       assign( new_rflags,
   3332                  mkIRExprCCall(
   3333                     Ity_I64,
   3334                     0/*regparm*/,
   3335                     left ? "amd64g_calculate_RCL" : "amd64g_calculate_RCR",
   3336                     left ? &amd64g_calculate_RCL  : &amd64g_calculate_RCR,
   3337                     argsRFLAGS
   3338                  )
   3339             );
   3340 
   3341       assign( dst1, narrowTo(ty, mkexpr(new_value)) );
   3342       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   3343       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(new_rflags) ));
   3344       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   3345       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   3346    }
   3347 
   3348    else
   3349    if (isShift) {
   3350 
   3351       IRTemp pre64     = newTemp(Ity_I64);
   3352       IRTemp res64     = newTemp(Ity_I64);
   3353       IRTemp res64ss   = newTemp(Ity_I64);
   3354       IRTemp shift_amt = newTemp(Ity_I8);
   3355       UChar  mask      = toUChar(sz==8 ? 63 : 31);
   3356       IROp   op64;
   3357 
   3358       switch (gregLO3ofRM(modrm)) {
   3359          case 4: op64 = Iop_Shl64; break;
   3360          case 5: op64 = Iop_Shr64; break;
   3361          case 7: op64 = Iop_Sar64; break;
   3362          /*NOTREACHED*/
   3363          default: vpanic("dis_Grp2:shift"); break;
   3364       }
   3365 
   3366       /* Widen the value to be shifted to 64 bits, do the shift, and
   3367          narrow back down.  This seems surprisingly long-winded, but
   3368          unfortunately the AMD semantics requires that 8/16/32-bit
   3369          shifts give defined results for shift values all the way up
   3370          to 32, and this seems the simplest way to do it.  It has the
   3371          advantage that the only IR level shifts generated are of 64
   3372          bit values, and the shift amount is guaranteed to be in the
   3373          range 0 .. 63, thereby observing the IR semantics requiring
   3374          all shift values to be in the range 0 .. 2^word_size-1.
   3375 
   3376          Therefore the shift amount is masked with 63 for 64-bit shifts
   3377          and 31 for all others.
   3378       */
   3379       /* shift_amt = shift_expr & MASK, regardless of operation size */
   3380       assign( shift_amt, binop(Iop_And8, shift_expr, mkU8(mask)) );
   3381 
   3382       /* suitably widen the value to be shifted to 64 bits. */
   3383       assign( pre64, op64==Iop_Sar64 ? widenSto64(mkexpr(dst0))
   3384                                      : widenUto64(mkexpr(dst0)) );
   3385 
   3386       /* res64 = pre64 `shift` shift_amt */
   3387       assign( res64, binop(op64, mkexpr(pre64), mkexpr(shift_amt)) );
   3388 
   3389       /* res64ss = pre64 `shift` ((shift_amt - 1) & MASK) */
   3390       assign( res64ss,
   3391               binop(op64,
   3392                     mkexpr(pre64),
   3393                     binop(Iop_And8,
   3394                           binop(Iop_Sub8,
   3395                                 mkexpr(shift_amt), mkU8(1)),
   3396                           mkU8(mask))) );
   3397 
   3398       /* Build the flags thunk. */
   3399       setFlags_DEP1_DEP2_shift(op64, res64, res64ss, ty, shift_amt);
   3400 
   3401       /* Narrow the result back down. */
   3402       assign( dst1, narrowTo(ty, mkexpr(res64)) );
   3403 
   3404    } /* if (isShift) */
   3405 
   3406    else
   3407    if (isRotate) {
   3408       Int    ccOp      = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1
   3409                                         : (ty==Ity_I32 ? 2 : 3));
   3410       Bool   left      = toBool(gregLO3ofRM(modrm) == 0);
   3411       IRTemp rot_amt   = newTemp(Ity_I8);
   3412       IRTemp rot_amt64 = newTemp(Ity_I8);
   3413       IRTemp oldFlags  = newTemp(Ity_I64);
   3414       UChar  mask      = toUChar(sz==8 ? 63 : 31);
   3415 
   3416       /* rot_amt = shift_expr & mask */
   3417       /* By masking the rotate amount thusly, the IR-level Shl/Shr
   3418          expressions never shift beyond the word size and thus remain
   3419          well defined. */
   3420       assign(rot_amt64, binop(Iop_And8, shift_expr, mkU8(mask)));
   3421 
   3422       if (ty == Ity_I64)
   3423          assign(rot_amt, mkexpr(rot_amt64));
   3424       else
   3425          assign(rot_amt, binop(Iop_And8, mkexpr(rot_amt64), mkU8(8*sz-1)));
   3426 
   3427       if (left) {
   3428 
   3429          /* dst1 = (dst0 << rot_amt) | (dst0 >>u (wordsize-rot_amt)) */
   3430          assign(dst1,
   3431             binop( mkSizedOp(ty,Iop_Or8),
   3432                    binop( mkSizedOp(ty,Iop_Shl8),
   3433                           mkexpr(dst0),
   3434                           mkexpr(rot_amt)
   3435                    ),
   3436                    binop( mkSizedOp(ty,Iop_Shr8),
   3437                           mkexpr(dst0),
   3438                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
   3439                    )
   3440             )
   3441          );
   3442          ccOp += AMD64G_CC_OP_ROLB;
   3443 
   3444       } else { /* right */
   3445 
   3446          /* dst1 = (dst0 >>u rot_amt) | (dst0 << (wordsize-rot_amt)) */
   3447          assign(dst1,
   3448             binop( mkSizedOp(ty,Iop_Or8),
   3449                    binop( mkSizedOp(ty,Iop_Shr8),
   3450                           mkexpr(dst0),
   3451                           mkexpr(rot_amt)
   3452                    ),
   3453                    binop( mkSizedOp(ty,Iop_Shl8),
   3454                           mkexpr(dst0),
   3455                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
   3456                    )
   3457             )
   3458          );
   3459          ccOp += AMD64G_CC_OP_RORB;
   3460 
   3461       }
   3462 
   3463       /* dst1 now holds the rotated value.  Build flag thunk.  We
   3464          need the resulting value for this, and the previous flags.
   3465          Except don't set it if the rotate count is zero. */
   3466 
   3467       assign(oldFlags, mk_amd64g_calculate_rflags_all());
   3468 
   3469       /* CC_DEP1 is the rotated value.  CC_NDEP is flags before. */
   3470       stmt( IRStmt_Put( OFFB_CC_OP,
   3471                         IRExpr_Mux0X( mkexpr(rot_amt64),
   3472                                       IRExpr_Get(OFFB_CC_OP,Ity_I64),
   3473                                       mkU64(ccOp))) );
   3474       stmt( IRStmt_Put( OFFB_CC_DEP1,
   3475                         IRExpr_Mux0X( mkexpr(rot_amt64),
   3476                                       IRExpr_Get(OFFB_CC_DEP1,Ity_I64),
   3477                                       widenUto64(mkexpr(dst1)))) );
   3478       stmt( IRStmt_Put( OFFB_CC_DEP2,
   3479                         IRExpr_Mux0X( mkexpr(rot_amt64),
   3480                                       IRExpr_Get(OFFB_CC_DEP2,Ity_I64),
   3481                                       mkU64(0))) );
   3482       stmt( IRStmt_Put( OFFB_CC_NDEP,
   3483                         IRExpr_Mux0X( mkexpr(rot_amt64),
   3484                                       IRExpr_Get(OFFB_CC_NDEP,Ity_I64),
   3485                                       mkexpr(oldFlags))) );
   3486    } /* if (isRotate) */
   3487 
   3488    /* Save result, and finish up. */
   3489    if (epartIsReg(modrm)) {
   3490       putIRegE(sz, pfx, modrm, mkexpr(dst1));
   3491       if (vex_traceflags & VEX_TRACE_FE) {
   3492          vex_printf("%s%c ",
   3493                     nameGrp2(gregLO3ofRM(modrm)), nameISize(sz) );
   3494          if (shift_expr_txt)
   3495             vex_printf("%s", shift_expr_txt);
   3496          else
   3497             ppIRExpr(shift_expr);
   3498          vex_printf(", %s\n", nameIRegE(sz,pfx,modrm));
   3499       }
   3500    } else {
   3501       storeLE(mkexpr(addr), mkexpr(dst1));
   3502       if (vex_traceflags & VEX_TRACE_FE) {
   3503          vex_printf("%s%c ",
   3504                     nameGrp2(gregLO3ofRM(modrm)), nameISize(sz) );
   3505          if (shift_expr_txt)
   3506             vex_printf("%s", shift_expr_txt);
   3507          else
   3508             ppIRExpr(shift_expr);
   3509          vex_printf(", %s\n", dis_buf);
   3510       }
   3511    }
   3512    return delta;
   3513 }
   3514 
   3515 
   3516 /* Group 8 extended opcodes (but BT/BTS/BTC/BTR only). */
   3517 static
   3518 ULong dis_Grp8_Imm ( VexAbiInfo* vbi,
   3519                      Prefix pfx,
   3520                      Long delta, UChar modrm,
   3521                      Int am_sz, Int sz, ULong src_val,
   3522                      Bool* decode_OK )
   3523 {
   3524    /* src_val denotes a d8.
   3525       And delta on entry points at the modrm byte. */
   3526 
   3527    IRType ty     = szToITy(sz);
   3528    IRTemp t2     = newTemp(Ity_I64);
   3529    IRTemp t2m    = newTemp(Ity_I64);
   3530    IRTemp t_addr = IRTemp_INVALID;
   3531    HChar  dis_buf[50];
   3532    ULong  mask;
   3533 
   3534    /* we're optimists :-) */
   3535    *decode_OK = True;
   3536 
   3537    /* Limit src_val -- the bit offset -- to something within a word.
   3538       The Intel docs say that literal offsets larger than a word are
   3539       masked in this way. */
   3540    switch (sz) {
   3541       case 2:  src_val &= 15; break;
   3542       case 4:  src_val &= 31; break;
   3543       case 8:  src_val &= 63; break;
   3544       default: *decode_OK = False; return delta;
   3545    }
   3546 
   3547    /* Invent a mask suitable for the operation. */
   3548    switch (gregLO3ofRM(modrm)) {
   3549       case 4: /* BT */  mask = 0;                  break;
   3550       case 5: /* BTS */ mask = 1ULL << src_val;    break;
   3551       case 6: /* BTR */ mask = ~(1ULL << src_val); break;
   3552       case 7: /* BTC */ mask = 1ULL << src_val;    break;
   3553          /* If this needs to be extended, probably simplest to make a
   3554             new function to handle the other cases (0 .. 3).  The
   3555             Intel docs do however not indicate any use for 0 .. 3, so
   3556             we don't expect this to happen. */
   3557       default: *decode_OK = False; return delta;
   3558    }
   3559 
   3560    /* Fetch the value to be tested and modified into t2, which is
   3561       64-bits wide regardless of sz. */
   3562    if (epartIsReg(modrm)) {
   3563       vassert(am_sz == 1);
   3564       assign( t2, widenUto64(getIRegE(sz, pfx, modrm)) );
   3565       delta += (am_sz + 1);
   3566       DIP("%s%c $0x%llx, %s\n", nameGrp8(gregLO3ofRM(modrm)),
   3567                                 nameISize(sz),
   3568                                 src_val, nameIRegE(sz,pfx,modrm));
   3569    } else {
   3570       Int len;
   3571       t_addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 1 );
   3572       delta  += (len+1);
   3573       assign( t2, widenUto64(loadLE(ty, mkexpr(t_addr))) );
   3574       DIP("%s%c $0x%llx, %s\n", nameGrp8(gregLO3ofRM(modrm)),
   3575                                 nameISize(sz),
   3576                                 src_val, dis_buf);
   3577    }
   3578 
   3579    /* Compute the new value into t2m, if non-BT. */
   3580    switch (gregLO3ofRM(modrm)) {
   3581       case 4: /* BT */
   3582          break;
   3583       case 5: /* BTS */
   3584          assign( t2m, binop(Iop_Or64, mkU64(mask), mkexpr(t2)) );
   3585          break;
   3586       case 6: /* BTR */
   3587          assign( t2m, binop(Iop_And64, mkU64(mask), mkexpr(t2)) );
   3588          break;
   3589       case 7: /* BTC */
   3590          assign( t2m, binop(Iop_Xor64, mkU64(mask), mkexpr(t2)) );
   3591          break;
   3592      default:
   3593          /*NOTREACHED*/ /*the previous switch guards this*/
   3594          vassert(0);
   3595    }
   3596 
   3597    /* Write the result back, if non-BT. */
   3598    if (gregLO3ofRM(modrm) != 4 /* BT */) {
   3599       if (epartIsReg(modrm)) {
   3600 	putIRegE(sz, pfx, modrm, narrowTo(ty, mkexpr(t2m)));
   3601       } else {
   3602          if (pfx & PFX_LOCK) {
   3603             casLE( mkexpr(t_addr),
   3604                    narrowTo(ty, mkexpr(t2))/*expd*/,
   3605                    narrowTo(ty, mkexpr(t2m))/*new*/,
   3606                    guest_RIP_curr_instr );
   3607          } else {
   3608             storeLE(mkexpr(t_addr), narrowTo(ty, mkexpr(t2m)));
   3609          }
   3610       }
   3611    }
   3612 
   3613    /* Copy relevant bit from t2 into the carry flag. */
   3614    /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
   3615    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   3616    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   3617    stmt( IRStmt_Put(
   3618             OFFB_CC_DEP1,
   3619             binop(Iop_And64,
   3620                   binop(Iop_Shr64, mkexpr(t2), mkU8(src_val)),
   3621                   mkU64(1))
   3622        ));
   3623    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   3624       elimination of previous stores to this field work better. */
   3625    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   3626 
   3627    return delta;
   3628 }
   3629 
   3630 
   3631 /* Signed/unsigned widening multiply.  Generate IR to multiply the
   3632    value in RAX/EAX/AX/AL by the given IRTemp, and park the result in
   3633    RDX:RAX/EDX:EAX/DX:AX/AX.
   3634 */
   3635 static void codegen_mulL_A_D ( Int sz, Bool syned,
   3636                                IRTemp tmp, HChar* tmp_txt )
   3637 {
   3638    IRType ty = szToITy(sz);
   3639    IRTemp t1 = newTemp(ty);
   3640 
   3641    assign( t1, getIRegRAX(sz) );
   3642 
   3643    switch (ty) {
   3644       case Ity_I64: {
   3645          IRTemp res128  = newTemp(Ity_I128);
   3646          IRTemp resHi   = newTemp(Ity_I64);
   3647          IRTemp resLo   = newTemp(Ity_I64);
   3648          IROp   mulOp   = syned ? Iop_MullS64 : Iop_MullU64;
   3649          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   3650          setFlags_MUL ( Ity_I64, t1, tmp, tBaseOp );
   3651          assign( res128, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   3652          assign( resHi, unop(Iop_128HIto64,mkexpr(res128)));
   3653          assign( resLo, unop(Iop_128to64,mkexpr(res128)));
   3654          putIReg64(R_RDX, mkexpr(resHi));
   3655          putIReg64(R_RAX, mkexpr(resLo));
   3656          break;
   3657       }
   3658       case Ity_I32: {
   3659          IRTemp res64   = newTemp(Ity_I64);
   3660          IRTemp resHi   = newTemp(Ity_I32);
   3661          IRTemp resLo   = newTemp(Ity_I32);
   3662          IROp   mulOp   = syned ? Iop_MullS32 : Iop_MullU32;
   3663          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   3664          setFlags_MUL ( Ity_I32, t1, tmp, tBaseOp );
   3665          assign( res64, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   3666          assign( resHi, unop(Iop_64HIto32,mkexpr(res64)));
   3667          assign( resLo, unop(Iop_64to32,mkexpr(res64)));
   3668          putIRegRDX(4, mkexpr(resHi));
   3669          putIRegRAX(4, mkexpr(resLo));
   3670          break;
   3671       }
   3672       case Ity_I16: {
   3673          IRTemp res32   = newTemp(Ity_I32);
   3674          IRTemp resHi   = newTemp(Ity_I16);
   3675          IRTemp resLo   = newTemp(Ity_I16);
   3676          IROp   mulOp   = syned ? Iop_MullS16 : Iop_MullU16;
   3677          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   3678          setFlags_MUL ( Ity_I16, t1, tmp, tBaseOp );
   3679          assign( res32, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   3680          assign( resHi, unop(Iop_32HIto16,mkexpr(res32)));
   3681          assign( resLo, unop(Iop_32to16,mkexpr(res32)));
   3682          putIRegRDX(2, mkexpr(resHi));
   3683          putIRegRAX(2, mkexpr(resLo));
   3684          break;
   3685       }
   3686       case Ity_I8: {
   3687          IRTemp res16   = newTemp(Ity_I16);
   3688          IRTemp resHi   = newTemp(Ity_I8);
   3689          IRTemp resLo   = newTemp(Ity_I8);
   3690          IROp   mulOp   = syned ? Iop_MullS8 : Iop_MullU8;
   3691          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   3692          setFlags_MUL ( Ity_I8, t1, tmp, tBaseOp );
   3693          assign( res16, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   3694          assign( resHi, unop(Iop_16HIto8,mkexpr(res16)));
   3695          assign( resLo, unop(Iop_16to8,mkexpr(res16)));
   3696          putIRegRAX(2, mkexpr(res16));
   3697          break;
   3698       }
   3699       default:
   3700          ppIRType(ty);
   3701          vpanic("codegen_mulL_A_D(amd64)");
   3702    }
   3703    DIP("%s%c %s\n", syned ? "imul" : "mul", nameISize(sz), tmp_txt);
   3704 }
   3705 
   3706 
   3707 /* Group 3 extended opcodes. */
   3708 static
   3709 ULong dis_Grp3 ( VexAbiInfo* vbi,
   3710                  Prefix pfx, Int sz, Long delta, Bool* decode_OK )
   3711 {
   3712    Long    d64;
   3713    UChar   modrm;
   3714    HChar   dis_buf[50];
   3715    Int     len;
   3716    IRTemp  addr;
   3717    IRType  ty = szToITy(sz);
   3718    IRTemp  t1 = newTemp(ty);
   3719    IRTemp dst1, src, dst0;
   3720    *decode_OK = True;
   3721    modrm = getUChar(delta);
   3722    if (epartIsReg(modrm)) {
   3723       switch (gregLO3ofRM(modrm)) {
   3724          case 0: { /* TEST */
   3725             delta++;
   3726             d64 = getSDisp(imin(4,sz), delta);
   3727             delta += imin(4,sz);
   3728             dst1 = newTemp(ty);
   3729             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
   3730                                getIRegE(sz,pfx,modrm),
   3731                                mkU(ty, d64 & mkSizeMask(sz))));
   3732             setFlags_DEP1( Iop_And8, dst1, ty );
   3733             DIP("test%c $%lld, %s\n",
   3734                 nameISize(sz), d64,
   3735                 nameIRegE(sz, pfx, modrm));
   3736             break;
   3737          }
   3738          case 1:
   3739             *decode_OK = False;
   3740             return delta;
   3741          case 2: /* NOT */
   3742             delta++;
   3743             putIRegE(sz, pfx, modrm,
   3744                               unop(mkSizedOp(ty,Iop_Not8),
   3745                                    getIRegE(sz, pfx, modrm)));
   3746             DIP("not%c %s\n", nameISize(sz),
   3747                               nameIRegE(sz, pfx, modrm));
   3748             break;
   3749          case 3: /* NEG */
   3750             delta++;
   3751             dst0 = newTemp(ty);
   3752             src  = newTemp(ty);
   3753             dst1 = newTemp(ty);
   3754             assign(dst0, mkU(ty,0));
   3755             assign(src,  getIRegE(sz, pfx, modrm));
   3756             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0),
   3757                                                        mkexpr(src)));
   3758             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
   3759             putIRegE(sz, pfx, modrm, mkexpr(dst1));
   3760             DIP("neg%c %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm));
   3761             break;
   3762          case 4: /* MUL (unsigned widening) */
   3763             delta++;
   3764             src = newTemp(ty);
   3765             assign(src, getIRegE(sz,pfx,modrm));
   3766             codegen_mulL_A_D ( sz, False, src,
   3767                                nameIRegE(sz,pfx,modrm) );
   3768             break;
   3769          case 5: /* IMUL (signed widening) */
   3770             delta++;
   3771             src = newTemp(ty);
   3772             assign(src, getIRegE(sz,pfx,modrm));
   3773             codegen_mulL_A_D ( sz, True, src,
   3774                                nameIRegE(sz,pfx,modrm) );
   3775             break;
   3776          case 6: /* DIV */
   3777             delta++;
   3778             assign( t1, getIRegE(sz, pfx, modrm) );
   3779             codegen_div ( sz, t1, False );
   3780             DIP("div%c %s\n", nameISize(sz),
   3781                               nameIRegE(sz, pfx, modrm));
   3782             break;
   3783          case 7: /* IDIV */
   3784             delta++;
   3785             assign( t1, getIRegE(sz, pfx, modrm) );
   3786             codegen_div ( sz, t1, True );
   3787             DIP("idiv%c %s\n", nameISize(sz),
   3788                                nameIRegE(sz, pfx, modrm));
   3789             break;
   3790          default:
   3791             /*NOTREACHED*/
   3792             vpanic("Grp3(amd64,R)");
   3793       }
   3794    } else {
   3795       addr = disAMode ( &len, vbi, pfx, delta, dis_buf,
   3796                         /* we have to inform disAMode of any immediate
   3797 			   bytes used */
   3798                         gregLO3ofRM(modrm)==0/*TEST*/
   3799                            ? imin(4,sz)
   3800                            : 0
   3801                       );
   3802       t1   = newTemp(ty);
   3803       delta += len;
   3804       assign(t1, loadLE(ty,mkexpr(addr)));
   3805       switch (gregLO3ofRM(modrm)) {
   3806          case 0: { /* TEST */
   3807             d64 = getSDisp(imin(4,sz), delta);
   3808             delta += imin(4,sz);
   3809             dst1 = newTemp(ty);
   3810             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
   3811                                mkexpr(t1),
   3812                                mkU(ty, d64 & mkSizeMask(sz))));
   3813             setFlags_DEP1( Iop_And8, dst1, ty );
   3814             DIP("test%c $%lld, %s\n", nameISize(sz), d64, dis_buf);
   3815             break;
   3816          }
   3817          case 1:
   3818             *decode_OK = False;
   3819             return delta;
   3820          case 2: /* NOT */
   3821             dst1 = newTemp(ty);
   3822             assign(dst1, unop(mkSizedOp(ty,Iop_Not8), mkexpr(t1)));
   3823             if (pfx & PFX_LOCK) {
   3824                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
   3825                                     guest_RIP_curr_instr );
   3826             } else {
   3827                storeLE( mkexpr(addr), mkexpr(dst1) );
   3828             }
   3829             DIP("not%c %s\n", nameISize(sz), dis_buf);
   3830             break;
   3831          case 3: /* NEG */
   3832             dst0 = newTemp(ty);
   3833             src  = newTemp(ty);
   3834             dst1 = newTemp(ty);
   3835             assign(dst0, mkU(ty,0));
   3836             assign(src,  mkexpr(t1));
   3837             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0),
   3838                                                        mkexpr(src)));
   3839             if (pfx & PFX_LOCK) {
   3840                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
   3841                                     guest_RIP_curr_instr );
   3842             } else {
   3843                storeLE( mkexpr(addr), mkexpr(dst1) );
   3844             }
   3845             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
   3846             DIP("neg%c %s\n", nameISize(sz), dis_buf);
   3847             break;
   3848          case 4: /* MUL (unsigned widening) */
   3849             codegen_mulL_A_D ( sz, False, t1, dis_buf );
   3850             break;
   3851          case 5: /* IMUL */
   3852             codegen_mulL_A_D ( sz, True, t1, dis_buf );
   3853             break;
   3854          case 6: /* DIV */
   3855             codegen_div ( sz, t1, False );
   3856             DIP("div%c %s\n", nameISize(sz), dis_buf);
   3857             break;
   3858          case 7: /* IDIV */
   3859             codegen_div ( sz, t1, True );
   3860             DIP("idiv%c %s\n", nameISize(sz), dis_buf);
   3861             break;
   3862          default:
   3863             /*NOTREACHED*/
   3864             vpanic("Grp3(amd64,M)");
   3865       }
   3866    }
   3867    return delta;
   3868 }
   3869 
   3870 
   3871 /* Group 4 extended opcodes. */
   3872 static
   3873 ULong dis_Grp4 ( VexAbiInfo* vbi,
   3874                  Prefix pfx, Long delta, Bool* decode_OK )
   3875 {
   3876    Int   alen;
   3877    UChar modrm;
   3878    HChar dis_buf[50];
   3879    IRType ty = Ity_I8;
   3880    IRTemp t1 = newTemp(ty);
   3881    IRTemp t2 = newTemp(ty);
   3882 
   3883    *decode_OK = True;
   3884 
   3885    modrm = getUChar(delta);
   3886    if (epartIsReg(modrm)) {
   3887       assign(t1, getIRegE(1, pfx, modrm));
   3888       switch (gregLO3ofRM(modrm)) {
   3889          case 0: /* INC */
   3890             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
   3891             putIRegE(1, pfx, modrm, mkexpr(t2));
   3892             setFlags_INC_DEC( True, t2, ty );
   3893             break;
   3894          case 1: /* DEC */
   3895             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
   3896             putIRegE(1, pfx, modrm, mkexpr(t2));
   3897             setFlags_INC_DEC( False, t2, ty );
   3898             break;
   3899          default:
   3900             *decode_OK = False;
   3901             return delta;
   3902       }
   3903       delta++;
   3904       DIP("%sb %s\n", nameGrp4(gregLO3ofRM(modrm)),
   3905                       nameIRegE(1, pfx, modrm));
   3906    } else {
   3907       IRTemp addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   3908       assign( t1, loadLE(ty, mkexpr(addr)) );
   3909       switch (gregLO3ofRM(modrm)) {
   3910          case 0: /* INC */
   3911             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
   3912             if (pfx & PFX_LOCK) {
   3913                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
   3914                       guest_RIP_curr_instr );
   3915             } else {
   3916                storeLE( mkexpr(addr), mkexpr(t2) );
   3917             }
   3918             setFlags_INC_DEC( True, t2, ty );
   3919             break;
   3920          case 1: /* DEC */
   3921             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
   3922             if (pfx & PFX_LOCK) {
   3923                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
   3924                       guest_RIP_curr_instr );
   3925             } else {
   3926                storeLE( mkexpr(addr), mkexpr(t2) );
   3927             }
   3928             setFlags_INC_DEC( False, t2, ty );
   3929             break;
   3930          default:
   3931             *decode_OK = False;
   3932             return delta;
   3933       }
   3934       delta += alen;
   3935       DIP("%sb %s\n", nameGrp4(gregLO3ofRM(modrm)), dis_buf);
   3936    }
   3937    return delta;
   3938 }
   3939 
   3940 
   3941 /* Group 5 extended opcodes. */
   3942 static
   3943 ULong dis_Grp5 ( VexAbiInfo* vbi,
   3944                  Prefix pfx, Int sz, Long delta,
   3945                  DisResult* dres, Bool* decode_OK )
   3946 {
   3947    Int     len;
   3948    UChar   modrm;
   3949    HChar   dis_buf[50];
   3950    IRTemp  addr = IRTemp_INVALID;
   3951    IRType  ty = szToITy(sz);
   3952    IRTemp  t1 = newTemp(ty);
   3953    IRTemp  t2 = IRTemp_INVALID;
   3954    IRTemp  t3 = IRTemp_INVALID;
   3955    Bool    showSz = True;
   3956 
   3957    *decode_OK = True;
   3958 
   3959    modrm = getUChar(delta);
   3960    if (epartIsReg(modrm)) {
   3961       assign(t1, getIRegE(sz,pfx,modrm));
   3962       switch (gregLO3ofRM(modrm)) {
   3963          case 0: /* INC */
   3964             t2 = newTemp(ty);
   3965             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
   3966                              mkexpr(t1), mkU(ty,1)));
   3967             setFlags_INC_DEC( True, t2, ty );
   3968             putIRegE(sz,pfx,modrm, mkexpr(t2));
   3969             break;
   3970          case 1: /* DEC */
   3971             t2 = newTemp(ty);
   3972             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
   3973                              mkexpr(t1), mkU(ty,1)));
   3974             setFlags_INC_DEC( False, t2, ty );
   3975             putIRegE(sz,pfx,modrm, mkexpr(t2));
   3976             break;
   3977          case 2: /* call Ev */
   3978             /* Ignore any sz value and operate as if sz==8. */
   3979             if (!(sz == 4 || sz == 8)) goto unhandled;
   3980             sz = 8;
   3981             t3 = newTemp(Ity_I64);
   3982             assign(t3, getIRegE(sz,pfx,modrm));
   3983             t2 = newTemp(Ity_I64);
   3984             assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   3985             putIReg64(R_RSP, mkexpr(t2));
   3986             storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+1));
   3987             make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(reg)");
   3988             jmp_treg(Ijk_Call,t3);
   3989             dres->whatNext = Dis_StopHere;
   3990             showSz = False;
   3991             break;
   3992          case 4: /* jmp Ev */
   3993             /* Ignore any sz value and operate as if sz==8. */
   3994             if (!(sz == 4 || sz == 8)) goto unhandled;
   3995             sz = 8;
   3996             t3 = newTemp(Ity_I64);
   3997             assign(t3, getIRegE(sz,pfx,modrm));
   3998             jmp_treg(Ijk_Boring,t3);
   3999             dres->whatNext = Dis_StopHere;
   4000             showSz = False;
   4001             break;
   4002          default:
   4003             *decode_OK = False;
   4004             return delta;
   4005       }
   4006       delta++;
   4007       DIP("%s%c %s\n", nameGrp5(gregLO3ofRM(modrm)),
   4008                        showSz ? nameISize(sz) : ' ',
   4009                        nameIRegE(sz, pfx, modrm));
   4010    } else {
   4011       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   4012       if (gregLO3ofRM(modrm) != 2 && gregLO3ofRM(modrm) != 4
   4013                                   && gregLO3ofRM(modrm) != 6) {
   4014          assign(t1, loadLE(ty,mkexpr(addr)));
   4015       }
   4016       switch (gregLO3ofRM(modrm)) {
   4017          case 0: /* INC */
   4018             t2 = newTemp(ty);
   4019             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
   4020                              mkexpr(t1), mkU(ty,1)));
   4021             if (pfx & PFX_LOCK) {
   4022                casLE( mkexpr(addr),
   4023                       mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
   4024             } else {
   4025                storeLE(mkexpr(addr),mkexpr(t2));
   4026             }
   4027             setFlags_INC_DEC( True, t2, ty );
   4028             break;
   4029          case 1: /* DEC */
   4030             t2 = newTemp(ty);
   4031             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
   4032                              mkexpr(t1), mkU(ty,1)));
   4033             if (pfx & PFX_LOCK) {
   4034                casLE( mkexpr(addr),
   4035                       mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
   4036             } else {
   4037                storeLE(mkexpr(addr),mkexpr(t2));
   4038             }
   4039             setFlags_INC_DEC( False, t2, ty );
   4040             break;
   4041          case 2: /* call Ev */
   4042             /* Ignore any sz value and operate as if sz==8. */
   4043             if (!(sz == 4 || sz == 8)) goto unhandled;
   4044             sz = 8;
   4045             t3 = newTemp(Ity_I64);
   4046             assign(t3, loadLE(Ity_I64,mkexpr(addr)));
   4047             t2 = newTemp(Ity_I64);
   4048             assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   4049             putIReg64(R_RSP, mkexpr(t2));
   4050             storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+len));
   4051             make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(mem)");
   4052             jmp_treg(Ijk_Call,t3);
   4053             dres->whatNext = Dis_StopHere;
   4054             showSz = False;
   4055             break;
   4056          case 4: /* JMP Ev */
   4057             /* Ignore any sz value and operate as if sz==8. */
   4058             if (!(sz == 4 || sz == 8)) goto unhandled;
   4059             sz = 8;
   4060             t3 = newTemp(Ity_I64);
   4061             assign(t3, loadLE(Ity_I64,mkexpr(addr)));
   4062             jmp_treg(Ijk_Boring,t3);
   4063             dres->whatNext = Dis_StopHere;
   4064             showSz = False;
   4065             break;
   4066          case 6: /* PUSH Ev */
   4067             /* There is no encoding for 32-bit operand size; hence ... */
   4068             if (sz == 4) sz = 8;
   4069             if (!(sz == 8 || sz == 2)) goto unhandled;
   4070             if (sz == 8) {
   4071                t3 = newTemp(Ity_I64);
   4072                assign(t3, loadLE(Ity_I64,mkexpr(addr)));
   4073                t2 = newTemp(Ity_I64);
   4074                assign( t2, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
   4075                putIReg64(R_RSP, mkexpr(t2) );
   4076                storeLE( mkexpr(t2), mkexpr(t3) );
   4077                break;
   4078 	    } else {
   4079                goto unhandled; /* awaiting test case */
   4080 	    }
   4081          default:
   4082          unhandled:
   4083             *decode_OK = False;
   4084             return delta;
   4085       }
   4086       delta += len;
   4087       DIP("%s%c %s\n", nameGrp5(gregLO3ofRM(modrm)),
   4088                        showSz ? nameISize(sz) : ' ',
   4089                        dis_buf);
   4090    }
   4091    return delta;
   4092 }
   4093 
   4094 
   4095 /*------------------------------------------------------------*/
   4096 /*--- Disassembling string ops (including REP prefixes)    ---*/
   4097 /*------------------------------------------------------------*/
   4098 
   4099 /* Code shared by all the string ops */
   4100 static
   4101 void dis_string_op_increment ( Int sz, IRTemp t_inc )
   4102 {
   4103    UChar logSz;
   4104    if (sz == 8 || sz == 4 || sz == 2) {
   4105       logSz = 1;
   4106       if (sz == 4) logSz = 2;
   4107       if (sz == 8) logSz = 3;
   4108       assign( t_inc,
   4109               binop(Iop_Shl64, IRExpr_Get( OFFB_DFLAG, Ity_I64 ),
   4110                                mkU8(logSz) ) );
   4111    } else {
   4112       assign( t_inc,
   4113               IRExpr_Get( OFFB_DFLAG, Ity_I64 ) );
   4114    }
   4115 }
   4116 
   4117 static
   4118 void dis_string_op( void (*dis_OP)( Int, IRTemp ),
   4119                     Int sz, HChar* name, Prefix pfx )
   4120 {
   4121    IRTemp t_inc = newTemp(Ity_I64);
   4122    /* Really we ought to inspect the override prefixes, but we don't.
   4123       The following assertion catches any resulting sillyness. */
   4124    vassert(pfx == clearSegBits(pfx));
   4125    dis_string_op_increment(sz, t_inc);
   4126    dis_OP( sz, t_inc );
   4127    DIP("%s%c\n", name, nameISize(sz));
   4128 }
   4129 
   4130 static
   4131 void dis_MOVS ( Int sz, IRTemp t_inc )
   4132 {
   4133    IRType ty = szToITy(sz);
   4134    IRTemp td = newTemp(Ity_I64);   /* RDI */
   4135    IRTemp ts = newTemp(Ity_I64);   /* RSI */
   4136 
   4137    assign( td, getIReg64(R_RDI) );
   4138    assign( ts, getIReg64(R_RSI) );
   4139 
   4140    storeLE( mkexpr(td), loadLE(ty,mkexpr(ts)) );
   4141 
   4142    putIReg64( R_RDI, binop(Iop_Add64, mkexpr(td), mkexpr(t_inc)) );
   4143    putIReg64( R_RSI, binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc)) );
   4144 }
   4145 
   4146 static
   4147 void dis_LODS ( Int sz, IRTemp t_inc )
   4148 {
   4149    IRType ty = szToITy(sz);
   4150    IRTemp ts = newTemp(Ity_I64);   /* RSI */
   4151 
   4152    assign( ts, getIReg64(R_RSI) );
   4153 
   4154    putIRegRAX ( sz, loadLE(ty, mkexpr(ts)) );
   4155 
   4156    putIReg64( R_RSI, binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc)) );
   4157 }
   4158 
   4159 static
   4160 void dis_STOS ( Int sz, IRTemp t_inc )
   4161 {
   4162    IRType ty = szToITy(sz);
   4163    IRTemp ta = newTemp(ty);        /* rAX */
   4164    IRTemp td = newTemp(Ity_I64);   /* RDI */
   4165 
   4166    assign( ta, getIRegRAX(sz) );
   4167 
   4168    assign( td, getIReg64(R_RDI) );
   4169 
   4170    storeLE( mkexpr(td), mkexpr(ta) );
   4171 
   4172    putIReg64( R_RDI, binop(Iop_Add64, mkexpr(td), mkexpr(t_inc)) );
   4173 }
   4174 
   4175 static
   4176 void dis_CMPS ( Int sz, IRTemp t_inc )
   4177 {
   4178    IRType ty  = szToITy(sz);
   4179    IRTemp tdv = newTemp(ty);      /* (RDI) */
   4180    IRTemp tsv = newTemp(ty);      /* (RSI) */
   4181    IRTemp td  = newTemp(Ity_I64); /*  RDI  */
   4182    IRTemp ts  = newTemp(Ity_I64); /*  RSI  */
   4183 
   4184    assign( td, getIReg64(R_RDI) );
   4185 
   4186    assign( ts, getIReg64(R_RSI) );
   4187 
   4188    assign( tdv, loadLE(ty,mkexpr(td)) );
   4189 
   4190    assign( tsv, loadLE(ty,mkexpr(ts)) );
   4191 
   4192    setFlags_DEP1_DEP2 ( Iop_Sub8, tsv, tdv, ty );
   4193 
   4194    putIReg64(R_RDI, binop(Iop_Add64, mkexpr(td), mkexpr(t_inc)) );
   4195 
   4196    putIReg64(R_RSI, binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc)) );
   4197 }
   4198 
   4199 static
   4200 void dis_SCAS ( Int sz, IRTemp t_inc )
   4201 {
   4202    IRType ty  = szToITy(sz);
   4203    IRTemp ta  = newTemp(ty);       /*  rAX  */
   4204    IRTemp td  = newTemp(Ity_I64);  /*  RDI  */
   4205    IRTemp tdv = newTemp(ty);       /* (RDI) */
   4206 
   4207    assign( ta, getIRegRAX(sz) );
   4208 
   4209    assign( td, getIReg64(R_RDI) );
   4210 
   4211    assign( tdv, loadLE(ty,mkexpr(td)) );
   4212 
   4213    setFlags_DEP1_DEP2 ( Iop_Sub8, ta, tdv, ty );
   4214 
   4215    putIReg64(R_RDI, binop(Iop_Add64, mkexpr(td), mkexpr(t_inc)) );
   4216 }
   4217 
   4218 
   4219 /* Wrap the appropriate string op inside a REP/REPE/REPNE.  We assume
   4220    the insn is the last one in the basic block, and so emit a jump to
   4221    the next insn, rather than just falling through. */
   4222 static
   4223 void dis_REP_op ( AMD64Condcode cond,
   4224                   void (*dis_OP)(Int, IRTemp),
   4225                   Int sz, Addr64 rip, Addr64 rip_next, HChar* name,
   4226                   Prefix pfx )
   4227 {
   4228    IRTemp t_inc = newTemp(Ity_I64);
   4229    IRTemp tc    = newTemp(Ity_I64);  /*  RCX  */
   4230 
   4231    /* Really we ought to inspect the override prefixes, but we don't.
   4232       The following assertion catches any resulting sillyness. */
   4233    vassert(pfx == clearSegBits(pfx));
   4234 
   4235    assign( tc, getIReg64(R_RCX) );
   4236 
   4237    stmt( IRStmt_Exit( binop(Iop_CmpEQ64,mkexpr(tc),mkU64(0)),
   4238                       Ijk_Boring,
   4239                       IRConst_U64(rip_next) ) );
   4240 
   4241    putIReg64(R_RCX, binop(Iop_Sub64, mkexpr(tc), mkU64(1)) );
   4242 
   4243    dis_string_op_increment(sz, t_inc);
   4244    dis_OP (sz, t_inc);
   4245 
   4246    if (cond == AMD64CondAlways) {
   4247       jmp_lit(Ijk_Boring,rip);
   4248    } else {
   4249       stmt( IRStmt_Exit( mk_amd64g_calculate_condition(cond),
   4250                          Ijk_Boring,
   4251                          IRConst_U64(rip) ) );
   4252       jmp_lit(Ijk_Boring,rip_next);
   4253    }
   4254    DIP("%s%c\n", name, nameISize(sz));
   4255 }
   4256 
   4257 
   4258 /*------------------------------------------------------------*/
   4259 /*--- Arithmetic, etc.                                     ---*/
   4260 /*------------------------------------------------------------*/
   4261 
   4262 /* IMUL E, G.  Supplied eip points to the modR/M byte. */
   4263 static
   4264 ULong dis_mul_E_G ( VexAbiInfo* vbi,
   4265                     Prefix      pfx,
   4266                     Int         size,
   4267                     Long        delta0 )
   4268 {
   4269    Int    alen;
   4270    HChar  dis_buf[50];
   4271    UChar  rm = getUChar(delta0);
   4272    IRType ty = szToITy(size);
   4273    IRTemp te = newTemp(ty);
   4274    IRTemp tg = newTemp(ty);
   4275    IRTemp resLo = newTemp(ty);
   4276 
   4277    assign( tg, getIRegG(size, pfx, rm) );
   4278    if (epartIsReg(rm)) {
   4279       assign( te, getIRegE(size, pfx, rm) );
   4280    } else {
   4281       IRTemp addr = disAMode( &alen, vbi, pfx, delta0, dis_buf, 0 );
   4282       assign( te, loadLE(ty,mkexpr(addr)) );
   4283    }
   4284 
   4285    setFlags_MUL ( ty, te, tg, AMD64G_CC_OP_SMULB );
   4286 
   4287    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tg) ) );
   4288 
   4289    putIRegG(size, pfx, rm, mkexpr(resLo) );
   4290 
   4291    if (epartIsReg(rm)) {
   4292       DIP("imul%c %s, %s\n", nameISize(size),
   4293                              nameIRegE(size,pfx,rm),
   4294                              nameIRegG(size,pfx,rm));
   4295       return 1+delta0;
   4296    } else {
   4297       DIP("imul%c %s, %s\n", nameISize(size),
   4298                              dis_buf,
   4299                              nameIRegG(size,pfx,rm));
   4300       return alen+delta0;
   4301    }
   4302 }
   4303 
   4304 
   4305 /* IMUL I * E -> G.  Supplied rip points to the modR/M byte. */
   4306 static
   4307 ULong dis_imul_I_E_G ( VexAbiInfo* vbi,
   4308                        Prefix      pfx,
   4309                        Int         size,
   4310                        Long        delta,
   4311                        Int         litsize )
   4312 {
   4313    Long   d64;
   4314    Int    alen;
   4315    HChar  dis_buf[50];
   4316    UChar  rm = getUChar(delta);
   4317    IRType ty = szToITy(size);
   4318    IRTemp te = newTemp(ty);
   4319    IRTemp tl = newTemp(ty);
   4320    IRTemp resLo = newTemp(ty);
   4321 
   4322    vassert(/*size == 1 ||*/ size == 2 || size == 4 || size == 8);
   4323 
   4324    if (epartIsReg(rm)) {
   4325       assign(te, getIRegE(size, pfx, rm));
   4326       delta++;
   4327    } else {
   4328       IRTemp addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   4329                                      imin(4,litsize) );
   4330       assign(te, loadLE(ty, mkexpr(addr)));
   4331       delta += alen;
   4332    }
   4333    d64 = getSDisp(imin(4,litsize),delta);
   4334    delta += imin(4,litsize);
   4335 
   4336    d64 &= mkSizeMask(size);
   4337    assign(tl, mkU(ty,d64));
   4338 
   4339    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tl) ));
   4340 
   4341    setFlags_MUL ( ty, te, tl, AMD64G_CC_OP_SMULB );
   4342 
   4343    putIRegG(size, pfx, rm, mkexpr(resLo));
   4344 
   4345    DIP("imul%c $%lld, %s, %s\n",
   4346        nameISize(size), d64,
   4347        ( epartIsReg(rm) ? nameIRegE(size,pfx,rm) : dis_buf ),
   4348        nameIRegG(size,pfx,rm) );
   4349    return delta;
   4350 }
   4351 
   4352 
   4353 /* Generate an IR sequence to do a popcount operation on the supplied
   4354    IRTemp, and return a new IRTemp holding the result.  'ty' may be
   4355    Ity_I16, Ity_I32 or Ity_I64 only. */
   4356 static IRTemp gen_POPCOUNT ( IRType ty, IRTemp src )
   4357 {
   4358    Int i;
   4359    if (ty == Ity_I16) {
   4360       IRTemp old = IRTemp_INVALID;
   4361       IRTemp nyu = IRTemp_INVALID;
   4362       IRTemp mask[4], shift[4];
   4363       for (i = 0; i < 4; i++) {
   4364          mask[i]  = newTemp(ty);
   4365          shift[i] = 1 << i;
   4366       }
   4367       assign(mask[0], mkU16(0x5555));
   4368       assign(mask[1], mkU16(0x3333));
   4369       assign(mask[2], mkU16(0x0F0F));
   4370       assign(mask[3], mkU16(0x00FF));
   4371       old = src;
   4372       for (i = 0; i < 4; i++) {
   4373          nyu = newTemp(ty);
   4374          assign(nyu,
   4375                 binop(Iop_Add16,
   4376                       binop(Iop_And16,
   4377                             mkexpr(old),
   4378                             mkexpr(mask[i])),
   4379                       binop(Iop_And16,
   4380                             binop(Iop_Shr16, mkexpr(old), mkU8(shift[i])),
   4381                             mkexpr(mask[i]))));
   4382          old = nyu;
   4383       }
   4384       return nyu;
   4385    }
   4386    if (ty == Ity_I32) {
   4387       IRTemp old = IRTemp_INVALID;
   4388       IRTemp nyu = IRTemp_INVALID;
   4389       IRTemp mask[5], shift[5];
   4390       for (i = 0; i < 5; i++) {
   4391          mask[i]  = newTemp(ty);
   4392          shift[i] = 1 << i;
   4393       }
   4394       assign(mask[0], mkU32(0x55555555));
   4395       assign(mask[1], mkU32(0x33333333));
   4396       assign(mask[2], mkU32(0x0F0F0F0F));
   4397       assign(mask[3], mkU32(0x00FF00FF));
   4398       assign(mask[4], mkU32(0x0000FFFF));
   4399       old = src;
   4400       for (i = 0; i < 5; i++) {
   4401          nyu = newTemp(ty);
   4402          assign(nyu,
   4403                 binop(Iop_Add32,
   4404                       binop(Iop_And32,
   4405                             mkexpr(old),
   4406                             mkexpr(mask[i])),
   4407                       binop(Iop_And32,
   4408                             binop(Iop_Shr32, mkexpr(old), mkU8(shift[i])),
   4409                             mkexpr(mask[i]))));
   4410          old = nyu;
   4411       }
   4412       return nyu;
   4413    }
   4414    if (ty == Ity_I64) {
   4415       IRTemp old = IRTemp_INVALID;
   4416       IRTemp nyu = IRTemp_INVALID;
   4417       IRTemp mask[6], shift[6];
   4418       for (i = 0; i < 6; i++) {
   4419          mask[i]  = newTemp(ty);
   4420          shift[i] = 1 << i;
   4421       }
   4422       assign(mask[0], mkU64(0x5555555555555555ULL));
   4423       assign(mask[1], mkU64(0x3333333333333333ULL));
   4424       assign(mask[2], mkU64(0x0F0F0F0F0F0F0F0FULL));
   4425       assign(mask[3], mkU64(0x00FF00FF00FF00FFULL));
   4426       assign(mask[4], mkU64(0x0000FFFF0000FFFFULL));
   4427       assign(mask[5], mkU64(0x00000000FFFFFFFFULL));
   4428       old = src;
   4429       for (i = 0; i < 6; i++) {
   4430          nyu = newTemp(ty);
   4431          assign(nyu,
   4432                 binop(Iop_Add64,
   4433                       binop(Iop_And64,
   4434                             mkexpr(old),
   4435                             mkexpr(mask[i])),
   4436                       binop(Iop_And64,
   4437                             binop(Iop_Shr64, mkexpr(old), mkU8(shift[i])),
   4438                             mkexpr(mask[i]))));
   4439          old = nyu;
   4440       }
   4441       return nyu;
   4442    }
   4443    /*NOTREACHED*/
   4444    vassert(0);
   4445 }
   4446 
   4447 
   4448 /* Generate an IR sequence to do a count-leading-zeroes operation on
   4449    the supplied IRTemp, and return a new IRTemp holding the result.
   4450    'ty' may be Ity_I16, Ity_I32 or Ity_I64 only.  In the case where
   4451    the argument is zero, return the number of bits in the word (the
   4452    natural semantics). */
   4453 static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
   4454 {
   4455    vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16);
   4456 
   4457    IRTemp src64 = newTemp(Ity_I64);
   4458    assign(src64, widenUto64( mkexpr(src) ));
   4459 
   4460    IRTemp src64x = newTemp(Ity_I64);
   4461    assign(src64x,
   4462           binop(Iop_Shl64, mkexpr(src64),
   4463                            mkU8(64 - 8 * sizeofIRType(ty))));
   4464 
   4465    // Clz64 has undefined semantics when its input is zero, so
   4466    // special-case around that.
   4467    IRTemp res64 = newTemp(Ity_I64);
   4468    assign(res64,
   4469           IRExpr_Mux0X(
   4470              unop(Iop_1Uto8,
   4471                   binop(Iop_CmpEQ64, mkexpr(src64x), mkU64(0))),
   4472              unop(Iop_Clz64, mkexpr(src64x)),
   4473              mkU64(8 * sizeofIRType(ty))
   4474    ));
   4475 
   4476    IRTemp res = newTemp(ty);
   4477    assign(res, narrowTo(ty, mkexpr(res64)));
   4478    return res;
   4479 }
   4480 
   4481 
   4482 /*------------------------------------------------------------*/
   4483 /*---                                                      ---*/
   4484 /*--- x87 FLOATING POINT INSTRUCTIONS                      ---*/
   4485 /*---                                                      ---*/
   4486 /*------------------------------------------------------------*/
   4487 
   4488 /* --- Helper functions for dealing with the register stack. --- */
   4489 
   4490 /* --- Set the emulation-warning pseudo-register. --- */
   4491 
   4492 static void put_emwarn ( IRExpr* e /* :: Ity_I32 */ )
   4493 {
   4494    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   4495    stmt( IRStmt_Put( OFFB_EMWARN, e ) );
   4496 }
   4497 
   4498 /* --- Produce an IRExpr* denoting a 64-bit QNaN. --- */
   4499 
   4500 static IRExpr* mkQNaN64 ( void )
   4501 {
   4502   /* QNaN is 0 2047 1 0(51times)
   4503      == 0b 11111111111b 1 0(51times)
   4504      == 0x7FF8 0000 0000 0000
   4505    */
   4506    return IRExpr_Const(IRConst_F64i(0x7FF8000000000000ULL));
   4507 }
   4508 
   4509 /* --------- Get/put the top-of-stack pointer :: Ity_I32 --------- */
   4510 
   4511 static IRExpr* get_ftop ( void )
   4512 {
   4513    return IRExpr_Get( OFFB_FTOP, Ity_I32 );
   4514 }
   4515 
   4516 static void put_ftop ( IRExpr* e )
   4517 {
   4518    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   4519    stmt( IRStmt_Put( OFFB_FTOP, e ) );
   4520 }
   4521 
   4522 /* --------- Get/put the C3210 bits. --------- */
   4523 
   4524 static IRExpr*  /* :: Ity_I64 */ get_C3210 ( void )
   4525 {
   4526    return IRExpr_Get( OFFB_FC3210, Ity_I64 );
   4527 }
   4528 
   4529 static void put_C3210 ( IRExpr* e  /* :: Ity_I64 */ )
   4530 {
   4531    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
   4532    stmt( IRStmt_Put( OFFB_FC3210, e ) );
   4533 }
   4534 
   4535 /* --------- Get/put the FPU rounding mode. --------- */
   4536 static IRExpr* /* :: Ity_I32 */ get_fpround ( void )
   4537 {
   4538    return unop(Iop_64to32, IRExpr_Get( OFFB_FPROUND, Ity_I64 ));
   4539 }
   4540 
   4541 static void put_fpround ( IRExpr* /* :: Ity_I32 */ e )
   4542 {
   4543    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   4544    stmt( IRStmt_Put( OFFB_FPROUND, unop(Iop_32Uto64,e) ) );
   4545 }
   4546 
   4547 
   4548 /* --------- Synthesise a 2-bit FPU rounding mode. --------- */
   4549 /* Produces a value in 0 .. 3, which is encoded as per the type
   4550    IRRoundingMode.  Since the guest_FPROUND value is also encoded as
   4551    per IRRoundingMode, we merely need to get it and mask it for
   4552    safety.
   4553 */
   4554 static IRExpr* /* :: Ity_I32 */ get_roundingmode ( void )
   4555 {
   4556    return binop( Iop_And32, get_fpround(), mkU32(3) );
   4557 }
   4558 
   4559 static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
   4560 {
   4561    return mkU32(Irrm_NEAREST);
   4562 }
   4563 
   4564 
   4565 /* --------- Get/set FP register tag bytes. --------- */
   4566 
   4567 /* Given i, and some expression e, generate 'ST_TAG(i) = e'. */
   4568 
   4569 static void put_ST_TAG ( Int i, IRExpr* value )
   4570 {
   4571    IRRegArray* descr;
   4572    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_I8);
   4573    descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   4574    stmt( IRStmt_PutI( descr, get_ftop(), i, value ) );
   4575 }
   4576 
   4577 /* Given i, generate an expression yielding 'ST_TAG(i)'.  This will be
   4578    zero to indicate "Empty" and nonzero to indicate "NonEmpty".  */
   4579 
   4580 static IRExpr* get_ST_TAG ( Int i )
   4581 {
   4582    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   4583    return IRExpr_GetI( descr, get_ftop(), i );
   4584 }
   4585 
   4586 
   4587 /* --------- Get/set FP registers. --------- */
   4588 
   4589 /* Given i, and some expression e, emit 'ST(i) = e' and set the
   4590    register's tag to indicate the register is full.  The previous
   4591    state of the register is not checked. */
   4592 
   4593 static void put_ST_UNCHECKED ( Int i, IRExpr* value )
   4594 {
   4595    IRRegArray* descr;
   4596    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_F64);
   4597    descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
   4598    stmt( IRStmt_PutI( descr, get_ftop(), i, value ) );
   4599    /* Mark the register as in-use. */
   4600    put_ST_TAG(i, mkU8(1));
   4601 }
   4602 
   4603 /* Given i, and some expression e, emit
   4604       ST(i) = is_full(i) ? NaN : e
   4605    and set the tag accordingly.
   4606 */
   4607 
   4608 static void put_ST ( Int i, IRExpr* value )
   4609 {
   4610    put_ST_UNCHECKED( i,
   4611                      IRExpr_Mux0X( get_ST_TAG(i),
   4612                                    /* 0 means empty */
   4613                                    value,
   4614                                    /* non-0 means full */
   4615                                    mkQNaN64()
   4616                    )
   4617    );
   4618 }
   4619 
   4620 
   4621 /* Given i, generate an expression yielding 'ST(i)'. */
   4622 
   4623 static IRExpr* get_ST_UNCHECKED ( Int i )
   4624 {
   4625    IRRegArray* descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
   4626    return IRExpr_GetI( descr, get_ftop(), i );
   4627 }
   4628 
   4629 
   4630 /* Given i, generate an expression yielding
   4631   is_full(i) ? ST(i) : NaN
   4632 */
   4633 
   4634 static IRExpr* get_ST ( Int i )
   4635 {
   4636    return
   4637       IRExpr_Mux0X( get_ST_TAG(i),
   4638                     /* 0 means empty */
   4639                     mkQNaN64(),
   4640                     /* non-0 means full */
   4641                     get_ST_UNCHECKED(i));
   4642 }
   4643 
   4644 
   4645 /* Adjust FTOP downwards by one register. */
   4646 
   4647 static void fp_push ( void )
   4648 {
   4649    put_ftop( binop(Iop_Sub32, get_ftop(), mkU32(1)) );
   4650 }
   4651 
   4652 /* Adjust FTOP upwards by one register, and mark the vacated register
   4653    as empty.  */
   4654 
   4655 static void fp_pop ( void )
   4656 {
   4657    put_ST_TAG(0, mkU8(0));
   4658    put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
   4659 }
   4660 
   4661 /* Clear the C2 bit of the FPU status register, for
   4662    sin/cos/tan/sincos. */
   4663 
   4664 static void clear_C2 ( void )
   4665 {
   4666    put_C3210( binop(Iop_And64, get_C3210(), mkU64(~AMD64G_FC_MASK_C2)) );
   4667 }
   4668 
   4669 /* Invent a plausible-looking FPU status word value:
   4670       ((ftop & 7) << 11) | (c3210 & 0x4700)
   4671  */
   4672 static IRExpr* get_FPU_sw ( void )
   4673 {
   4674    return
   4675       unop(Iop_32to16,
   4676            binop(Iop_Or32,
   4677                  binop(Iop_Shl32,
   4678                        binop(Iop_And32, get_ftop(), mkU32(7)),
   4679                              mkU8(11)),
   4680                        binop(Iop_And32, unop(Iop_64to32, get_C3210()),
   4681                                         mkU32(0x4700))
   4682       ));
   4683 }
   4684 
   4685 
   4686 /* ------------------------------------------------------- */
   4687 /* Given all that stack-mangling junk, we can now go ahead
   4688    and describe FP instructions.
   4689 */
   4690 
   4691 /* ST(0) = ST(0) `op` mem64/32(addr)
   4692    Need to check ST(0)'s tag on read, but not on write.
   4693 */
   4694 static
   4695 void fp_do_op_mem_ST_0 ( IRTemp addr, HChar* op_txt, HChar* dis_buf,
   4696                          IROp op, Bool dbl )
   4697 {
   4698    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
   4699    if (dbl) {
   4700       put_ST_UNCHECKED(0,
   4701          triop( op,
   4702                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4703                 get_ST(0),
   4704                 loadLE(Ity_F64,mkexpr(addr))
   4705          ));
   4706    } else {
   4707       put_ST_UNCHECKED(0,
   4708          triop( op,
   4709                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4710                 get_ST(0),
   4711                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr)))
   4712          ));
   4713    }
   4714 }
   4715 
   4716 
   4717 /* ST(0) = mem64/32(addr) `op` ST(0)
   4718    Need to check ST(0)'s tag on read, but not on write.
   4719 */
   4720 static
   4721 void fp_do_oprev_mem_ST_0 ( IRTemp addr, HChar* op_txt, HChar* dis_buf,
   4722                             IROp op, Bool dbl )
   4723 {
   4724    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
   4725    if (dbl) {
   4726       put_ST_UNCHECKED(0,
   4727          triop( op,
   4728                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4729                 loadLE(Ity_F64,mkexpr(addr)),
   4730                 get_ST(0)
   4731          ));
   4732    } else {
   4733       put_ST_UNCHECKED(0,
   4734          triop( op,
   4735                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4736                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr))),
   4737                 get_ST(0)
   4738          ));
   4739    }
   4740 }
   4741 
   4742 
   4743 /* ST(dst) = ST(dst) `op` ST(src).
   4744    Check dst and src tags when reading but not on write.
   4745 */
   4746 static
   4747 void fp_do_op_ST_ST ( HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
   4748                       Bool pop_after )
   4749 {
   4750    DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"", st_src, st_dst );
   4751    put_ST_UNCHECKED(
   4752       st_dst,
   4753       triop( op,
   4754              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4755              get_ST(st_dst),
   4756              get_ST(st_src) )
   4757    );
   4758    if (pop_after)
   4759       fp_pop();
   4760 }
   4761 
   4762 /* ST(dst) = ST(src) `op` ST(dst).
   4763    Check dst and src tags when reading but not on write.
   4764 */
   4765 static
   4766 void fp_do_oprev_ST_ST ( HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
   4767                          Bool pop_after )
   4768 {
   4769    DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"", st_src, st_dst );
   4770    put_ST_UNCHECKED(
   4771       st_dst,
   4772       triop( op,
   4773              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4774              get_ST(st_src),
   4775              get_ST(st_dst) )
   4776    );
   4777    if (pop_after)
   4778       fp_pop();
   4779 }
   4780 
   4781 /* %rflags(Z,P,C) = UCOMI( st(0), st(i) ) */
   4782 static void fp_do_ucomi_ST0_STi ( UInt i, Bool pop_after )
   4783 {
   4784    DIP("fucomi%s %%st(0),%%st(%u)\n", pop_after ? "p" : "", i);
   4785    /* This is a bit of a hack (and isn't really right).  It sets
   4786       Z,P,C,O correctly, but forces A and S to zero, whereas the Intel
   4787       documentation implies A and S are unchanged.
   4788    */
   4789    /* It's also fishy in that it is used both for COMIP and
   4790       UCOMIP, and they aren't the same (although similar). */
   4791    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   4792    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   4793    stmt( IRStmt_Put(
   4794             OFFB_CC_DEP1,
   4795             binop( Iop_And64,
   4796                    unop( Iop_32Uto64,
   4797                          binop(Iop_CmpF64, get_ST(0), get_ST(i))),
   4798                    mkU64(0x45)
   4799         )));
   4800    if (pop_after)
   4801       fp_pop();
   4802 }
   4803 
   4804 
   4805 /* returns
   4806    32to16( if e32 <s -32768 || e32 >s 32767 then -32768 else e32 )
   4807 */
   4808 static IRExpr* x87ishly_qnarrow_32_to_16 ( IRExpr* e32 )
   4809 {
   4810    IRTemp t32 = newTemp(Ity_I32);
   4811    assign( t32, e32 );
   4812    return
   4813       IRExpr_Mux0X(
   4814          unop(Iop_1Uto8,
   4815               binop(Iop_CmpLT64U,
   4816                     unop(Iop_32Uto64,
   4817                          binop(Iop_Add32, mkexpr(t32), mkU32(32768))),
   4818                     mkU64(65536))),
   4819          mkU16( 0x8000 ),
   4820          unop(Iop_32to16, mkexpr(t32)));
   4821 }
   4822 
   4823 
   4824 static
   4825 ULong dis_FPU ( /*OUT*/Bool* decode_ok,
   4826                 VexAbiInfo* vbi, Prefix pfx, Long delta )
   4827 {
   4828    Int    len;
   4829    UInt   r_src, r_dst;
   4830    HChar  dis_buf[50];
   4831    IRTemp t1, t2;
   4832 
   4833    /* On entry, delta points at the second byte of the insn (the modrm
   4834       byte).*/
   4835    UChar first_opcode = getUChar(delta-1);
   4836    UChar modrm        = getUChar(delta+0);
   4837 
   4838    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD8 opcodes +-+-+-+-+-+-+-+ */
   4839 
   4840    if (first_opcode == 0xD8) {
   4841       if (modrm < 0xC0) {
   4842 
   4843          /* bits 5,4,3 are an opcode extension, and the modRM also
   4844            specifies an address. */
   4845          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   4846          delta += len;
   4847 
   4848          switch (gregLO3ofRM(modrm)) {
   4849 
   4850             case 0: /* FADD single-real */
   4851                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, False );
   4852                break;
   4853 
   4854             case 1: /* FMUL single-real */
   4855                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, False );
   4856                break;
   4857 
   4858 //..             case 2: /* FCOM single-real */
   4859 //..                DIP("fcoms %s\n", dis_buf);
   4860 //..                /* This forces C1 to zero, which isn't right. */
   4861 //..                put_C3210(
   4862 //..                    binop( Iop_And32,
   4863 //..                           binop(Iop_Shl32,
   4864 //..                                 binop(Iop_CmpF64,
   4865 //..                                       get_ST(0),
   4866 //..                                       unop(Iop_F32toF64,
   4867 //..                                            loadLE(Ity_F32,mkexpr(addr)))),
   4868 //..                                 mkU8(8)),
   4869 //..                           mkU32(0x4500)
   4870 //..                    ));
   4871 //..                break;
   4872 //..
   4873 //..             case 3: /* FCOMP single-real */
   4874 //..                DIP("fcomps %s\n", dis_buf);
   4875 //..                /* This forces C1 to zero, which isn't right. */
   4876 //..                put_C3210(
   4877 //..                    binop( Iop_And32,
   4878 //..                           binop(Iop_Shl32,
   4879 //..                                 binop(Iop_CmpF64,
   4880 //..                                       get_ST(0),
   4881 //..                                       unop(Iop_F32toF64,
   4882 //..                                            loadLE(Ity_F32,mkexpr(addr)))),
   4883 //..                                 mkU8(8)),
   4884 //..                           mkU32(0x4500)
   4885 //..                    ));
   4886 //..                fp_pop();
   4887 //..                break;
   4888 
   4889             case 4: /* FSUB single-real */
   4890                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, False );
   4891                break;
   4892 
   4893             case 5: /* FSUBR single-real */
   4894                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, False );
   4895                break;
   4896 
   4897             case 6: /* FDIV single-real */
   4898                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, False );
   4899                break;
   4900 
   4901             case 7: /* FDIVR single-real */
   4902                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, False );
   4903                break;
   4904 
   4905             default:
   4906                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   4907                vex_printf("first_opcode == 0xD8\n");
   4908                goto decode_fail;
   4909          }
   4910       } else {
   4911          delta++;
   4912          switch (modrm) {
   4913 
   4914             case 0xC0 ... 0xC7: /* FADD %st(?),%st(0) */
   4915                fp_do_op_ST_ST ( "add", Iop_AddF64, modrm - 0xC0, 0, False );
   4916                break;
   4917 
   4918             case 0xC8 ... 0xCF: /* FMUL %st(?),%st(0) */
   4919                fp_do_op_ST_ST ( "mul", Iop_MulF64, modrm - 0xC8, 0, False );
   4920                break;
   4921 
   4922             /* Dunno if this is right */
   4923             case 0xD0 ... 0xD7: /* FCOM %st(?),%st(0) */
   4924                r_dst = (UInt)modrm - 0xD0;
   4925                DIP("fcom %%st(0),%%st(%d)\n", r_dst);
   4926                /* This forces C1 to zero, which isn't right. */
   4927                put_C3210(
   4928                    unop(Iop_32Uto64,
   4929                    binop( Iop_And32,
   4930                           binop(Iop_Shl32,
   4931                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   4932                                 mkU8(8)),
   4933                           mkU32(0x4500)
   4934                    )));
   4935                break;
   4936 
   4937             /* Dunno if this is right */
   4938             case 0xD8 ... 0xDF: /* FCOMP %st(?),%st(0) */
   4939                r_dst = (UInt)modrm - 0xD8;
   4940                DIP("fcomp %%st(0),%%st(%d)\n", r_dst);
   4941                /* This forces C1 to zero, which isn't right. */
   4942                put_C3210(
   4943                    unop(Iop_32Uto64,
   4944                    binop( Iop_And32,
   4945                           binop(Iop_Shl32,
   4946                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   4947                                 mkU8(8)),
   4948                           mkU32(0x4500)
   4949                    )));
   4950                fp_pop();
   4951                break;
   4952 
   4953             case 0xE0 ... 0xE7: /* FSUB %st(?),%st(0) */
   4954                fp_do_op_ST_ST ( "sub", Iop_SubF64, modrm - 0xE0, 0, False );
   4955                break;
   4956 
   4957             case 0xE8 ... 0xEF: /* FSUBR %st(?),%st(0) */
   4958                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, modrm - 0xE8, 0, False );
   4959                break;
   4960 
   4961             case 0xF0 ... 0xF7: /* FDIV %st(?),%st(0) */
   4962                fp_do_op_ST_ST ( "div", Iop_DivF64, modrm - 0xF0, 0, False );
   4963                break;
   4964 
   4965             case 0xF8 ... 0xFF: /* FDIVR %st(?),%st(0) */
   4966                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, modrm - 0xF8, 0, False );
   4967                break;
   4968 
   4969             default:
   4970                goto decode_fail;
   4971          }
   4972       }
   4973    }
   4974 
   4975    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD9 opcodes +-+-+-+-+-+-+-+ */
   4976    else
   4977    if (first_opcode == 0xD9) {
   4978       if (modrm < 0xC0) {
   4979 
   4980          /* bits 5,4,3 are an opcode extension, and the modRM also
   4981             specifies an address. */
   4982          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   4983          delta += len;
   4984 
   4985          switch (gregLO3ofRM(modrm)) {
   4986 
   4987             case 0: /* FLD single-real */
   4988                DIP("flds %s\n", dis_buf);
   4989                fp_push();
   4990                put_ST(0, unop(Iop_F32toF64,
   4991                               loadLE(Ity_F32, mkexpr(addr))));
   4992                break;
   4993 
   4994             case 2: /* FST single-real */
   4995                DIP("fsts %s\n", dis_buf);
   4996                storeLE(mkexpr(addr),
   4997                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
   4998                break;
   4999 
   5000             case 3: /* FSTP single-real */
   5001                DIP("fstps %s\n", dis_buf);
   5002                storeLE(mkexpr(addr),
   5003                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
   5004                fp_pop();
   5005                break;
   5006 
   5007             case 4: { /* FLDENV m28 */
   5008                /* Uses dirty helper:
   5009                      VexEmWarn amd64g_do_FLDENV ( VexGuestX86State*, HWord ) */
   5010                IRTemp    ew = newTemp(Ity_I32);
   5011                IRTemp   w64 = newTemp(Ity_I64);
   5012                IRDirty*   d = unsafeIRDirty_0_N (
   5013                                  0/*regparms*/,
   5014                                  "amd64g_dirtyhelper_FLDENV",
   5015                                  &amd64g_dirtyhelper_FLDENV,
   5016                                  mkIRExprVec_1( mkexpr(addr) )
   5017                               );
   5018                d->needsBBP = True;
   5019                d->tmp      = w64;
   5020                /* declare we're reading memory */
   5021                d->mFx   = Ifx_Read;
   5022                d->mAddr = mkexpr(addr);
   5023                d->mSize = 28;
   5024 
   5025                /* declare we're writing guest state */
   5026                d->nFxState = 4;
   5027 
   5028                d->fxState[0].fx     = Ifx_Write;
   5029                d->fxState[0].offset = OFFB_FTOP;
   5030                d->fxState[0].size   = sizeof(UInt);
   5031 
   5032                d->fxState[1].fx     = Ifx_Write;
   5033                d->fxState[1].offset = OFFB_FPTAGS;
   5034                d->fxState[1].size   = 8 * sizeof(UChar);
   5035 
   5036                d->fxState[2].fx     = Ifx_Write;
   5037                d->fxState[2].offset = OFFB_FPROUND;
   5038                d->fxState[2].size   = sizeof(ULong);
   5039 
   5040                d->fxState[3].fx     = Ifx_Write;
   5041                d->fxState[3].offset = OFFB_FC3210;
   5042                d->fxState[3].size   = sizeof(ULong);
   5043 
   5044                stmt( IRStmt_Dirty(d) );
   5045 
   5046                /* ew contains any emulation warning we may need to
   5047                   issue.  If needed, side-exit to the next insn,
   5048                   reporting the warning, so that Valgrind's dispatcher
   5049                   sees the warning. */
   5050 	       assign(ew, unop(Iop_64to32,mkexpr(w64)) );
   5051                put_emwarn( mkexpr(ew) );
   5052                stmt(
   5053                   IRStmt_Exit(
   5054                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   5055                      Ijk_EmWarn,
   5056                      IRConst_U64( guest_RIP_bbstart+delta )
   5057                   )
   5058                );
   5059 
   5060                DIP("fldenv %s\n", dis_buf);
   5061                break;
   5062             }
   5063 
   5064             case 5: {/* FLDCW */
   5065                /* The only thing we observe in the control word is the
   5066                   rounding mode.  Therefore, pass the 16-bit value
   5067                   (x87 native-format control word) to a clean helper,
   5068                   getting back a 64-bit value, the lower half of which
   5069                   is the FPROUND value to store, and the upper half of
   5070                   which is the emulation-warning token which may be
   5071                   generated.
   5072                */
   5073                /* ULong amd64h_check_fldcw ( ULong ); */
   5074                IRTemp t64 = newTemp(Ity_I64);
   5075                IRTemp ew = newTemp(Ity_I32);
   5076                DIP("fldcw %s\n", dis_buf);
   5077                assign( t64, mkIRExprCCall(
   5078                                Ity_I64, 0/*regparms*/,
   5079                                "amd64g_check_fldcw",
   5080                                &amd64g_check_fldcw,
   5081                                mkIRExprVec_1(
   5082                                   unop( Iop_16Uto64,
   5083                                         loadLE(Ity_I16, mkexpr(addr)))
   5084                                )
   5085                             )
   5086                      );
   5087 
   5088                put_fpround( unop(Iop_64to32, mkexpr(t64)) );
   5089                assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
   5090                put_emwarn( mkexpr(ew) );
   5091                /* Finally, if an emulation warning was reported,
   5092                   side-exit to the next insn, reporting the warning,
   5093                   so that Valgrind's dispatcher sees the warning. */
   5094                stmt(
   5095                   IRStmt_Exit(
   5096                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   5097                      Ijk_EmWarn,
   5098                      IRConst_U64( guest_RIP_bbstart+delta )
   5099                   )
   5100                );
   5101                break;
   5102             }
   5103 
   5104             case 6: { /* FNSTENV m28 */
   5105                /* Uses dirty helper:
   5106                      void amd64g_do_FSTENV ( VexGuestAMD64State*, HWord ) */
   5107                IRDirty* d = unsafeIRDirty_0_N (
   5108                                0/*regparms*/,
   5109                                "amd64g_dirtyhelper_FSTENV",
   5110                                &amd64g_dirtyhelper_FSTENV,
   5111                                mkIRExprVec_1( mkexpr(addr) )
   5112                             );
   5113                d->needsBBP = True;
   5114                /* declare we're writing memory */
   5115                d->mFx   = Ifx_Write;
   5116                d->mAddr = mkexpr(addr);
   5117                d->mSize = 28;
   5118 
   5119                /* declare we're reading guest state */
   5120                d->nFxState = 4;
   5121 
   5122                d->fxState[0].fx     = Ifx_Read;
   5123                d->fxState[0].offset = OFFB_FTOP;
   5124                d->fxState[0].size   = sizeof(UInt);
   5125 
   5126                d->fxState[1].fx     = Ifx_Read;
   5127                d->fxState[1].offset = OFFB_FPTAGS;
   5128                d->fxState[1].size   = 8 * sizeof(UChar);
   5129 
   5130                d->fxState[2].fx     = Ifx_Read;
   5131                d->fxState[2].offset = OFFB_FPROUND;
   5132                d->fxState[2].size   = sizeof(ULong);
   5133 
   5134                d->fxState[3].fx     = Ifx_Read;
   5135                d->fxState[3].offset = OFFB_FC3210;
   5136                d->fxState[3].size   = sizeof(ULong);
   5137 
   5138                stmt( IRStmt_Dirty(d) );
   5139 
   5140                DIP("fnstenv %s\n", dis_buf);
   5141                break;
   5142             }
   5143 
   5144             case 7: /* FNSTCW */
   5145                /* Fake up a native x87 FPU control word.  The only
   5146                   thing it depends on is FPROUND[1:0], so call a clean
   5147                   helper to cook it up. */
   5148                /* ULong amd64g_create_fpucw ( ULong fpround ) */
   5149                DIP("fnstcw %s\n", dis_buf);
   5150                storeLE(
   5151                   mkexpr(addr),
   5152                   unop( Iop_64to16,
   5153                         mkIRExprCCall(
   5154                            Ity_I64, 0/*regp*/,
   5155                            "amd64g_create_fpucw", &amd64g_create_fpucw,
   5156                            mkIRExprVec_1( unop(Iop_32Uto64, get_fpround()) )
   5157                         )
   5158                   )
   5159                );
   5160                break;
   5161 
   5162             default:
   5163                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   5164                vex_printf("first_opcode == 0xD9\n");
   5165                goto decode_fail;
   5166          }
   5167 
   5168       } else {
   5169          delta++;
   5170          switch (modrm) {
   5171 
   5172             case 0xC0 ... 0xC7: /* FLD %st(?) */
   5173                r_src = (UInt)modrm - 0xC0;
   5174                DIP("fld %%st(%u)\n", r_src);
   5175                t1 = newTemp(Ity_F64);
   5176                assign(t1, get_ST(r_src));
   5177                fp_push();
   5178                put_ST(0, mkexpr(t1));
   5179                break;
   5180 
   5181             case 0xC8 ... 0xCF: /* FXCH %st(?) */
   5182                r_src = (UInt)modrm - 0xC8;
   5183                DIP("fxch %%st(%u)\n", r_src);
   5184                t1 = newTemp(Ity_F64);
   5185                t2 = newTemp(Ity_F64);
   5186                assign(t1, get_ST(0));
   5187                assign(t2, get_ST(r_src));
   5188                put_ST_UNCHECKED(0, mkexpr(t2));
   5189                put_ST_UNCHECKED(r_src, mkexpr(t1));
   5190                break;
   5191 
   5192             case 0xE0: /* FCHS */
   5193                DIP("fchs\n");
   5194                put_ST_UNCHECKED(0, unop(Iop_NegF64, get_ST(0)));
   5195                break;
   5196 
   5197             case 0xE1: /* FABS */
   5198                DIP("fabs\n");
   5199                put_ST_UNCHECKED(0, unop(Iop_AbsF64, get_ST(0)));
   5200                break;
   5201 
   5202             case 0xE5: { /* FXAM */
   5203                /* This is an interesting one.  It examines %st(0),
   5204                   regardless of whether the tag says it's empty or not.
   5205                   Here, just pass both the tag (in our format) and the
   5206                   value (as a double, actually a ULong) to a helper
   5207                   function. */
   5208                IRExpr** args
   5209                   = mkIRExprVec_2( unop(Iop_8Uto64, get_ST_TAG(0)),
   5210                                    unop(Iop_ReinterpF64asI64,
   5211                                         get_ST_UNCHECKED(0)) );
   5212                put_C3210(mkIRExprCCall(
   5213                             Ity_I64,
   5214                             0/*regparm*/,
   5215                             "amd64g_calculate_FXAM", &amd64g_calculate_FXAM,
   5216                             args
   5217                         ));
   5218                DIP("fxam\n");
   5219                break;
   5220             }
   5221 
   5222             case 0xE8: /* FLD1 */
   5223                DIP("fld1\n");
   5224                fp_push();
   5225                /* put_ST(0, IRExpr_Const(IRConst_F64(1.0))); */
   5226                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff0000000000000ULL)));
   5227                break;
   5228 
   5229             case 0xE9: /* FLDL2T */
   5230                DIP("fldl2t\n");
   5231                fp_push();
   5232                /* put_ST(0, IRExpr_Const(IRConst_F64(3.32192809488736234781))); */
   5233                put_ST(0, IRExpr_Const(IRConst_F64i(0x400a934f0979a371ULL)));
   5234                break;
   5235 
   5236             case 0xEA: /* FLDL2E */
   5237                DIP("fldl2e\n");
   5238                fp_push();
   5239                /* put_ST(0, IRExpr_Const(IRConst_F64(1.44269504088896340739))); */
   5240                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff71547652b82feULL)));
   5241                break;
   5242 
   5243             case 0xEB: /* FLDPI */
   5244                DIP("fldpi\n");
   5245                fp_push();
   5246                /* put_ST(0, IRExpr_Const(IRConst_F64(3.14159265358979323851))); */
   5247                put_ST(0, IRExpr_Const(IRConst_F64i(0x400921fb54442d18ULL)));
   5248                break;
   5249 
   5250             case 0xEC: /* FLDLG2 */
   5251                DIP("fldlg2\n");
   5252                fp_push();
   5253                /* put_ST(0, IRExpr_Const(IRConst_F64(0.301029995663981143))); */
   5254                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fd34413509f79ffULL)));
   5255                break;
   5256 
   5257             case 0xED: /* FLDLN2 */
   5258                DIP("fldln2\n");
   5259                fp_push();
   5260                /* put_ST(0, IRExpr_Const(IRConst_F64(0.69314718055994530942))); */
   5261                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fe62e42fefa39efULL)));
   5262                break;
   5263 
   5264             case 0xEE: /* FLDZ */
   5265                DIP("fldz\n");
   5266                fp_push();
   5267                /* put_ST(0, IRExpr_Const(IRConst_F64(0.0))); */
   5268                put_ST(0, IRExpr_Const(IRConst_F64i(0x0000000000000000ULL)));
   5269                break;
   5270 
   5271             case 0xF0: /* F2XM1 */
   5272                DIP("f2xm1\n");
   5273                put_ST_UNCHECKED(0,
   5274                   binop(Iop_2xm1F64,
   5275                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5276                         get_ST(0)));
   5277                break;
   5278 
   5279             case 0xF1: /* FYL2X */
   5280                DIP("fyl2x\n");
   5281                put_ST_UNCHECKED(1,
   5282                   triop(Iop_Yl2xF64,
   5283                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5284                         get_ST(1),
   5285                         get_ST(0)));
   5286                fp_pop();
   5287                break;
   5288 
   5289             case 0xF2: /* FPTAN */
   5290                DIP("ftan\n");
   5291                put_ST_UNCHECKED(0,
   5292                   binop(Iop_TanF64,
   5293                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5294                         get_ST(0)));
   5295                fp_push();
   5296                put_ST(0, IRExpr_Const(IRConst_F64(1.0)));
   5297                clear_C2(); /* HACK */
   5298                break;
   5299 
   5300             case 0xF3: /* FPATAN */
   5301                DIP("fpatan\n");
   5302                put_ST_UNCHECKED(1,
   5303                   triop(Iop_AtanF64,
   5304                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5305                         get_ST(1),
   5306                         get_ST(0)));
   5307                fp_pop();
   5308                break;
   5309 
   5310             case 0xF4: { /* FXTRACT */
   5311                IRTemp argF = newTemp(Ity_F64);
   5312                IRTemp sigF = newTemp(Ity_F64);
   5313                IRTemp expF = newTemp(Ity_F64);
   5314                IRTemp argI = newTemp(Ity_I64);
   5315                IRTemp sigI = newTemp(Ity_I64);
   5316                IRTemp expI = newTemp(Ity_I64);
   5317                DIP("fxtract\n");
   5318                assign( argF, get_ST(0) );
   5319                assign( argI, unop(Iop_ReinterpF64asI64, mkexpr(argF)));
   5320                assign( sigI,
   5321                        mkIRExprCCall(
   5322                           Ity_I64, 0/*regparms*/,
   5323                           "x86amd64g_calculate_FXTRACT",
   5324                           &x86amd64g_calculate_FXTRACT,
   5325                           mkIRExprVec_2( mkexpr(argI),
   5326                                          mkIRExpr_HWord(0)/*sig*/ ))
   5327                );
   5328                assign( expI,
   5329                        mkIRExprCCall(
   5330                           Ity_I64, 0/*regparms*/,
   5331                           "x86amd64g_calculate_FXTRACT",
   5332                           &x86amd64g_calculate_FXTRACT,
   5333                           mkIRExprVec_2( mkexpr(argI),
   5334                                          mkIRExpr_HWord(1)/*exp*/ ))
   5335                );
   5336                assign( sigF, unop(Iop_ReinterpI64asF64, mkexpr(sigI)) );
   5337                assign( expF, unop(Iop_ReinterpI64asF64, mkexpr(expI)) );
   5338                /* exponent */
   5339                put_ST_UNCHECKED(0, mkexpr(expF) );
   5340                fp_push();
   5341                /* significand */
   5342                put_ST(0, mkexpr(sigF) );
   5343                break;
   5344             }
   5345 
   5346             case 0xF5: { /* FPREM1 -- IEEE compliant */
   5347                IRTemp a1 = newTemp(Ity_F64);
   5348                IRTemp a2 = newTemp(Ity_F64);
   5349                DIP("fprem1\n");
   5350                /* Do FPREM1 twice, once to get the remainder, and once
   5351                   to get the C3210 flag values. */
   5352                assign( a1, get_ST(0) );
   5353                assign( a2, get_ST(1) );
   5354                put_ST_UNCHECKED(0,
   5355                   triop(Iop_PRem1F64,
   5356                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5357                         mkexpr(a1),
   5358                         mkexpr(a2)));
   5359                put_C3210(
   5360                   unop(Iop_32Uto64,
   5361                   triop(Iop_PRem1C3210F64,
   5362                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5363                         mkexpr(a1),
   5364                         mkexpr(a2)) ));
   5365                break;
   5366             }
   5367 
   5368             case 0xF7: /* FINCSTP */
   5369                DIP("fincstp\n");
   5370                put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
   5371                break;
   5372 
   5373             case 0xF8: { /* FPREM -- not IEEE compliant */
   5374                IRTemp a1 = newTemp(Ity_F64);
   5375                IRTemp a2 = newTemp(Ity_F64);
   5376                DIP("fprem\n");
   5377                /* Do FPREM twice, once to get the remainder, and once
   5378                   to get the C3210 flag values. */
   5379                assign( a1, get_ST(0) );
   5380                assign( a2, get_ST(1) );
   5381                put_ST_UNCHECKED(0,
   5382                   triop(Iop_PRemF64,
   5383                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5384                         mkexpr(a1),
   5385                         mkexpr(a2)));
   5386                put_C3210(
   5387                   unop(Iop_32Uto64,
   5388                   triop(Iop_PRemC3210F64,
   5389                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5390                         mkexpr(a1),
   5391                         mkexpr(a2)) ));
   5392                break;
   5393             }
   5394 
   5395             case 0xF9: /* FYL2XP1 */
   5396                DIP("fyl2xp1\n");
   5397                put_ST_UNCHECKED(1,
   5398                   triop(Iop_Yl2xp1F64,
   5399                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5400                         get_ST(1),
   5401                         get_ST(0)));
   5402                fp_pop();
   5403                break;
   5404 
   5405             case 0xFA: /* FSQRT */
   5406                DIP("fsqrt\n");
   5407                put_ST_UNCHECKED(0,
   5408                   binop(Iop_SqrtF64,
   5409                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5410                         get_ST(0)));
   5411                break;
   5412 
   5413             case 0xFB: { /* FSINCOS */
   5414                IRTemp a1 = newTemp(Ity_F64);
   5415                assign( a1, get_ST(0) );
   5416                DIP("fsincos\n");
   5417                put_ST_UNCHECKED(0,
   5418                   binop(Iop_SinF64,
   5419                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5420                         mkexpr(a1)));
   5421                fp_push();
   5422                put_ST(0,
   5423                   binop(Iop_CosF64,
   5424                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5425                         mkexpr(a1)));
   5426                clear_C2(); /* HACK */
   5427                break;
   5428             }
   5429 
   5430             case 0xFC: /* FRNDINT */
   5431                DIP("frndint\n");
   5432                put_ST_UNCHECKED(0,
   5433                   binop(Iop_RoundF64toInt, get_roundingmode(), get_ST(0)) );
   5434                break;
   5435 
   5436             case 0xFD: /* FSCALE */
   5437                DIP("fscale\n");
   5438                put_ST_UNCHECKED(0,
   5439                   triop(Iop_ScaleF64,
   5440                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5441                         get_ST(0),
   5442                         get_ST(1)));
   5443                break;
   5444 
   5445             case 0xFE: /* FSIN */
   5446                DIP("fsin\n");
   5447                put_ST_UNCHECKED(0,
   5448                   binop(Iop_SinF64,
   5449                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5450                         get_ST(0)));
   5451                clear_C2(); /* HACK */
   5452                break;
   5453 
   5454             case 0xFF: /* FCOS */
   5455                DIP("fcos\n");
   5456                put_ST_UNCHECKED(0,
   5457                   binop(Iop_CosF64,
   5458                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5459                         get_ST(0)));
   5460                clear_C2(); /* HACK */
   5461                break;
   5462 
   5463             default:
   5464                goto decode_fail;
   5465          }
   5466       }
   5467    }
   5468 
   5469    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDA opcodes +-+-+-+-+-+-+-+ */
   5470    else
   5471    if (first_opcode == 0xDA) {
   5472 
   5473       if (modrm < 0xC0) {
   5474 
   5475          /* bits 5,4,3 are an opcode extension, and the modRM also
   5476             specifies an address. */
   5477          IROp   fop;
   5478          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   5479          delta += len;
   5480          switch (gregLO3ofRM(modrm)) {
   5481 
   5482             case 0: /* FIADD m32int */ /* ST(0) += m32int */
   5483                DIP("fiaddl %s\n", dis_buf);
   5484                fop = Iop_AddF64;
   5485                goto do_fop_m32;
   5486 
   5487             case 1: /* FIMUL m32int */ /* ST(0) *= m32int */
   5488                DIP("fimull %s\n", dis_buf);
   5489                fop = Iop_MulF64;
   5490                goto do_fop_m32;
   5491 
   5492             case 4: /* FISUB m32int */ /* ST(0) -= m32int */
   5493                DIP("fisubl %s\n", dis_buf);
   5494                fop = Iop_SubF64;
   5495                goto do_fop_m32;
   5496 
   5497             case 5: /* FISUBR m32int */ /* ST(0) = m32int - ST(0) */
   5498                DIP("fisubrl %s\n", dis_buf);
   5499                fop = Iop_SubF64;
   5500                goto do_foprev_m32;
   5501 
   5502             case 6: /* FIDIV m32int */ /* ST(0) /= m32int */
   5503                DIP("fisubl %s\n", dis_buf);
   5504                fop = Iop_DivF64;
   5505                goto do_fop_m32;
   5506 
   5507             case 7: /* FIDIVR m32int */ /* ST(0) = m32int / ST(0) */
   5508                DIP("fidivrl %s\n", dis_buf);
   5509                fop = Iop_DivF64;
   5510                goto do_foprev_m32;
   5511 
   5512             do_fop_m32:
   5513                put_ST_UNCHECKED(0,
   5514                   triop(fop,
   5515                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5516                         get_ST(0),
   5517                         unop(Iop_I32StoF64,
   5518                              loadLE(Ity_I32, mkexpr(addr)))));
   5519                break;
   5520 
   5521             do_foprev_m32:
   5522                put_ST_UNCHECKED(0,
   5523                   triop(fop,
   5524                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5525                         unop(Iop_I32StoF64,
   5526                              loadLE(Ity_I32, mkexpr(addr))),
   5527                         get_ST(0)));
   5528                break;
   5529 
   5530             default:
   5531                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   5532                vex_printf("first_opcode == 0xDA\n");
   5533                goto decode_fail;
   5534          }
   5535 
   5536       } else {
   5537 
   5538          delta++;
   5539          switch (modrm) {
   5540 
   5541             case 0xC0 ... 0xC7: /* FCMOVB ST(i), ST(0) */
   5542                r_src = (UInt)modrm - 0xC0;
   5543                DIP("fcmovb %%st(%u), %%st(0)\n", r_src);
   5544                put_ST_UNCHECKED(0,
   5545                                 IRExpr_Mux0X(
   5546                                     unop(Iop_1Uto8,
   5547                                          mk_amd64g_calculate_condition(AMD64CondB)),
   5548                                     get_ST(0), get_ST(r_src)) );
   5549                break;
   5550 
   5551             case 0xC8 ... 0xCF: /* FCMOVE(Z) ST(i), ST(0) */
   5552                r_src = (UInt)modrm - 0xC8;
   5553                DIP("fcmovz %%st(%u), %%st(0)\n", r_src);
   5554                put_ST_UNCHECKED(0,
   5555                                 IRExpr_Mux0X(
   5556                                     unop(Iop_1Uto8,
   5557                                          mk_amd64g_calculate_condition(AMD64CondZ)),
   5558                                     get_ST(0), get_ST(r_src)) );
   5559                break;
   5560 
   5561             case 0xD0 ... 0xD7: /* FCMOVBE ST(i), ST(0) */
   5562                r_src = (UInt)modrm - 0xD0;
   5563                DIP("fcmovbe %%st(%u), %%st(0)\n", r_src);
   5564                put_ST_UNCHECKED(0,
   5565                                 IRExpr_Mux0X(
   5566                                     unop(Iop_1Uto8,
   5567                                          mk_amd64g_calculate_condition(AMD64CondBE)),
   5568                                     get_ST(0), get_ST(r_src)) );
   5569                break;
   5570 
   5571             case 0xD8 ... 0xDF: /* FCMOVU ST(i), ST(0) */
   5572                r_src = (UInt)modrm - 0xD8;
   5573                DIP("fcmovu %%st(%u), %%st(0)\n", r_src);
   5574                put_ST_UNCHECKED(0,
   5575                                 IRExpr_Mux0X(
   5576                                     unop(Iop_1Uto8,
   5577                                          mk_amd64g_calculate_condition(AMD64CondP)),
   5578                                     get_ST(0), get_ST(r_src)) );
   5579                break;
   5580 
   5581             case 0xE9: /* FUCOMPP %st(0),%st(1) */
   5582                DIP("fucompp %%st(0),%%st(1)\n");
   5583                /* This forces C1 to zero, which isn't right. */
   5584                put_C3210(
   5585                    unop(Iop_32Uto64,
   5586                    binop( Iop_And32,
   5587                           binop(Iop_Shl32,
   5588                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
   5589                                 mkU8(8)),
   5590                           mkU32(0x4500)
   5591                    )));
   5592                fp_pop();
   5593                fp_pop();
   5594                break;
   5595 
   5596             default:
   5597                goto decode_fail;
   5598          }
   5599 
   5600       }
   5601    }
   5602 
   5603    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDB opcodes +-+-+-+-+-+-+-+ */
   5604    else
   5605    if (first_opcode == 0xDB) {
   5606       if (modrm < 0xC0) {
   5607 
   5608          /* bits 5,4,3 are an opcode extension, and the modRM also
   5609             specifies an address. */
   5610          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   5611          delta += len;
   5612 
   5613          switch (gregLO3ofRM(modrm)) {
   5614 
   5615             case 0: /* FILD m32int */
   5616                DIP("fildl %s\n", dis_buf);
   5617                fp_push();
   5618                put_ST(0, unop(Iop_I32StoF64,
   5619                               loadLE(Ity_I32, mkexpr(addr))));
   5620                break;
   5621 
   5622             case 1: /* FISTTPL m32 (SSE3) */
   5623                DIP("fisttpl %s\n", dis_buf);
   5624                storeLE( mkexpr(addr),
   5625                         binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) );
   5626                fp_pop();
   5627                break;
   5628 
   5629             case 2: /* FIST m32 */
   5630                DIP("fistl %s\n", dis_buf);
   5631                storeLE( mkexpr(addr),
   5632                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
   5633                break;
   5634 
   5635             case 3: /* FISTP m32 */
   5636                DIP("fistpl %s\n", dis_buf);
   5637                storeLE( mkexpr(addr),
   5638                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
   5639                fp_pop();
   5640                break;
   5641 
   5642             case 5: { /* FLD extended-real */
   5643                /* Uses dirty helper:
   5644                      ULong amd64g_loadF80le ( ULong )
   5645                   addr holds the address.  First, do a dirty call to
   5646                   get hold of the data. */
   5647                IRTemp   val  = newTemp(Ity_I64);
   5648                IRExpr** args = mkIRExprVec_1 ( mkexpr(addr) );
   5649 
   5650                IRDirty* d = unsafeIRDirty_1_N (
   5651                                val,
   5652                                0/*regparms*/,
   5653                                "amd64g_dirtyhelper_loadF80le",
   5654                                &amd64g_dirtyhelper_loadF80le,
   5655                                args
   5656                             );
   5657                /* declare that we're reading memory */
   5658                d->mFx   = Ifx_Read;
   5659                d->mAddr = mkexpr(addr);
   5660                d->mSize = 10;
   5661 
   5662                /* execute the dirty call, dumping the result in val. */
   5663                stmt( IRStmt_Dirty(d) );
   5664                fp_push();
   5665                put_ST(0, unop(Iop_ReinterpI64asF64, mkexpr(val)));
   5666 
   5667                DIP("fldt %s\n", dis_buf);
   5668                break;
   5669             }
   5670 
   5671             case 7: { /* FSTP extended-real */
   5672                /* Uses dirty helper:
   5673                      void amd64g_storeF80le ( ULong addr, ULong data )
   5674                */
   5675                IRExpr** args
   5676                   = mkIRExprVec_2( mkexpr(addr),
   5677                                    unop(Iop_ReinterpF64asI64, get_ST(0)) );
   5678 
   5679                IRDirty* d = unsafeIRDirty_0_N (
   5680                                0/*regparms*/,
   5681                                "amd64g_dirtyhelper_storeF80le",
   5682                                &amd64g_dirtyhelper_storeF80le,
   5683                                args
   5684                             );
   5685                /* declare we're writing memory */
   5686                d->mFx   = Ifx_Write;
   5687                d->mAddr = mkexpr(addr);
   5688                d->mSize = 10;
   5689 
   5690                /* execute the dirty call. */
   5691                stmt( IRStmt_Dirty(d) );
   5692                fp_pop();
   5693 
   5694                DIP("fstpt\n %s", dis_buf);
   5695                break;
   5696             }
   5697 
   5698             default:
   5699                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   5700                vex_printf("first_opcode == 0xDB\n");
   5701                goto decode_fail;
   5702          }
   5703 
   5704       } else {
   5705 
   5706          delta++;
   5707          switch (modrm) {
   5708 
   5709             case 0xC0 ... 0xC7: /* FCMOVNB ST(i), ST(0) */
   5710                r_src = (UInt)modrm - 0xC0;
   5711                DIP("fcmovnb %%st(%u), %%st(0)\n", r_src);
   5712                put_ST_UNCHECKED(0,
   5713                                 IRExpr_Mux0X(
   5714                                     unop(Iop_1Uto8,
   5715                                          mk_amd64g_calculate_condition(AMD64CondNB)),
   5716                                     get_ST(0), get_ST(r_src)) );
   5717                break;
   5718 
   5719             case 0xC8 ... 0xCF: /* FCMOVNE(NZ) ST(i), ST(0) */
   5720                r_src = (UInt)modrm - 0xC8;
   5721                DIP("fcmovnz %%st(%u), %%st(0)\n", r_src);
   5722                put_ST_UNCHECKED(
   5723                   0,
   5724                   IRExpr_Mux0X(
   5725                      unop(Iop_1Uto8,
   5726                           mk_amd64g_calculate_condition(AMD64CondNZ)),
   5727                      get_ST(0),
   5728                      get_ST(r_src)
   5729                   )
   5730                );
   5731                break;
   5732 
   5733             case 0xD0 ... 0xD7: /* FCMOVNBE ST(i), ST(0) */
   5734                r_src = (UInt)modrm - 0xD0;
   5735                DIP("fcmovnbe %%st(%u), %%st(0)\n", r_src);
   5736                put_ST_UNCHECKED(
   5737                   0,
   5738                   IRExpr_Mux0X(
   5739                      unop(Iop_1Uto8,
   5740                           mk_amd64g_calculate_condition(AMD64CondNBE)),
   5741                      get_ST(0),
   5742                      get_ST(r_src)
   5743                   )
   5744                );
   5745                break;
   5746 
   5747             case 0xD8 ... 0xDF: /* FCMOVNU ST(i), ST(0) */
   5748                r_src = (UInt)modrm - 0xD8;
   5749                DIP("fcmovnu %%st(%u), %%st(0)\n", r_src);
   5750                put_ST_UNCHECKED(
   5751                   0,
   5752                   IRExpr_Mux0X(
   5753                      unop(Iop_1Uto8,
   5754                           mk_amd64g_calculate_condition(AMD64CondNP)),
   5755                      get_ST(0),
   5756                      get_ST(r_src)
   5757                   )
   5758                );
   5759                break;
   5760 
   5761             case 0xE2:
   5762                DIP("fnclex\n");
   5763                break;
   5764 
   5765             case 0xE3: {
   5766                /* Uses dirty helper:
   5767                      void amd64g_do_FINIT ( VexGuestAMD64State* ) */
   5768                IRDirty* d  = unsafeIRDirty_0_N (
   5769                                 0/*regparms*/,
   5770                                 "amd64g_dirtyhelper_FINIT",
   5771                                 &amd64g_dirtyhelper_FINIT,
   5772                                 mkIRExprVec_0()
   5773                              );
   5774                d->needsBBP = True;
   5775 
   5776                /* declare we're writing guest state */
   5777                d->nFxState = 5;
   5778 
   5779                d->fxState[0].fx     = Ifx_Write;
   5780                d->fxState[0].offset = OFFB_FTOP;
   5781                d->fxState[0].size   = sizeof(UInt);
   5782 
   5783                d->fxState[1].fx     = Ifx_Write;
   5784                d->fxState[1].offset = OFFB_FPREGS;
   5785                d->fxState[1].size   = 8 * sizeof(ULong);
   5786 
   5787                d->fxState[2].fx     = Ifx_Write;
   5788                d->fxState[2].offset = OFFB_FPTAGS;
   5789                d->fxState[2].size   = 8 * sizeof(UChar);
   5790 
   5791                d->fxState[3].fx     = Ifx_Write;
   5792                d->fxState[3].offset = OFFB_FPROUND;
   5793                d->fxState[3].size   = sizeof(ULong);
   5794 
   5795                d->fxState[4].fx     = Ifx_Write;
   5796                d->fxState[4].offset = OFFB_FC3210;
   5797                d->fxState[4].size   = sizeof(ULong);
   5798 
   5799                stmt( IRStmt_Dirty(d) );
   5800 
   5801                DIP("fninit\n");
   5802                break;
   5803             }
   5804 
   5805             case 0xE8 ... 0xEF: /* FUCOMI %st(0),%st(?) */
   5806                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, False );
   5807                break;
   5808 
   5809             case 0xF0 ... 0xF7: /* FCOMI %st(0),%st(?) */
   5810                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, False );
   5811                break;
   5812 
   5813             default:
   5814                goto decode_fail;
   5815          }
   5816       }
   5817    }
   5818 
   5819    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDC opcodes +-+-+-+-+-+-+-+ */
   5820    else
   5821    if (first_opcode == 0xDC) {
   5822       if (modrm < 0xC0) {
   5823 
   5824          /* bits 5,4,3 are an opcode extension, and the modRM also
   5825             specifies an address. */
   5826          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   5827          delta += len;
   5828 
   5829          switch (gregLO3ofRM(modrm)) {
   5830 
   5831             case 0: /* FADD double-real */
   5832                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, True );
   5833                break;
   5834 
   5835             case 1: /* FMUL double-real */
   5836                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, True );
   5837                break;
   5838 
   5839 //..             case 2: /* FCOM double-real */
   5840 //..                DIP("fcoml %s\n", dis_buf);
   5841 //..                /* This forces C1 to zero, which isn't right. */
   5842 //..                put_C3210(
   5843 //..                    binop( Iop_And32,
   5844 //..                           binop(Iop_Shl32,
   5845 //..                                 binop(Iop_CmpF64,
   5846 //..                                       get_ST(0),
   5847 //..                                       loadLE(Ity_F64,mkexpr(addr))),
   5848 //..                                 mkU8(8)),
   5849 //..                           mkU32(0x4500)
   5850 //..                    ));
   5851 //..                break;
   5852 
   5853             case 3: /* FCOMP double-real */
   5854                DIP("fcompl %s\n", dis_buf);
   5855                /* This forces C1 to zero, which isn't right. */
   5856                put_C3210(
   5857                    unop(Iop_32Uto64,
   5858                    binop( Iop_And32,
   5859                           binop(Iop_Shl32,
   5860                                 binop(Iop_CmpF64,
   5861                                       get_ST(0),
   5862                                       loadLE(Ity_F64,mkexpr(addr))),
   5863                                 mkU8(8)),
   5864                           mkU32(0x4500)
   5865                    )));
   5866                fp_pop();
   5867                break;
   5868 
   5869             case 4: /* FSUB double-real */
   5870                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, True );
   5871                break;
   5872 
   5873             case 5: /* FSUBR double-real */
   5874                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, True );
   5875                break;
   5876 
   5877             case 6: /* FDIV double-real */
   5878                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, True );
   5879                break;
   5880 
   5881             case 7: /* FDIVR double-real */
   5882                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, True );
   5883                break;
   5884 
   5885             default:
   5886                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   5887                vex_printf("first_opcode == 0xDC\n");
   5888                goto decode_fail;
   5889          }
   5890 
   5891       } else {
   5892 
   5893          delta++;
   5894          switch (modrm) {
   5895 
   5896             case 0xC0 ... 0xC7: /* FADD %st(0),%st(?) */
   5897                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, False );
   5898                break;
   5899 
   5900             case 0xC8 ... 0xCF: /* FMUL %st(0),%st(?) */
   5901                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, False );
   5902                break;
   5903 
   5904             case 0xE0 ... 0xE7: /* FSUBR %st(0),%st(?) */
   5905                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0, modrm - 0xE0, False );
   5906                break;
   5907 
   5908             case 0xE8 ... 0xEF: /* FSUB %st(0),%st(?) */
   5909                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0, modrm - 0xE8, False );
   5910                break;
   5911 
   5912             case 0xF0 ... 0xF7: /* FDIVR %st(0),%st(?) */
   5913                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, False );
   5914                break;
   5915 
   5916             case 0xF8 ... 0xFF: /* FDIV %st(0),%st(?) */
   5917                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, False );
   5918                break;
   5919 
   5920             default:
   5921                goto decode_fail;
   5922          }
   5923 
   5924       }
   5925    }
   5926 
   5927    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDD opcodes +-+-+-+-+-+-+-+ */
   5928    else
   5929    if (first_opcode == 0xDD) {
   5930 
   5931       if (modrm < 0xC0) {
   5932 
   5933          /* bits 5,4,3 are an opcode extension, and the modRM also
   5934             specifies an address. */
   5935          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   5936          delta += len;
   5937 
   5938          switch (gregLO3ofRM(modrm)) {
   5939 
   5940             case 0: /* FLD double-real */
   5941                DIP("fldl %s\n", dis_buf);
   5942                fp_push();
   5943                put_ST(0, loadLE(Ity_F64, mkexpr(addr)));
   5944                break;
   5945 
   5946             case 1: /* FISTTPQ m64 (SSE3) */
   5947                DIP("fistppll %s\n", dis_buf);
   5948                storeLE( mkexpr(addr),
   5949                         binop(Iop_F64toI64S, mkU32(Irrm_ZERO), get_ST(0)) );
   5950                fp_pop();
   5951                break;
   5952 
   5953             case 2: /* FST double-real */
   5954                DIP("fstl %s\n", dis_buf);
   5955                storeLE(mkexpr(addr), get_ST(0));
   5956                break;
   5957 
   5958             case 3: /* FSTP double-real */
   5959                DIP("fstpl %s\n", dis_buf);
   5960                storeLE(mkexpr(addr), get_ST(0));
   5961                fp_pop();
   5962                break;
   5963 
   5964 //..             case 4: { /* FRSTOR m108 */
   5965 //..                /* Uses dirty helper:
   5966 //..                      VexEmWarn x86g_do_FRSTOR ( VexGuestX86State*, Addr32 ) */
   5967 //..                IRTemp   ew = newTemp(Ity_I32);
   5968 //..                IRDirty* d  = unsafeIRDirty_0_N (
   5969 //..                                 0/*regparms*/,
   5970 //..                                 "x86g_dirtyhelper_FRSTOR",
   5971 //..                                 &x86g_dirtyhelper_FRSTOR,
   5972 //..                                 mkIRExprVec_1( mkexpr(addr) )
   5973 //..                              );
   5974 //..                d->needsBBP = True;
   5975 //..                d->tmp      = ew;
   5976 //..                /* declare we're reading memory */
   5977 //..                d->mFx   = Ifx_Read;
   5978 //..                d->mAddr = mkexpr(addr);
   5979 //..                d->mSize = 108;
   5980 //..
   5981 //..                /* declare we're writing guest state */
   5982 //..                d->nFxState = 5;
   5983 //..
   5984 //..                d->fxState[0].fx     = Ifx_Write;
   5985 //..                d->fxState[0].offset = OFFB_FTOP;
   5986 //..                d->fxState[0].size   = sizeof(UInt);
   5987 //..
   5988 //..                d->fxState[1].fx     = Ifx_Write;
   5989 //..                d->fxState[1].offset = OFFB_FPREGS;
   5990 //..                d->fxState[1].size   = 8 * sizeof(ULong);
   5991 //..
   5992 //..                d->fxState[2].fx     = Ifx_Write;
   5993 //..                d->fxState[2].offset = OFFB_FPTAGS;
   5994 //..                d->fxState[2].size   = 8 * sizeof(UChar);
   5995 //..
   5996 //..                d->fxState[3].fx     = Ifx_Write;
   5997 //..                d->fxState[3].offset = OFFB_FPROUND;
   5998 //..                d->fxState[3].size   = sizeof(UInt);
   5999 //..
   6000 //..                d->fxState[4].fx     = Ifx_Write;
   6001 //..                d->fxState[4].offset = OFFB_FC3210;
   6002 //..                d->fxState[4].size   = sizeof(UInt);
   6003 //..
   6004 //..                stmt( IRStmt_Dirty(d) );
   6005 //..
   6006 //..                /* ew contains any emulation warning we may need to
   6007 //..                   issue.  If needed, side-exit to the next insn,
   6008 //..                   reporting the warning, so that Valgrind's dispatcher
   6009 //..                   sees the warning. */
   6010 //..                put_emwarn( mkexpr(ew) );
   6011 //..                stmt(
   6012 //..                   IRStmt_Exit(
   6013 //..                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   6014 //..                      Ijk_EmWarn,
   6015 //..                      IRConst_U32( ((Addr32)guest_eip_bbstart)+delta)
   6016 //..                   )
   6017 //..                );
   6018 //..
   6019 //..                DIP("frstor %s\n", dis_buf);
   6020 //..                break;
   6021 //..             }
   6022 //..
   6023 //..             case 6: { /* FNSAVE m108 */
   6024 //..                /* Uses dirty helper:
   6025 //..                      void x86g_do_FSAVE ( VexGuestX86State*, UInt ) */
   6026 //..                IRDirty* d = unsafeIRDirty_0_N (
   6027 //..                                0/*regparms*/,
   6028 //..                                "x86g_dirtyhelper_FSAVE",
   6029 //..                                &x86g_dirtyhelper_FSAVE,
   6030 //..                                mkIRExprVec_1( mkexpr(addr) )
   6031 //..                             );
   6032 //..                d->needsBBP = True;
   6033 //..                /* declare we're writing memory */
   6034 //..                d->mFx   = Ifx_Write;
   6035 //..                d->mAddr = mkexpr(addr);
   6036 //..                d->mSize = 108;
   6037 //..
   6038 //..                /* declare we're reading guest state */
   6039 //..                d->nFxState = 5;
   6040 //..
   6041 //..                d->fxState[0].fx     = Ifx_Read;
   6042 //..                d->fxState[0].offset = OFFB_FTOP;
   6043 //..                d->fxState[0].size   = sizeof(UInt);
   6044 //..
   6045 //..                d->fxState[1].fx     = Ifx_Read;
   6046 //..                d->fxState[1].offset = OFFB_FPREGS;
   6047 //..                d->fxState[1].size   = 8 * sizeof(ULong);
   6048 //..
   6049 //..                d->fxState[2].fx     = Ifx_Read;
   6050 //..                d->fxState[2].offset = OFFB_FPTAGS;
   6051 //..                d->fxState[2].size   = 8 * sizeof(UChar);
   6052 //..
   6053 //..                d->fxState[3].fx     = Ifx_Read;
   6054 //..                d->fxState[3].offset = OFFB_FPROUND;
   6055 //..                d->fxState[3].size   = sizeof(UInt);
   6056 //..
   6057 //..                d->fxState[4].fx     = Ifx_Read;
   6058 //..                d->fxState[4].offset = OFFB_FC3210;
   6059 //..                d->fxState[4].size   = sizeof(UInt);
   6060 //..
   6061 //..                stmt( IRStmt_Dirty(d) );
   6062 //..
   6063 //..                DIP("fnsave %s\n", dis_buf);
   6064 //..                break;
   6065 //..             }
   6066 
   6067             case 7: { /* FNSTSW m16 */
   6068                IRExpr* sw = get_FPU_sw();
   6069                vassert(typeOfIRExpr(irsb->tyenv, sw) == Ity_I16);
   6070                storeLE( mkexpr(addr), sw );
   6071                DIP("fnstsw %s\n", dis_buf);
   6072                break;
   6073             }
   6074 
   6075             default:
   6076                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   6077                vex_printf("first_opcode == 0xDD\n");
   6078                goto decode_fail;
   6079          }
   6080       } else {
   6081          delta++;
   6082          switch (modrm) {
   6083 
   6084             case 0xC0 ... 0xC7: /* FFREE %st(?) */
   6085                r_dst = (UInt)modrm - 0xC0;
   6086                DIP("ffree %%st(%u)\n", r_dst);
   6087                put_ST_TAG ( r_dst, mkU8(0) );
   6088                break;
   6089 
   6090             case 0xD0 ... 0xD7: /* FST %st(0),%st(?) */
   6091                r_dst = (UInt)modrm - 0xD0;
   6092                DIP("fst %%st(0),%%st(%u)\n", r_dst);
   6093                /* P4 manual says: "If the destination operand is a
   6094                   non-empty register, the invalid-operation exception
   6095                   is not generated.  Hence put_ST_UNCHECKED. */
   6096                put_ST_UNCHECKED(r_dst, get_ST(0));
   6097                break;
   6098 
   6099             case 0xD8 ... 0xDF: /* FSTP %st(0),%st(?) */
   6100                r_dst = (UInt)modrm - 0xD8;
   6101                DIP("fstp %%st(0),%%st(%u)\n", r_dst);
   6102                /* P4 manual says: "If the destination operand is a
   6103                   non-empty register, the invalid-operation exception
   6104                   is not generated.  Hence put_ST_UNCHECKED. */
   6105                put_ST_UNCHECKED(r_dst, get_ST(0));
   6106                fp_pop();
   6107                break;
   6108 
   6109             case 0xE0 ... 0xE7: /* FUCOM %st(0),%st(?) */
   6110                r_dst = (UInt)modrm - 0xE0;
   6111                DIP("fucom %%st(0),%%st(%u)\n", r_dst);
   6112                /* This forces C1 to zero, which isn't right. */
   6113                put_C3210(
   6114                    unop(Iop_32Uto64,
   6115                    binop( Iop_And32,
   6116                           binop(Iop_Shl32,
   6117                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   6118                                 mkU8(8)),
   6119                           mkU32(0x4500)
   6120                    )));
   6121                break;
   6122 
   6123             case 0xE8 ... 0xEF: /* FUCOMP %st(0),%st(?) */
   6124                r_dst = (UInt)modrm - 0xE8;
   6125                DIP("fucomp %%st(0),%%st(%u)\n", r_dst);
   6126                /* This forces C1 to zero, which isn't right. */
   6127                put_C3210(
   6128                    unop(Iop_32Uto64,
   6129                    binop( Iop_And32,
   6130                           binop(Iop_Shl32,
   6131                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   6132                                 mkU8(8)),
   6133                           mkU32(0x4500)
   6134                    )));
   6135                fp_pop();
   6136                break;
   6137 
   6138             default:
   6139                goto decode_fail;
   6140          }
   6141       }
   6142    }
   6143 
   6144    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDE opcodes +-+-+-+-+-+-+-+ */
   6145    else
   6146    if (first_opcode == 0xDE) {
   6147 
   6148       if (modrm < 0xC0) {
   6149 
   6150          /* bits 5,4,3 are an opcode extension, and the modRM also
   6151             specifies an address. */
   6152          IROp   fop;
   6153          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6154          delta += len;
   6155 
   6156          switch (gregLO3ofRM(modrm)) {
   6157 
   6158             case 0: /* FIADD m16int */ /* ST(0) += m16int */
   6159                DIP("fiaddw %s\n", dis_buf);
   6160                fop = Iop_AddF64;
   6161                goto do_fop_m16;
   6162 
   6163             case 1: /* FIMUL m16int */ /* ST(0) *= m16int */
   6164                DIP("fimulw %s\n", dis_buf);
   6165                fop = Iop_MulF64;
   6166                goto do_fop_m16;
   6167 
   6168             case 4: /* FISUB m16int */ /* ST(0) -= m16int */
   6169                DIP("fisubw %s\n", dis_buf);
   6170                fop = Iop_SubF64;
   6171                goto do_fop_m16;
   6172 
   6173             case 5: /* FISUBR m16int */ /* ST(0) = m16int - ST(0) */
   6174                DIP("fisubrw %s\n", dis_buf);
   6175                fop = Iop_SubF64;
   6176                goto do_foprev_m16;
   6177 
   6178             case 6: /* FIDIV m16int */ /* ST(0) /= m16int */
   6179                DIP("fisubw %s\n", dis_buf);
   6180                fop = Iop_DivF64;
   6181                goto do_fop_m16;
   6182 
   6183             case 7: /* FIDIVR m16int */ /* ST(0) = m16int / ST(0) */
   6184                DIP("fidivrw %s\n", dis_buf);
   6185                fop = Iop_DivF64;
   6186                goto do_foprev_m16;
   6187 
   6188             do_fop_m16:
   6189                put_ST_UNCHECKED(0,
   6190                   triop(fop,
   6191                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6192                         get_ST(0),
   6193                         unop(Iop_I32StoF64,
   6194                              unop(Iop_16Sto32,
   6195                                   loadLE(Ity_I16, mkexpr(addr))))));
   6196                break;
   6197 
   6198             do_foprev_m16:
   6199                put_ST_UNCHECKED(0,
   6200                   triop(fop,
   6201                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6202                         unop(Iop_I32StoF64,
   6203                              unop(Iop_16Sto32,
   6204                                   loadLE(Ity_I16, mkexpr(addr)))),
   6205                         get_ST(0)));
   6206                break;
   6207 
   6208             default:
   6209                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   6210                vex_printf("first_opcode == 0xDE\n");
   6211                goto decode_fail;
   6212          }
   6213 
   6214       } else {
   6215 
   6216          delta++;
   6217          switch (modrm) {
   6218 
   6219             case 0xC0 ... 0xC7: /* FADDP %st(0),%st(?) */
   6220                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, True );
   6221                break;
   6222 
   6223             case 0xC8 ... 0xCF: /* FMULP %st(0),%st(?) */
   6224                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, True );
   6225                break;
   6226 
   6227             case 0xD9: /* FCOMPP %st(0),%st(1) */
   6228                DIP("fcompp %%st(0),%%st(1)\n");
   6229                /* This forces C1 to zero, which isn't right. */
   6230                put_C3210(
   6231                    unop(Iop_32Uto64,
   6232                    binop( Iop_And32,
   6233                           binop(Iop_Shl32,
   6234                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
   6235                                 mkU8(8)),
   6236                           mkU32(0x4500)
   6237                    )));
   6238                fp_pop();
   6239                fp_pop();
   6240                break;
   6241 
   6242             case 0xE0 ... 0xE7: /* FSUBRP %st(0),%st(?) */
   6243                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0,  modrm - 0xE0, True );
   6244                break;
   6245 
   6246             case 0xE8 ... 0xEF: /* FSUBP %st(0),%st(?) */
   6247                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0,  modrm - 0xE8, True );
   6248                break;
   6249 
   6250             case 0xF0 ... 0xF7: /* FDIVRP %st(0),%st(?) */
   6251                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, True );
   6252                break;
   6253 
   6254             case 0xF8 ... 0xFF: /* FDIVP %st(0),%st(?) */
   6255                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, True );
   6256                break;
   6257 
   6258             default:
   6259                goto decode_fail;
   6260          }
   6261 
   6262       }
   6263    }
   6264 
   6265    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDF opcodes +-+-+-+-+-+-+-+ */
   6266    else
   6267    if (first_opcode == 0xDF) {
   6268 
   6269       if (modrm < 0xC0) {
   6270 
   6271          /* bits 5,4,3 are an opcode extension, and the modRM also
   6272             specifies an address. */
   6273          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6274          delta += len;
   6275 
   6276          switch (gregLO3ofRM(modrm)) {
   6277 
   6278             case 0: /* FILD m16int */
   6279                DIP("fildw %s\n", dis_buf);
   6280                fp_push();
   6281                put_ST(0, unop(Iop_I32StoF64,
   6282                               unop(Iop_16Sto32,
   6283                                    loadLE(Ity_I16, mkexpr(addr)))));
   6284                break;
   6285 
   6286             case 1: /* FISTTPS m16 (SSE3) */
   6287                DIP("fisttps %s\n", dis_buf);
   6288                storeLE( mkexpr(addr),
   6289                         x87ishly_qnarrow_32_to_16(
   6290                         binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) ));
   6291                fp_pop();
   6292                break;
   6293 
   6294             case 2: /* FIST m16 */
   6295                DIP("fists %s\n", dis_buf);
   6296                storeLE( mkexpr(addr),
   6297                         x87ishly_qnarrow_32_to_16(
   6298                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) ));
   6299                break;
   6300 
   6301             case 3: /* FISTP m16 */
   6302                DIP("fistps %s\n", dis_buf);
   6303                storeLE( mkexpr(addr),
   6304                         x87ishly_qnarrow_32_to_16(
   6305                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) ));
   6306                fp_pop();
   6307                break;
   6308 
   6309             case 5: /* FILD m64 */
   6310                DIP("fildll %s\n", dis_buf);
   6311                fp_push();
   6312                put_ST(0, binop(Iop_I64StoF64,
   6313                                get_roundingmode(),
   6314                                loadLE(Ity_I64, mkexpr(addr))));
   6315                break;
   6316 
   6317             case 7: /* FISTP m64 */
   6318                DIP("fistpll %s\n", dis_buf);
   6319                storeLE( mkexpr(addr),
   6320                         binop(Iop_F64toI64S, get_roundingmode(), get_ST(0)) );
   6321                fp_pop();
   6322                break;
   6323 
   6324             default:
   6325                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   6326                vex_printf("first_opcode == 0xDF\n");
   6327                goto decode_fail;
   6328          }
   6329 
   6330       } else {
   6331 
   6332          delta++;
   6333          switch (modrm) {
   6334 
   6335             case 0xC0: /* FFREEP %st(0) */
   6336                DIP("ffreep %%st(%d)\n", 0);
   6337                put_ST_TAG ( 0, mkU8(0) );
   6338                fp_pop();
   6339                break;
   6340 
   6341             case 0xE0: /* FNSTSW %ax */
   6342                DIP("fnstsw %%ax\n");
   6343                /* Invent a plausible-looking FPU status word value and
   6344                   dump it in %AX:
   6345                      ((ftop & 7) << 11) | (c3210 & 0x4700)
   6346                */
   6347                putIRegRAX(
   6348                   2,
   6349                   unop(Iop_32to16,
   6350                        binop(Iop_Or32,
   6351                              binop(Iop_Shl32,
   6352                                    binop(Iop_And32, get_ftop(), mkU32(7)),
   6353                                    mkU8(11)),
   6354                              binop(Iop_And32,
   6355                                    unop(Iop_64to32, get_C3210()),
   6356                                    mkU32(0x4700))
   6357                )));
   6358                break;
   6359 
   6360             case 0xE8 ... 0xEF: /* FUCOMIP %st(0),%st(?) */
   6361                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, True );
   6362                break;
   6363 
   6364             case 0xF0 ... 0xF7: /* FCOMIP %st(0),%st(?) */
   6365                /* not really right since COMIP != UCOMIP */
   6366                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, True );
   6367                break;
   6368 
   6369             default:
   6370                goto decode_fail;
   6371          }
   6372       }
   6373 
   6374    }
   6375 
   6376    else
   6377       goto decode_fail;
   6378 
   6379    *decode_ok = True;
   6380    return delta;
   6381 
   6382   decode_fail:
   6383    *decode_ok = False;
   6384    return delta;
   6385 }
   6386 
   6387 
   6388 /*------------------------------------------------------------*/
   6389 /*---                                                      ---*/
   6390 /*--- MMX INSTRUCTIONS                                     ---*/
   6391 /*---                                                      ---*/
   6392 /*------------------------------------------------------------*/
   6393 
   6394 /* Effect of MMX insns on x87 FPU state (table 11-2 of
   6395    IA32 arch manual, volume 3):
   6396 
   6397    Read from, or write to MMX register (viz, any insn except EMMS):
   6398    * All tags set to Valid (non-empty) -- FPTAGS[i] := nonzero
   6399    * FP stack pointer set to zero
   6400 
   6401    EMMS:
   6402    * All tags set to Invalid (empty) -- FPTAGS[i] := zero
   6403    * FP stack pointer set to zero
   6404 */
   6405 
   6406 static void do_MMX_preamble ( void )
   6407 {
   6408    Int         i;
   6409    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   6410    IRExpr*     zero  = mkU32(0);
   6411    IRExpr*     tag1  = mkU8(1);
   6412    put_ftop(zero);
   6413    for (i = 0; i < 8; i++)
   6414       stmt( IRStmt_PutI( descr, zero, i, tag1 ) );
   6415 }
   6416 
   6417 static void do_EMMS_preamble ( void )
   6418 {
   6419    Int         i;
   6420    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   6421    IRExpr*     zero  = mkU32(0);
   6422    IRExpr*     tag0  = mkU8(0);
   6423    put_ftop(zero);
   6424    for (i = 0; i < 8; i++)
   6425       stmt( IRStmt_PutI( descr, zero, i, tag0 ) );
   6426 }
   6427 
   6428 
   6429 static IRExpr* getMMXReg ( UInt archreg )
   6430 {
   6431    vassert(archreg < 8);
   6432    return IRExpr_Get( OFFB_FPREGS + 8 * archreg, Ity_I64 );
   6433 }
   6434 
   6435 
   6436 static void putMMXReg ( UInt archreg, IRExpr* e )
   6437 {
   6438    vassert(archreg < 8);
   6439    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   6440    stmt( IRStmt_Put( OFFB_FPREGS + 8 * archreg, e ) );
   6441 }
   6442 
   6443 
   6444 /* Helper for non-shift MMX insns.  Note this is incomplete in the
   6445    sense that it does not first call do_MMX_preamble() -- that is the
   6446    responsibility of its caller. */
   6447 
   6448 static
   6449 ULong dis_MMXop_regmem_to_reg ( VexAbiInfo* vbi,
   6450                                 Prefix      pfx,
   6451                                 Long        delta,
   6452                                 UChar       opc,
   6453                                 HChar*      name,
   6454                                 Bool        show_granularity )
   6455 {
   6456    HChar   dis_buf[50];
   6457    UChar   modrm = getUChar(delta);
   6458    Bool    isReg = epartIsReg(modrm);
   6459    IRExpr* argL  = NULL;
   6460    IRExpr* argR  = NULL;
   6461    IRExpr* argG  = NULL;
   6462    IRExpr* argE  = NULL;
   6463    IRTemp  res   = newTemp(Ity_I64);
   6464 
   6465    Bool    invG  = False;
   6466    IROp    op    = Iop_INVALID;
   6467    void*   hAddr = NULL;
   6468    HChar*  hName = NULL;
   6469    Bool    eLeft = False;
   6470 
   6471 #  define XXX(_name) do { hAddr = &_name; hName = #_name; } while (0)
   6472 
   6473    switch (opc) {
   6474       /* Original MMX ones */
   6475       case 0xFC: op = Iop_Add8x8; break;
   6476       case 0xFD: op = Iop_Add16x4; break;
   6477       case 0xFE: op = Iop_Add32x2; break;
   6478 
   6479       case 0xEC: op = Iop_QAdd8Sx8; break;
   6480       case 0xED: op = Iop_QAdd16Sx4; break;
   6481 
   6482       case 0xDC: op = Iop_QAdd8Ux8; break;
   6483       case 0xDD: op = Iop_QAdd16Ux4; break;
   6484 
   6485       case 0xF8: op = Iop_Sub8x8;  break;
   6486       case 0xF9: op = Iop_Sub16x4; break;
   6487       case 0xFA: op = Iop_Sub32x2; break;
   6488 
   6489       case 0xE8: op = Iop_QSub8Sx8; break;
   6490       case 0xE9: op = Iop_QSub16Sx4; break;
   6491 
   6492       case 0xD8: op = Iop_QSub8Ux8; break;
   6493       case 0xD9: op = Iop_QSub16Ux4; break;
   6494 
   6495       case 0xE5: op = Iop_MulHi16Sx4; break;
   6496       case 0xD5: op = Iop_Mul16x4; break;
   6497       case 0xF5: XXX(amd64g_calculate_mmx_pmaddwd); break;
   6498 
   6499       case 0x74: op = Iop_CmpEQ8x8; break;
   6500       case 0x75: op = Iop_CmpEQ16x4; break;
   6501       case 0x76: op = Iop_CmpEQ32x2; break;
   6502 
   6503       case 0x64: op = Iop_CmpGT8Sx8; break;
   6504       case 0x65: op = Iop_CmpGT16Sx4; break;
   6505       case 0x66: op = Iop_CmpGT32Sx2; break;
   6506 
   6507       case 0x6B: op = Iop_QNarrow32Sx2; eLeft = True; break;
   6508       case 0x63: op = Iop_QNarrow16Sx4; eLeft = True; break;
   6509       case 0x67: op = Iop_QNarrow16Ux4; eLeft = True; break;
   6510 
   6511       case 0x68: op = Iop_InterleaveHI8x8;  eLeft = True; break;
   6512       case 0x69: op = Iop_InterleaveHI16x4; eLeft = True; break;
   6513       case 0x6A: op = Iop_InterleaveHI32x2; eLeft = True; break;
   6514 
   6515       case 0x60: op = Iop_InterleaveLO8x8;  eLeft = True; break;
   6516       case 0x61: op = Iop_InterleaveLO16x4; eLeft = True; break;
   6517       case 0x62: op = Iop_InterleaveLO32x2; eLeft = True; break;
   6518 
   6519       case 0xDB: op = Iop_And64; break;
   6520       case 0xDF: op = Iop_And64; invG = True; break;
   6521       case 0xEB: op = Iop_Or64; break;
   6522       case 0xEF: /* Possibly do better here if argL and argR are the
   6523                     same reg */
   6524                  op = Iop_Xor64; break;
   6525 
   6526       /* Introduced in SSE1 */
   6527       case 0xE0: op = Iop_Avg8Ux8;    break;
   6528       case 0xE3: op = Iop_Avg16Ux4;   break;
   6529       case 0xEE: op = Iop_Max16Sx4;   break;
   6530       case 0xDE: op = Iop_Max8Ux8;    break;
   6531       case 0xEA: op = Iop_Min16Sx4;   break;
   6532       case 0xDA: op = Iop_Min8Ux8;    break;
   6533       case 0xE4: op = Iop_MulHi16Ux4; break;
   6534       case 0xF6: XXX(amd64g_calculate_mmx_psadbw); break;
   6535 
   6536       /* Introduced in SSE2 */
   6537       case 0xD4: op = Iop_Add64; break;
   6538       case 0xFB: op = Iop_Sub64; break;
   6539 
   6540       default:
   6541          vex_printf("\n0x%x\n", (Int)opc);
   6542          vpanic("dis_MMXop_regmem_to_reg");
   6543    }
   6544 
   6545 #  undef XXX
   6546 
   6547    argG = getMMXReg(gregLO3ofRM(modrm));
   6548    if (invG)
   6549       argG = unop(Iop_Not64, argG);
   6550 
   6551    if (isReg) {
   6552       delta++;
   6553       argE = getMMXReg(eregLO3ofRM(modrm));
   6554    } else {
   6555       Int    len;
   6556       IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6557       delta += len;
   6558       argE = loadLE(Ity_I64, mkexpr(addr));
   6559    }
   6560 
   6561    if (eLeft) {
   6562       argL = argE;
   6563       argR = argG;
   6564    } else {
   6565       argL = argG;
   6566       argR = argE;
   6567    }
   6568 
   6569    if (op != Iop_INVALID) {
   6570       vassert(hName == NULL);
   6571       vassert(hAddr == NULL);
   6572       assign(res, binop(op, argL, argR));
   6573    } else {
   6574       vassert(hName != NULL);
   6575       vassert(hAddr != NULL);
   6576       assign( res,
   6577               mkIRExprCCall(
   6578                  Ity_I64,
   6579                  0/*regparms*/, hName, hAddr,
   6580                  mkIRExprVec_2( argL, argR )
   6581               )
   6582             );
   6583    }
   6584 
   6585    putMMXReg( gregLO3ofRM(modrm), mkexpr(res) );
   6586 
   6587    DIP("%s%s %s, %s\n",
   6588        name, show_granularity ? nameMMXGran(opc & 3) : "",
   6589        ( isReg ? nameMMXReg(eregLO3ofRM(modrm)) : dis_buf ),
   6590        nameMMXReg(gregLO3ofRM(modrm)) );
   6591 
   6592    return delta;
   6593 }
   6594 
   6595 
   6596 /* Vector by scalar shift of G by the amount specified at the bottom
   6597    of E.  This is a straight copy of dis_SSE_shiftG_byE. */
   6598 
   6599 static ULong dis_MMX_shiftG_byE ( VexAbiInfo* vbi,
   6600                                   Prefix pfx, Long delta,
   6601                                   HChar* opname, IROp op )
   6602 {
   6603    HChar   dis_buf[50];
   6604    Int     alen, size;
   6605    IRTemp  addr;
   6606    Bool    shl, shr, sar;
   6607    UChar   rm   = getUChar(delta);
   6608    IRTemp  g0   = newTemp(Ity_I64);
   6609    IRTemp  g1   = newTemp(Ity_I64);
   6610    IRTemp  amt  = newTemp(Ity_I64);
   6611    IRTemp  amt8 = newTemp(Ity_I8);
   6612 
   6613    if (epartIsReg(rm)) {
   6614       assign( amt, getMMXReg(eregLO3ofRM(rm)) );
   6615       DIP("%s %s,%s\n", opname,
   6616                         nameMMXReg(eregLO3ofRM(rm)),
   6617                         nameMMXReg(gregLO3ofRM(rm)) );
   6618       delta++;
   6619    } else {
   6620       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   6621       assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
   6622       DIP("%s %s,%s\n", opname,
   6623                         dis_buf,
   6624                         nameMMXReg(gregLO3ofRM(rm)) );
   6625       delta += alen;
   6626    }
   6627    assign( g0,   getMMXReg(gregLO3ofRM(rm)) );
   6628    assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
   6629 
   6630    shl = shr = sar = False;
   6631    size = 0;
   6632    switch (op) {
   6633       case Iop_ShlN16x4: shl = True; size = 32; break;
   6634       case Iop_ShlN32x2: shl = True; size = 32; break;
   6635       case Iop_Shl64:    shl = True; size = 64; break;
   6636       case Iop_ShrN16x4: shr = True; size = 16; break;
   6637       case Iop_ShrN32x2: shr = True; size = 32; break;
   6638       case Iop_Shr64:    shr = True; size = 64; break;
   6639       case Iop_SarN16x4: sar = True; size = 16; break;
   6640       case Iop_SarN32x2: sar = True; size = 32; break;
   6641       default: vassert(0);
   6642    }
   6643 
   6644    if (shl || shr) {
   6645      assign(
   6646         g1,
   6647         IRExpr_Mux0X(
   6648            unop(Iop_1Uto8,binop(Iop_CmpLT64U,mkexpr(amt),mkU64(size))),
   6649            mkU64(0),
   6650            binop(op, mkexpr(g0), mkexpr(amt8))
   6651         )
   6652      );
   6653    } else
   6654    if (sar) {
   6655      assign(
   6656         g1,
   6657         IRExpr_Mux0X(
   6658            unop(Iop_1Uto8,binop(Iop_CmpLT64U,mkexpr(amt),mkU64(size))),
   6659            binop(op, mkexpr(g0), mkU8(size-1)),
   6660            binop(op, mkexpr(g0), mkexpr(amt8))
   6661         )
   6662      );
   6663    } else {
   6664       vassert(0);
   6665    }
   6666 
   6667    putMMXReg( gregLO3ofRM(rm), mkexpr(g1) );
   6668    return delta;
   6669 }
   6670 
   6671 
   6672 /* Vector by scalar shift of E by an immediate byte.  This is a
   6673    straight copy of dis_SSE_shiftE_imm. */
   6674 
   6675 static
   6676 ULong dis_MMX_shiftE_imm ( Long delta, HChar* opname, IROp op )
   6677 {
   6678    Bool    shl, shr, sar;
   6679    UChar   rm   = getUChar(delta);
   6680    IRTemp  e0   = newTemp(Ity_I64);
   6681    IRTemp  e1   = newTemp(Ity_I64);
   6682    UChar   amt, size;
   6683    vassert(epartIsReg(rm));
   6684    vassert(gregLO3ofRM(rm) == 2
   6685            || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
   6686    amt = getUChar(delta+1);
   6687    delta += 2;
   6688    DIP("%s $%d,%s\n", opname,
   6689                       (Int)amt,
   6690                       nameMMXReg(eregLO3ofRM(rm)) );
   6691 
   6692    assign( e0, getMMXReg(eregLO3ofRM(rm)) );
   6693 
   6694    shl = shr = sar = False;
   6695    size = 0;
   6696    switch (op) {
   6697       case Iop_ShlN16x4: shl = True; size = 16; break;
   6698       case Iop_ShlN32x2: shl = True; size = 32; break;
   6699       case Iop_Shl64:    shl = True; size = 64; break;
   6700       case Iop_SarN16x4: sar = True; size = 16; break;
   6701       case Iop_SarN32x2: sar = True; size = 32; break;
   6702       case Iop_ShrN16x4: shr = True; size = 16; break;
   6703       case Iop_ShrN32x2: shr = True; size = 32; break;
   6704       case Iop_Shr64:    shr = True; size = 64; break;
   6705       default: vassert(0);
   6706    }
   6707 
   6708    if (shl || shr) {
   6709      assign( e1, amt >= size
   6710                     ? mkU64(0)
   6711                     : binop(op, mkexpr(e0), mkU8(amt))
   6712      );
   6713    } else
   6714    if (sar) {
   6715      assign( e1, amt >= size
   6716                     ? binop(op, mkexpr(e0), mkU8(size-1))
   6717                     : binop(op, mkexpr(e0), mkU8(amt))
   6718      );
   6719    } else {
   6720       vassert(0);
   6721    }
   6722 
   6723    putMMXReg( eregLO3ofRM(rm), mkexpr(e1) );
   6724    return delta;
   6725 }
   6726 
   6727 
   6728 /* Completely handle all MMX instructions except emms. */
   6729 
   6730 static
   6731 ULong dis_MMX ( Bool* decode_ok,
   6732                 VexAbiInfo* vbi, Prefix pfx, Int sz, Long delta )
   6733 {
   6734    Int   len;
   6735    UChar modrm;
   6736    HChar dis_buf[50];
   6737    UChar opc = getUChar(delta);
   6738    delta++;
   6739 
   6740    /* dis_MMX handles all insns except emms. */
   6741    do_MMX_preamble();
   6742 
   6743    switch (opc) {
   6744 
   6745       case 0x6E:
   6746          if (sz == 4) {
   6747             /* MOVD (src)ireg32-or-mem32 (E), (dst)mmxreg (G)*/
   6748             modrm = getUChar(delta);
   6749             if (epartIsReg(modrm)) {
   6750                delta++;
   6751                putMMXReg(
   6752                   gregLO3ofRM(modrm),
   6753                   binop( Iop_32HLto64,
   6754                          mkU32(0),
   6755                          getIReg32(eregOfRexRM(pfx,modrm)) ) );
   6756                DIP("movd %s, %s\n",
   6757                    nameIReg32(eregOfRexRM(pfx,modrm)),
   6758                    nameMMXReg(gregLO3ofRM(modrm)));
   6759             } else {
   6760                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6761                delta += len;
   6762                putMMXReg(
   6763                   gregLO3ofRM(modrm),
   6764                   binop( Iop_32HLto64,
   6765                          mkU32(0),
   6766                          loadLE(Ity_I32, mkexpr(addr)) ) );
   6767                DIP("movd %s, %s\n", dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
   6768             }
   6769          }
   6770          else
   6771          if (sz == 8) {
   6772             /* MOVD (src)ireg64-or-mem64 (E), (dst)mmxreg (G)*/
   6773             modrm = getUChar(delta);
   6774             if (epartIsReg(modrm)) {
   6775                delta++;
   6776                putMMXReg( gregLO3ofRM(modrm),
   6777                           getIReg64(eregOfRexRM(pfx,modrm)) );
   6778                DIP("movd %s, %s\n",
   6779                    nameIReg64(eregOfRexRM(pfx,modrm)),
   6780                    nameMMXReg(gregLO3ofRM(modrm)));
   6781             } else {
   6782                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6783                delta += len;
   6784                putMMXReg( gregLO3ofRM(modrm),
   6785                           loadLE(Ity_I64, mkexpr(addr)) );
   6786                DIP("movd{64} %s, %s\n", dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
   6787             }
   6788          }
   6789          else {
   6790             goto mmx_decode_failure;
   6791          }
   6792          break;
   6793 
   6794       case 0x7E:
   6795          if (sz == 4) {
   6796             /* MOVD (src)mmxreg (G), (dst)ireg32-or-mem32 (E) */
   6797             modrm = getUChar(delta);
   6798             if (epartIsReg(modrm)) {
   6799                delta++;
   6800                putIReg32( eregOfRexRM(pfx,modrm),
   6801                           unop(Iop_64to32, getMMXReg(gregLO3ofRM(modrm)) ) );
   6802                DIP("movd %s, %s\n",
   6803                    nameMMXReg(gregLO3ofRM(modrm)),
   6804                    nameIReg32(eregOfRexRM(pfx,modrm)));
   6805             } else {
   6806                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6807                delta += len;
   6808                storeLE( mkexpr(addr),
   6809                         unop(Iop_64to32, getMMXReg(gregLO3ofRM(modrm)) ) );
   6810                DIP("movd %s, %s\n", nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
   6811             }
   6812          }
   6813          else
   6814          if (sz == 8) {
   6815             /* MOVD (src)mmxreg (G), (dst)ireg64-or-mem64 (E) */
   6816             modrm = getUChar(delta);
   6817             if (epartIsReg(modrm)) {
   6818                delta++;
   6819                putIReg64( eregOfRexRM(pfx,modrm),
   6820                           getMMXReg(gregLO3ofRM(modrm)) );
   6821                DIP("movd %s, %s\n",
   6822                    nameMMXReg(gregLO3ofRM(modrm)),
   6823                    nameIReg64(eregOfRexRM(pfx,modrm)));
   6824             } else {
   6825                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6826                delta += len;
   6827                storeLE( mkexpr(addr),
   6828                        getMMXReg(gregLO3ofRM(modrm)) );
   6829                DIP("movd{64} %s, %s\n", nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
   6830             }
   6831          } else {
   6832             goto mmx_decode_failure;
   6833          }
   6834          break;
   6835 
   6836       case 0x6F:
   6837          /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
   6838          if (sz != 4
   6839              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   6840             goto mmx_decode_failure;
   6841          modrm = getUChar(delta);
   6842          if (epartIsReg(modrm)) {
   6843             delta++;
   6844             putMMXReg( gregLO3ofRM(modrm), getMMXReg(eregLO3ofRM(modrm)) );
   6845             DIP("movq %s, %s\n",
   6846                 nameMMXReg(eregLO3ofRM(modrm)),
   6847                 nameMMXReg(gregLO3ofRM(modrm)));
   6848          } else {
   6849             IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6850             delta += len;
   6851             putMMXReg( gregLO3ofRM(modrm), loadLE(Ity_I64, mkexpr(addr)) );
   6852             DIP("movq %s, %s\n",
   6853                 dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
   6854          }
   6855          break;
   6856 
   6857       case 0x7F:
   6858          /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
   6859          if (sz != 4
   6860              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   6861             goto mmx_decode_failure;
   6862          modrm = getUChar(delta);
   6863          if (epartIsReg(modrm)) {
   6864             /* Fall through.  The assembler doesn't appear to generate
   6865                these. */
   6866             goto mmx_decode_failure;
   6867          } else {
   6868             IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6869             delta += len;
   6870             storeLE( mkexpr(addr), getMMXReg(gregLO3ofRM(modrm)) );
   6871             DIP("mov(nt)q %s, %s\n",
   6872                 nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
   6873          }
   6874          break;
   6875 
   6876       case 0xFC:
   6877       case 0xFD:
   6878       case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
   6879          if (sz != 4)
   6880             goto mmx_decode_failure;
   6881          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "padd", True );
   6882          break;
   6883 
   6884       case 0xEC:
   6885       case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
   6886          if (sz != 4
   6887              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   6888             goto mmx_decode_failure;
   6889          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "padds", True );
   6890          break;
   6891 
   6892       case 0xDC:
   6893       case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   6894          if (sz != 4)
   6895             goto mmx_decode_failure;
   6896          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "paddus", True );
   6897          break;
   6898 
   6899       case 0xF8:
   6900       case 0xF9:
   6901       case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
   6902          if (sz != 4)
   6903             goto mmx_decode_failure;
   6904          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psub", True );
   6905          break;
   6906 
   6907       case 0xE8:
   6908       case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
   6909          if (sz != 4)
   6910             goto mmx_decode_failure;
   6911          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psubs", True );
   6912          break;
   6913 
   6914       case 0xD8:
   6915       case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   6916          if (sz != 4)
   6917             goto mmx_decode_failure;
   6918          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psubus", True );
   6919          break;
   6920 
   6921       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
   6922          if (sz != 4)
   6923             goto mmx_decode_failure;
   6924          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmulhw", False );
   6925          break;
   6926 
   6927       case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
   6928          if (sz != 4)
   6929             goto mmx_decode_failure;
   6930          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmullw", False );
   6931          break;
   6932 
   6933       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
   6934          vassert(sz == 4);
   6935          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmaddwd", False );
   6936          break;
   6937 
   6938       case 0x74:
   6939       case 0x75:
   6940       case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
   6941          if (sz != 4)
   6942             goto mmx_decode_failure;
   6943          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pcmpeq", True );
   6944          break;
   6945 
   6946       case 0x64:
   6947       case 0x65:
   6948       case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
   6949          if (sz != 4)
   6950             goto mmx_decode_failure;
   6951          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pcmpgt", True );
   6952          break;
   6953 
   6954       case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
   6955          if (sz != 4)
   6956             goto mmx_decode_failure;
   6957          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packssdw", False );
   6958          break;
   6959 
   6960       case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
   6961          if (sz != 4)
   6962             goto mmx_decode_failure;
   6963          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packsswb", False );
   6964          break;
   6965 
   6966       case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
   6967          if (sz != 4)
   6968             goto mmx_decode_failure;
   6969          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packuswb", False );
   6970          break;
   6971 
   6972       case 0x68:
   6973       case 0x69:
   6974       case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
   6975          if (sz != 4
   6976              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   6977             goto mmx_decode_failure;
   6978          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "punpckh", True );
   6979          break;
   6980 
   6981       case 0x60:
   6982       case 0x61:
   6983       case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
   6984          if (sz != 4
   6985              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   6986             goto mmx_decode_failure;
   6987          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "punpckl", True );
   6988          break;
   6989 
   6990       case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
   6991          if (sz != 4)
   6992             goto mmx_decode_failure;
   6993          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pand", False );
   6994          break;
   6995 
   6996       case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
   6997          if (sz != 4)
   6998             goto mmx_decode_failure;
   6999          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pandn", False );
   7000          break;
   7001 
   7002       case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
   7003          if (sz != 4)
   7004             goto mmx_decode_failure;
   7005          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "por", False );
   7006          break;
   7007 
   7008       case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
   7009          if (sz != 4)
   7010             goto mmx_decode_failure;
   7011          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pxor", False );
   7012          break;
   7013 
   7014 #     define SHIFT_BY_REG(_name,_op)                                     \
   7015                 delta = dis_MMX_shiftG_byE(vbi, pfx, delta, _name, _op); \
   7016                 break;
   7017 
   7018       /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
   7019       case 0xF1: SHIFT_BY_REG("psllw", Iop_ShlN16x4);
   7020       case 0xF2: SHIFT_BY_REG("pslld", Iop_ShlN32x2);
   7021       case 0xF3: SHIFT_BY_REG("psllq", Iop_Shl64);
   7022 
   7023       /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
   7024       case 0xD1: SHIFT_BY_REG("psrlw", Iop_ShrN16x4);
   7025       case 0xD2: SHIFT_BY_REG("psrld", Iop_ShrN32x2);
   7026       case 0xD3: SHIFT_BY_REG("psrlq", Iop_Shr64);
   7027 
   7028       /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
   7029       case 0xE1: SHIFT_BY_REG("psraw", Iop_SarN16x4);
   7030       case 0xE2: SHIFT_BY_REG("psrad", Iop_SarN32x2);
   7031 
   7032 #     undef SHIFT_BY_REG
   7033 
   7034       case 0x71:
   7035       case 0x72:
   7036       case 0x73: {
   7037          /* (sz==4): PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
   7038          UChar byte2, subopc;
   7039          if (sz != 4)
   7040             goto mmx_decode_failure;
   7041          byte2  = getUChar(delta);      /* amode / sub-opcode */
   7042          subopc = toUChar( (byte2 >> 3) & 7 );
   7043 
   7044 #        define SHIFT_BY_IMM(_name,_op)                        \
   7045             do { delta = dis_MMX_shiftE_imm(delta,_name,_op);  \
   7046             } while (0)
   7047 
   7048               if (subopc == 2 /*SRL*/ && opc == 0x71)
   7049                   SHIFT_BY_IMM("psrlw", Iop_ShrN16x4);
   7050          else if (subopc == 2 /*SRL*/ && opc == 0x72)
   7051                  SHIFT_BY_IMM("psrld", Iop_ShrN32x2);
   7052          else if (subopc == 2 /*SRL*/ && opc == 0x73)
   7053                  SHIFT_BY_IMM("psrlq", Iop_Shr64);
   7054 
   7055          else if (subopc == 4 /*SAR*/ && opc == 0x71)
   7056                  SHIFT_BY_IMM("psraw", Iop_SarN16x4);
   7057          else if (subopc == 4 /*SAR*/ && opc == 0x72)
   7058                  SHIFT_BY_IMM("psrad", Iop_SarN32x2);
   7059 
   7060          else if (subopc == 6 /*SHL*/ && opc == 0x71)
   7061                  SHIFT_BY_IMM("psllw", Iop_ShlN16x4);
   7062          else if (subopc == 6 /*SHL*/ && opc == 0x72)
   7063                   SHIFT_BY_IMM("pslld", Iop_ShlN32x2);
   7064          else if (subopc == 6 /*SHL*/ && opc == 0x73)
   7065                  SHIFT_BY_IMM("psllq", Iop_Shl64);
   7066 
   7067          else goto mmx_decode_failure;
   7068 
   7069 #        undef SHIFT_BY_IMM
   7070          break;
   7071       }
   7072 
   7073       case 0xF7: {
   7074          IRTemp addr    = newTemp(Ity_I64);
   7075          IRTemp regD    = newTemp(Ity_I64);
   7076          IRTemp regM    = newTemp(Ity_I64);
   7077          IRTemp mask    = newTemp(Ity_I64);
   7078          IRTemp olddata = newTemp(Ity_I64);
   7079          IRTemp newdata = newTemp(Ity_I64);
   7080 
   7081          modrm = getUChar(delta);
   7082          if (sz != 4 || (!epartIsReg(modrm)))
   7083             goto mmx_decode_failure;
   7084          delta++;
   7085 
   7086          assign( addr, handleAddrOverrides( vbi, pfx, getIReg64(R_RDI) ));
   7087          assign( regM, getMMXReg( eregLO3ofRM(modrm) ));
   7088          assign( regD, getMMXReg( gregLO3ofRM(modrm) ));
   7089          assign( mask, binop(Iop_SarN8x8, mkexpr(regM), mkU8(7)) );
   7090          assign( olddata, loadLE( Ity_I64, mkexpr(addr) ));
   7091          assign( newdata,
   7092                  binop(Iop_Or64,
   7093                        binop(Iop_And64,
   7094                              mkexpr(regD),
   7095                              mkexpr(mask) ),
   7096                        binop(Iop_And64,
   7097                              mkexpr(olddata),
   7098                              unop(Iop_Not64, mkexpr(mask)))) );
   7099          storeLE( mkexpr(addr), mkexpr(newdata) );
   7100          DIP("maskmovq %s,%s\n", nameMMXReg( eregLO3ofRM(modrm) ),
   7101                                  nameMMXReg( gregLO3ofRM(modrm) ) );
   7102          break;
   7103       }
   7104 
   7105       /* --- MMX decode failure --- */
   7106       default:
   7107       mmx_decode_failure:
   7108          *decode_ok = False;
   7109          return delta; /* ignored */
   7110 
   7111    }
   7112 
   7113    *decode_ok = True;
   7114    return delta;
   7115 }
   7116 
   7117 
   7118 /*------------------------------------------------------------*/
   7119 /*--- More misc arithmetic and other obscure insns.        ---*/
   7120 /*------------------------------------------------------------*/
   7121 
   7122 /* Generate base << amt with vacated places filled with stuff
   7123    from xtra.  amt guaranteed in 0 .. 63. */
   7124 static
   7125 IRExpr* shiftL64_with_extras ( IRTemp base, IRTemp xtra, IRTemp amt )
   7126 {
   7127    /* if   amt == 0
   7128       then base
   7129       else (base << amt) | (xtra >>u (64-amt))
   7130    */
   7131    return
   7132       IRExpr_Mux0X(
   7133          mkexpr(amt),
   7134          mkexpr(base),
   7135          binop(Iop_Or64,
   7136                binop(Iop_Shl64, mkexpr(base), mkexpr(amt)),
   7137                binop(Iop_Shr64, mkexpr(xtra),
   7138                                 binop(Iop_Sub8, mkU8(64), mkexpr(amt)))
   7139          )
   7140       );
   7141 }
   7142 
   7143 /* Generate base >>u amt with vacated places filled with stuff
   7144    from xtra.  amt guaranteed in 0 .. 63. */
   7145 static
   7146 IRExpr* shiftR64_with_extras ( IRTemp xtra, IRTemp base, IRTemp amt )
   7147 {
   7148    /* if   amt == 0
   7149       then base
   7150       else (base >>u amt) | (xtra << (64-amt))
   7151    */
   7152    return
   7153       IRExpr_Mux0X(
   7154          mkexpr(amt),
   7155          mkexpr(base),
   7156          binop(Iop_Or64,
   7157                binop(Iop_Shr64, mkexpr(base), mkexpr(amt)),
   7158                binop(Iop_Shl64, mkexpr(xtra),
   7159                                 binop(Iop_Sub8, mkU8(64), mkexpr(amt)))
   7160          )
   7161       );
   7162 }
   7163 
   7164 /* Double length left and right shifts.  Apparently only required in
   7165    v-size (no b- variant). */
   7166 static
   7167 ULong dis_SHLRD_Gv_Ev ( VexAbiInfo* vbi,
   7168                         Prefix pfx,
   7169                         Long delta, UChar modrm,
   7170                         Int sz,
   7171                         IRExpr* shift_amt,
   7172                         Bool amt_is_literal,
   7173                         HChar* shift_amt_txt,
   7174                         Bool left_shift )
   7175 {
   7176    /* shift_amt :: Ity_I8 is the amount to shift.  shift_amt_txt is used
   7177       for printing it.   And eip on entry points at the modrm byte. */
   7178    Int len;
   7179    HChar dis_buf[50];
   7180 
   7181    IRType ty     = szToITy(sz);
   7182    IRTemp gsrc   = newTemp(ty);
   7183    IRTemp esrc   = newTemp(ty);
   7184    IRTemp addr   = IRTemp_INVALID;
   7185    IRTemp tmpSH  = newTemp(Ity_I8);
   7186    IRTemp tmpSS  = newTemp(Ity_I8);
   7187    IRTemp tmp64  = IRTemp_INVALID;
   7188    IRTemp res64  = IRTemp_INVALID;
   7189    IRTemp rss64  = IRTemp_INVALID;
   7190    IRTemp resTy  = IRTemp_INVALID;
   7191    IRTemp rssTy  = IRTemp_INVALID;
   7192    Int    mask   = sz==8 ? 63 : 31;
   7193 
   7194    vassert(sz == 2 || sz == 4 || sz == 8);
   7195 
   7196    /* The E-part is the destination; this is shifted.  The G-part
   7197       supplies bits to be shifted into the E-part, but is not
   7198       changed.
   7199 
   7200       If shifting left, form a double-length word with E at the top
   7201       and G at the bottom, and shift this left.  The result is then in
   7202       the high part.
   7203 
   7204       If shifting right, form a double-length word with G at the top
   7205       and E at the bottom, and shift this right.  The result is then
   7206       at the bottom.  */
   7207 
   7208    /* Fetch the operands. */
   7209 
   7210    assign( gsrc, getIRegG(sz, pfx, modrm) );
   7211 
   7212    if (epartIsReg(modrm)) {
   7213       delta++;
   7214       assign( esrc, getIRegE(sz, pfx, modrm) );
   7215       DIP("sh%cd%c %s, %s, %s\n",
   7216           ( left_shift ? 'l' : 'r' ), nameISize(sz),
   7217           shift_amt_txt,
   7218           nameIRegG(sz, pfx, modrm), nameIRegE(sz, pfx, modrm));
   7219    } else {
   7220       addr = disAMode ( &len, vbi, pfx, delta, dis_buf,
   7221                         /* # bytes following amode */
   7222                         amt_is_literal ? 1 : 0 );
   7223       delta += len;
   7224       assign( esrc, loadLE(ty, mkexpr(addr)) );
   7225       DIP("sh%cd%c %s, %s, %s\n",
   7226           ( left_shift ? 'l' : 'r' ), nameISize(sz),
   7227           shift_amt_txt,
   7228           nameIRegG(sz, pfx, modrm), dis_buf);
   7229    }
   7230 
   7231    /* Calculate the masked shift amount (tmpSH), the masked subshift
   7232       amount (tmpSS), the shifted value (res64) and the subshifted
   7233       value (rss64). */
   7234 
   7235    assign( tmpSH, binop(Iop_And8, shift_amt, mkU8(mask)) );
   7236    assign( tmpSS, binop(Iop_And8,
   7237                         binop(Iop_Sub8, mkexpr(tmpSH), mkU8(1) ),
   7238                         mkU8(mask)));
   7239 
   7240    tmp64 = newTemp(Ity_I64);
   7241    res64 = newTemp(Ity_I64);
   7242    rss64 = newTemp(Ity_I64);
   7243 
   7244    if (sz == 2 || sz == 4) {
   7245 
   7246       /* G is xtra; E is data */
   7247       /* what a freaking nightmare: */
   7248       if (sz == 4 && left_shift) {
   7249          assign( tmp64, binop(Iop_32HLto64, mkexpr(esrc), mkexpr(gsrc)) );
   7250          assign( res64,
   7251                  binop(Iop_Shr64,
   7252                        binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSH)),
   7253                        mkU8(32)) );
   7254          assign( rss64,
   7255                  binop(Iop_Shr64,
   7256                        binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSS)),
   7257                        mkU8(32)) );
   7258       }
   7259       else
   7260       if (sz == 4 && !left_shift) {
   7261          assign( tmp64, binop(Iop_32HLto64, mkexpr(gsrc), mkexpr(esrc)) );
   7262          assign( res64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSH)) );
   7263          assign( rss64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSS)) );
   7264       }
   7265       else
   7266       if (sz == 2 && left_shift) {
   7267          assign( tmp64,
   7268                  binop(Iop_32HLto64,
   7269                        binop(Iop_16HLto32, mkexpr(esrc), mkexpr(gsrc)),
   7270                        binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(gsrc))
   7271          ));
   7272 	 /* result formed by shifting [esrc'gsrc'gsrc'gsrc] */
   7273          assign( res64,
   7274                  binop(Iop_Shr64,
   7275                        binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSH)),
   7276                        mkU8(48)) );
   7277          /* subshift formed by shifting [esrc'0000'0000'0000] */
   7278          assign( rss64,
   7279                  binop(Iop_Shr64,
   7280                        binop(Iop_Shl64,
   7281                              binop(Iop_Shl64, unop(Iop_16Uto64, mkexpr(esrc)),
   7282                                               mkU8(48)),
   7283                              mkexpr(tmpSS)),
   7284                        mkU8(48)) );
   7285       }
   7286       else
   7287       if (sz == 2 && !left_shift) {
   7288          assign( tmp64,
   7289                  binop(Iop_32HLto64,
   7290                        binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(gsrc)),
   7291                        binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(esrc))
   7292          ));
   7293          /* result formed by shifting [gsrc'gsrc'gsrc'esrc] */
   7294          assign( res64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSH)) );
   7295          /* subshift formed by shifting [0000'0000'0000'esrc] */
   7296          assign( rss64, binop(Iop_Shr64,
   7297                               unop(Iop_16Uto64, mkexpr(esrc)),
   7298                               mkexpr(tmpSS)) );
   7299       }
   7300 
   7301    } else {
   7302 
   7303       vassert(sz == 8);
   7304       if (left_shift) {
   7305          assign( res64, shiftL64_with_extras( esrc, gsrc, tmpSH ));
   7306          assign( rss64, shiftL64_with_extras( esrc, gsrc, tmpSS ));
   7307       } else {
   7308          assign( res64, shiftR64_with_extras( gsrc, esrc, tmpSH ));
   7309          assign( rss64, shiftR64_with_extras( gsrc, esrc, tmpSS ));
   7310       }
   7311 
   7312    }
   7313 
   7314    resTy = newTemp(ty);
   7315    rssTy = newTemp(ty);
   7316    assign( resTy, narrowTo(ty, mkexpr(res64)) );
   7317    assign( rssTy, narrowTo(ty, mkexpr(rss64)) );
   7318 
   7319    /* Put result back and write the flags thunk. */
   7320    setFlags_DEP1_DEP2_shift ( left_shift ? Iop_Shl64 : Iop_Sar64,
   7321                               resTy, rssTy, ty, tmpSH );
   7322 
   7323    if (epartIsReg(modrm)) {
   7324       putIRegE(sz, pfx, modrm, mkexpr(resTy));
   7325    } else {
   7326       storeLE( mkexpr(addr), mkexpr(resTy) );
   7327    }
   7328 
   7329    if (amt_is_literal) delta++;
   7330    return delta;
   7331 }
   7332 
   7333 
   7334 /* Handle BT/BTS/BTR/BTC Gv, Ev.  Apparently b-size is not
   7335    required. */
   7336 
   7337 typedef enum { BtOpNone, BtOpSet, BtOpReset, BtOpComp } BtOp;
   7338 
   7339 static HChar* nameBtOp ( BtOp op )
   7340 {
   7341    switch (op) {
   7342       case BtOpNone:  return "";
   7343       case BtOpSet:   return "s";
   7344       case BtOpReset: return "r";
   7345       case BtOpComp:  return "c";
   7346       default: vpanic("nameBtOp(amd64)");
   7347    }
   7348 }
   7349 
   7350 
   7351 static
   7352 ULong dis_bt_G_E ( VexAbiInfo* vbi,
   7353                    Prefix pfx, Int sz, Long delta, BtOp op )
   7354 {
   7355    HChar  dis_buf[50];
   7356    UChar  modrm;
   7357    Int    len;
   7358    IRTemp t_fetched, t_bitno0, t_bitno1, t_bitno2, t_addr0,
   7359      t_addr1, t_rsp, t_mask, t_new;
   7360 
   7361    vassert(sz == 2 || sz == 4 || sz == 8);
   7362 
   7363    t_fetched = t_bitno0 = t_bitno1 = t_bitno2
   7364              = t_addr0 = t_addr1 = t_rsp
   7365              = t_mask = t_new = IRTemp_INVALID;
   7366 
   7367    t_fetched = newTemp(Ity_I8);
   7368    t_new     = newTemp(Ity_I8);
   7369    t_bitno0  = newTemp(Ity_I64);
   7370    t_bitno1  = newTemp(Ity_I64);
   7371    t_bitno2  = newTemp(Ity_I8);
   7372    t_addr1   = newTemp(Ity_I64);
   7373    modrm     = getUChar(delta);
   7374 
   7375    assign( t_bitno0, widenSto64(getIRegG(sz, pfx, modrm)) );
   7376 
   7377    if (epartIsReg(modrm)) {
   7378       delta++;
   7379       /* Get it onto the client's stack.  Oh, this is a horrible
   7380          kludge.  See https://bugs.kde.org/show_bug.cgi?id=245925.
   7381          Because of the ELF ABI stack redzone, there may be live data
   7382          up to 128 bytes below %RSP.  So we can't just push it on the
   7383          stack, else we may wind up trashing live data, and causing
   7384          impossible-to-find simulation errors.  (Yes, this did
   7385          happen.)  So we need to drop RSP before at least 128 before
   7386          pushing it.  That unfortunately means hitting Memcheck's
   7387          fast-case painting code.  Ideally we should drop more than
   7388          128, to reduce the chances of breaking buggy programs that
   7389          have live data below -128(%RSP).  Memcheck fast-cases moves
   7390          of 288 bytes due to the need to handle ppc64-linux quickly,
   7391          so let's use 288.  Of course the real fix is to get rid of
   7392          this kludge entirely.  */
   7393       t_rsp = newTemp(Ity_I64);
   7394       t_addr0 = newTemp(Ity_I64);
   7395 
   7396       vassert(vbi->guest_stack_redzone_size == 128);
   7397       assign( t_rsp, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(288)) );
   7398       putIReg64(R_RSP, mkexpr(t_rsp));
   7399 
   7400       storeLE( mkexpr(t_rsp), getIRegE(sz, pfx, modrm) );
   7401 
   7402       /* Make t_addr0 point at it. */
   7403       assign( t_addr0, mkexpr(t_rsp) );
   7404 
   7405       /* Mask out upper bits of the shift amount, since we're doing a
   7406          reg. */
   7407       assign( t_bitno1, binop(Iop_And64,
   7408                               mkexpr(t_bitno0),
   7409                               mkU64(sz == 8 ? 63 : sz == 4 ? 31 : 15)) );
   7410 
   7411    } else {
   7412       t_addr0 = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   7413       delta += len;
   7414       assign( t_bitno1, mkexpr(t_bitno0) );
   7415    }
   7416 
   7417    /* At this point: t_addr0 is the address being operated on.  If it
   7418       was a reg, we will have pushed it onto the client's stack.
   7419       t_bitno1 is the bit number, suitably masked in the case of a
   7420       reg.  */
   7421 
   7422    /* Now the main sequence. */
   7423    assign( t_addr1,
   7424            binop(Iop_Add64,
   7425                  mkexpr(t_addr0),
   7426                  binop(Iop_Sar64, mkexpr(t_bitno1), mkU8(3))) );
   7427 
   7428    /* t_addr1 now holds effective address */
   7429 
   7430    assign( t_bitno2,
   7431            unop(Iop_64to8,
   7432                 binop(Iop_And64, mkexpr(t_bitno1), mkU64(7))) );
   7433 
   7434    /* t_bitno2 contains offset of bit within byte */
   7435 
   7436    if (op != BtOpNone) {
   7437       t_mask = newTemp(Ity_I8);
   7438       assign( t_mask, binop(Iop_Shl8, mkU8(1), mkexpr(t_bitno2)) );
   7439    }
   7440 
   7441    /* t_mask is now a suitable byte mask */
   7442 
   7443    assign( t_fetched, loadLE(Ity_I8, mkexpr(t_addr1)) );
   7444 
   7445    if (op != BtOpNone) {
   7446       switch (op) {
   7447          case BtOpSet:
   7448             assign( t_new,
   7449                     binop(Iop_Or8, mkexpr(t_fetched), mkexpr(t_mask)) );
   7450             break;
   7451          case BtOpComp:
   7452             assign( t_new,
   7453                     binop(Iop_Xor8, mkexpr(t_fetched), mkexpr(t_mask)) );
   7454             break;
   7455          case BtOpReset:
   7456             assign( t_new,
   7457                     binop(Iop_And8, mkexpr(t_fetched),
   7458                                     unop(Iop_Not8, mkexpr(t_mask))) );
   7459             break;
   7460          default:
   7461             vpanic("dis_bt_G_E(amd64)");
   7462       }
   7463       if ((pfx & PFX_LOCK) && !epartIsReg(modrm)) {
   7464          casLE( mkexpr(t_addr1), mkexpr(t_fetched)/*expd*/,
   7465                                  mkexpr(t_new)/*new*/,
   7466                                  guest_RIP_curr_instr );
   7467       } else {
   7468          storeLE( mkexpr(t_addr1), mkexpr(t_new) );
   7469       }
   7470    }
   7471 
   7472    /* Side effect done; now get selected bit into Carry flag */
   7473    /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
   7474    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   7475    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   7476    stmt( IRStmt_Put(
   7477             OFFB_CC_DEP1,
   7478             binop(Iop_And64,
   7479                   binop(Iop_Shr64,
   7480                         unop(Iop_8Uto64, mkexpr(t_fetched)),
   7481                         mkexpr(t_bitno2)),
   7482                   mkU64(1)))
   7483        );
   7484    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   7485       elimination of previous stores to this field work better. */
   7486    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   7487 
   7488    /* Move reg operand from stack back to reg */
   7489    if (epartIsReg(modrm)) {
   7490       /* t_rsp still points at it. */
   7491       /* only write the reg if actually modifying it; doing otherwise
   7492          zeroes the top half erroneously when doing btl due to
   7493          standard zero-extend rule */
   7494       if (op != BtOpNone)
   7495          putIRegE(sz, pfx, modrm, loadLE(szToITy(sz), mkexpr(t_rsp)) );
   7496       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t_rsp), mkU64(288)) );
   7497    }
   7498 
   7499    DIP("bt%s%c %s, %s\n",
   7500        nameBtOp(op), nameISize(sz), nameIRegG(sz, pfx, modrm),
   7501        ( epartIsReg(modrm) ? nameIRegE(sz, pfx, modrm) : dis_buf ) );
   7502 
   7503    return delta;
   7504 }
   7505 
   7506 
   7507 
   7508 /* Handle BSF/BSR.  Only v-size seems necessary. */
   7509 static
   7510 ULong dis_bs_E_G ( VexAbiInfo* vbi,
   7511                    Prefix pfx, Int sz, Long delta, Bool fwds )
   7512 {
   7513    Bool   isReg;
   7514    UChar  modrm;
   7515    HChar  dis_buf[50];
   7516 
   7517    IRType ty    = szToITy(sz);
   7518    IRTemp src   = newTemp(ty);
   7519    IRTemp dst   = newTemp(ty);
   7520    IRTemp src64 = newTemp(Ity_I64);
   7521    IRTemp dst64 = newTemp(Ity_I64);
   7522    IRTemp src8  = newTemp(Ity_I8);
   7523 
   7524    vassert(sz == 8 || sz == 4 || sz == 2);
   7525 
   7526    modrm = getUChar(delta);
   7527    isReg = epartIsReg(modrm);
   7528    if (isReg) {
   7529       delta++;
   7530       assign( src, getIRegE(sz, pfx, modrm) );
   7531    } else {
   7532       Int    len;
   7533       IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7534       delta += len;
   7535       assign( src, loadLE(ty, mkexpr(addr)) );
   7536    }
   7537 
   7538    DIP("bs%c%c %s, %s\n",
   7539        fwds ? 'f' : 'r', nameISize(sz),
   7540        ( isReg ? nameIRegE(sz, pfx, modrm) : dis_buf ),
   7541        nameIRegG(sz, pfx, modrm));
   7542 
   7543    /* First, widen src to 64 bits if it is not already. */
   7544    assign( src64, widenUto64(mkexpr(src)) );
   7545 
   7546    /* Generate an 8-bit expression which is zero iff the
   7547       original is zero, and nonzero otherwise */
   7548    assign( src8,
   7549            unop(Iop_1Uto8,
   7550                 binop(Iop_CmpNE64,
   7551                       mkexpr(src64), mkU64(0))) );
   7552 
   7553    /* Flags: Z is 1 iff source value is zero.  All others
   7554       are undefined -- we force them to zero. */
   7555    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   7556    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   7557    stmt( IRStmt_Put(
   7558             OFFB_CC_DEP1,
   7559             IRExpr_Mux0X( mkexpr(src8),
   7560                           /* src==0 */
   7561                           mkU64(AMD64G_CC_MASK_Z),
   7562                           /* src!=0 */
   7563                           mkU64(0)
   7564                         )
   7565        ));
   7566    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   7567       elimination of previous stores to this field work better. */
   7568    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   7569 
   7570    /* Result: iff source value is zero, we can't use
   7571       Iop_Clz64/Iop_Ctz64 as they have no defined result in that case.
   7572       But anyway, amd64 semantics say the result is undefined in
   7573       such situations.  Hence handle the zero case specially. */
   7574 
   7575    /* Bleh.  What we compute:
   7576 
   7577           bsf64:  if src == 0 then {dst is unchanged}
   7578                               else Ctz64(src)
   7579 
   7580           bsr64:  if src == 0 then {dst is unchanged}
   7581                               else 63 - Clz64(src)
   7582 
   7583           bsf32:  if src == 0 then {dst is unchanged}
   7584                               else Ctz64(32Uto64(src))
   7585 
   7586           bsr32:  if src == 0 then {dst is unchanged}
   7587                               else 63 - Clz64(32Uto64(src))
   7588 
   7589           bsf16:  if src == 0 then {dst is unchanged}
   7590                               else Ctz64(32Uto64(16Uto32(src)))
   7591 
   7592           bsr16:  if src == 0 then {dst is unchanged}
   7593                               else 63 - Clz64(32Uto64(16Uto32(src)))
   7594    */
   7595 
   7596    /* The main computation, guarding against zero. */
   7597    assign( dst64,
   7598            IRExpr_Mux0X(
   7599               mkexpr(src8),
   7600               /* src == 0 -- leave dst unchanged */
   7601               widenUto64( getIRegG( sz, pfx, modrm ) ),
   7602               /* src != 0 */
   7603               fwds ? unop(Iop_Ctz64, mkexpr(src64))
   7604                    : binop(Iop_Sub64,
   7605                            mkU64(63),
   7606                            unop(Iop_Clz64, mkexpr(src64)))
   7607            )
   7608          );
   7609 
   7610    if (sz == 2)
   7611       assign( dst, unop(Iop_64to16, mkexpr(dst64)) );
   7612    else
   7613    if (sz == 4)
   7614       assign( dst, unop(Iop_64to32, mkexpr(dst64)) );
   7615    else
   7616       assign( dst, mkexpr(dst64) );
   7617 
   7618    /* dump result back */
   7619    putIRegG( sz, pfx, modrm, mkexpr(dst) );
   7620 
   7621    return delta;
   7622 }
   7623 
   7624 
   7625 /* swap rAX with the reg specified by reg and REX.B */
   7626 static
   7627 void codegen_xchg_rAX_Reg ( Prefix pfx, Int sz, UInt regLo3 )
   7628 {
   7629    IRType ty = szToITy(sz);
   7630    IRTemp t1 = newTemp(ty);
   7631    IRTemp t2 = newTemp(ty);
   7632    vassert(sz == 4 || sz == 8);
   7633    vassert(regLo3 < 8);
   7634    if (sz == 8) {
   7635       assign( t1, getIReg64(R_RAX) );
   7636       assign( t2, getIRegRexB(8, pfx, regLo3) );
   7637       putIReg64( R_RAX, mkexpr(t2) );
   7638       putIRegRexB(8, pfx, regLo3, mkexpr(t1) );
   7639    } else {
   7640       assign( t1, getIReg32(R_RAX) );
   7641       assign( t2, getIRegRexB(4, pfx, regLo3) );
   7642       putIReg32( R_RAX, mkexpr(t2) );
   7643       putIRegRexB(4, pfx, regLo3, mkexpr(t1) );
   7644    }
   7645    DIP("xchg%c %s, %s\n",
   7646        nameISize(sz), nameIRegRAX(sz),
   7647                       nameIRegRexB(sz,pfx, regLo3));
   7648 }
   7649 
   7650 
   7651 static
   7652 void codegen_SAHF ( void )
   7653 {
   7654    /* Set the flags to:
   7655       (amd64g_calculate_flags_all() & AMD64G_CC_MASK_O)
   7656                                     -- retain the old O flag
   7657       | (%AH & (AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
   7658                 |AMD64G_CC_MASK_P|AMD64G_CC_MASK_C)
   7659    */
   7660    ULong  mask_SZACP = AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
   7661                        |AMD64G_CC_MASK_C|AMD64G_CC_MASK_P;
   7662    IRTemp oldflags   = newTemp(Ity_I64);
   7663    assign( oldflags, mk_amd64g_calculate_rflags_all() );
   7664    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   7665    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   7666    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   7667    stmt( IRStmt_Put( OFFB_CC_DEP1,
   7668          binop(Iop_Or64,
   7669                binop(Iop_And64, mkexpr(oldflags), mkU64(AMD64G_CC_MASK_O)),
   7670                binop(Iop_And64,
   7671                      binop(Iop_Shr64, getIReg64(R_RAX), mkU8(8)),
   7672                      mkU64(mask_SZACP))
   7673               )
   7674    ));
   7675 }
   7676 
   7677 
   7678 static
   7679 void codegen_LAHF ( void  )
   7680 {
   7681    /* AH <- EFLAGS(SF:ZF:0:AF:0:PF:1:CF) */
   7682    IRExpr* rax_with_hole;
   7683    IRExpr* new_byte;
   7684    IRExpr* new_rax;
   7685    ULong   mask_SZACP = AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
   7686                         |AMD64G_CC_MASK_C|AMD64G_CC_MASK_P;
   7687 
   7688    IRTemp  flags = newTemp(Ity_I64);
   7689    assign( flags, mk_amd64g_calculate_rflags_all() );
   7690 
   7691    rax_with_hole
   7692       = binop(Iop_And64, getIReg64(R_RAX), mkU64(~0xFF00ULL));
   7693    new_byte
   7694       = binop(Iop_Or64, binop(Iop_And64, mkexpr(flags), mkU64(mask_SZACP)),
   7695                         mkU64(1<<1));
   7696    new_rax
   7697       = binop(Iop_Or64, rax_with_hole,
   7698                         binop(Iop_Shl64, new_byte, mkU8(8)));
   7699    putIReg64(R_RAX, new_rax);
   7700 }
   7701 
   7702 
   7703 static
   7704 ULong dis_cmpxchg_G_E ( /*OUT*/Bool* ok,
   7705                         VexAbiInfo*  vbi,
   7706                         Prefix       pfx,
   7707                         Int          size,
   7708                         Long         delta0 )
   7709 {
   7710    HChar dis_buf[50];
   7711    Int   len;
   7712 
   7713    IRType ty    = szToITy(size);
   7714    IRTemp acc   = newTemp(ty);
   7715    IRTemp src   = newTemp(ty);
   7716    IRTemp dest  = newTemp(ty);
   7717    IRTemp dest2 = newTemp(ty);
   7718    IRTemp acc2  = newTemp(ty);
   7719    IRTemp cond8 = newTemp(Ity_I8);
   7720    IRTemp addr  = IRTemp_INVALID;
   7721    UChar  rm    = getUChar(delta0);
   7722 
   7723    /* There are 3 cases to consider:
   7724 
   7725       reg-reg: ignore any lock prefix, generate sequence based
   7726                on Mux0X
   7727 
   7728       reg-mem, not locked: ignore any lock prefix, generate sequence
   7729                            based on Mux0X
   7730 
   7731       reg-mem, locked: use IRCAS
   7732    */
   7733 
   7734    if (epartIsReg(rm)) {
   7735       /* case 1 */
   7736       assign( dest, getIRegE(size, pfx, rm) );
   7737       delta0++;
   7738       assign( src, getIRegG(size, pfx, rm) );
   7739       assign( acc, getIRegRAX(size) );
   7740       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   7741       assign( cond8, unop(Iop_1Uto8, mk_amd64g_calculate_condition(AMD64CondZ)) );
   7742       assign( dest2, IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(src)) );
   7743       assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
   7744       putIRegRAX(size, mkexpr(acc2));
   7745       putIRegE(size, pfx, rm, mkexpr(dest2));
   7746       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   7747                                nameIRegG(size,pfx,rm),
   7748                                nameIRegE(size,pfx,rm) );
   7749    }
   7750    else if (!epartIsReg(rm) && !(pfx & PFX_LOCK)) {
   7751       /* case 2 */
   7752       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   7753       assign( dest, loadLE(ty, mkexpr(addr)) );
   7754       delta0 += len;
   7755       assign( src, getIRegG(size, pfx, rm) );
   7756       assign( acc, getIRegRAX(size) );
   7757       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   7758       assign( cond8, unop(Iop_1Uto8, mk_amd64g_calculate_condition(AMD64CondZ)) );
   7759       assign( dest2, IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(src)) );
   7760       assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
   7761       putIRegRAX(size, mkexpr(acc2));
   7762       storeLE( mkexpr(addr), mkexpr(dest2) );
   7763       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   7764                                nameIRegG(size,pfx,rm), dis_buf);
   7765    }
   7766    else if (!epartIsReg(rm) && (pfx & PFX_LOCK)) {
   7767       /* case 3 */
   7768       /* src is new value.  acc is expected value.  dest is old value.
   7769          Compute success from the output of the IRCAS, and steer the
   7770          new value for RAX accordingly: in case of success, RAX is
   7771          unchanged. */
   7772       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   7773       delta0 += len;
   7774       assign( src, getIRegG(size, pfx, rm) );
   7775       assign( acc, getIRegRAX(size) );
   7776       stmt( IRStmt_CAS(
   7777          mkIRCAS( IRTemp_INVALID, dest, Iend_LE, mkexpr(addr),
   7778                   NULL, mkexpr(acc), NULL, mkexpr(src) )
   7779       ));
   7780       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   7781       assign( cond8, unop(Iop_1Uto8, mk_amd64g_calculate_condition(AMD64CondZ)) );
   7782       assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
   7783       putIRegRAX(size, mkexpr(acc2));
   7784       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   7785                                nameIRegG(size,pfx,rm), dis_buf);
   7786    }
   7787    else vassert(0);
   7788 
   7789    *ok = True;
   7790    return delta0;
   7791 }
   7792 
   7793 
   7794 /* Handle conditional move instructions of the form
   7795       cmovcc E(reg-or-mem), G(reg)
   7796 
   7797    E(src) is reg-or-mem
   7798    G(dst) is reg.
   7799 
   7800    If E is reg, -->    GET %E, tmps
   7801                        GET %G, tmpd
   7802                        CMOVcc tmps, tmpd
   7803                        PUT tmpd, %G
   7804 
   7805    If E is mem  -->    (getAddr E) -> tmpa
   7806                        LD (tmpa), tmps
   7807                        GET %G, tmpd
   7808                        CMOVcc tmps, tmpd
   7809                        PUT tmpd, %G
   7810 */
   7811 static
   7812 ULong dis_cmov_E_G ( VexAbiInfo* vbi,
   7813                      Prefix        pfx,
   7814                      Int           sz,
   7815                      AMD64Condcode cond,
   7816                      Long          delta0 )
   7817 {
   7818    UChar rm  = getUChar(delta0);
   7819    HChar dis_buf[50];
   7820    Int   len;
   7821 
   7822    IRType ty   = szToITy(sz);
   7823    IRTemp tmps = newTemp(ty);
   7824    IRTemp tmpd = newTemp(ty);
   7825 
   7826    if (epartIsReg(rm)) {
   7827       assign( tmps, getIRegE(sz, pfx, rm) );
   7828       assign( tmpd, getIRegG(sz, pfx, rm) );
   7829 
   7830       putIRegG( sz, pfx, rm,
   7831                 IRExpr_Mux0X( unop(Iop_1Uto8,
   7832                                    mk_amd64g_calculate_condition(cond)),
   7833                               mkexpr(tmpd),
   7834                               mkexpr(tmps) )
   7835               );
   7836       DIP("cmov%s %s,%s\n", name_AMD64Condcode(cond),
   7837                             nameIRegE(sz,pfx,rm),
   7838                             nameIRegG(sz,pfx,rm));
   7839       return 1+delta0;
   7840    }
   7841 
   7842    /* E refers to memory */
   7843    {
   7844       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   7845       assign( tmps, loadLE(ty, mkexpr(addr)) );
   7846       assign( tmpd, getIRegG(sz, pfx, rm) );
   7847 
   7848       putIRegG( sz, pfx, rm,
   7849                 IRExpr_Mux0X( unop(Iop_1Uto8,
   7850                                    mk_amd64g_calculate_condition(cond)),
   7851                               mkexpr(tmpd),
   7852                               mkexpr(tmps) )
   7853               );
   7854 
   7855       DIP("cmov%s %s,%s\n", name_AMD64Condcode(cond),
   7856                             dis_buf,
   7857                             nameIRegG(sz,pfx,rm));
   7858       return len+delta0;
   7859    }
   7860 }
   7861 
   7862 
   7863 static
   7864 ULong dis_xadd_G_E ( /*OUT*/Bool* decode_ok,
   7865                      VexAbiInfo* vbi,
   7866                      Prefix pfx, Int sz, Long delta0 )
   7867 {
   7868    Int   len;
   7869    UChar rm = getUChar(delta0);
   7870    HChar dis_buf[50];
   7871 
   7872    IRType ty    = szToITy(sz);
   7873    IRTemp tmpd  = newTemp(ty);
   7874    IRTemp tmpt0 = newTemp(ty);
   7875    IRTemp tmpt1 = newTemp(ty);
   7876 
   7877    /* There are 3 cases to consider:
   7878 
   7879       reg-reg: ignore any lock prefix,
   7880                generate 'naive' (non-atomic) sequence
   7881 
   7882       reg-mem, not locked: ignore any lock prefix, generate 'naive'
   7883                            (non-atomic) sequence
   7884 
   7885       reg-mem, locked: use IRCAS
   7886    */
   7887 
   7888    if (epartIsReg(rm)) {
   7889       /* case 1 */
   7890       assign( tmpd, getIRegE(sz, pfx, rm) );
   7891       assign( tmpt0, getIRegG(sz, pfx, rm) );
   7892       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   7893                            mkexpr(tmpd), mkexpr(tmpt0)) );
   7894       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   7895       putIRegG(sz, pfx, rm, mkexpr(tmpd));
   7896       putIRegE(sz, pfx, rm, mkexpr(tmpt1));
   7897       DIP("xadd%c %s, %s\n",
   7898           nameISize(sz), nameIRegG(sz,pfx,rm),
   7899           				 nameIRegE(sz,pfx,rm));
   7900       *decode_ok = True;
   7901       return 1+delta0;
   7902    }
   7903    else if (!epartIsReg(rm) && !(pfx & PFX_LOCK)) {
   7904       /* case 2 */
   7905       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   7906       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
   7907       assign( tmpt0, getIRegG(sz, pfx, rm) );
   7908       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   7909                            mkexpr(tmpd), mkexpr(tmpt0)) );
   7910       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   7911       storeLE( mkexpr(addr), mkexpr(tmpt1) );
   7912       putIRegG(sz, pfx, rm, mkexpr(tmpd));
   7913       DIP("xadd%c %s, %s\n",
   7914           nameISize(sz), nameIRegG(sz,pfx,rm), dis_buf);
   7915       *decode_ok = True;
   7916       return len+delta0;
   7917    }
   7918    else if (!epartIsReg(rm) && (pfx & PFX_LOCK)) {
   7919       /* case 3 */
   7920       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   7921       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
   7922       assign( tmpt0, getIRegG(sz, pfx, rm) );
   7923       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   7924                            mkexpr(tmpd), mkexpr(tmpt0)) );
   7925       casLE( mkexpr(addr), mkexpr(tmpd)/*expVal*/,
   7926                            mkexpr(tmpt1)/*newVal*/, guest_RIP_curr_instr );
   7927       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   7928       putIRegG(sz, pfx, rm, mkexpr(tmpd));
   7929       DIP("xadd%c %s, %s\n",
   7930           nameISize(sz), nameIRegG(sz,pfx,rm), dis_buf);
   7931       *decode_ok = True;
   7932       return len+delta0;
   7933    }
   7934    /*UNREACHED*/
   7935    vassert(0);
   7936 }
   7937 
   7938 //.. /* Move 16 bits from Ew (ireg or mem) to G (a segment register). */
   7939 //..
   7940 //.. static
   7941 //.. UInt dis_mov_Ew_Sw ( UChar sorb, Long delta0 )
   7942 //.. {
   7943 //..    Int    len;
   7944 //..    IRTemp addr;
   7945 //..    UChar  rm  = getUChar(delta0);
   7946 //..    HChar  dis_buf[50];
   7947 //..
   7948 //..    if (epartIsReg(rm)) {
   7949 //..       putSReg( gregOfRM(rm), getIReg(2, eregOfRM(rm)) );
   7950 //..       DIP("movw %s,%s\n", nameIReg(2,eregOfRM(rm)), nameSReg(gregOfRM(rm)));
   7951 //..       return 1+delta0;
   7952 //..    } else {
   7953 //..       addr = disAMode ( &len, sorb, delta0, dis_buf );
   7954 //..       putSReg( gregOfRM(rm), loadLE(Ity_I16, mkexpr(addr)) );
   7955 //..       DIP("movw %s,%s\n", dis_buf, nameSReg(gregOfRM(rm)));
   7956 //..       return len+delta0;
   7957 //..    }
   7958 //.. }
   7959 //..
   7960 //.. /* Move 16 bits from G (a segment register) to Ew (ireg or mem).  If
   7961 //..    dst is ireg and sz==4, zero out top half of it.  */
   7962 //..
   7963 //.. static
   7964 //.. UInt dis_mov_Sw_Ew ( UChar sorb,
   7965 //..                      Int   sz,
   7966 //..                      UInt  delta0 )
   7967 //.. {
   7968 //..    Int    len;
   7969 //..    IRTemp addr;
   7970 //..    UChar  rm  = getUChar(delta0);
   7971 //..    HChar  dis_buf[50];
   7972 //..
   7973 //..    vassert(sz == 2 || sz == 4);
   7974 //..
   7975 //..    if (epartIsReg(rm)) {
   7976 //..       if (sz == 4)
   7977 //..          putIReg(4, eregOfRM(rm), unop(Iop_16Uto32, getSReg(gregOfRM(rm))));
   7978 //..       else
   7979 //..          putIReg(2, eregOfRM(rm), getSReg(gregOfRM(rm)));
   7980 //..
   7981 //..       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), nameIReg(sz,eregOfRM(rm)));
   7982 //..       return 1+delta0;
   7983 //..    } else {
   7984 //..       addr = disAMode ( &len, sorb, delta0, dis_buf );
   7985 //..       storeLE( mkexpr(addr), getSReg(gregOfRM(rm)) );
   7986 //..       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), dis_buf);
   7987 //..       return len+delta0;
   7988 //..    }
   7989 //.. }
   7990 //..
   7991 //..
   7992 //.. static
   7993 //.. void dis_push_segreg ( UInt sreg, Int sz )
   7994 //.. {
   7995 //..     IRTemp t1 = newTemp(Ity_I16);
   7996 //..     IRTemp ta = newTemp(Ity_I32);
   7997 //..     vassert(sz == 2 || sz == 4);
   7998 //..
   7999 //..     assign( t1, getSReg(sreg) );
   8000 //..     assign( ta, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)) );
   8001 //..     putIReg(4, R_ESP, mkexpr(ta));
   8002 //..     storeLE( mkexpr(ta), mkexpr(t1) );
   8003 //..
   8004 //..     DIP("pushw %s\n", nameSReg(sreg));
   8005 //.. }
   8006 //..
   8007 //.. static
   8008 //.. void dis_pop_segreg ( UInt sreg, Int sz )
   8009 //.. {
   8010 //..     IRTemp t1 = newTemp(Ity_I16);
   8011 //..     IRTemp ta = newTemp(Ity_I32);
   8012 //..     vassert(sz == 2 || sz == 4);
   8013 //..
   8014 //..     assign( ta, getIReg(4, R_ESP) );
   8015 //..     assign( t1, loadLE(Ity_I16, mkexpr(ta)) );
   8016 //..
   8017 //..     putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(ta), mkU32(sz)) );
   8018 //..     putSReg( sreg, mkexpr(t1) );
   8019 //..     DIP("pop %s\n", nameSReg(sreg));
   8020 //.. }
   8021 
   8022 static
   8023 void dis_ret ( VexAbiInfo* vbi, ULong d64 )
   8024 {
   8025    IRTemp t1 = newTemp(Ity_I64);
   8026    IRTemp t2 = newTemp(Ity_I64);
   8027    IRTemp t3 = newTemp(Ity_I64);
   8028    assign(t1, getIReg64(R_RSP));
   8029    assign(t2, loadLE(Ity_I64,mkexpr(t1)));
   8030    assign(t3, binop(Iop_Add64, mkexpr(t1), mkU64(8+d64)));
   8031    putIReg64(R_RSP, mkexpr(t3));
   8032    make_redzone_AbiHint(vbi, t3, t2/*nia*/, "ret");
   8033    jmp_treg(Ijk_Ret,t2);
   8034 }
   8035 
   8036 
   8037 /*------------------------------------------------------------*/
   8038 /*--- SSE/SSE2/SSE3 helpers                                ---*/
   8039 /*------------------------------------------------------------*/
   8040 
   8041 /* Worker function; do not call directly.
   8042    Handles full width G = G `op` E   and   G = (not G) `op` E.
   8043 */
   8044 
   8045 static ULong dis_SSE_E_to_G_all_wrk (
   8046                 VexAbiInfo* vbi,
   8047                 Prefix pfx, Long delta,
   8048                 HChar* opname, IROp op,
   8049                 Bool   invertG
   8050              )
   8051 {
   8052    HChar   dis_buf[50];
   8053    Int     alen;
   8054    IRTemp  addr;
   8055    UChar   rm = getUChar(delta);
   8056    IRExpr* gpart
   8057       = invertG ? unop(Iop_NotV128, getXMMReg(gregOfRexRM(pfx,rm)))
   8058                 : getXMMReg(gregOfRexRM(pfx,rm));
   8059    if (epartIsReg(rm)) {
   8060       putXMMReg( gregOfRexRM(pfx,rm),
   8061                  binop(op, gpart,
   8062                            getXMMReg(eregOfRexRM(pfx,rm))) );
   8063       DIP("%s %s,%s\n", opname,
   8064                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8065                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8066       return delta+1;
   8067    } else {
   8068       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8069       putXMMReg( gregOfRexRM(pfx,rm),
   8070                  binop(op, gpart,
   8071                            loadLE(Ity_V128, mkexpr(addr))) );
   8072       DIP("%s %s,%s\n", opname,
   8073                         dis_buf,
   8074                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8075       return delta+alen;
   8076    }
   8077 }
   8078 
   8079 
   8080 /* All lanes SSE binary operation, G = G `op` E. */
   8081 
   8082 static
   8083 ULong dis_SSE_E_to_G_all ( VexAbiInfo* vbi,
   8084                            Prefix pfx, Long delta,
   8085                            HChar* opname, IROp op )
   8086 {
   8087    return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, False );
   8088 }
   8089 
   8090 /* All lanes SSE binary operation, G = (not G) `op` E. */
   8091 
   8092 static
   8093 ULong dis_SSE_E_to_G_all_invG ( VexAbiInfo* vbi,
   8094                                 Prefix pfx, Long delta,
   8095                                 HChar* opname, IROp op )
   8096 {
   8097    return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, True );
   8098 }
   8099 
   8100 
   8101 /* Lowest 32-bit lane only SSE binary operation, G = G `op` E. */
   8102 
   8103 static ULong dis_SSE_E_to_G_lo32 ( VexAbiInfo* vbi,
   8104                                    Prefix pfx, Long delta,
   8105                                    HChar* opname, IROp op )
   8106 {
   8107    HChar   dis_buf[50];
   8108    Int     alen;
   8109    IRTemp  addr;
   8110    UChar   rm = getUChar(delta);
   8111    IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
   8112    if (epartIsReg(rm)) {
   8113       putXMMReg( gregOfRexRM(pfx,rm),
   8114                  binop(op, gpart,
   8115                            getXMMReg(eregOfRexRM(pfx,rm))) );
   8116       DIP("%s %s,%s\n", opname,
   8117                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8118                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8119       return delta+1;
   8120    } else {
   8121       /* We can only do a 32-bit memory read, so the upper 3/4 of the
   8122          E operand needs to be made simply of zeroes. */
   8123       IRTemp epart = newTemp(Ity_V128);
   8124       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8125       assign( epart, unop( Iop_32UtoV128,
   8126                            loadLE(Ity_I32, mkexpr(addr))) );
   8127       putXMMReg( gregOfRexRM(pfx,rm),
   8128                  binop(op, gpart, mkexpr(epart)) );
   8129       DIP("%s %s,%s\n", opname,
   8130                         dis_buf,
   8131                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8132       return delta+alen;
   8133    }
   8134 }
   8135 
   8136 
   8137 /* Lower 64-bit lane only SSE binary operation, G = G `op` E. */
   8138 
   8139 static ULong dis_SSE_E_to_G_lo64 ( VexAbiInfo* vbi,
   8140                                    Prefix pfx, Long delta,
   8141                                    HChar* opname, IROp op )
   8142 {
   8143    HChar   dis_buf[50];
   8144    Int     alen;
   8145    IRTemp  addr;
   8146    UChar   rm = getUChar(delta);
   8147    IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
   8148    if (epartIsReg(rm)) {
   8149       putXMMReg( gregOfRexRM(pfx,rm),
   8150                  binop(op, gpart,
   8151                            getXMMReg(eregOfRexRM(pfx,rm))) );
   8152       DIP("%s %s,%s\n", opname,
   8153                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8154                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8155       return delta+1;
   8156    } else {
   8157       /* We can only do a 64-bit memory read, so the upper half of the
   8158          E operand needs to be made simply of zeroes. */
   8159       IRTemp epart = newTemp(Ity_V128);
   8160       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8161       assign( epart, unop( Iop_64UtoV128,
   8162                            loadLE(Ity_I64, mkexpr(addr))) );
   8163       putXMMReg( gregOfRexRM(pfx,rm),
   8164                  binop(op, gpart, mkexpr(epart)) );
   8165       DIP("%s %s,%s\n", opname,
   8166                         dis_buf,
   8167                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8168       return delta+alen;
   8169    }
   8170 }
   8171 
   8172 
   8173 /* All lanes unary SSE operation, G = op(E). */
   8174 
   8175 static ULong dis_SSE_E_to_G_unary_all (
   8176                 VexAbiInfo* vbi,
   8177                 Prefix pfx, Long delta,
   8178                 HChar* opname, IROp op
   8179              )
   8180 {
   8181    HChar   dis_buf[50];
   8182    Int     alen;
   8183    IRTemp  addr;
   8184    UChar   rm = getUChar(delta);
   8185    if (epartIsReg(rm)) {
   8186       putXMMReg( gregOfRexRM(pfx,rm),
   8187                  unop(op, getXMMReg(eregOfRexRM(pfx,rm))) );
   8188       DIP("%s %s,%s\n", opname,
   8189                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8190                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8191       return delta+1;
   8192    } else {
   8193       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8194       putXMMReg( gregOfRexRM(pfx,rm),
   8195                  unop(op, loadLE(Ity_V128, mkexpr(addr))) );
   8196       DIP("%s %s,%s\n", opname,
   8197                         dis_buf,
   8198                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8199       return delta+alen;
   8200    }
   8201 }
   8202 
   8203 
   8204 /* Lowest 32-bit lane only unary SSE operation, G = op(E). */
   8205 
   8206 static ULong dis_SSE_E_to_G_unary_lo32 (
   8207                 VexAbiInfo* vbi,
   8208                 Prefix pfx, Long delta,
   8209                 HChar* opname, IROp op
   8210              )
   8211 {
   8212    /* First we need to get the old G value and patch the low 32 bits
   8213       of the E operand into it.  Then apply op and write back to G. */
   8214    HChar   dis_buf[50];
   8215    Int     alen;
   8216    IRTemp  addr;
   8217    UChar   rm = getUChar(delta);
   8218    IRTemp  oldG0 = newTemp(Ity_V128);
   8219    IRTemp  oldG1 = newTemp(Ity_V128);
   8220 
   8221    assign( oldG0, getXMMReg(gregOfRexRM(pfx,rm)) );
   8222 
   8223    if (epartIsReg(rm)) {
   8224       assign( oldG1,
   8225               binop( Iop_SetV128lo32,
   8226                      mkexpr(oldG0),
   8227                      getXMMRegLane32(eregOfRexRM(pfx,rm), 0)) );
   8228       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   8229       DIP("%s %s,%s\n", opname,
   8230                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8231                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8232       return delta+1;
   8233    } else {
   8234       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8235       assign( oldG1,
   8236               binop( Iop_SetV128lo32,
   8237                      mkexpr(oldG0),
   8238                      loadLE(Ity_I32, mkexpr(addr)) ));
   8239       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   8240       DIP("%s %s,%s\n", opname,
   8241                         dis_buf,
   8242                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8243       return delta+alen;
   8244    }
   8245 }
   8246 
   8247 
   8248 /* Lowest 64-bit lane only unary SSE operation, G = op(E). */
   8249 
   8250 static ULong dis_SSE_E_to_G_unary_lo64 (
   8251                 VexAbiInfo* vbi,
   8252                 Prefix pfx, Long delta,
   8253                 HChar* opname, IROp op
   8254              )
   8255 {
   8256    /* First we need to get the old G value and patch the low 64 bits
   8257       of the E operand into it.  Then apply op and write back to G. */
   8258    HChar   dis_buf[50];
   8259    Int     alen;
   8260    IRTemp  addr;
   8261    UChar   rm = getUChar(delta);
   8262    IRTemp  oldG0 = newTemp(Ity_V128);
   8263    IRTemp  oldG1 = newTemp(Ity_V128);
   8264 
   8265    assign( oldG0, getXMMReg(gregOfRexRM(pfx,rm)) );
   8266 
   8267    if (epartIsReg(rm)) {
   8268       assign( oldG1,
   8269               binop( Iop_SetV128lo64,
   8270                      mkexpr(oldG0),
   8271                      getXMMRegLane64(eregOfRexRM(pfx,rm), 0)) );
   8272       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   8273       DIP("%s %s,%s\n", opname,
   8274                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8275                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8276       return delta+1;
   8277    } else {
   8278       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8279       assign( oldG1,
   8280               binop( Iop_SetV128lo64,
   8281                      mkexpr(oldG0),
   8282                      loadLE(Ity_I64, mkexpr(addr)) ));
   8283       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   8284       DIP("%s %s,%s\n", opname,
   8285                         dis_buf,
   8286                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8287       return delta+alen;
   8288    }
   8289 }
   8290 
   8291 
   8292 /* SSE integer binary operation:
   8293       G = G `op` E   (eLeft == False)
   8294       G = E `op` G   (eLeft == True)
   8295 */
   8296 static ULong dis_SSEint_E_to_G(
   8297                 VexAbiInfo* vbi,
   8298                 Prefix pfx, Long delta,
   8299                 HChar* opname, IROp op,
   8300                 Bool   eLeft
   8301              )
   8302 {
   8303    HChar   dis_buf[50];
   8304    Int     alen;
   8305    IRTemp  addr;
   8306    UChar   rm = getUChar(delta);
   8307    IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
   8308    IRExpr* epart = NULL;
   8309    if (epartIsReg(rm)) {
   8310       epart = getXMMReg(eregOfRexRM(pfx,rm));
   8311       DIP("%s %s,%s\n", opname,
   8312                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8313                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8314       delta += 1;
   8315    } else {
   8316       addr  = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8317       epart = loadLE(Ity_V128, mkexpr(addr));
   8318       DIP("%s %s,%s\n", opname,
   8319                         dis_buf,
   8320                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8321       delta += alen;
   8322    }
   8323    putXMMReg( gregOfRexRM(pfx,rm),
   8324               eLeft ? binop(op, epart, gpart)
   8325 	            : binop(op, gpart, epart) );
   8326    return delta;
   8327 }
   8328 
   8329 
   8330 /* Helper for doing SSE FP comparisons. */
   8331 
   8332 static void findSSECmpOp ( Bool* needNot, IROp* op,
   8333                            Int imm8, Bool all_lanes, Int sz )
   8334 {
   8335    imm8 &= 7;
   8336    *needNot = False;
   8337    *op      = Iop_INVALID;
   8338    if (imm8 >= 4) {
   8339       *needNot = True;
   8340       imm8 -= 4;
   8341    }
   8342 
   8343    if (sz == 4 && all_lanes) {
   8344       switch (imm8) {
   8345          case 0: *op = Iop_CmpEQ32Fx4; return;
   8346          case 1: *op = Iop_CmpLT32Fx4; return;
   8347          case 2: *op = Iop_CmpLE32Fx4; return;
   8348          case 3: *op = Iop_CmpUN32Fx4; return;
   8349          default: break;
   8350       }
   8351    }
   8352    if (sz == 4 && !all_lanes) {
   8353       switch (imm8) {
   8354          case 0: *op = Iop_CmpEQ32F0x4; return;
   8355          case 1: *op = Iop_CmpLT32F0x4; return;
   8356          case 2: *op = Iop_CmpLE32F0x4; return;
   8357          case 3: *op = Iop_CmpUN32F0x4; return;
   8358          default: break;
   8359       }
   8360    }
   8361    if (sz == 8 && all_lanes) {
   8362       switch (imm8) {
   8363          case 0: *op = Iop_CmpEQ64Fx2; return;
   8364          case 1: *op = Iop_CmpLT64Fx2; return;
   8365          case 2: *op = Iop_CmpLE64Fx2; return;
   8366          case 3: *op = Iop_CmpUN64Fx2; return;
   8367          default: break;
   8368       }
   8369    }
   8370    if (sz == 8 && !all_lanes) {
   8371       switch (imm8) {
   8372          case 0: *op = Iop_CmpEQ64F0x2; return;
   8373          case 1: *op = Iop_CmpLT64F0x2; return;
   8374          case 2: *op = Iop_CmpLE64F0x2; return;
   8375          case 3: *op = Iop_CmpUN64F0x2; return;
   8376          default: break;
   8377       }
   8378    }
   8379    vpanic("findSSECmpOp(amd64,guest)");
   8380 }
   8381 
   8382 /* Handles SSE 32F/64F comparisons. */
   8383 
   8384 static ULong dis_SSEcmp_E_to_G ( VexAbiInfo* vbi,
   8385                                  Prefix pfx, Long delta,
   8386                                  HChar* opname, Bool all_lanes, Int sz )
   8387 {
   8388    HChar   dis_buf[50];
   8389    Int     alen, imm8;
   8390    IRTemp  addr;
   8391    Bool    needNot = False;
   8392    IROp    op      = Iop_INVALID;
   8393    IRTemp  plain   = newTemp(Ity_V128);
   8394    UChar   rm      = getUChar(delta);
   8395    UShort  mask    = 0;
   8396    vassert(sz == 4 || sz == 8);
   8397    if (epartIsReg(rm)) {
   8398       imm8 = getUChar(delta+1);
   8399       findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
   8400       assign( plain, binop(op, getXMMReg(gregOfRexRM(pfx,rm)),
   8401                                getXMMReg(eregOfRexRM(pfx,rm))) );
   8402       delta += 2;
   8403       DIP("%s $%d,%s,%s\n", opname,
   8404                             (Int)imm8,
   8405                             nameXMMReg(eregOfRexRM(pfx,rm)),
   8406                             nameXMMReg(gregOfRexRM(pfx,rm)) );
   8407    } else {
   8408       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   8409       imm8 = getUChar(delta+alen);
   8410       findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
   8411       assign( plain,
   8412               binop(
   8413                  op,
   8414                  getXMMReg(gregOfRexRM(pfx,rm)),
   8415                    all_lanes  ? loadLE(Ity_V128, mkexpr(addr))
   8416                  : sz == 8    ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
   8417                  : /*sz==4*/    unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr)))
   8418 	      )
   8419       );
   8420       delta += alen+1;
   8421       DIP("%s $%d,%s,%s\n", opname,
   8422                             (Int)imm8,
   8423                             dis_buf,
   8424                             nameXMMReg(gregOfRexRM(pfx,rm)) );
   8425    }
   8426 
   8427    if (needNot && all_lanes) {
   8428       putXMMReg( gregOfRexRM(pfx,rm),
   8429                  unop(Iop_NotV128, mkexpr(plain)) );
   8430    }
   8431    else
   8432    if (needNot && !all_lanes) {
   8433       mask = toUShort(sz==4 ? 0x000F : 0x00FF);
   8434       putXMMReg( gregOfRexRM(pfx,rm),
   8435                  binop(Iop_XorV128, mkexpr(plain), mkV128(mask)) );
   8436    }
   8437    else {
   8438       putXMMReg( gregOfRexRM(pfx,rm), mkexpr(plain) );
   8439    }
   8440 
   8441    return delta;
   8442 }
   8443 
   8444 
   8445 /* Vector by scalar shift of G by the amount specified at the bottom
   8446    of E. */
   8447 
   8448 static ULong dis_SSE_shiftG_byE ( VexAbiInfo* vbi,
   8449                                   Prefix pfx, Long delta,
   8450                                   HChar* opname, IROp op )
   8451 {
   8452    HChar   dis_buf[50];
   8453    Int     alen, size;
   8454    IRTemp  addr;
   8455    Bool    shl, shr, sar;
   8456    UChar   rm   = getUChar(delta);
   8457    IRTemp  g0   = newTemp(Ity_V128);
   8458    IRTemp  g1   = newTemp(Ity_V128);
   8459    IRTemp  amt  = newTemp(Ity_I32);
   8460    IRTemp  amt8 = newTemp(Ity_I8);
   8461    if (epartIsReg(rm)) {
   8462       assign( amt, getXMMRegLane32(eregOfRexRM(pfx,rm), 0) );
   8463       DIP("%s %s,%s\n", opname,
   8464                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8465                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8466       delta++;
   8467    } else {
   8468       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8469       assign( amt, loadLE(Ity_I32, mkexpr(addr)) );
   8470       DIP("%s %s,%s\n", opname,
   8471                         dis_buf,
   8472                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8473       delta += alen;
   8474    }
   8475    assign( g0,   getXMMReg(gregOfRexRM(pfx,rm)) );
   8476    assign( amt8, unop(Iop_32to8, mkexpr(amt)) );
   8477 
   8478    shl = shr = sar = False;
   8479    size = 0;
   8480    switch (op) {
   8481       case Iop_ShlN16x8: shl = True; size = 32; break;
   8482       case Iop_ShlN32x4: shl = True; size = 32; break;
   8483       case Iop_ShlN64x2: shl = True; size = 64; break;
   8484       case Iop_SarN16x8: sar = True; size = 16; break;
   8485       case Iop_SarN32x4: sar = True; size = 32; break;
   8486       case Iop_ShrN16x8: shr = True; size = 16; break;
   8487       case Iop_ShrN32x4: shr = True; size = 32; break;
   8488       case Iop_ShrN64x2: shr = True; size = 64; break;
   8489       default: vassert(0);
   8490    }
   8491 
   8492    if (shl || shr) {
   8493      assign(
   8494         g1,
   8495         IRExpr_Mux0X(
   8496            unop(Iop_1Uto8,
   8497                 binop(Iop_CmpLT64U, unop(Iop_32Uto64,mkexpr(amt)), mkU64(size))),
   8498            mkV128(0x0000),
   8499            binop(op, mkexpr(g0), mkexpr(amt8))
   8500         )
   8501      );
   8502    } else
   8503    if (sar) {
   8504      assign(
   8505         g1,
   8506         IRExpr_Mux0X(
   8507            unop(Iop_1Uto8,
   8508                 binop(Iop_CmpLT64U, unop(Iop_32Uto64,mkexpr(amt)), mkU64(size))),
   8509            binop(op, mkexpr(g0), mkU8(size-1)),
   8510            binop(op, mkexpr(g0), mkexpr(amt8))
   8511         )
   8512      );
   8513    } else {
   8514       vassert(0);
   8515    }
   8516 
   8517    putXMMReg( gregOfRexRM(pfx,rm), mkexpr(g1) );
   8518    return delta;
   8519 }
   8520 
   8521 
   8522 /* Vector by scalar shift of E by an immediate byte. */
   8523 
   8524 static
   8525 ULong dis_SSE_shiftE_imm ( Prefix pfx,
   8526                            Long delta, HChar* opname, IROp op )
   8527 {
   8528    Bool    shl, shr, sar;
   8529    UChar   rm   = getUChar(delta);
   8530    IRTemp  e0   = newTemp(Ity_V128);
   8531    IRTemp  e1   = newTemp(Ity_V128);
   8532    UChar   amt, size;
   8533    vassert(epartIsReg(rm));
   8534    vassert(gregLO3ofRM(rm) == 2
   8535            || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
   8536    amt = getUChar(delta+1);
   8537    delta += 2;
   8538    DIP("%s $%d,%s\n", opname,
   8539                       (Int)amt,
   8540                       nameXMMReg(eregOfRexRM(pfx,rm)) );
   8541    assign( e0, getXMMReg(eregOfRexRM(pfx,rm)) );
   8542 
   8543    shl = shr = sar = False;
   8544    size = 0;
   8545    switch (op) {
   8546       case Iop_ShlN16x8: shl = True; size = 16; break;
   8547       case Iop_ShlN32x4: shl = True; size = 32; break;
   8548       case Iop_ShlN64x2: shl = True; size = 64; break;
   8549       case Iop_SarN16x8: sar = True; size = 16; break;
   8550       case Iop_SarN32x4: sar = True; size = 32; break;
   8551       case Iop_ShrN16x8: shr = True; size = 16; break;
   8552       case Iop_ShrN32x4: shr = True; size = 32; break;
   8553       case Iop_ShrN64x2: shr = True; size = 64; break;
   8554       default: vassert(0);
   8555    }
   8556 
   8557    if (shl || shr) {
   8558      assign( e1, amt >= size
   8559                     ? mkV128(0x0000)
   8560                     : binop(op, mkexpr(e0), mkU8(amt))
   8561      );
   8562    } else
   8563    if (sar) {
   8564      assign( e1, amt >= size
   8565                     ? binop(op, mkexpr(e0), mkU8(size-1))
   8566                     : binop(op, mkexpr(e0), mkU8(amt))
   8567      );
   8568    } else {
   8569       vassert(0);
   8570    }
   8571 
   8572    putXMMReg( eregOfRexRM(pfx,rm), mkexpr(e1) );
   8573    return delta;
   8574 }
   8575 
   8576 
   8577 /* Get the current SSE rounding mode. */
   8578 
   8579 static IRExpr* /* :: Ity_I32 */ get_sse_roundingmode ( void )
   8580 {
   8581    return
   8582       unop( Iop_64to32,
   8583             binop( Iop_And64,
   8584                    IRExpr_Get( OFFB_SSEROUND, Ity_I64 ),
   8585                    mkU64(3) ));
   8586 }
   8587 
   8588 static void put_sse_roundingmode ( IRExpr* sseround )
   8589 {
   8590    vassert(typeOfIRExpr(irsb->tyenv, sseround) == Ity_I32);
   8591    stmt( IRStmt_Put( OFFB_SSEROUND,
   8592                      unop(Iop_32Uto64,sseround) ) );
   8593 }
   8594 
   8595 /* Break a 128-bit value up into four 32-bit ints. */
   8596 
   8597 static void breakup128to32s ( IRTemp t128,
   8598                               /*OUTs*/
   8599                               IRTemp* t3, IRTemp* t2,
   8600                               IRTemp* t1, IRTemp* t0 )
   8601 {
   8602    IRTemp hi64 = newTemp(Ity_I64);
   8603    IRTemp lo64 = newTemp(Ity_I64);
   8604    assign( hi64, unop(Iop_V128HIto64, mkexpr(t128)) );
   8605    assign( lo64, unop(Iop_V128to64,   mkexpr(t128)) );
   8606 
   8607    vassert(t0 && *t0 == IRTemp_INVALID);
   8608    vassert(t1 && *t1 == IRTemp_INVALID);
   8609    vassert(t2 && *t2 == IRTemp_INVALID);
   8610    vassert(t3 && *t3 == IRTemp_INVALID);
   8611 
   8612    *t0 = newTemp(Ity_I32);
   8613    *t1 = newTemp(Ity_I32);
   8614    *t2 = newTemp(Ity_I32);
   8615    *t3 = newTemp(Ity_I32);
   8616    assign( *t0, unop(Iop_64to32,   mkexpr(lo64)) );
   8617    assign( *t1, unop(Iop_64HIto32, mkexpr(lo64)) );
   8618    assign( *t2, unop(Iop_64to32,   mkexpr(hi64)) );
   8619    assign( *t3, unop(Iop_64HIto32, mkexpr(hi64)) );
   8620 }
   8621 
   8622 /* Construct a 128-bit value from four 32-bit ints. */
   8623 
   8624 static IRExpr* mk128from32s ( IRTemp t3, IRTemp t2,
   8625                               IRTemp t1, IRTemp t0 )
   8626 {
   8627    return
   8628       binop( Iop_64HLtoV128,
   8629              binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
   8630              binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0))
   8631    );
   8632 }
   8633 
   8634 /* Break a 64-bit value up into four 16-bit ints. */
   8635 
   8636 static void breakup64to16s ( IRTemp t64,
   8637                              /*OUTs*/
   8638                              IRTemp* t3, IRTemp* t2,
   8639                              IRTemp* t1, IRTemp* t0 )
   8640 {
   8641    IRTemp hi32 = newTemp(Ity_I32);
   8642    IRTemp lo32 = newTemp(Ity_I32);
   8643    assign( hi32, unop(Iop_64HIto32, mkexpr(t64)) );
   8644    assign( lo32, unop(Iop_64to32,   mkexpr(t64)) );
   8645 
   8646    vassert(t0 && *t0 == IRTemp_INVALID);
   8647    vassert(t1 && *t1 == IRTemp_INVALID);
   8648    vassert(t2 && *t2 == IRTemp_INVALID);
   8649    vassert(t3 && *t3 == IRTemp_INVALID);
   8650 
   8651    *t0 = newTemp(Ity_I16);
   8652    *t1 = newTemp(Ity_I16);
   8653    *t2 = newTemp(Ity_I16);
   8654    *t3 = newTemp(Ity_I16);
   8655    assign( *t0, unop(Iop_32to16,   mkexpr(lo32)) );
   8656    assign( *t1, unop(Iop_32HIto16, mkexpr(lo32)) );
   8657    assign( *t2, unop(Iop_32to16,   mkexpr(hi32)) );
   8658    assign( *t3, unop(Iop_32HIto16, mkexpr(hi32)) );
   8659 }
   8660 
   8661 /* Construct a 64-bit value from four 16-bit ints. */
   8662 
   8663 static IRExpr* mk64from16s ( IRTemp t3, IRTemp t2,
   8664                              IRTemp t1, IRTemp t0 )
   8665 {
   8666    return
   8667       binop( Iop_32HLto64,
   8668              binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)),
   8669              binop(Iop_16HLto32, mkexpr(t1), mkexpr(t0))
   8670    );
   8671 }
   8672 
   8673 
   8674 /* Helper for the SSSE3 (not SSE3) PMULHRSW insns.  Given two 64-bit
   8675    values (aa,bb), computes, for each of the 4 16-bit lanes:
   8676 
   8677    (((aa_lane *s32 bb_lane) >>u 14) + 1) >>u 1
   8678 */
   8679 static IRExpr* dis_PMULHRSW_helper ( IRExpr* aax, IRExpr* bbx )
   8680 {
   8681    IRTemp aa      = newTemp(Ity_I64);
   8682    IRTemp bb      = newTemp(Ity_I64);
   8683    IRTemp aahi32s = newTemp(Ity_I64);
   8684    IRTemp aalo32s = newTemp(Ity_I64);
   8685    IRTemp bbhi32s = newTemp(Ity_I64);
   8686    IRTemp bblo32s = newTemp(Ity_I64);
   8687    IRTemp rHi     = newTemp(Ity_I64);
   8688    IRTemp rLo     = newTemp(Ity_I64);
   8689    IRTemp one32x2 = newTemp(Ity_I64);
   8690    assign(aa, aax);
   8691    assign(bb, bbx);
   8692    assign( aahi32s,
   8693            binop(Iop_SarN32x2,
   8694                  binop(Iop_InterleaveHI16x4, mkexpr(aa), mkexpr(aa)),
   8695                  mkU8(16) ));
   8696    assign( aalo32s,
   8697            binop(Iop_SarN32x2,
   8698                  binop(Iop_InterleaveLO16x4, mkexpr(aa), mkexpr(aa)),
   8699                  mkU8(16) ));
   8700    assign( bbhi32s,
   8701            binop(Iop_SarN32x2,
   8702                  binop(Iop_InterleaveHI16x4, mkexpr(bb), mkexpr(bb)),
   8703                  mkU8(16) ));
   8704    assign( bblo32s,
   8705            binop(Iop_SarN32x2,
   8706                  binop(Iop_InterleaveLO16x4, mkexpr(bb), mkexpr(bb)),
   8707                  mkU8(16) ));
   8708    assign(one32x2, mkU64( (1ULL << 32) + 1 ));
   8709    assign(
   8710       rHi,
   8711       binop(
   8712          Iop_ShrN32x2,
   8713          binop(
   8714             Iop_Add32x2,
   8715             binop(
   8716                Iop_ShrN32x2,
   8717                binop(Iop_Mul32x2, mkexpr(aahi32s), mkexpr(bbhi32s)),
   8718                mkU8(14)
   8719             ),
   8720             mkexpr(one32x2)
   8721          ),
   8722          mkU8(1)
   8723       )
   8724    );
   8725    assign(
   8726       rLo,
   8727       binop(
   8728          Iop_ShrN32x2,
   8729          binop(
   8730             Iop_Add32x2,
   8731             binop(
   8732                Iop_ShrN32x2,
   8733                binop(Iop_Mul32x2, mkexpr(aalo32s), mkexpr(bblo32s)),
   8734                mkU8(14)
   8735             ),
   8736             mkexpr(one32x2)
   8737          ),
   8738          mkU8(1)
   8739       )
   8740    );
   8741    return
   8742       binop(Iop_CatEvenLanes16x4, mkexpr(rHi), mkexpr(rLo));
   8743 }
   8744 
   8745 /* Helper for the SSSE3 (not SSE3) PSIGN{B,W,D} insns.  Given two 64-bit
   8746    values (aa,bb), computes, for each lane:
   8747 
   8748           if aa_lane < 0 then - bb_lane
   8749      else if aa_lane > 0 then bb_lane
   8750      else 0
   8751 */
   8752 static IRExpr* dis_PSIGN_helper ( IRExpr* aax, IRExpr* bbx, Int laneszB )
   8753 {
   8754    IRTemp aa       = newTemp(Ity_I64);
   8755    IRTemp bb       = newTemp(Ity_I64);
   8756    IRTemp zero     = newTemp(Ity_I64);
   8757    IRTemp bbNeg    = newTemp(Ity_I64);
   8758    IRTemp negMask  = newTemp(Ity_I64);
   8759    IRTemp posMask  = newTemp(Ity_I64);
   8760    IROp   opSub    = Iop_INVALID;
   8761    IROp   opCmpGTS = Iop_INVALID;
   8762 
   8763    switch (laneszB) {
   8764       case 1: opSub = Iop_Sub8x8;  opCmpGTS = Iop_CmpGT8Sx8;  break;
   8765       case 2: opSub = Iop_Sub16x4; opCmpGTS = Iop_CmpGT16Sx4; break;
   8766       case 4: opSub = Iop_Sub32x2; opCmpGTS = Iop_CmpGT32Sx2; break;
   8767       default: vassert(0);
   8768    }
   8769 
   8770    assign( aa,      aax );
   8771    assign( bb,      bbx );
   8772    assign( zero,    mkU64(0) );
   8773    assign( bbNeg,   binop(opSub,    mkexpr(zero), mkexpr(bb)) );
   8774    assign( negMask, binop(opCmpGTS, mkexpr(zero), mkexpr(aa)) );
   8775    assign( posMask, binop(opCmpGTS, mkexpr(aa),   mkexpr(zero)) );
   8776 
   8777    return
   8778       binop(Iop_Or64,
   8779             binop(Iop_And64, mkexpr(bb),    mkexpr(posMask)),
   8780             binop(Iop_And64, mkexpr(bbNeg), mkexpr(negMask)) );
   8781 
   8782 }
   8783 
   8784 /* Helper for the SSSE3 (not SSE3) PABS{B,W,D} insns.  Given a 64-bit
   8785    value aa, computes, for each lane
   8786 
   8787    if aa < 0 then -aa else aa
   8788 
   8789    Note that the result is interpreted as unsigned, so that the
   8790    absolute value of the most negative signed input can be
   8791    represented.
   8792 */
   8793 static IRExpr* dis_PABS_helper ( IRExpr* aax, Int laneszB )
   8794 {
   8795    IRTemp aa      = newTemp(Ity_I64);
   8796    IRTemp zero    = newTemp(Ity_I64);
   8797    IRTemp aaNeg   = newTemp(Ity_I64);
   8798    IRTemp negMask = newTemp(Ity_I64);
   8799    IRTemp posMask = newTemp(Ity_I64);
   8800    IROp   opSub   = Iop_INVALID;
   8801    IROp   opSarN  = Iop_INVALID;
   8802 
   8803    switch (laneszB) {
   8804       case 1: opSub = Iop_Sub8x8;  opSarN = Iop_SarN8x8;  break;
   8805       case 2: opSub = Iop_Sub16x4; opSarN = Iop_SarN16x4; break;
   8806       case 4: opSub = Iop_Sub32x2; opSarN = Iop_SarN32x2; break;
   8807       default: vassert(0);
   8808    }
   8809 
   8810    assign( aa,      aax );
   8811    assign( negMask, binop(opSarN, mkexpr(aa), mkU8(8*laneszB-1)) );
   8812    assign( posMask, unop(Iop_Not64, mkexpr(negMask)) );
   8813    assign( zero,    mkU64(0) );
   8814    assign( aaNeg,   binop(opSub, mkexpr(zero), mkexpr(aa)) );
   8815    return
   8816       binop(Iop_Or64,
   8817             binop(Iop_And64, mkexpr(aa),    mkexpr(posMask)),
   8818             binop(Iop_And64, mkexpr(aaNeg), mkexpr(negMask)) );
   8819 }
   8820 
   8821 static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64,
   8822                                         IRTemp lo64, Long byteShift )
   8823 {
   8824    vassert(byteShift >= 1 && byteShift <= 7);
   8825    return
   8826       binop(Iop_Or64,
   8827             binop(Iop_Shl64, mkexpr(hi64), mkU8(8*(8-byteShift))),
   8828             binop(Iop_Shr64, mkexpr(lo64), mkU8(8*byteShift))
   8829       );
   8830 }
   8831 
   8832 /* Generate a SIGSEGV followed by a restart of the current instruction
   8833    if effective_addr is not 16-aligned.  This is required behaviour
   8834    for some SSE3 instructions and all 128-bit SSSE3 instructions.
   8835    This assumes that guest_RIP_curr_instr is set correctly! */
   8836 /* TODO(glider): we've replaced the 0xF mask with 0x0, effectively disabling
   8837  * the check. Need to enable it once TSan stops generating unaligned
   8838  * accesses in the wrappers.
   8839  * See http://code.google.com/p/data-race-test/issues/detail?id=49 */
   8840 static void gen_SEGV_if_not_16_aligned ( IRTemp effective_addr )
   8841 {
   8842    stmt(
   8843       IRStmt_Exit(
   8844          binop(Iop_CmpNE64,
   8845                binop(Iop_And64,mkexpr(effective_addr),mkU64(0x0)),
   8846                mkU64(0)),
   8847          Ijk_SigSEGV,
   8848          IRConst_U64(guest_RIP_curr_instr)
   8849       )
   8850    );
   8851 }
   8852 
   8853 
   8854 /* Helper for deciding whether a given insn (starting at the opcode
   8855    byte) may validly be used with a LOCK prefix.  The following insns
   8856    may be used with LOCK when their destination operand is in memory.
   8857    AFAICS this is exactly the same for both 32-bit and 64-bit mode.
   8858 
   8859    ADD        80 /0,  81 /0,  82 /0,  83 /0,  00,  01
   8860    OR         80 /1,  81 /1,  82 /x,  83 /1,  08,  09
   8861    ADC        80 /2,  81 /2,  82 /2,  83 /2,  10,  11
   8862    SBB        81 /3,  81 /3,  82 /x,  83 /3,  18,  19
   8863    AND        80 /4,  81 /4,  82 /x,  83 /4,  20,  21
   8864    SUB        80 /5,  81 /5,  82 /x,  83 /5,  28,  29
   8865    XOR        80 /6,  81 /6,  82 /x,  83 /6,  30,  31
   8866 
   8867    DEC        FE /1,  FF /1
   8868    INC        FE /0,  FF /0
   8869 
   8870    NEG        F6 /3,  F7 /3
   8871    NOT        F6 /2,  F7 /2
   8872 
   8873    XCHG       86, 87
   8874 
   8875    BTC        0F BB,  0F BA /7
   8876    BTR        0F B3,  0F BA /6
   8877    BTS        0F AB,  0F BA /5
   8878 
   8879    CMPXCHG    0F B0,  0F B1
   8880    CMPXCHG8B  0F C7 /1
   8881 
   8882    XADD       0F C0,  0F C1
   8883 
   8884    ------------------------------
   8885 
   8886    80 /0  =  addb $imm8,  rm8
   8887    81 /0  =  addl $imm32, rm32  and  addw $imm16, rm16
   8888    82 /0  =  addb $imm8,  rm8
   8889    83 /0  =  addl $simm8, rm32  and  addw $simm8, rm16
   8890 
   8891    00     =  addb r8,  rm8
   8892    01     =  addl r32, rm32  and  addw r16, rm16
   8893 
   8894    Same for ADD OR ADC SBB AND SUB XOR
   8895 
   8896    FE /1  = dec rm8
   8897    FF /1  = dec rm32  and  dec rm16
   8898 
   8899    FE /0  = inc rm8
   8900    FF /0  = inc rm32  and  inc rm16
   8901 
   8902    F6 /3  = neg rm8
   8903    F7 /3  = neg rm32  and  neg rm16
   8904 
   8905    F6 /2  = not rm8
   8906    F7 /2  = not rm32  and  not rm16
   8907 
   8908    0F BB     = btcw r16, rm16    and  btcl r32, rm32
   8909    OF BA /7  = btcw $imm8, rm16  and  btcw $imm8, rm32
   8910 
   8911    Same for BTS, BTR
   8912 */
   8913 static Bool can_be_used_with_LOCK_prefix ( UChar* opc )
   8914 {
   8915    switch (opc[0]) {
   8916       case 0x00: case 0x01: case 0x08: case 0x09:
   8917       case 0x10: case 0x11: case 0x18: case 0x19:
   8918       case 0x20: case 0x21: case 0x28: case 0x29:
   8919       case 0x30: case 0x31:
   8920          if (!epartIsReg(opc[1]))
   8921             return True;
   8922          break;
   8923 
   8924       case 0x80: case 0x81: case 0x82: case 0x83:
   8925          if (gregLO3ofRM(opc[1]) >= 0 && gregLO3ofRM(opc[1]) <= 6
   8926              && !epartIsReg(opc[1]))
   8927             return True;
   8928          break;
   8929 
   8930       case 0xFE: case 0xFF:
   8931          if (gregLO3ofRM(opc[1]) >= 0 && gregLO3ofRM(opc[1]) <= 1
   8932              && !epartIsReg(opc[1]))
   8933             return True;
   8934          break;
   8935 
   8936       case 0xF6: case 0xF7:
   8937          if (gregLO3ofRM(opc[1]) >= 2 && gregLO3ofRM(opc[1]) <= 3
   8938              && !epartIsReg(opc[1]))
   8939             return True;
   8940          break;
   8941 
   8942       case 0x86: case 0x87:
   8943          if (!epartIsReg(opc[1]))
   8944             return True;
   8945          break;
   8946 
   8947       case 0x0F: {
   8948          switch (opc[1]) {
   8949             case 0xBB: case 0xB3: case 0xAB:
   8950                if (!epartIsReg(opc[2]))
   8951                   return True;
   8952                break;
   8953             case 0xBA:
   8954                if (gregLO3ofRM(opc[2]) >= 5 && gregLO3ofRM(opc[2]) <= 7
   8955                    && !epartIsReg(opc[2]))
   8956                   return True;
   8957                break;
   8958             case 0xB0: case 0xB1:
   8959                if (!epartIsReg(opc[2]))
   8960                   return True;
   8961                break;
   8962             case 0xC7:
   8963                if (gregLO3ofRM(opc[2]) == 1 && !epartIsReg(opc[2]) )
   8964                   return True;
   8965                break;
   8966             case 0xC0: case 0xC1:
   8967                if (!epartIsReg(opc[2]))
   8968                   return True;
   8969                break;
   8970             default:
   8971                break;
   8972          } /* switch (opc[1]) */
   8973          break;
   8974       }
   8975 
   8976       default:
   8977          break;
   8978    } /* switch (opc[0]) */
   8979 
   8980    return False;
   8981 }
   8982 
   8983 
   8984 /*------------------------------------------------------------*/
   8985 /*--- Disassemble a single instruction                     ---*/
   8986 /*------------------------------------------------------------*/
   8987 
   8988 /* Disassemble a single instruction into IR.  The instruction is
   8989    located in host memory at &guest_code[delta]. */
   8990 
   8991 static
   8992 DisResult disInstr_AMD64_WRK (
   8993              /*OUT*/Bool* expect_CAS,
   8994              Bool         put_IP,
   8995              Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
   8996              Bool         resteerCisOk,
   8997              void*        callback_opaque,
   8998              Long         delta64,
   8999              VexArchInfo* archinfo,
   9000              VexAbiInfo*  vbi
   9001           )
   9002 {
   9003    IRType    ty;
   9004    IRTemp    addr, t0, t1, t2, t3, t4, t5, t6;
   9005    Int       alen;
   9006    UChar     opc, modrm, abyte, pre;
   9007    Long      d64;
   9008    HChar     dis_buf[50];
   9009    Int       am_sz, d_sz, n, n_prefixes;
   9010    DisResult dres;
   9011    UChar*    insn; /* used in SSE decoders */
   9012 
   9013    /* The running delta */
   9014    Long delta = delta64;
   9015 
   9016    /* Holds eip at the start of the insn, so that we can print
   9017       consistent error messages for unimplemented insns. */
   9018    Long delta_start = delta;
   9019 
   9020    /* sz denotes the nominal data-op size of the insn; we change it to
   9021       2 if an 0x66 prefix is seen and 8 if REX.W is 1.  In case of
   9022       conflict REX.W takes precedence. */
   9023    Int sz = 4;
   9024 
   9025    /* pfx holds the summary of prefixes. */
   9026    Prefix pfx = PFX_EMPTY;
   9027 
   9028    /* Set result defaults. */
   9029    dres.whatNext   = Dis_Continue;
   9030    dres.len        = 0;
   9031    dres.continueAt = 0;
   9032 
   9033    *expect_CAS = False;
   9034 
   9035    vassert(guest_RIP_next_assumed == 0);
   9036    vassert(guest_RIP_next_mustcheck == False);
   9037 
   9038    addr = t0 = t1 = t2 = t3 = t4 = t5 = t6 = IRTemp_INVALID;
   9039 
   9040    DIP("\t0x%llx:  ", guest_RIP_bbstart+delta);
   9041 
   9042    /* We may be asked to update the guest RIP before going further. */
   9043    if (put_IP)
   9044       stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_curr_instr)) );
   9045 
   9046    /* Spot "Special" instructions (see comment at top of file). */
   9047    {
   9048       UChar* code = (UChar*)(guest_code + delta);
   9049       /* Spot the 16-byte preamble:
   9050          48C1C703   rolq $3,  %rdi
   9051          48C1C70D   rolq $13, %rdi
   9052          48C1C73D   rolq $61, %rdi
   9053          48C1C733   rolq $51, %rdi
   9054       */
   9055       if (code[ 0] == 0x48 && code[ 1] == 0xC1 && code[ 2] == 0xC7
   9056                                                && code[ 3] == 0x03 &&
   9057           code[ 4] == 0x48 && code[ 5] == 0xC1 && code[ 6] == 0xC7
   9058                                                && code[ 7] == 0x0D &&
   9059           code[ 8] == 0x48 && code[ 9] == 0xC1 && code[10] == 0xC7
   9060                                                && code[11] == 0x3D &&
   9061           code[12] == 0x48 && code[13] == 0xC1 && code[14] == 0xC7
   9062                                                && code[15] == 0x33) {
   9063          /* Got a "Special" instruction preamble.  Which one is it? */
   9064          if (code[16] == 0x48 && code[17] == 0x87
   9065                               && code[18] == 0xDB /* xchgq %rbx,%rbx */) {
   9066             /* %RDX = client_request ( %RAX ) */
   9067             DIP("%%rdx = client_request ( %%rax )\n");
   9068             delta += 19;
   9069             jmp_lit(Ijk_ClientReq, guest_RIP_bbstart+delta);
   9070             dres.whatNext = Dis_StopHere;
   9071             goto decode_success;
   9072          }
   9073          else
   9074          if (code[16] == 0x48 && code[17] == 0x87
   9075                               && code[18] == 0xC9 /* xchgq %rcx,%rcx */) {
   9076             /* %RAX = guest_NRADDR */
   9077             DIP("%%rax = guest_NRADDR\n");
   9078             delta += 19;
   9079             putIRegRAX(8, IRExpr_Get( OFFB_NRADDR, Ity_I64 ));
   9080             goto decode_success;
   9081          }
   9082          else
   9083          if (code[16] == 0x48 && code[17] == 0x87
   9084                               && code[18] == 0xD2 /* xchgq %rdx,%rdx */) {
   9085             /* call-noredir *%RAX */
   9086             DIP("call-noredir *%%rax\n");
   9087             delta += 19;
   9088             t1 = newTemp(Ity_I64);
   9089             assign(t1, getIRegRAX(8));
   9090             t2 = newTemp(Ity_I64);
   9091             assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   9092             putIReg64(R_RSP, mkexpr(t2));
   9093             storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta));
   9094             jmp_treg(Ijk_NoRedir,t1);
   9095             dres.whatNext = Dis_StopHere;
   9096             goto decode_success;
   9097          }
   9098          /* We don't know what it is. */
   9099          goto decode_failure;
   9100          /*NOTREACHED*/
   9101       }
   9102    }
   9103 
   9104    /* Eat prefixes, summarising the result in pfx and sz, and rejecting
   9105       as many invalid combinations as possible. */
   9106    n_prefixes = 0;
   9107    while (True) {
   9108       if (n_prefixes > 7) goto decode_failure;
   9109       pre = getUChar(delta);
   9110       switch (pre) {
   9111          case 0x66: pfx |= PFX_66; break;
   9112          case 0x67: pfx |= PFX_ASO; break;
   9113          case 0xF2: pfx |= PFX_F2; break;
   9114          case 0xF3: pfx |= PFX_F3; break;
   9115          case 0xF0: pfx |= PFX_LOCK; *expect_CAS = True; break;
   9116          case 0x2E: pfx |= PFX_CS; break;
   9117          case 0x3E: pfx |= PFX_DS; break;
   9118          case 0x26: pfx |= PFX_ES; break;
   9119          case 0x64: pfx |= PFX_FS; break;
   9120          case 0x65: pfx |= PFX_GS; break;
   9121          case 0x36: pfx |= PFX_SS; break;
   9122          case 0x40 ... 0x4F:
   9123             pfx |= PFX_REX;
   9124             if (pre & (1<<3)) pfx |= PFX_REXW;
   9125             if (pre & (1<<2)) pfx |= PFX_REXR;
   9126             if (pre & (1<<1)) pfx |= PFX_REXX;
   9127             if (pre & (1<<0)) pfx |= PFX_REXB;
   9128             break;
   9129          default:
   9130             goto not_a_prefix;
   9131       }
   9132       n_prefixes++;
   9133       delta++;
   9134    }
   9135 
   9136    not_a_prefix:
   9137 
   9138    /* Dump invalid combinations */
   9139    n = 0;
   9140    if (pfx & PFX_F2) n++;
   9141    if (pfx & PFX_F3) n++;
   9142    if (n > 1)
   9143       goto decode_failure; /* can't have both */
   9144 
   9145    n = 0;
   9146    if (pfx & PFX_CS) n++;
   9147    if (pfx & PFX_DS) n++;
   9148    if (pfx & PFX_ES) n++;
   9149    if (pfx & PFX_FS) n++;
   9150    if (pfx & PFX_GS) n++;
   9151    if (pfx & PFX_SS) n++;
   9152    if (n > 1)
   9153       goto decode_failure; /* multiple seg overrides == illegal */
   9154 
   9155    /* We have a %fs prefix.  Reject it if there's no evidence in 'vbi'
   9156       that we should accept it. */
   9157    if ((pfx & PFX_FS) && !vbi->guest_amd64_assume_fs_is_zero)
   9158       goto decode_failure;
   9159 
   9160    /* Ditto for %gs prefixes. */
   9161    if ((pfx & PFX_GS) && !vbi->guest_amd64_assume_gs_is_0x60)
   9162       goto decode_failure;
   9163 
   9164    /* Set up sz. */
   9165    sz = 4;
   9166    if (pfx & PFX_66) sz = 2;
   9167    if ((pfx & PFX_REX) && (pfx & PFX_REXW)) sz = 8;
   9168 
   9169    /* Now we should be looking at the primary opcode byte or the
   9170       leading F2 or F3.  Check that any LOCK prefix is actually
   9171       allowed. */
   9172 
   9173    if (pfx & PFX_LOCK) {
   9174       if (can_be_used_with_LOCK_prefix( (UChar*)&guest_code[delta] )) {
   9175          DIP("lock ");
   9176       } else {
   9177          *expect_CAS = False;
   9178          goto decode_failure;
   9179       }
   9180    }
   9181 
   9182 
   9183    /* ---------------------------------------------------- */
   9184    /* --- The SSE/SSE2 decoder.                        --- */
   9185    /* ---------------------------------------------------- */
   9186 
   9187    /* What did I do to deserve SSE ?  Perhaps I was really bad in a
   9188       previous life? */
   9189 
   9190    /* Note, this doesn't handle SSE3 right now.  All amd64s support
   9191       SSE2 as a minimum so there is no point distinguishing SSE1 vs
   9192       SSE2. */
   9193 
   9194    insn = (UChar*)&guest_code[delta];
   9195 
   9196    /* FXSAVE is spuriously at the start here only because it is
   9197       thusly placed in guest-x86/toIR.c. */
   9198 
   9199    /* 0F AE /0 = FXSAVE m512 -- write x87 and SSE state to memory.
   9200       Note that the presence or absence of REX.W slightly affects the
   9201       written format: whether the saved FPU IP and DP pointers are 64
   9202       or 32 bits.  But the helper function we call simply writes zero
   9203       bits in the relevant fields (which are 64 bits regardless of
   9204       what REX.W is) and so it's good enough (iow, equally broken) in
   9205       both cases. */
   9206    if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   9207        && insn[0] == 0x0F && insn[1] == 0xAE
   9208        && !epartIsReg(insn[2]) && gregOfRexRM(pfx,insn[2]) == 0) {
   9209        IRDirty* d;
   9210       modrm = getUChar(delta+2);
   9211       vassert(!epartIsReg(modrm));
   9212 
   9213       addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   9214       delta += 2+alen;
   9215       gen_SEGV_if_not_16_aligned(addr);
   9216 
   9217       DIP("%sfxsave %s\n", sz==8 ? "rex64/" : "", dis_buf);
   9218 
   9219       /* Uses dirty helper:
   9220             void amd64g_do_FXSAVE ( VexGuestAMD64State*, ULong ) */
   9221       d = unsafeIRDirty_0_N (
   9222              0/*regparms*/,
   9223              "amd64g_dirtyhelper_FXSAVE",
   9224              &amd64g_dirtyhelper_FXSAVE,
   9225              mkIRExprVec_1( mkexpr(addr) )
   9226           );
   9227       d->needsBBP = True;
   9228 
   9229       /* declare we're writing memory */
   9230       d->mFx   = Ifx_Write;
   9231       d->mAddr = mkexpr(addr);
   9232       d->mSize = 512;
   9233 
   9234       /* declare we're reading guest state */
   9235       d->nFxState = 7;
   9236 
   9237       d->fxState[0].fx     = Ifx_Read;
   9238       d->fxState[0].offset = OFFB_FTOP;
   9239       d->fxState[0].size   = sizeof(UInt);
   9240 
   9241       d->fxState[1].fx     = Ifx_Read;
   9242       d->fxState[1].offset = OFFB_FPREGS;
   9243       d->fxState[1].size   = 8 * sizeof(ULong);
   9244 
   9245       d->fxState[2].fx     = Ifx_Read;
   9246       d->fxState[2].offset = OFFB_FPTAGS;
   9247       d->fxState[2].size   = 8 * sizeof(UChar);
   9248 
   9249       d->fxState[3].fx     = Ifx_Read;
   9250       d->fxState[3].offset = OFFB_FPROUND;
   9251       d->fxState[3].size   = sizeof(ULong);
   9252 
   9253       d->fxState[4].fx     = Ifx_Read;
   9254       d->fxState[4].offset = OFFB_FC3210;
   9255       d->fxState[4].size   = sizeof(ULong);
   9256 
   9257       d->fxState[5].fx     = Ifx_Read;
   9258       d->fxState[5].offset = OFFB_XMM0;
   9259       d->fxState[5].size   = 16 * sizeof(U128);
   9260 
   9261       d->fxState[6].fx     = Ifx_Read;
   9262       d->fxState[6].offset = OFFB_SSEROUND;
   9263       d->fxState[6].size   = sizeof(ULong);
   9264 
   9265       /* Be paranoid ... this assertion tries to ensure the 16 %xmm
   9266 	 images are packed back-to-back.  If not, the value of
   9267 	 d->fxState[5].size is wrong. */
   9268       vassert(16 == sizeof(U128));
   9269       vassert(OFFB_XMM15 == (OFFB_XMM0 + 15 * 16));
   9270 
   9271       stmt( IRStmt_Dirty(d) );
   9272 
   9273       goto decode_success;
   9274    }
   9275 
   9276    /* 0F AE /1 = FXRSTOR m512 -- read x87 and SSE state from memory.
   9277       As with FXSAVE above we ignore the value of REX.W since we're
   9278       not bothering with the FPU DP and IP fields. */
   9279    if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   9280        && insn[0] == 0x0F && insn[1] == 0xAE
   9281        && !epartIsReg(insn[2]) && gregOfRexRM(pfx,insn[2]) == 1) {
   9282        IRDirty* d;
   9283       modrm = getUChar(delta+2);
   9284       vassert(!epartIsReg(modrm));
   9285 
   9286       addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   9287       delta += 2+alen;
   9288       gen_SEGV_if_not_16_aligned(addr);
   9289 
   9290       DIP("%sfxrstor %s\n", sz==8 ? "rex64/" : "", dis_buf);
   9291 
   9292       /* Uses dirty helper:
   9293             VexEmWarn amd64g_do_FXRSTOR ( VexGuestAMD64State*, ULong )
   9294          NOTE:
   9295             the VexEmWarn value is simply ignored
   9296       */
   9297       d = unsafeIRDirty_0_N (
   9298              0/*regparms*/,
   9299              "amd64g_dirtyhelper_FXRSTOR",
   9300              &amd64g_dirtyhelper_FXRSTOR,
   9301              mkIRExprVec_1( mkexpr(addr) )
   9302           );
   9303       d->needsBBP = True;
   9304 
   9305       /* declare we're reading memory */
   9306       d->mFx   = Ifx_Read;
   9307       d->mAddr = mkexpr(addr);
   9308       d->mSize = 512;
   9309 
   9310       /* declare we're writing guest state */
   9311       d->nFxState = 7;
   9312 
   9313       d->fxState[0].fx     = Ifx_Write;
   9314       d->fxState[0].offset = OFFB_FTOP;
   9315       d->fxState[0].size   = sizeof(UInt);
   9316 
   9317       d->fxState[1].fx     = Ifx_Write;
   9318       d->fxState[1].offset = OFFB_FPREGS;
   9319       d->fxState[1].size   = 8 * sizeof(ULong);
   9320 
   9321       d->fxState[2].fx     = Ifx_Write;
   9322       d->fxState[2].offset = OFFB_FPTAGS;
   9323       d->fxState[2].size   = 8 * sizeof(UChar);
   9324 
   9325       d->fxState[3].fx     = Ifx_Write;
   9326       d->fxState[3].offset = OFFB_FPROUND;
   9327       d->fxState[3].size   = sizeof(ULong);
   9328 
   9329       d->fxState[4].fx     = Ifx_Write;
   9330       d->fxState[4].offset = OFFB_FC3210;
   9331       d->fxState[4].size   = sizeof(ULong);
   9332 
   9333       d->fxState[5].fx     = Ifx_Write;
   9334       d->fxState[5].offset = OFFB_XMM0;
   9335       d->fxState[5].size   = 16 * sizeof(U128);
   9336 
   9337       d->fxState[6].fx     = Ifx_Write;
   9338       d->fxState[6].offset = OFFB_SSEROUND;
   9339       d->fxState[6].size   = sizeof(ULong);
   9340 
   9341       /* Be paranoid ... this assertion tries to ensure the 16 %xmm
   9342 	 images are packed back-to-back.  If not, the value of
   9343 	 d->fxState[5].size is wrong. */
   9344       vassert(16 == sizeof(U128));
   9345       vassert(OFFB_XMM15 == (OFFB_XMM0 + 15 * 16));
   9346 
   9347       stmt( IRStmt_Dirty(d) );
   9348 
   9349       goto decode_success;
   9350    }
   9351 
   9352    /* ------ SSE decoder main ------ */
   9353 
   9354    /* 0F 58 = ADDPS -- add 32Fx4 from R/M to R */
   9355    if (haveNo66noF2noF3(pfx) && sz == 4
   9356        && insn[0] == 0x0F && insn[1] == 0x58) {
   9357       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "addps", Iop_Add32Fx4 );
   9358       goto decode_success;
   9359    }
   9360 
   9361    /* F3 0F 58 = ADDSS -- add 32F0x4 from R/M to R */
   9362    if (haveF3no66noF2(pfx) && sz == 4
   9363        && insn[0] == 0x0F && insn[1] == 0x58) {
   9364       delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "addss", Iop_Add32F0x4 );
   9365       goto decode_success;
   9366    }
   9367 
   9368    /* 0F 55 = ANDNPS -- G = (not G) and E */
   9369    if (haveNo66noF2noF3(pfx) && sz == 4
   9370        && insn[0] == 0x0F && insn[1] == 0x55) {
   9371       delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta+2, "andnps", Iop_AndV128 );
   9372       goto decode_success;
   9373    }
   9374 
   9375    /* 0F 54 = ANDPS -- G = G and E */
   9376    if (haveNo66noF2noF3(pfx) && sz == 4
   9377        && insn[0] == 0x0F && insn[1] == 0x54) {
   9378       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "andps", Iop_AndV128 );
   9379       goto decode_success;
   9380    }
   9381 
   9382    /* 0F C2 = CMPPS -- 32Fx4 comparison from R/M to R */
   9383    if (haveNo66noF2noF3(pfx) && sz == 4
   9384        && insn[0] == 0x0F && insn[1] == 0xC2) {
   9385       delta = dis_SSEcmp_E_to_G( vbi, pfx, delta+2, "cmpps", True, 4 );
   9386       goto decode_success;
   9387    }
   9388 
   9389    /* F3 0F C2 = CMPSS -- 32F0x4 comparison from R/M to R */
   9390    if (haveF3no66noF2(pfx) && sz == 4
   9391        && insn[0] == 0x0F && insn[1] == 0xC2) {
   9392       delta = dis_SSEcmp_E_to_G( vbi, pfx, delta+2, "cmpss", False, 4 );
   9393       goto decode_success;
   9394    }
   9395 
   9396    /* 0F 2F = COMISS  -- 32F0x4 comparison G,E, and set ZCP */
   9397    /* 0F 2E = UCOMISS -- 32F0x4 comparison G,E, and set ZCP */
   9398    if (haveNo66noF2noF3(pfx) && sz == 4
   9399        && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
   9400       IRTemp argL = newTemp(Ity_F32);
   9401       IRTemp argR = newTemp(Ity_F32);
   9402       modrm = getUChar(delta+2);
   9403       if (epartIsReg(modrm)) {
   9404          assign( argR, getXMMRegLane32F( eregOfRexRM(pfx,modrm),
   9405                                          0/*lowest lane*/ ) );
   9406          delta += 2+1;
   9407          DIP("%scomiss %s,%s\n", insn[1]==0x2E ? "u" : "",
   9408                                  nameXMMReg(eregOfRexRM(pfx,modrm)),
   9409                                  nameXMMReg(gregOfRexRM(pfx,modrm)) );
   9410       } else {
   9411          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   9412 	 assign( argR, loadLE(Ity_F32, mkexpr(addr)) );
   9413          delta += 2+alen;
   9414          DIP("%scomiss %s,%s\n", insn[1]==0x2E ? "u" : "",
   9415                                  dis_buf,
   9416                                  nameXMMReg(gregOfRexRM(pfx,modrm)) );
   9417       }
   9418       assign( argL, getXMMRegLane32F( gregOfRexRM(pfx,modrm),
   9419                                       0/*lowest lane*/ ) );
   9420 
   9421       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   9422       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   9423       stmt( IRStmt_Put(
   9424                OFFB_CC_DEP1,
   9425                binop( Iop_And64,
   9426                       unop( Iop_32Uto64,
   9427                             binop(Iop_CmpF64,
   9428                                   unop(Iop_F32toF64,mkexpr(argL)),
   9429                                   unop(Iop_F32toF64,mkexpr(argR)))),
   9430                       mkU64(0x45)
   9431           )));
   9432 
   9433       goto decode_success;
   9434    }
   9435 
   9436    /* 0F 2A = CVTPI2PS -- convert 2 x I32 in mem/mmx to 2 x F32 in low
   9437       half xmm */
   9438    if (haveNo66noF2noF3(pfx) && sz == 4
   9439        && insn[0] == 0x0F && insn[1] == 0x2A) {
   9440       IRTemp arg64 = newTemp(Ity_I64);
   9441       IRTemp rmode = newTemp(Ity_I32);
   9442 
   9443       modrm = getUChar(delta+2);
   9444       do_MMX_preamble();
   9445       if (epartIsReg(modrm)) {
   9446          assign( arg64, getMMXReg(eregLO3ofRM(modrm)) );
   9447          delta += 2+1;
   9448          DIP("cvtpi2ps %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   9449                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   9450       } else {
   9451          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   9452          assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   9453          delta += 2+alen;
   9454          DIP("cvtpi2ps %s,%s\n", dis_buf,
   9455                                  nameXMMReg(gregOfRexRM(pfx,modrm)) );
   9456       }
   9457 
   9458       assign( rmode, get_sse_roundingmode() );
   9459 
   9460       putXMMRegLane32F(
   9461          gregOfRexRM(pfx,modrm), 0,
   9462          binop(Iop_F64toF32,
   9463                mkexpr(rmode),
   9464                unop(Iop_I32StoF64,
   9465                     unop(Iop_64to32, mkexpr(arg64)) )) );
   9466 
   9467       putXMMRegLane32F(
   9468          gregOfRexRM(pfx,modrm), 1,
   9469          binop(Iop_F64toF32,
   9470                mkexpr(rmode),
   9471                unop(Iop_I32StoF64,
   9472                     unop(Iop_64HIto32, mkexpr(arg64)) )) );
   9473 
   9474       goto decode_success;
   9475    }
   9476 
   9477    /* F3 0F 2A = CVTSI2SS
   9478       -- sz==4: convert I32 in mem/ireg to F32 in low quarter xmm
   9479       -- sz==8: convert I64 in mem/ireg to F32 in low quarter xmm */
   9480    if (haveF3no66noF2(pfx) && (sz == 4 || sz == 8)
   9481        && insn[0] == 0x0F && insn[1] == 0x2A) {
   9482 
   9483       IRTemp rmode = newTemp(Ity_I32);
   9484       assign( rmode, get_sse_roundingmode() );
   9485       modrm = getUChar(delta+2);
   9486 
   9487       if (sz == 4) {
   9488          IRTemp arg32 = newTemp(Ity_I32);
   9489          if (epartIsReg(modrm)) {
   9490             assign( arg32, getIReg32(eregOfRexRM(pfx,modrm)) );
   9491             delta += 2+1;
   9492             DIP("cvtsi2ss %s,%s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
   9493                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   9494          } else {
   9495             addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   9496             assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   9497             delta += 2+alen;
   9498             DIP("cvtsi2ss %s,%s\n", dis_buf,
   9499                                     nameXMMReg(gregOfRexRM(pfx,modrm)) );
   9500          }
   9501          putXMMRegLane32F(
   9502             gregOfRexRM(pfx,modrm), 0,
   9503             binop(Iop_F64toF32,
   9504                   mkexpr(rmode),
   9505                   unop(Iop_I32StoF64, mkexpr(arg32)) ) );
   9506       } else {
   9507          /* sz == 8 */
   9508          IRTemp arg64 = newTemp(Ity_I64);
   9509          if (epartIsReg(modrm)) {
   9510             assign( arg64, getIReg64(eregOfRexRM(pfx,modrm)) );
   9511             delta += 2+1;
   9512             DIP("cvtsi2ssq %s,%s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
   9513                                      nameXMMReg(gregOfRexRM(pfx,modrm)));
   9514          } else {
   9515             addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   9516             assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   9517             delta += 2+alen;
   9518             DIP("cvtsi2ssq %s,%s\n", dis_buf,
   9519                                      nameXMMReg(gregOfRexRM(pfx,modrm)) );
   9520          }
   9521          putXMMRegLane32F(
   9522             gregOfRexRM(pfx,modrm), 0,
   9523             binop(Iop_F64toF32,
   9524                   mkexpr(rmode),
   9525                   binop(Iop_I64StoF64, mkexpr(rmode), mkexpr(arg64)) ) );
   9526       }
   9527 
   9528       goto decode_success;
   9529    }
   9530 
   9531    /* 0F 2D = CVTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
   9532       I32 in mmx, according to prevailing SSE rounding mode */
   9533    /* 0F 2C = CVTTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
   9534       I32 in mmx, rounding towards zero */
   9535    if (haveNo66noF2noF3(pfx) && sz == 4
   9536        && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
   9537       IRTemp dst64  = newTemp(Ity_I64);
   9538       IRTemp rmode  = newTemp(Ity_I32);
   9539       IRTemp f32lo  = newTemp(Ity_F32);
   9540       IRTemp f32hi  = newTemp(Ity_F32);
   9541       Bool   r2zero = toBool(insn[1] == 0x2C);
   9542 
   9543       do_MMX_preamble();
   9544       modrm = getUChar(delta+2);
   9545 
   9546       if (epartIsReg(modrm)) {
   9547          delta += 2+1;
   9548          assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
   9549          assign(f32hi, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 1));
   9550          DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
   9551                                    nameXMMReg(eregOfRexRM(pfx,modrm)),
   9552                                    nameMMXReg(gregLO3ofRM(modrm)));
   9553       } else {
   9554          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   9555          assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   9556          assign(f32hi, loadLE(Ity_F32, binop( Iop_Add64,
   9557                                               mkexpr(addr),
   9558                                               mkU64(4) )));
   9559          delta += 2+alen;
   9560          DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
   9561                                    dis_buf,
   9562                                    nameMMXReg(gregLO3ofRM(modrm)));
   9563       }
   9564 
   9565       if (r2zero) {
   9566          assign(rmode, mkU32((UInt)Irrm_ZERO) );
   9567       } else {
   9568          assign( rmode, get_sse_roundingmode() );
   9569       }
   9570 
   9571       assign(
   9572          dst64,
   9573          binop( Iop_32HLto64,
   9574                 binop( Iop_F64toI32S,
   9575                        mkexpr(rmode),
   9576                        unop( Iop_F32toF64, mkexpr(f32hi) ) ),
   9577                 binop( Iop_F64toI32S,
   9578                        mkexpr(rmode),
   9579                        unop( Iop_F32toF64, mkexpr(f32lo) ) )
   9580               )
   9581       );
   9582 
   9583       putMMXReg(gregLO3ofRM(modrm), mkexpr(dst64));
   9584       goto decode_success;
   9585    }
   9586 
   9587    /* F3 0F 2D = CVTSS2SI
   9588       when sz==4 -- convert F32 in mem/low quarter xmm to I32 in ireg,
   9589                     according to prevailing SSE rounding mode
   9590       when sz==8 -- convert F32 in mem/low quarter xmm to I64 in ireg,
   9591                     according to prevailing SSE rounding mode
   9592    */
   9593    /* F3 0F 2C = CVTTSS2SI
   9594       when sz==4 -- convert F32 in mem/low quarter xmm to I32 in ireg,
   9595                     truncating towards zero
   9596       when sz==8 -- convert F32 in mem/low quarter xmm to I64 in ireg,
   9597                     truncating towards zero
   9598    */
   9599    if (haveF3no66noF2(pfx)
   9600        && insn[0] == 0x0F
   9601        && (insn[1] == 0x2D || insn[1] == 0x2C)) {
   9602       IRTemp rmode  = newTemp(Ity_I32);
   9603       IRTemp f32lo  = newTemp(Ity_F32);
   9604       Bool   r2zero = toBool(insn[1] == 0x2C);
   9605       vassert(sz == 4 || sz == 8);
   9606 
   9607       modrm = getUChar(delta+2);
   9608       if (epartIsReg(modrm)) {
   9609          delta += 2+1;
   9610          assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
   9611          DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
   9612                                    nameXMMReg(eregOfRexRM(pfx,modrm)),
   9613                                    nameIReg(sz, gregOfRexRM(pfx,modrm), False));
   9614       } else {
   9615          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   9616          assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   9617          delta += 2+alen;
   9618          DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
   9619                                    dis_buf,
   9620                                    nameIReg(sz, gregOfRexRM(pfx,modrm), False));
   9621       }
   9622 
   9623       if (r2zero) {
   9624          assign( rmode, mkU32((UInt)Irrm_ZERO) );
   9625       } else {
   9626          assign( rmode, get_sse_roundingmode() );
   9627       }
   9628 
   9629       if (sz == 4) {
   9630          putIReg32( gregOfRexRM(pfx,modrm),
   9631                     binop( Iop_F64toI32S,
   9632                            mkexpr(rmode),
   9633                            unop(Iop_F32toF64, mkexpr(f32lo))) );
   9634       } else {
   9635          putIReg64( gregOfRexRM(pfx,modrm),
   9636                     binop( Iop_F64toI64S,
   9637                            mkexpr(rmode),
   9638                            unop(Iop_F32toF64, mkexpr(f32lo))) );
   9639       }
   9640 
   9641       goto decode_success;
   9642    }
   9643 
   9644    /* 0F 5E = DIVPS -- div 32Fx4 from R/M to R */
   9645    if (haveNo66noF2noF3(pfx) && sz == 4
   9646        && insn[0] == 0x0F && insn[1] == 0x5E) {
   9647       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "divps", Iop_Div32Fx4 );
   9648       goto decode_success;
   9649    }
   9650 
   9651    /* F3 0F 5E = DIVSS -- div 32F0x4 from R/M to R */
   9652    if (haveF3no66noF2(pfx) && sz == 4
   9653        && insn[0] == 0x0F && insn[1] == 0x5E) {
   9654       delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "divss", Iop_Div32F0x4 );
   9655       goto decode_success;
   9656    }
   9657 
   9658    /* 0F AE /2 = LDMXCSR m32 -- load %mxcsr */
   9659    if (insn[0] == 0x0F && insn[1] == 0xAE
   9660        && haveNo66noF2noF3(pfx)
   9661        && !epartIsReg(insn[2]) && gregLO3ofRM(insn[2]) == 2) {
   9662 
   9663       IRTemp t64 = newTemp(Ity_I64);
   9664       IRTemp ew = newTemp(Ity_I32);
   9665 
   9666       vassert(sz == 4);
   9667       addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   9668       delta += 2+alen;
   9669       DIP("ldmxcsr %s\n", dis_buf);
   9670 
   9671       /* The only thing we observe in %mxcsr is the rounding mode.
   9672          Therefore, pass the 32-bit value (SSE native-format control
   9673          word) to a clean helper, getting back a 64-bit value, the
   9674          lower half of which is the SSEROUND value to store, and the
   9675          upper half of which is the emulation-warning token which may
   9676          be generated.
   9677       */
   9678       /* ULong amd64h_check_ldmxcsr ( ULong ); */
   9679       assign( t64, mkIRExprCCall(
   9680                       Ity_I64, 0/*regparms*/,
   9681                       "amd64g_check_ldmxcsr",
   9682                       &amd64g_check_ldmxcsr,
   9683                       mkIRExprVec_1(
   9684                          unop(Iop_32Uto64,
   9685                               loadLE(Ity_I32, mkexpr(addr))
   9686                          )
   9687                       )
   9688                    )
   9689             );
   9690 
   9691       put_sse_roundingmode( unop(Iop_64to32, mkexpr(t64)) );
   9692       assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
   9693       put_emwarn( mkexpr(ew) );
   9694       /* Finally, if an emulation warning was reported, side-exit to
   9695          the next insn, reporting the warning, so that Valgrind's
   9696          dispatcher sees the warning. */
   9697       stmt(
   9698          IRStmt_Exit(
   9699             binop(Iop_CmpNE64, unop(Iop_32Uto64,mkexpr(ew)), mkU64(0)),
   9700             Ijk_EmWarn,
   9701             IRConst_U64(guest_RIP_bbstart+delta)
   9702          )
   9703       );
   9704       goto decode_success;
   9705    }
   9706 
   9707    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   9708    /* 0F F7 = MASKMOVQ -- 8x8 masked store */
   9709    if (haveNo66noF2noF3(pfx) && sz == 4
   9710        && insn[0] == 0x0F && insn[1] == 0xF7) {
   9711       Bool ok = False;
   9712       delta = dis_MMX( &ok, vbi, pfx, sz, delta+1 );
   9713       if (!ok)
   9714          goto decode_failure;
   9715       goto decode_success;
   9716    }
   9717 
   9718    /* 0F 5F = MAXPS -- max 32Fx4 from R/M to R */
   9719    if (haveNo66noF2noF3(pfx) && sz == 4
   9720        && insn[0] == 0x0F && insn[1] == 0x5F) {
   9721       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "maxps", Iop_Max32Fx4 );
   9722       goto decode_success;
   9723    }
   9724 
   9725    /* F3 0F 5F = MAXSS -- max 32F0x4 from R/M to R */
   9726    if (haveF3no66noF2(pfx) && sz == 4
   9727        && insn[0] == 0x0F && insn[1] == 0x5F) {
   9728       delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "maxss", Iop_Max32F0x4 );
   9729       goto decode_success;
   9730    }
   9731 
   9732    /* 0F 5D = MINPS -- min 32Fx4 from R/M to R */
   9733    if (haveNo66noF2noF3(pfx) && sz == 4
   9734        && insn[0] == 0x0F && insn[1] == 0x5D) {
   9735       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "minps", Iop_Min32Fx4 );
   9736       goto decode_success;
   9737    }
   9738 
   9739    /* F3 0F 5D = MINSS -- min 32F0x4 from R/M to R */
   9740    if (haveF3no66noF2(pfx) && sz == 4
   9741        && insn[0] == 0x0F && insn[1] == 0x5D) {
   9742       delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "minss", Iop_Min32F0x4 );
   9743       goto decode_success;
   9744    }
   9745 
   9746    /* 0F 28 = MOVAPS -- move from E (mem or xmm) to G (xmm). */
   9747    /* 0F 10 = MOVUPS -- move from E (mem or xmm) to G (xmm). */
   9748    if (haveNo66noF2noF3(pfx)
   9749        && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
   9750        && insn[0] == 0x0F && (insn[1] == 0x28 || insn[1] == 0x10)) {
   9751       modrm = getUChar(delta+2);
   9752       if (epartIsReg(modrm)) {
   9753          putXMMReg( gregOfRexRM(pfx,modrm),
   9754                     getXMMReg( eregOfRexRM(pfx,modrm) ));
   9755          DIP("mov[ua]ps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   9756                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   9757          delta += 2+1;
   9758       } else {
   9759          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   9760          if (insn[1] == 0x28/*movaps*/)
   9761             gen_SEGV_if_not_16_aligned( addr );
   9762          putXMMReg( gregOfRexRM(pfx,modrm),
   9763                     loadLE(Ity_V128, mkexpr(addr)) );
   9764          DIP("mov[ua]ps %s,%s\n", dis_buf,
   9765                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   9766          delta += 2+alen;
   9767       }
   9768       goto decode_success;
   9769    }
   9770 
   9771    /* 0F 29 = MOVAPS -- move from G (xmm) to E (mem or xmm). */
   9772    /* 0F 11 = MOVUPS -- move from G (xmm) to E (mem or xmm). */
   9773    if (haveNo66noF2noF3(pfx)
   9774        && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
   9775        && insn[0] == 0x0F && (insn[1] == 0x29 || insn[1] == 0x11)) {
   9776       modrm = getUChar(delta+2);
   9777       if (epartIsReg(modrm)) {
   9778          /* fall through; awaiting test case */
   9779       } else {
   9780          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   9781          if (insn[1] == 0x29/*movaps*/)
   9782             gen_SEGV_if_not_16_aligned( addr );
   9783          storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   9784          DIP("mov[ua]ps %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   9785                                   dis_buf );
   9786          delta += 2+alen;
   9787          goto decode_success;
   9788       }
   9789    }
   9790 
   9791    /* 0F 16 = MOVHPS -- move from mem to high half of XMM. */
   9792    /* 0F 16 = MOVLHPS -- move from lo half to hi half of XMM. */
   9793    if (haveNo66noF2noF3(pfx)
   9794        && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
   9795        && insn[0] == 0x0F && insn[1] == 0x16) {
   9796       modrm = getUChar(delta+2);
   9797       if (epartIsReg(modrm)) {
   9798          delta += 2+1;
   9799          putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
   9800                           getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ) );
   9801          DIP("movhps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   9802                                nameXMMReg(gregOfRexRM(pfx,modrm)));
   9803       } else {
   9804          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   9805          delta += 2+alen;
   9806          putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
   9807                           loadLE(Ity_I64, mkexpr(addr)) );
   9808          DIP("movhps %s,%s\n", dis_buf,
   9809                                nameXMMReg( gregOfRexRM(pfx,modrm) ));
   9810       }
   9811       goto decode_success;
   9812    }
   9813 
   9814    /* 0F 17 = MOVHPS -- move from high half of XMM to mem. */
   9815    if (haveNo66noF2noF3(pfx)
   9816        && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
   9817        && insn[0] == 0x0F && insn[1] == 0x17) {
   9818       if (!epartIsReg(insn[2])) {
   9819          delta += 2;
   9820          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9821          delta += alen;
   9822          storeLE( mkexpr(addr),
   9823                   getXMMRegLane64( gregOfRexRM(pfx,insn[2]),
   9824                                    1/*upper lane*/ ) );
   9825          DIP("movhps %s,%s\n", nameXMMReg( gregOfRexRM(pfx,insn[2]) ),
   9826                                dis_buf);
   9827          goto decode_success;
   9828       }
   9829       /* else fall through */
   9830    }
   9831 
   9832    /* 0F 12 = MOVLPS -- move from mem to low half of XMM. */
   9833    /* OF 12 = MOVHLPS -- from from hi half to lo half of XMM. */
   9834    if (haveNo66noF2noF3(pfx)
   9835        && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
   9836        && insn[0] == 0x0F && insn[1] == 0x12) {
   9837       modrm = getUChar(delta+2);
   9838       if (epartIsReg(modrm)) {
   9839          delta += 2+1;
   9840          putXMMRegLane64( gregOfRexRM(pfx,modrm),
   9841                           0/*lower lane*/,
   9842                           getXMMRegLane64( eregOfRexRM(pfx,modrm), 1 ));
   9843          DIP("movhlps %s, %s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   9844                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   9845       } else {
   9846          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   9847          delta += 2+alen;
   9848          putXMMRegLane64( gregOfRexRM(pfx,modrm),  0/*lower lane*/,
   9849                           loadLE(Ity_I64, mkexpr(addr)) );
   9850          DIP("movlps %s, %s\n",
   9851              dis_buf, nameXMMReg( gregOfRexRM(pfx,modrm) ));
   9852       }
   9853       goto decode_success;
   9854    }
   9855 
   9856    /* 0F 13 = MOVLPS -- move from low half of XMM to mem. */
   9857    if (haveNo66noF2noF3(pfx)
   9858        && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
   9859        && insn[0] == 0x0F && insn[1] == 0x13) {
   9860       if (!epartIsReg(insn[2])) {
   9861          delta += 2;
   9862          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9863          delta += alen;
   9864          storeLE( mkexpr(addr),
   9865                   getXMMRegLane64( gregOfRexRM(pfx,insn[2]),
   9866                                    0/*lower lane*/ ) );
   9867          DIP("movlps %s, %s\n", nameXMMReg( gregOfRexRM(pfx,insn[2]) ),
   9868                                 dis_buf);
   9869          goto decode_success;
   9870       }
   9871       /* else fall through */
   9872    }
   9873 
   9874    /* 0F 50 = MOVMSKPS - move 4 sign bits from 4 x F32 in xmm(E)
   9875       to 4 lowest bits of ireg(G) */
   9876    if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   9877        && insn[0] == 0x0F && insn[1] == 0x50) {
   9878       /* sz == 8 is a kludge to handle insns with REX.W redundantly
   9879          set to 1, which has been known to happen:
   9880 
   9881          4c 0f 50 d9             rex64X movmskps %xmm1,%r11d
   9882 
   9883          20071106: Intel docs say that REX.W isn't redundant: when
   9884          present, a 64-bit register is written; when not present, only
   9885          the 32-bit half is written.  However, testing on a Core2
   9886          machine suggests the entire 64 bit register is written
   9887          irrespective of the status of REX.W.  That could be because
   9888          of the default rule that says "if the lower half of a 32-bit
   9889          register is written, the upper half is zeroed".  By using
   9890          putIReg32 here we inadvertantly produce the same behaviour as
   9891          the Core2, for the same reason -- putIReg32 implements said
   9892          rule.
   9893 
   9894          AMD docs give no indication that REX.W is even valid for this
   9895          insn. */
   9896       modrm = getUChar(delta+2);
   9897       if (epartIsReg(modrm)) {
   9898          Int src;
   9899          t0 = newTemp(Ity_I32);
   9900          t1 = newTemp(Ity_I32);
   9901          t2 = newTemp(Ity_I32);
   9902          t3 = newTemp(Ity_I32);
   9903          delta += 2+1;
   9904          src = eregOfRexRM(pfx,modrm);
   9905          assign( t0, binop( Iop_And32,
   9906                             binop(Iop_Shr32, getXMMRegLane32(src,0), mkU8(31)),
   9907                             mkU32(1) ));
   9908          assign( t1, binop( Iop_And32,
   9909                             binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(30)),
   9910                             mkU32(2) ));
   9911          assign( t2, binop( Iop_And32,
   9912                             binop(Iop_Shr32, getXMMRegLane32(src,2), mkU8(29)),
   9913                             mkU32(4) ));
   9914          assign( t3, binop( Iop_And32,
   9915                             binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(28)),
   9916                             mkU32(8) ));
   9917          putIReg32( gregOfRexRM(pfx,modrm),
   9918                     binop(Iop_Or32,
   9919                           binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
   9920                           binop(Iop_Or32, mkexpr(t2), mkexpr(t3))
   9921                          )
   9922                  );
   9923          DIP("movmskps %s,%s\n", nameXMMReg(src),
   9924                                  nameIReg32(gregOfRexRM(pfx,modrm)));
   9925          goto decode_success;
   9926       }
   9927       /* else fall through */
   9928    }
   9929 
   9930    /* 66 0F 2B = MOVNTPD -- for us, just a plain SSE store. */
   9931    /* 0F 2B = MOVNTPS -- for us, just a plain SSE store. */
   9932    if ( ( (haveNo66noF2noF3(pfx) && sz == 4)
   9933           || (have66noF2noF3(pfx) && sz == 2)
   9934         )
   9935         && insn[0] == 0x0F && insn[1] == 0x2B) {
   9936       modrm = getUChar(delta+2);
   9937       if (!epartIsReg(modrm)) {
   9938          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   9939          gen_SEGV_if_not_16_aligned( addr );
   9940          storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   9941          DIP("movntp%s %s,%s\n", sz==2 ? "d" : "s",
   9942                                  dis_buf,
   9943                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   9944          delta += 2+alen;
   9945          goto decode_success;
   9946       }
   9947       /* else fall through */
   9948    }
   9949 
   9950    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   9951    /* 0F E7 = MOVNTQ -- for us, just a plain MMX store.  Note, the
   9952       Intel manual does not say anything about the usual business of
   9953       the FP reg tags getting trashed whenever an MMX insn happens.
   9954       So we just leave them alone.
   9955    */
   9956    if (haveNo66noF2noF3(pfx) && sz == 4
   9957        && insn[0] == 0x0F && insn[1] == 0xE7) {
   9958       modrm = getUChar(delta+2);
   9959       if (!epartIsReg(modrm)) {
   9960          /* do_MMX_preamble(); Intel docs don't specify this */
   9961          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   9962          storeLE( mkexpr(addr), getMMXReg(gregLO3ofRM(modrm)) );
   9963          DIP("movntq %s,%s\n", dis_buf,
   9964                                nameMMXReg(gregLO3ofRM(modrm)));
   9965          delta += 2+alen;
   9966          goto decode_success;
   9967       }
   9968       /* else fall through */
   9969    }
   9970 
   9971    /* F3 0F 10 = MOVSS -- move 32 bits from E (mem or lo 1/4 xmm) to G
   9972       (lo 1/4 xmm).  If E is mem, upper 3/4 of G is zeroed out. */
   9973    if (haveF3no66noF2(pfx)
   9974        && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
   9975        && insn[0] == 0x0F && insn[1] == 0x10) {
   9976       modrm = getUChar(delta+2);
   9977       if (epartIsReg(modrm)) {
   9978          putXMMRegLane32( gregOfRexRM(pfx,modrm), 0,
   9979                           getXMMRegLane32( eregOfRexRM(pfx,modrm), 0 ));
   9980          DIP("movss %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   9981                               nameXMMReg(gregOfRexRM(pfx,modrm)));
   9982          delta += 2+1;
   9983       } else {
   9984          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   9985          putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
   9986          putXMMRegLane32( gregOfRexRM(pfx,modrm), 0,
   9987                           loadLE(Ity_I32, mkexpr(addr)) );
   9988          DIP("movss %s,%s\n", dis_buf,
   9989                               nameXMMReg(gregOfRexRM(pfx,modrm)));
   9990          delta += 2+alen;
   9991       }
   9992       goto decode_success;
   9993    }
   9994 
   9995    /* F3 0F 11 = MOVSS -- move 32 bits from G (lo 1/4 xmm) to E (mem
   9996       or lo 1/4 xmm). */
   9997    if (haveF3no66noF2(pfx) && sz == 4
   9998        && insn[0] == 0x0F && insn[1] == 0x11) {
   9999       modrm = getUChar(delta+2);
   10000       if (epartIsReg(modrm)) {
   10001          /* fall through, we don't yet have a test case */
   10002       } else {
   10003          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   10004          storeLE( mkexpr(addr),
   10005                   getXMMRegLane32(gregOfRexRM(pfx,modrm), 0) );
   10006          DIP("movss %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   10007                               dis_buf);
   10008          delta += 2+alen;
   10009          goto decode_success;
   10010       }
   10011    }
   10012 
   10013    /* 0F 59 = MULPS -- mul 32Fx4 from R/M to R */
   10014    if (haveNo66noF2noF3(pfx) && sz == 4
   10015        && insn[0] == 0x0F && insn[1] == 0x59) {
   10016       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "mulps", Iop_Mul32Fx4 );
   10017       goto decode_success;
   10018    }
   10019 
   10020    /* F3 0F 59 = MULSS -- mul 32F0x4 from R/M to R */
   10021    if (haveF3no66noF2(pfx) && sz == 4
   10022        && insn[0] == 0x0F && insn[1] == 0x59) {
   10023       delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "mulss", Iop_Mul32F0x4 );
   10024       goto decode_success;
   10025    }
   10026 
   10027    /* 0F 56 = ORPS -- G = G and E */
   10028    if (haveNo66noF2noF3(pfx) && sz == 4
   10029        && insn[0] == 0x0F && insn[1] == 0x56) {
   10030       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "orps", Iop_OrV128 );
   10031       goto decode_success;
   10032    }
   10033 
   10034    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   10035    /* 0F E0 = PAVGB -- 8x8 unsigned Packed Average, with rounding */
   10036    if (haveNo66noF2noF3(pfx) && sz == 4
   10037        && insn[0] == 0x0F && insn[1] == 0xE0) {
   10038       do_MMX_preamble();
   10039       delta = dis_MMXop_regmem_to_reg (
   10040                  vbi, pfx, delta+2, insn[1], "pavgb", False );
   10041       goto decode_success;
   10042    }
   10043 
   10044    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   10045    /* 0F E3 = PAVGW -- 16x4 unsigned Packed Average, with rounding */
   10046    if (haveNo66noF2noF3(pfx) && sz == 4
   10047        && insn[0] == 0x0F && insn[1] == 0xE3) {
   10048       do_MMX_preamble();
   10049       delta = dis_MMXop_regmem_to_reg (
   10050                  vbi, pfx, delta+2, insn[1], "pavgw", False );
   10051       goto decode_success;
   10052    }
   10053 
   10054    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   10055    /* 0F C5 = PEXTRW -- extract 16-bit field from mmx(E) and put
   10056       zero-extend of it in ireg(G). */
   10057    if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   10058        && insn[0] == 0x0F && insn[1] == 0xC5) {
   10059       modrm = insn[2];
   10060       if (epartIsReg(modrm)) {
   10061          IRTemp sV = newTemp(Ity_I64);
   10062          t5 = newTemp(Ity_I16);
   10063          do_MMX_preamble();
   10064          assign(sV, getMMXReg(eregLO3ofRM(modrm)));
   10065          breakup64to16s( sV, &t3, &t2, &t1, &t0 );
   10066          switch (insn[3] & 3) {
   10067             case 0:  assign(t5, mkexpr(t0)); break;
   10068             case 1:  assign(t5, mkexpr(t1)); break;
   10069             case 2:  assign(t5, mkexpr(t2)); break;
   10070             case 3:  assign(t5, mkexpr(t3)); break;
   10071             default: vassert(0);
   10072          }
   10073          if (sz == 8)
   10074             putIReg64(gregOfRexRM(pfx,modrm), unop(Iop_16Uto64, mkexpr(t5)));
   10075          else
   10076             putIReg32(gregOfRexRM(pfx,modrm), unop(Iop_16Uto32, mkexpr(t5)));
   10077          DIP("pextrw $%d,%s,%s\n",
   10078              (Int)insn[3], nameMMXReg(eregLO3ofRM(modrm)),
   10079                            sz==8 ? nameIReg64(gregOfRexRM(pfx,modrm))
   10080                                  : nameIReg32(gregOfRexRM(pfx,modrm))
   10081          );
   10082          delta += 4;
   10083          goto decode_success;
   10084       }
   10085       /* else fall through */
   10086       /* note, for anyone filling in the mem case: this insn has one
   10087          byte after the amode and therefore you must pass 1 as the
   10088          last arg to disAMode */
   10089    }
   10090 
   10091    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   10092    /* 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
   10093       put it into the specified lane of mmx(G). */
   10094    if (haveNo66noF2noF3(pfx)
   10095        && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
   10096        && insn[0] == 0x0F && insn[1] == 0xC4) {
   10097       /* Use t0 .. t3 to hold the 4 original 16-bit lanes of the
   10098          mmx reg.  t4 is the new lane value.  t5 is the original
   10099          mmx value. t6 is the new mmx value. */
   10100       Int lane;
   10101       t4 = newTemp(Ity_I16);
   10102       t5 = newTemp(Ity_I64);
   10103       t6 = newTemp(Ity_I64);
   10104       modrm = insn[2];
   10105       do_MMX_preamble();
   10106 
   10107       assign(t5, getMMXReg(gregLO3ofRM(modrm)));
   10108       breakup64to16s( t5, &t3, &t2, &t1, &t0 );
   10109 
   10110       if (epartIsReg(modrm)) {
   10111          assign(t4, getIReg16(eregOfRexRM(pfx,modrm)));
   10112          delta += 3+1;
   10113          lane = insn[3+1-1];
   10114          DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   10115                                    nameIReg16(eregOfRexRM(pfx,modrm)),
   10116                                    nameMMXReg(gregLO3ofRM(modrm)));
   10117       } else {
   10118          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 1 );
   10119          delta += 3+alen;
   10120          lane = insn[3+alen-1];
   10121          assign(t4, loadLE(Ity_I16, mkexpr(addr)));
   10122          DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   10123                                    dis_buf,
   10124                                    nameMMXReg(gregLO3ofRM(modrm)));
   10125       }
   10126 
   10127       switch (lane & 3) {
   10128          case 0:  assign(t6, mk64from16s(t3,t2,t1,t4)); break;
   10129          case 1:  assign(t6, mk64from16s(t3,t2,t4,t0)); break;
   10130          case 2:  assign(t6, mk64from16s(t3,t4,t1,t0)); break;
   10131          case 3:  assign(t6, mk64from16s(t4,t2,t1,t0)); break;
   10132          default: vassert(0);
   10133       }
   10134       putMMXReg(gregLO3ofRM(modrm), mkexpr(t6));
   10135       goto decode_success;
   10136    }
   10137 
   10138    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   10139    /* 0F EE = PMAXSW -- 16x4 signed max */
   10140    if (haveNo66noF2noF3(pfx) && sz == 4
   10141        && insn[0] == 0x0F && insn[1] == 0xEE) {
   10142       do_MMX_preamble();
   10143       delta = dis_MMXop_regmem_to_reg (
   10144                  vbi, pfx, delta+2, insn[1], "pmaxsw", False );
   10145       goto decode_success;
   10146    }
   10147 
   10148    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   10149    /* 0F DE = PMAXUB -- 8x8 unsigned max */
   10150    if (haveNo66noF2noF3(pfx) && sz == 4
   10151        && insn[0] == 0x0F && insn[1] == 0xDE) {
   10152       do_MMX_preamble();
   10153       delta = dis_MMXop_regmem_to_reg (
   10154                  vbi, pfx, delta+2, insn[1], "pmaxub", False );
   10155       goto decode_success;
   10156    }
   10157 
   10158    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   10159    /* 0F EA = PMINSW -- 16x4 signed min */
   10160    if (haveNo66noF2noF3(pfx) && sz == 4
   10161        && insn[0] == 0x0F && insn[1] == 0xEA) {
   10162       do_MMX_preamble();
   10163       delta = dis_MMXop_regmem_to_reg (
   10164                  vbi, pfx, delta+2, insn[1], "pminsw", False );
   10165       goto decode_success;
   10166    }
   10167 
   10168    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   10169    /* 0F DA = PMINUB -- 8x8 unsigned min */
   10170    if (haveNo66noF2noF3(pfx) && sz == 4
   10171        && insn[0] == 0x0F && insn[1] == 0xDA) {
   10172       do_MMX_preamble();
   10173       delta = dis_MMXop_regmem_to_reg (
   10174                  vbi, pfx, delta+2, insn[1], "pminub", False );
   10175       goto decode_success;
   10176    }
   10177 
   10178    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   10179    /* 0F D7 = PMOVMSKB -- extract sign bits from each of 8 lanes in
   10180       mmx(G), turn them into a byte, and put zero-extend of it in
   10181       ireg(G). */
   10182    if (haveNo66noF2noF3(pfx) && sz == 4
   10183        && insn[0] == 0x0F && insn[1] == 0xD7) {
   10184       modrm = insn[2];
   10185       if (epartIsReg(modrm)) {
   10186          do_MMX_preamble();
   10187          t0 = newTemp(Ity_I64);
   10188          t1 = newTemp(Ity_I64);
   10189          assign(t0, getMMXReg(eregLO3ofRM(modrm)));
   10190          assign(t1, mkIRExprCCall(
   10191                        Ity_I64, 0/*regparms*/,
   10192                        "amd64g_calculate_mmx_pmovmskb",
   10193                        &amd64g_calculate_mmx_pmovmskb,
   10194                        mkIRExprVec_1(mkexpr(t0))));
   10195          putIReg32(gregOfRexRM(pfx,modrm), unop(Iop_64to32,mkexpr(t1)));
   10196          DIP("pmovmskb %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   10197                                  nameIReg32(gregOfRexRM(pfx,modrm)));
   10198          delta += 3;
   10199          goto decode_success;
   10200       }
   10201       /* else fall through */
   10202    }
   10203 
   10204    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   10205    /* 0F E4 = PMULUH -- 16x4 hi-half of unsigned widening multiply */
   10206    if (haveNo66noF2noF3(pfx) && sz == 4
   10207        && insn[0] == 0x0F && insn[1] == 0xE4) {
   10208       do_MMX_preamble();
   10209       delta = dis_MMXop_regmem_to_reg (
   10210                  vbi, pfx, delta+2, insn[1], "pmuluh", False );
   10211       goto decode_success;
   10212    }
   10213 
   10214    /* 0F 18 /0 = PREFETCHNTA -- prefetch into caches, */
   10215    /* 0F 18 /1 = PREFETCH0   -- with various different hints */
   10216    /* 0F 18 /2 = PREFETCH1 */
   10217    /* 0F 18 /3 = PREFETCH2 */
   10218    if (insn[0] == 0x0F && insn[1] == 0x18
   10219        && haveNo66noF2noF3(pfx)
   10220        && !epartIsReg(insn[2])
   10221        && gregLO3ofRM(insn[2]) >= 0 && gregLO3ofRM(insn[2]) <= 3) {
   10222       HChar* hintstr = "??";
   10223 
   10224       modrm = getUChar(delta+2);
   10225       vassert(!epartIsReg(modrm));
   10226 
   10227       addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   10228       delta += 2+alen;
   10229 
   10230       switch (gregLO3ofRM(modrm)) {
   10231          case 0: hintstr = "nta"; break;
   10232          case 1: hintstr = "t0"; break;
   10233          case 2: hintstr = "t1"; break;
   10234          case 3: hintstr = "t2"; break;
   10235          default: vassert(0);
   10236       }
   10237 
   10238       DIP("prefetch%s %s\n", hintstr, dis_buf);
   10239       goto decode_success;
   10240    }
   10241 
   10242    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   10243    /* 0F F6 = PSADBW -- sum of 8Ux8 absolute differences */
   10244    if (haveNo66noF2noF3(pfx) && sz == 4
   10245        && insn[0] == 0x0F && insn[1] == 0xF6) {
   10246       do_MMX_preamble();
   10247       delta = dis_MMXop_regmem_to_reg (
   10248                  vbi, pfx, delta+2, insn[1], "psadbw", False );
   10249       goto decode_success;
   10250    }
   10251 
   10252    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   10253    /* 0F 70 = PSHUFW -- rearrange 4x16 from E(mmx or mem) to G(mmx) */
   10254    if (haveNo66noF2noF3(pfx) && sz == 4
   10255        && insn[0] == 0x0F && insn[1] == 0x70) {
   10256       Int order;
   10257       IRTemp sV, dV, s3, s2, s1, s0;
   10258       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   10259       sV = newTemp(Ity_I64);
   10260       dV = newTemp(Ity_I64);
   10261       do_MMX_preamble();
   10262       modrm = insn[2];
   10263       if (epartIsReg(modrm)) {
   10264          assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   10265          order = (Int)insn[3];
   10266          delta += 2+2;
   10267          DIP("pshufw $%d,%s,%s\n", order,
   10268                                    nameMMXReg(eregLO3ofRM(modrm)),
   10269                                    nameMMXReg(gregLO3ofRM(modrm)));
   10270       } else {
   10271          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf,
   10272                            1/*extra byte after amode*/ );
   10273          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   10274          order = (Int)insn[2+alen];
   10275          delta += 3+alen;
   10276          DIP("pshufw $%d,%s,%s\n", order,
   10277                                    dis_buf,
   10278                                    nameMMXReg(gregLO3ofRM(modrm)));
   10279       }
   10280       breakup64to16s( sV, &s3, &s2, &s1, &s0 );
   10281 #     define SEL(n) \
   10282                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   10283       assign(dV,
   10284 	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
   10285                           SEL((order>>2)&3), SEL((order>>0)&3) )
   10286       );
   10287       putMMXReg(gregLO3ofRM(modrm), mkexpr(dV));
   10288 #     undef SEL
   10289       goto decode_success;
   10290    }
   10291 
   10292    /* 0F 53 = RCPPS -- approx reciprocal 32Fx4 from R/M to R */
   10293    if (haveNo66noF2noF3(pfx) && sz == 4
   10294        && insn[0] == 0x0F && insn[1] == 0x53) {
   10295       delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta+2,
   10296                                         "rcpps", Iop_Recip32Fx4 );
   10297       goto decode_success;
   10298    }
   10299 
   10300    /* F3 0F 53 = RCPSS -- approx reciprocal 32F0x4 from R/M to R */
   10301    if (haveF3no66noF2(pfx) && sz == 4
   10302        && insn[0] == 0x0F && insn[1] == 0x53) {
   10303       delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta+2,
   10304                                          "rcpss", Iop_Recip32F0x4 );
   10305       goto decode_success;
   10306    }
   10307 
   10308    /* 0F 52 = RSQRTPS -- approx reciprocal sqrt 32Fx4 from R/M to R */
   10309    if (haveNo66noF2noF3(pfx) && sz == 4
   10310        && insn[0] == 0x0F && insn[1] == 0x52) {
   10311       delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta+2,
   10312                                         "rsqrtps", Iop_RSqrt32Fx4 );
   10313       goto decode_success;
   10314    }
   10315 
   10316    /* F3 0F 52 = RSQRTSS -- approx reciprocal sqrt 32F0x4 from R/M to R */
   10317    if (haveF3no66noF2(pfx) && sz == 4
   10318        && insn[0] == 0x0F && insn[1] == 0x52) {
   10319       delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta+2,
   10320                                          "rsqrtss", Iop_RSqrt32F0x4 );
   10321       goto decode_success;
   10322    }
   10323 
   10324    /* 0F AE /7 = SFENCE -- flush pending operations to memory */
   10325    if (haveNo66noF2noF3(pfx)
   10326        && insn[0] == 0x0F && insn[1] == 0xAE
   10327        && epartIsReg(insn[2]) && gregLO3ofRM(insn[2]) == 7
   10328        && sz == 4) {
   10329       delta += 3;
   10330       /* Insert a memory fence.  It's sometimes important that these
   10331          are carried through to the generated code. */
   10332       stmt( IRStmt_MBE(Imbe_Fence) );
   10333       DIP("sfence\n");
   10334       goto decode_success;
   10335    }
   10336 
   10337    /* 0F C6 /r ib = SHUFPS -- shuffle packed F32s */
   10338    if (haveNo66noF2noF3(pfx) && sz == 4
   10339        && insn[0] == 0x0F && insn[1] == 0xC6) {
   10340       Int    select;
   10341       IRTemp sV, dV;
   10342       IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   10343       sV = newTemp(Ity_V128);
   10344       dV = newTemp(Ity_V128);
   10345       s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   10346       modrm = insn[2];
   10347       assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   10348 
   10349       if (epartIsReg(modrm)) {
   10350          assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   10351          select = (Int)insn[3];
   10352          delta += 2+2;
   10353          DIP("shufps $%d,%s,%s\n", select,
   10354                                    nameXMMReg(eregOfRexRM(pfx,modrm)),
   10355                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   10356       } else {
   10357          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf,
   10358                            1/*byte at end of insn*/ );
   10359          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   10360          select = (Int)insn[2+alen];
   10361          delta += 3+alen;
   10362          DIP("shufps $%d,%s,%s\n", select,
   10363                                    dis_buf,
   10364                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   10365       }
   10366 
   10367       breakup128to32s( dV, &d3, &d2, &d1, &d0 );
   10368       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   10369 
   10370 #     define SELD(n) ((n)==0 ? d0 : ((n)==1 ? d1 : ((n)==2 ? d2 : d3)))
   10371 #     define SELS(n) ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   10372 
   10373       putXMMReg(
   10374          gregOfRexRM(pfx,modrm),
   10375          mk128from32s( SELS((select>>6)&3), SELS((select>>4)&3),
   10376                        SELD((select>>2)&3), SELD((select>>0)&3) )
   10377       );
   10378 
   10379 #     undef SELD
   10380 #     undef SELS
   10381 
   10382       goto decode_success;
   10383    }
   10384 
   10385    /* 0F 51 = SQRTPS -- approx sqrt 32Fx4 from R/M to R */
   10386    if (haveNo66noF2noF3(pfx) && sz == 4
   10387        && insn[0] == 0x0F && insn[1] == 0x51) {
   10388       delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta+2,
   10389                                         "sqrtps", Iop_Sqrt32Fx4 );
   10390       goto decode_success;
   10391    }
   10392 
   10393    /* F3 0F 51 = SQRTSS -- approx sqrt 32F0x4 from R/M to R */
   10394    if (haveF3no66noF2(pfx) && sz == 4
   10395        && insn[0] == 0x0F && insn[1] == 0x51) {
   10396       delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta+2,
   10397                                          "sqrtss", Iop_Sqrt32F0x4 );
   10398       goto decode_success;
   10399    }
   10400 
   10401    /* 0F AE /3 = STMXCSR m32 -- store %mxcsr */
   10402    if (insn[0] == 0x0F && insn[1] == 0xAE
   10403        && haveNo66noF2noF3(pfx)
   10404        && !epartIsReg(insn[2]) && gregLO3ofRM(insn[2]) == 3) {
   10405 
   10406       vassert(sz == 4);
   10407       addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   10408       delta += 2+alen;
   10409 
   10410       /* Fake up a native SSE mxcsr word.  The only thing it depends
   10411          on is SSEROUND[1:0], so call a clean helper to cook it up.
   10412       */
   10413       /* ULong amd64h_create_mxcsr ( ULong sseround ) */
   10414       DIP("stmxcsr %s\n", dis_buf);
   10415       storeLE(
   10416          mkexpr(addr),
   10417          unop(Iop_64to32,
   10418               mkIRExprCCall(
   10419                  Ity_I64, 0/*regp*/,
   10420                  "amd64g_create_mxcsr", &amd64g_create_mxcsr,
   10421                  mkIRExprVec_1( unop(Iop_32Uto64,get_sse_roundingmode()) )
   10422 	      )
   10423 	 )
   10424       );
   10425       goto decode_success;
   10426    }
   10427 
   10428    /* 0F 5C = SUBPS -- sub 32Fx4 from R/M to R */
   10429    if (haveNo66noF2noF3(pfx) && sz == 4
   10430        && insn[0] == 0x0F && insn[1] == 0x5C) {
   10431       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "subps", Iop_Sub32Fx4 );
   10432       goto decode_success;
   10433    }
   10434 
   10435    /* F3 0F 5C = SUBSS -- sub 32F0x4 from R/M to R */
   10436    if (haveF3no66noF2(pfx) && sz == 4
   10437        && insn[0] == 0x0F && insn[1] == 0x5C) {
   10438       delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "subss", Iop_Sub32F0x4 );
   10439       goto decode_success;
   10440    }
   10441 
   10442    /* 0F 15 = UNPCKHPS -- unpack and interleave high part F32s */
   10443    /* 0F 14 = UNPCKLPS -- unpack and interleave low part F32s */
   10444    /* These just appear to be special cases of SHUFPS */
   10445    if (haveNo66noF2noF3(pfx) && sz == 4
   10446        && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
   10447       IRTemp sV, dV;
   10448       IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   10449       Bool hi = toBool(insn[1] == 0x15);
   10450       sV = newTemp(Ity_V128);
   10451       dV = newTemp(Ity_V128);
   10452       s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   10453       modrm = insn[2];
   10454       assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   10455 
   10456       if (epartIsReg(modrm)) {
   10457          assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   10458          delta += 2+1;
   10459          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   10460                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
   10461                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   10462       } else {
   10463          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   10464          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   10465          delta += 2+alen;
   10466          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   10467                                   dis_buf,
   10468                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   10469       }
   10470 
   10471       breakup128to32s( dV, &d3, &d2, &d1, &d0 );
   10472       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   10473 
   10474       if (hi) {
   10475          putXMMReg( gregOfRexRM(pfx,modrm), mk128from32s( s3, d3, s2, d2 ) );
   10476       } else {
   10477          putXMMReg( gregOfRexRM(pfx,modrm), mk128from32s( s1, d1, s0, d0 ) );
   10478       }
   10479 
   10480       goto decode_success;
   10481    }
   10482 
   10483    /* 0F 57 = XORPS -- G = G and E */
   10484    if (haveNo66noF2noF3(pfx) && sz == 4
   10485        && insn[0] == 0x0F && insn[1] == 0x57) {
   10486       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "xorps", Iop_XorV128 );
   10487       goto decode_success;
   10488    }
   10489 
   10490    /* ---------------------------------------------------- */
   10491    /* --- end of the SSE decoder.                      --- */
   10492    /* ---------------------------------------------------- */
   10493 
   10494    /* ---------------------------------------------------- */
   10495    /* --- start of the SSE2 decoder.                   --- */
   10496    /* ---------------------------------------------------- */
   10497 
   10498    /* 66 0F 58 = ADDPD -- add 32Fx4 from R/M to R */
   10499    if (have66noF2noF3(pfx)
   10500        && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
   10501        && insn[0] == 0x0F && insn[1] == 0x58) {
   10502       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "addpd", Iop_Add64Fx2 );
   10503       goto decode_success;
   10504    }
   10505 
   10506    /* F2 0F 58 = ADDSD -- add 64F0x2 from R/M to R */
   10507    if (haveF2no66noF3(pfx)
   10508        && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
   10509        && insn[0] == 0x0F && insn[1] == 0x58) {
   10510       delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "addsd", Iop_Add64F0x2 );
   10511       goto decode_success;
   10512    }
   10513 
   10514    /* 66 0F 55 = ANDNPD -- G = (not G) and E */
   10515    if (have66noF2noF3(pfx) && sz == 2
   10516        && insn[0] == 0x0F && insn[1] == 0x55) {
   10517       delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta+2, "andnpd", Iop_AndV128 );
   10518       goto decode_success;
   10519    }
   10520 
   10521    /* 66 0F 54 = ANDPD -- G = G and E */
   10522    if (have66noF2noF3(pfx) && sz == 2
   10523        && insn[0] == 0x0F && insn[1] == 0x54) {
   10524       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "andpd", Iop_AndV128 );
   10525       goto decode_success;
   10526    }
   10527 
   10528    /* 66 0F C2 = CMPPD -- 64Fx2 comparison from R/M to R */
   10529    if (have66noF2noF3(pfx) && sz == 2
   10530        && insn[0] == 0x0F && insn[1] == 0xC2) {
   10531       delta = dis_SSEcmp_E_to_G( vbi, pfx, delta+2, "cmppd", True, 8 );
   10532       goto decode_success;
   10533    }
   10534 
   10535    /* F2 0F C2 = CMPSD -- 64F0x2 comparison from R/M to R */
   10536    if (haveF2no66noF3(pfx) && sz == 4
   10537        && insn[0] == 0x0F && insn[1] == 0xC2) {
   10538       delta = dis_SSEcmp_E_to_G( vbi, pfx, delta+2, "cmpsd", False, 8 );
   10539       goto decode_success;
   10540    }
   10541 
   10542    /* 66 0F 2F = COMISD  -- 64F0x2 comparison G,E, and set ZCP */
   10543    /* 66 0F 2E = UCOMISD -- 64F0x2 comparison G,E, and set ZCP */
   10544    if (have66noF2noF3(pfx) && sz == 2
   10545        && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
   10546       IRTemp argL = newTemp(Ity_F64);
   10547       IRTemp argR = newTemp(Ity_F64);
   10548       modrm = getUChar(delta+2);
   10549       if (epartIsReg(modrm)) {
   10550          assign( argR, getXMMRegLane64F( eregOfRexRM(pfx,modrm),
   10551                                          0/*lowest lane*/ ) );
   10552          delta += 2+1;
   10553          DIP("%scomisd %s,%s\n", insn[1]==0x2E ? "u" : "",
   10554                                  nameXMMReg(eregOfRexRM(pfx,modrm)),
   10555                                  nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10556       } else {
   10557          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   10558          assign( argR, loadLE(Ity_F64, mkexpr(addr)) );
   10559          delta += 2+alen;
   10560          DIP("%scomisd %s,%s\n", insn[1]==0x2E ? "u" : "",
   10561                                  dis_buf,
   10562                                  nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10563       }
   10564       assign( argL, getXMMRegLane64F( gregOfRexRM(pfx,modrm),
   10565                                       0/*lowest lane*/ ) );
   10566 
   10567       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   10568       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   10569       stmt( IRStmt_Put(
   10570                OFFB_CC_DEP1,
   10571                binop( Iop_And64,
   10572                       unop( Iop_32Uto64,
   10573                             binop(Iop_CmpF64, mkexpr(argL), mkexpr(argR)) ),
   10574                       mkU64(0x45)
   10575           )));
   10576 
   10577       goto decode_success;
   10578    }
   10579 
   10580    /* F3 0F E6 = CVTDQ2PD -- convert 2 x I32 in mem/lo half xmm to 2 x
   10581       F64 in xmm(G) */
   10582    if (haveF3no66noF2(pfx) && insn[0] == 0x0F && insn[1] == 0xE6) {
   10583       IRTemp arg64 = newTemp(Ity_I64);
   10584       if (sz != 4) goto decode_failure;
   10585 
   10586       modrm = getUChar(delta+2);
   10587       if (epartIsReg(modrm)) {
   10588          assign( arg64, getXMMRegLane64(eregOfRexRM(pfx,modrm), 0) );
   10589          delta += 2+1;
   10590          DIP("cvtdq2pd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   10591                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   10592       } else {
   10593          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   10594          assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   10595          delta += 2+alen;
   10596          DIP("cvtdq2pd %s,%s\n", dis_buf,
   10597                                  nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10598       }
   10599 
   10600       putXMMRegLane64F(
   10601          gregOfRexRM(pfx,modrm), 0,
   10602          unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)))
   10603       );
   10604 
   10605       putXMMRegLane64F(
   10606          gregOfRexRM(pfx,modrm), 1,
   10607          unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)))
   10608       );
   10609 
   10610       goto decode_success;
   10611    }
   10612 
   10613    /* 0F 5B = CVTDQ2PS -- convert 4 x I32 in mem/xmm to 4 x F32 in
   10614       xmm(G) */
   10615    if (haveNo66noF2noF3(pfx) && sz == 4
   10616        && insn[0] == 0x0F && insn[1] == 0x5B) {
   10617       IRTemp argV  = newTemp(Ity_V128);
   10618       IRTemp rmode = newTemp(Ity_I32);
   10619 
   10620       modrm = getUChar(delta+2);
   10621       if (epartIsReg(modrm)) {
   10622          assign( argV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   10623          delta += 2+1;
   10624          DIP("cvtdq2ps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   10625                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   10626       } else {
   10627          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   10628          assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10629          delta += 2+alen;
   10630          DIP("cvtdq2ps %s,%s\n", dis_buf,
   10631                                  nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10632       }
   10633 
   10634       assign( rmode, get_sse_roundingmode() );
   10635       breakup128to32s( argV, &t3, &t2, &t1, &t0 );
   10636 
   10637 #     define CVT(_t)  binop( Iop_F64toF32,                    \
   10638                              mkexpr(rmode),                   \
   10639                              unop(Iop_I32StoF64,mkexpr(_t)))
   10640 
   10641       putXMMRegLane32F( gregOfRexRM(pfx,modrm), 3, CVT(t3) );
   10642       putXMMRegLane32F( gregOfRexRM(pfx,modrm), 2, CVT(t2) );
   10643       putXMMRegLane32F( gregOfRexRM(pfx,modrm), 1, CVT(t1) );
   10644       putXMMRegLane32F( gregOfRexRM(pfx,modrm), 0, CVT(t0) );
   10645 
   10646 #     undef CVT
   10647 
   10648       goto decode_success;
   10649    }
   10650 
   10651    /* 66 0F E6 = CVTTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
   10652       lo half xmm(G), and zero upper half, rounding towards zero */
   10653    /* F2 0F E6 = CVTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
   10654       lo half xmm(G), according to prevailing rounding mode, and zero
   10655       upper half */
   10656    if ( ( (haveF2no66noF3(pfx) && sz == 4)
   10657           || (have66noF2noF3(pfx) && sz == 2)
   10658         )
   10659         && insn[0] == 0x0F && insn[1] == 0xE6) {
   10660       IRTemp argV   = newTemp(Ity_V128);
   10661       IRTemp rmode  = newTemp(Ity_I32);
   10662       Bool   r2zero = toBool(sz == 2);
   10663 
   10664       modrm = getUChar(delta+2);
   10665       if (epartIsReg(modrm)) {
   10666          assign( argV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   10667          delta += 2+1;
   10668          DIP("cvt%spd2dq %s,%s\n", r2zero ? "t" : "",
   10669                                    nameXMMReg(eregOfRexRM(pfx,modrm)),
   10670                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   10671       } else {
   10672          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   10673          assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10674          delta += 2+alen;
   10675          DIP("cvt%spd2dq %s,%s\n", r2zero ? "t" : "",
   10676                                    dis_buf,
   10677                                    nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10678       }
   10679 
   10680       if (r2zero) {
   10681          assign(rmode, mkU32((UInt)Irrm_ZERO) );
   10682       } else {
   10683          assign( rmode, get_sse_roundingmode() );
   10684       }
   10685 
   10686       t0 = newTemp(Ity_F64);
   10687       t1 = newTemp(Ity_F64);
   10688       assign( t0, unop(Iop_ReinterpI64asF64,
   10689                        unop(Iop_V128to64, mkexpr(argV))) );
   10690       assign( t1, unop(Iop_ReinterpI64asF64,
   10691                        unop(Iop_V128HIto64, mkexpr(argV))) );
   10692 
   10693 #     define CVT(_t)  binop( Iop_F64toI32S,                   \
   10694                              mkexpr(rmode),                   \
   10695                              mkexpr(_t) )
   10696 
   10697       putXMMRegLane32( gregOfRexRM(pfx,modrm), 3, mkU32(0) );
   10698       putXMMRegLane32( gregOfRexRM(pfx,modrm), 2, mkU32(0) );
   10699       putXMMRegLane32( gregOfRexRM(pfx,modrm), 1, CVT(t1) );
   10700       putXMMRegLane32( gregOfRexRM(pfx,modrm), 0, CVT(t0) );
   10701 
   10702 #     undef CVT
   10703 
   10704       goto decode_success;
   10705    }
   10706 
   10707    /* 66 0F 2D = CVTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
   10708       I32 in mmx, according to prevailing SSE rounding mode */
   10709    /* 66 0F 2C = CVTTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
   10710       I32 in mmx, rounding towards zero */
   10711    if (have66noF2noF3(pfx) && sz == 2
   10712        && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
   10713       IRTemp dst64  = newTemp(Ity_I64);
   10714       IRTemp rmode  = newTemp(Ity_I32);
   10715       IRTemp f64lo  = newTemp(Ity_F64);
   10716       IRTemp f64hi  = newTemp(Ity_F64);
   10717       Bool   r2zero = toBool(insn[1] == 0x2C);
   10718 
   10719       do_MMX_preamble();
   10720       modrm = getUChar(delta+2);
   10721 
   10722       if (epartIsReg(modrm)) {
   10723          delta += 2+1;
   10724          assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
   10725          assign(f64hi, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 1));
   10726          DIP("cvt%spd2pi %s,%s\n", r2zero ? "t" : "",
   10727                                    nameXMMReg(eregOfRexRM(pfx,modrm)),
   10728                                    nameMMXReg(gregLO3ofRM(modrm)));
   10729       } else {
   10730          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   10731          assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   10732          assign(f64hi, loadLE(Ity_F64, binop( Iop_Add64,
   10733                                               mkexpr(addr),
   10734                                               mkU64(8) )));
   10735          delta += 2+alen;
   10736          DIP("cvt%spf2pi %s,%s\n", r2zero ? "t" : "",
   10737                                    dis_buf,
   10738                                    nameMMXReg(gregLO3ofRM(modrm)));
   10739       }
   10740 
   10741       if (r2zero) {
   10742          assign(rmode, mkU32((UInt)Irrm_ZERO) );
   10743       } else {
   10744          assign( rmode, get_sse_roundingmode() );
   10745       }
   10746 
   10747       assign(
   10748          dst64,
   10749          binop( Iop_32HLto64,
   10750                 binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64hi) ),
   10751                 binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo) )
   10752               )
   10753       );
   10754 
   10755       putMMXReg(gregLO3ofRM(modrm), mkexpr(dst64));
   10756       goto decode_success;
   10757    }
   10758 
   10759    /* 66 0F 5A = CVTPD2PS -- convert 2 x F64 in mem/xmm to 2 x F32 in
   10760       lo half xmm(G), rounding according to prevailing SSE rounding
   10761       mode, and zero upper half */
   10762    /* Note, this is practically identical to CVTPD2DQ.  It would have
   10763       been nicer to merge them together, but the insn[] offsets differ
   10764       by one. */
   10765    if (have66noF2noF3(pfx) && sz == 2
   10766        && insn[0] == 0x0F && insn[1] == 0x5A) {
   10767       IRTemp argV  = newTemp(Ity_V128);
   10768       IRTemp rmode = newTemp(Ity_I32);
   10769 
   10770       modrm = getUChar(delta+2);
   10771       if (epartIsReg(modrm)) {
   10772          assign( argV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   10773          delta += 2+1;
   10774          DIP("cvtpd2ps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   10775                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   10776       } else {
   10777          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   10778          assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10779          delta += 2+alen;
   10780          DIP("cvtpd2ps %s,%s\n", dis_buf,
   10781                                  nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10782       }
   10783 
   10784       assign( rmode, get_sse_roundingmode() );
   10785       t0 = newTemp(Ity_F64);
   10786       t1 = newTemp(Ity_F64);
   10787       assign( t0, unop(Iop_ReinterpI64asF64,
   10788                        unop(Iop_V128to64, mkexpr(argV))) );
   10789       assign( t1, unop(Iop_ReinterpI64asF64,
   10790                        unop(Iop_V128HIto64, mkexpr(argV))) );
   10791 
   10792 #     define CVT(_t)  binop( Iop_F64toF32,                    \
   10793                              mkexpr(rmode),                   \
   10794                              mkexpr(_t) )
   10795 
   10796       putXMMRegLane32(  gregOfRexRM(pfx,modrm), 3, mkU32(0) );
   10797       putXMMRegLane32(  gregOfRexRM(pfx,modrm), 2, mkU32(0) );
   10798       putXMMRegLane32F( gregOfRexRM(pfx,modrm), 1, CVT(t1) );
   10799       putXMMRegLane32F( gregOfRexRM(pfx,modrm), 0, CVT(t0) );
   10800 
   10801 #     undef CVT
   10802 
   10803       goto decode_success;
   10804    }
   10805 
   10806    /* 66 0F 2A = CVTPI2PD -- convert 2 x I32 in mem/mmx to 2 x F64 in
   10807       xmm(G) */
   10808    if (have66noF2noF3(pfx) && sz == 2
   10809        && insn[0] == 0x0F && insn[1] == 0x2A) {
   10810       IRTemp arg64 = newTemp(Ity_I64);
   10811 
   10812       modrm = getUChar(delta+2);
   10813       if (epartIsReg(modrm)) {
   10814          /* Only switch to MMX mode if the source is a MMX register.
   10815             This is inconsistent with all other instructions which
   10816             convert between XMM and (M64 or MMX), which always switch
   10817             to MMX mode even if 64-bit operand is M64 and not MMX.  At
   10818             least, that's what the Intel docs seem to me to say.
   10819             Fixes #210264. */
   10820          do_MMX_preamble();
   10821          assign( arg64, getMMXReg(eregLO3ofRM(modrm)) );
   10822          delta += 2+1;
   10823          DIP("cvtpi2pd %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   10824                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   10825       } else {
   10826          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   10827          assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   10828          delta += 2+alen;
   10829          DIP("cvtpi2pd %s,%s\n", dis_buf,
   10830                                  nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10831       }
   10832 
   10833       putXMMRegLane64F(
   10834          gregOfRexRM(pfx,modrm), 0,
   10835          unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)) )
   10836       );
   10837 
   10838       putXMMRegLane64F(
   10839          gregOfRexRM(pfx,modrm), 1,
   10840          unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)) )
   10841       );
   10842 
   10843       goto decode_success;
   10844    }
   10845 
   10846    /* F3 0F 5B = CVTTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
   10847       xmm(G), rounding towards zero */
   10848    /* 66 0F 5B = CVTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
   10849       xmm(G), as per the prevailing rounding mode */
   10850    if ( ( (have66noF2noF3(pfx) && sz == 2)
   10851           || (haveF3no66noF2(pfx) && sz == 4)
   10852         )
   10853         && insn[0] == 0x0F && insn[1] == 0x5B) {
   10854       IRTemp argV   = newTemp(Ity_V128);
   10855       IRTemp rmode  = newTemp(Ity_I32);
   10856       Bool   r2zero = toBool(sz == 4);
   10857 
   10858       modrm = getUChar(delta+2);
   10859       if (epartIsReg(modrm)) {
   10860          assign( argV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   10861          delta += 2+1;
   10862          DIP("cvtps2dq %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   10863                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   10864       } else {
   10865          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   10866          assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10867          delta += 2+alen;
   10868          DIP("cvtps2dq %s,%s\n", dis_buf,
   10869                                  nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10870       }
   10871 
   10872       if (r2zero) {
   10873          assign( rmode, mkU32((UInt)Irrm_ZERO) );
   10874       } else {
   10875          assign( rmode, get_sse_roundingmode() );
   10876       }
   10877 
   10878       breakup128to32s( argV, &t3, &t2, &t1, &t0 );
   10879 
   10880       /* This is less than ideal.  If it turns out to be a performance
   10881          bottleneck it can be improved. */
   10882 #     define CVT(_t)                             \
   10883          binop( Iop_F64toI32S,                   \
   10884                 mkexpr(rmode),                   \
   10885                 unop( Iop_F32toF64,              \
   10886                       unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
   10887 
   10888       putXMMRegLane32( gregOfRexRM(pfx,modrm), 3, CVT(t3) );
   10889       putXMMRegLane32( gregOfRexRM(pfx,modrm), 2, CVT(t2) );
   10890       putXMMRegLane32( gregOfRexRM(pfx,modrm), 1, CVT(t1) );
   10891       putXMMRegLane32( gregOfRexRM(pfx,modrm), 0, CVT(t0) );
   10892 
   10893 #     undef CVT
   10894 
   10895       goto decode_success;
   10896    }
   10897 
   10898    /* 0F 5A = CVTPS2PD -- convert 2 x F32 in low half mem/xmm to 2 x
   10899       F64 in xmm(G). */
   10900    if (haveNo66noF2noF3(pfx) && sz == 4
   10901        && insn[0] == 0x0F && insn[1] == 0x5A) {
   10902       IRTemp f32lo = newTemp(Ity_F32);
   10903       IRTemp f32hi = newTemp(Ity_F32);
   10904 
   10905       modrm = getUChar(delta+2);
   10906       if (epartIsReg(modrm)) {
   10907          assign( f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0) );
   10908          assign( f32hi, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 1) );
   10909          delta += 2+1;
   10910          DIP("cvtps2pd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   10911                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   10912       } else {
   10913          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   10914 	 assign( f32lo, loadLE(Ity_F32, mkexpr(addr)) );
   10915 	 assign( f32hi, loadLE(Ity_F32,
   10916                                binop(Iop_Add64,mkexpr(addr),mkU64(4))) );
   10917          delta += 2+alen;
   10918          DIP("cvtps2pd %s,%s\n", dis_buf,
   10919                                  nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10920       }
   10921 
   10922       putXMMRegLane64F( gregOfRexRM(pfx,modrm), 1,
   10923                         unop(Iop_F32toF64, mkexpr(f32hi)) );
   10924       putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0,
   10925                         unop(Iop_F32toF64, mkexpr(f32lo)) );
   10926 
   10927       goto decode_success;
   10928    }
   10929 
   10930    /* F2 0F 2D = CVTSD2SI
   10931       when sz==4 -- convert F64 in mem/low half xmm to I32 in ireg,
   10932                     according to prevailing SSE rounding mode
   10933       when sz==8 -- convert F64 in mem/low half xmm to I64 in ireg,
   10934                     according to prevailing SSE rounding mode
   10935    */
   10936    /* F2 0F 2C = CVTTSD2SI
   10937       when sz==4 -- convert F64 in mem/low half xmm to I32 in ireg,
   10938                     truncating towards zero
   10939       when sz==8 -- convert F64 in mem/low half xmm to I64 in ireg,
   10940                     truncating towards zero
   10941    */
   10942    if (haveF2no66noF3(pfx)
   10943        && insn[0] == 0x0F
   10944        && (insn[1] == 0x2D || insn[1] == 0x2C)) {
   10945       IRTemp rmode  = newTemp(Ity_I32);
   10946       IRTemp f64lo  = newTemp(Ity_F64);
   10947       Bool   r2zero = toBool(insn[1] == 0x2C);
   10948       vassert(sz == 4 || sz == 8);
   10949 
   10950       modrm = getUChar(delta+2);
   10951       if (epartIsReg(modrm)) {
   10952          delta += 2+1;
   10953          assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
   10954          DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
   10955                                    nameXMMReg(eregOfRexRM(pfx,modrm)),
   10956                                    nameIReg(sz, gregOfRexRM(pfx,modrm), False));
   10957       } else {
   10958          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   10959          assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   10960          delta += 2+alen;
   10961          DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
   10962                                    dis_buf,
   10963                                    nameIReg(sz, gregOfRexRM(pfx,modrm), False));
   10964       }
   10965 
   10966       if (r2zero) {
   10967          assign( rmode, mkU32((UInt)Irrm_ZERO) );
   10968       } else {
   10969          assign( rmode, get_sse_roundingmode() );
   10970       }
   10971 
   10972       if (sz == 4) {
   10973          putIReg32( gregOfRexRM(pfx,modrm),
   10974                     binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo)) );
   10975       } else {
   10976          putIReg64( gregOfRexRM(pfx,modrm),
   10977                     binop( Iop_F64toI64S, mkexpr(rmode), mkexpr(f64lo)) );
   10978       }
   10979 
   10980       goto decode_success;
   10981    }
   10982 
   10983    /* F2 0F 5A = CVTSD2SS -- convert F64 in mem/low half xmm to F32 in
   10984       low 1/4 xmm(G), according to prevailing SSE rounding mode */
   10985    if (haveF2no66noF3(pfx) && sz == 4
   10986        && insn[0] == 0x0F && insn[1] == 0x5A) {
   10987       IRTemp rmode = newTemp(Ity_I32);
   10988       IRTemp f64lo = newTemp(Ity_F64);
   10989       vassert(sz == 4);
   10990 
   10991       modrm = getUChar(delta+2);
   10992       if (epartIsReg(modrm)) {
   10993          delta += 2+1;
   10994          assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
   10995          DIP("cvtsd2ss %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   10996                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   10997       } else {
   10998          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   10999          assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   11000          delta += 2+alen;
   11001          DIP("cvtsd2ss %s,%s\n", dis_buf,
   11002                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   11003       }
   11004 
   11005       assign( rmode, get_sse_roundingmode() );
   11006       putXMMRegLane32F(
   11007          gregOfRexRM(pfx,modrm), 0,
   11008          binop( Iop_F64toF32, mkexpr(rmode), mkexpr(f64lo) )
   11009       );
   11010 
   11011       goto decode_success;
   11012    }
   11013 
   11014    /* F2 0F 2A = CVTSI2SD
   11015       when sz==4 -- convert I32 in mem/ireg to F64 in low half xmm
   11016       when sz==8 -- convert I64 in mem/ireg to F64 in low half xmm
   11017    */
   11018    if (haveF2no66noF3(pfx) && (sz == 4 || sz == 8)
   11019        && insn[0] == 0x0F && insn[1] == 0x2A) {
   11020       modrm = getUChar(delta+2);
   11021 
   11022       if (sz == 4) {
   11023          IRTemp arg32 = newTemp(Ity_I32);
   11024          if (epartIsReg(modrm)) {
   11025             assign( arg32, getIReg32(eregOfRexRM(pfx,modrm)) );
   11026             delta += 2+1;
   11027             DIP("cvtsi2sd %s,%s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
   11028                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   11029          } else {
   11030             addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11031             assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   11032             delta += 2+alen;
   11033             DIP("cvtsi2sd %s,%s\n", dis_buf,
   11034                                     nameXMMReg(gregOfRexRM(pfx,modrm)) );
   11035          }
   11036          putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0,
   11037                            unop(Iop_I32StoF64, mkexpr(arg32))
   11038          );
   11039       } else {
   11040          /* sz == 8 */
   11041          IRTemp arg64 = newTemp(Ity_I64);
   11042          if (epartIsReg(modrm)) {
   11043             assign( arg64, getIReg64(eregOfRexRM(pfx,modrm)) );
   11044             delta += 2+1;
   11045             DIP("cvtsi2sdq %s,%s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
   11046                                      nameXMMReg(gregOfRexRM(pfx,modrm)));
   11047          } else {
   11048             addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11049             assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   11050             delta += 2+alen;
   11051             DIP("cvtsi2sdq %s,%s\n", dis_buf,
   11052                                      nameXMMReg(gregOfRexRM(pfx,modrm)) );
   11053          }
   11054          putXMMRegLane64F(
   11055             gregOfRexRM(pfx,modrm),
   11056             0,
   11057             binop( Iop_I64StoF64,
   11058                    get_sse_roundingmode(),
   11059                    mkexpr(arg64)
   11060             )
   11061          );
   11062 
   11063       }
   11064 
   11065       goto decode_success;
   11066    }
   11067 
   11068    /* F3 0F 5A = CVTSS2SD -- convert F32 in mem/low 1/4 xmm to F64 in
   11069       low half xmm(G) */
   11070    if (haveF3no66noF2(pfx) && sz == 4
   11071        && insn[0] == 0x0F && insn[1] == 0x5A) {
   11072       IRTemp f32lo = newTemp(Ity_F32);
   11073 
   11074       modrm = getUChar(delta+2);
   11075       if (epartIsReg(modrm)) {
   11076          delta += 2+1;
   11077          assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
   11078          DIP("cvtss2sd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   11079                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   11080       } else {
   11081          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11082          assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   11083          delta += 2+alen;
   11084          DIP("cvtss2sd %s,%s\n", dis_buf,
   11085                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   11086       }
   11087 
   11088       putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0,
   11089                         unop( Iop_F32toF64, mkexpr(f32lo) ) );
   11090 
   11091       goto decode_success;
   11092    }
   11093 
   11094    /* 66 0F 5E = DIVPD -- div 64Fx2 from R/M to R */
   11095    if (have66noF2noF3(pfx) && sz == 2
   11096        && insn[0] == 0x0F && insn[1] == 0x5E) {
   11097       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "divpd", Iop_Div64Fx2 );
   11098       goto decode_success;
   11099    }
   11100 
   11101    /* F2 0F 5E = DIVSD -- div 64F0x2 from R/M to R */
   11102    if (haveF2no66noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x5E) {
   11103       vassert(sz == 4);
   11104       delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "divsd", Iop_Div64F0x2 );
   11105       goto decode_success;
   11106    }
   11107 
   11108    /* 0F AE /5 = LFENCE -- flush pending operations to memory */
   11109    /* 0F AE /6 = MFENCE -- flush pending operations to memory */
   11110    if (haveNo66noF2noF3(pfx) && sz == 4
   11111        && insn[0] == 0x0F && insn[1] == 0xAE
   11112        && epartIsReg(insn[2])
   11113        && (gregLO3ofRM(insn[2]) == 5 || gregLO3ofRM(insn[2]) == 6)) {
   11114       delta += 3;
   11115       /* Insert a memory fence.  It's sometimes important that these
   11116          are carried through to the generated code. */
   11117       stmt( IRStmt_MBE(Imbe_Fence) );
   11118       DIP("%sfence\n", gregLO3ofRM(insn[2])==5 ? "l" : "m");
   11119       goto decode_success;
   11120    }
   11121 
   11122    /* 66 0F 5F = MAXPD -- max 64Fx2 from R/M to R */
   11123    if (have66noF2noF3(pfx) && sz == 2
   11124        && insn[0] == 0x0F && insn[1] == 0x5F) {
   11125       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "maxpd", Iop_Max64Fx2 );
   11126       goto decode_success;
   11127    }
   11128 
   11129    /* F2 0F 5F = MAXSD -- max 64F0x2 from R/M to R */
   11130    if (haveF2no66noF3(pfx) && sz == 4
   11131        && insn[0] == 0x0F && insn[1] == 0x5F) {
   11132       delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "maxsd", Iop_Max64F0x2 );
   11133       goto decode_success;
   11134    }
   11135 
   11136    /* 66 0F 5D = MINPD -- min 64Fx2 from R/M to R */
   11137    if (have66noF2noF3(pfx) && sz == 2
   11138        && insn[0] == 0x0F && insn[1] == 0x5D) {
   11139       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "minpd", Iop_Min64Fx2 );
   11140       goto decode_success;
   11141    }
   11142 
   11143    /* F2 0F 5D = MINSD -- min 64F0x2 from R/M to R */
   11144    if (haveF2no66noF3(pfx) && sz == 4
   11145        && insn[0] == 0x0F && insn[1] == 0x5D) {
   11146       delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "minsd", Iop_Min64F0x2 );
   11147       goto decode_success;
   11148    }
   11149 
   11150    /* 66 0F 28 = MOVAPD -- move from E (mem or xmm) to G (xmm). */
   11151    /* 66 0F 10 = MOVUPD -- move from E (mem or xmm) to G (xmm). */
   11152    /* 66 0F 6F = MOVDQA -- move from E (mem or xmm) to G (xmm). */
   11153    if (have66noF2noF3(pfx)
   11154        && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
   11155        && insn[0] == 0x0F
   11156        && (insn[1] == 0x28 || insn[1] == 0x10 || insn[1] == 0x6F)) {
   11157       HChar* wot = insn[1]==0x28 ? "apd" :
   11158                    insn[1]==0x10 ? "upd" : "dqa";
   11159       modrm = getUChar(delta+2);
   11160       if (epartIsReg(modrm)) {
   11161          putXMMReg( gregOfRexRM(pfx,modrm),
   11162                     getXMMReg( eregOfRexRM(pfx,modrm) ));
   11163          DIP("mov%s %s,%s\n", wot, nameXMMReg(eregOfRexRM(pfx,modrm)),
   11164                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   11165          delta += 2+1;
   11166       } else {
   11167          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11168          if (insn[1] == 0x28/*movapd*/ || insn[1] == 0x6F/*movdqa*/)
   11169             gen_SEGV_if_not_16_aligned( addr );
   11170          putXMMReg( gregOfRexRM(pfx,modrm),
   11171                     loadLE(Ity_V128, mkexpr(addr)) );
   11172          DIP("mov%s %s,%s\n", wot, dis_buf,
   11173                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   11174          delta += 2+alen;
   11175       }
   11176       goto decode_success;
   11177    }
   11178 
   11179    /* 66 0F 29 = MOVAPD -- move from G (xmm) to E (mem or xmm). */
   11180    /* 66 0F 11 = MOVUPD -- move from G (xmm) to E (mem or xmm). */
   11181    if (have66noF2noF3(pfx) && insn[0] == 0x0F
   11182        && (insn[1] == 0x29 || insn[1] == 0x11)) {
   11183       HChar* wot = insn[1]==0x29 ? "apd" : "upd";
   11184       modrm = getUChar(delta+2);
   11185       if (epartIsReg(modrm)) {
   11186          putXMMReg( eregOfRexRM(pfx,modrm),
   11187 		    getXMMReg( gregOfRexRM(pfx,modrm) ) );
   11188          DIP("mov%s %s,%s\n", wot, nameXMMReg(gregOfRexRM(pfx,modrm)),
   11189 	                           nameXMMReg(eregOfRexRM(pfx,modrm)));
   11190          delta += 2+1;
   11191       } else {
   11192          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11193          if (insn[1] == 0x29/*movapd*/)
   11194             gen_SEGV_if_not_16_aligned( addr );
   11195          storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   11196          DIP("mov%s %s,%s\n", wot, nameXMMReg(gregOfRexRM(pfx,modrm)),
   11197                               dis_buf );
   11198          delta += 2+alen;
   11199       }
   11200       goto decode_success;
   11201    }
   11202 
   11203    /* 66 0F 6E = MOVD from ireg32/m32 to xmm lo 1/4, zeroing high 3/4 of xmm. */
   11204    /*              or from ireg64/m64 to xmm lo 1/2, zeroing high 1/2 of xmm. */
   11205    if (have66noF2noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x6E) {
   11206       vassert(sz == 2 || sz == 8);
   11207       if (sz == 2) sz = 4;
   11208       modrm = getUChar(delta+2);
   11209       if (epartIsReg(modrm)) {
   11210          delta += 2+1;
   11211          if (sz == 4) {
   11212             putXMMReg(
   11213                gregOfRexRM(pfx,modrm),
   11214                unop( Iop_32UtoV128, getIReg32(eregOfRexRM(pfx,modrm)) )
   11215             );
   11216             DIP("movd %s, %s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
   11217                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   11218          } else {
   11219             putXMMReg(
   11220                gregOfRexRM(pfx,modrm),
   11221                unop( Iop_64UtoV128, getIReg64(eregOfRexRM(pfx,modrm)) )
   11222             );
   11223             DIP("movq %s, %s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
   11224                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   11225 	 }
   11226       } else {
   11227          addr = disAMode( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11228          delta += 2+alen;
   11229          putXMMReg(
   11230             gregOfRexRM(pfx,modrm),
   11231             sz == 4
   11232                ?  unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)) )
   11233 	       :  unop( Iop_64UtoV128,loadLE(Ity_I64, mkexpr(addr)) )
   11234          );
   11235          DIP("mov%c %s, %s\n", sz == 4 ? 'd' : 'q', dis_buf,
   11236                                nameXMMReg(gregOfRexRM(pfx,modrm)));
   11237       }
   11238       goto decode_success;
   11239    }
   11240 
   11241    /* 66 0F 7E = MOVD from xmm low 1/4 to ireg32 or m32. */
   11242    /*              or from xmm low 1/2 to ireg64 or m64. */
   11243    if (have66noF2noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x7E) {
   11244       if (sz == 2) sz = 4;
   11245       vassert(sz == 4 || sz == 8);
   11246       modrm = getUChar(delta+2);
   11247       if (epartIsReg(modrm)) {
   11248          delta += 2+1;
   11249          if (sz == 4) {
   11250             putIReg32( eregOfRexRM(pfx,modrm),
   11251                        getXMMRegLane32(gregOfRexRM(pfx,modrm), 0) );
   11252             DIP("movd %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11253                                  nameIReg32(eregOfRexRM(pfx,modrm)));
   11254 	 } else {
   11255             putIReg64( eregOfRexRM(pfx,modrm),
   11256                        getXMMRegLane64(gregOfRexRM(pfx,modrm), 0) );
   11257             DIP("movq %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11258                                  nameIReg64(eregOfRexRM(pfx,modrm)));
   11259 	 }
   11260       } else {
   11261          addr = disAMode( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11262          delta += 2+alen;
   11263          storeLE( mkexpr(addr),
   11264                   sz == 4
   11265                      ? getXMMRegLane32(gregOfRexRM(pfx,modrm),0)
   11266                      : getXMMRegLane64(gregOfRexRM(pfx,modrm),0) );
   11267          DIP("mov%c %s, %s\n", sz == 4 ? 'd' : 'q',
   11268                                nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
   11269       }
   11270       goto decode_success;
   11271    }
   11272 
   11273    /* 66 0F 7F = MOVDQA -- move from G (xmm) to E (mem or xmm). */
   11274    if (have66noF2noF3(pfx) && sz == 2
   11275        && insn[0] == 0x0F && insn[1] == 0x7F) {
   11276       modrm = getUChar(delta+2);
   11277       if (epartIsReg(modrm)) {
   11278          delta += 2+1;
   11279          putXMMReg( eregOfRexRM(pfx,modrm),
   11280                     getXMMReg(gregOfRexRM(pfx,modrm)) );
   11281          DIP("movdqa %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11282                                 nameXMMReg(eregOfRexRM(pfx,modrm)));
   11283       } else {
   11284          addr = disAMode( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11285          gen_SEGV_if_not_16_aligned( addr );
   11286          delta += 2+alen;
   11287          storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   11288          DIP("movdqa %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
   11289       }
   11290       goto decode_success;
   11291    }
   11292 
   11293    /* F3 0F 6F = MOVDQU -- move from E (mem or xmm) to G (xmm). */
   11294    if (haveF3no66noF2(pfx) && sz == 4
   11295        && insn[0] == 0x0F && insn[1] == 0x6F) {
   11296       modrm = getUChar(delta+2);
   11297       if (epartIsReg(modrm)) {
   11298          putXMMReg( gregOfRexRM(pfx,modrm),
   11299                     getXMMReg( eregOfRexRM(pfx,modrm) ));
   11300          DIP("movdqu %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   11301                                nameXMMReg(gregOfRexRM(pfx,modrm)));
   11302          delta += 2+1;
   11303       } else {
   11304          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11305          putXMMReg( gregOfRexRM(pfx,modrm),
   11306                     loadLE(Ity_V128, mkexpr(addr)) );
   11307          DIP("movdqu %s,%s\n", dis_buf,
   11308                                nameXMMReg(gregOfRexRM(pfx,modrm)));
   11309          delta += 2+alen;
   11310       }
   11311       goto decode_success;
   11312    }
   11313 
   11314    /* F3 0F 7F = MOVDQU -- move from G (xmm) to E (mem or xmm). */
   11315    if (haveF3no66noF2(pfx) && sz == 4
   11316        && insn[0] == 0x0F && insn[1] == 0x7F) {
   11317       modrm = getUChar(delta+2);
   11318       if (epartIsReg(modrm)) {
   11319          goto decode_failure; /* awaiting test case */
   11320          delta += 2+1;
   11321          putXMMReg( eregOfRexRM(pfx,modrm),
   11322                     getXMMReg(gregOfRexRM(pfx,modrm)) );
   11323          DIP("movdqu %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11324                                 nameXMMReg(eregOfRexRM(pfx,modrm)));
   11325       } else {
   11326          addr = disAMode( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11327          delta += 2+alen;
   11328          storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   11329          DIP("movdqu %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
   11330       }
   11331       goto decode_success;
   11332    }
   11333 
   11334    /* F2 0F D6 = MOVDQ2Q -- move from E (lo half xmm, not mem) to G (mmx). */
   11335    if (haveF2no66noF3(pfx) && sz == 4
   11336        && insn[0] == 0x0F && insn[1] == 0xD6) {
   11337       modrm = getUChar(delta+2);
   11338       if (epartIsReg(modrm)) {
   11339          do_MMX_preamble();
   11340          putMMXReg( gregLO3ofRM(modrm),
   11341                     getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
   11342          DIP("movdq2q %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   11343                                 nameMMXReg(gregLO3ofRM(modrm)));
   11344          delta += 2+1;
   11345          goto decode_success;
   11346       } else {
   11347          /* apparently no mem case for this insn */
   11348          goto decode_failure;
   11349       }
   11350    }
   11351 
   11352    /* 66 0F 16 = MOVHPD -- move from mem to high half of XMM. */
   11353    /* These seems identical to MOVHPS.  This instruction encoding is
   11354       completely crazy. */
   11355    if (have66noF2noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x16) {
   11356       modrm = getUChar(delta+2);
   11357       if (epartIsReg(modrm)) {
   11358          /* fall through; apparently reg-reg is not possible */
   11359       } else {
   11360          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11361          delta += 2+alen;
   11362          putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
   11363                           loadLE(Ity_I64, mkexpr(addr)) );
   11364          DIP("movhpd %s,%s\n", dis_buf,
   11365                                nameXMMReg( gregOfRexRM(pfx,modrm) ));
   11366          goto decode_success;
   11367       }
   11368    }
   11369 
   11370    /* 66 0F 17 = MOVHPD -- move from high half of XMM to mem. */
   11371    /* Again, this seems identical to MOVHPS. */
   11372    if (have66noF2noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x17) {
   11373       if (!epartIsReg(insn[2])) {
   11374          delta += 2;
   11375          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11376          delta += alen;
   11377          storeLE( mkexpr(addr),
   11378                   getXMMRegLane64( gregOfRexRM(pfx,insn[2]),
   11379                                    1/*upper lane*/ ) );
   11380          DIP("movhpd %s,%s\n", nameXMMReg( gregOfRexRM(pfx,insn[2]) ),
   11381                                dis_buf);
   11382          goto decode_success;
   11383       }
   11384       /* else fall through */
   11385    }
   11386 
   11387    /* 66 0F 12 = MOVLPD -- move from mem to low half of XMM. */
   11388    /* Identical to MOVLPS ? */
   11389    if (have66noF2noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x12) {
   11390       modrm = getUChar(delta+2);
   11391       if (epartIsReg(modrm)) {
   11392          /* fall through; apparently reg-reg is not possible */
   11393       } else {
   11394          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11395          delta += 2+alen;
   11396          putXMMRegLane64( gregOfRexRM(pfx,modrm),
   11397                           0/*lower lane*/,
   11398                           loadLE(Ity_I64, mkexpr(addr)) );
   11399          DIP("movlpd %s, %s\n",
   11400              dis_buf, nameXMMReg( gregOfRexRM(pfx,modrm) ));
   11401          goto decode_success;
   11402       }
   11403    }
   11404 
   11405    /* 66 0F 13 = MOVLPD -- move from low half of XMM to mem. */
   11406    /* Identical to MOVLPS ? */
   11407    if (have66noF2noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x13) {
   11408       modrm = getUChar(delta+2);
   11409       if (!epartIsReg(modrm)) {
   11410          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11411          delta += 2+alen;
   11412          storeLE( mkexpr(addr),
   11413                   getXMMRegLane64( gregOfRexRM(pfx,modrm),
   11414                                    0/*lower lane*/ ) );
   11415          DIP("movlpd %s, %s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
   11416                                 dis_buf);
   11417          goto decode_success;
   11418       }
   11419       /* else fall through */
   11420    }
   11421 
   11422    /* 66 0F 50 = MOVMSKPD - move 2 sign bits from 2 x F64 in xmm(E) to
   11423       2 lowest bits of ireg(G) */
   11424    if (have66noF2noF3(pfx) && (sz == 2 || sz == 8)
   11425        && insn[0] == 0x0F && insn[1] == 0x50) {
   11426       /* sz == 8 is a kludge to handle insns with REX.W redundantly
   11427          set to 1, which has been known to happen:
   11428          66 4c 0f 50 d9          rex64X movmskpd %xmm1,%r11d
   11429          20071106: see further comments on MOVMSKPS implementation above.
   11430       */
   11431       modrm = getUChar(delta+2);
   11432       if (epartIsReg(modrm)) {
   11433          Int src;
   11434          t0 = newTemp(Ity_I32);
   11435          t1 = newTemp(Ity_I32);
   11436          delta += 2+1;
   11437          src = eregOfRexRM(pfx,modrm);
   11438          assign( t0, binop( Iop_And32,
   11439                             binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(31)),
   11440                             mkU32(1) ));
   11441          assign( t1, binop( Iop_And32,
   11442                             binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(30)),
   11443                             mkU32(2) ));
   11444          putIReg32( gregOfRexRM(pfx,modrm),
   11445                     binop(Iop_Or32, mkexpr(t0), mkexpr(t1))
   11446                   );
   11447          DIP("movmskpd %s,%s\n", nameXMMReg(src),
   11448                                  nameIReg32(gregOfRexRM(pfx,modrm)));
   11449          goto decode_success;
   11450       }
   11451       /* else fall through */
   11452       goto decode_failure;
   11453    }
   11454 
   11455    /* 66 0F F7 = MASKMOVDQU -- store selected bytes of double quadword */
   11456    if (have66noF2noF3(pfx) && sz == 2
   11457        && insn[0] == 0x0F && insn[1] == 0xF7) {
   11458       modrm = getUChar(delta+2);
   11459       if (epartIsReg(modrm)) {
   11460          IRTemp regD    = newTemp(Ity_V128);
   11461          IRTemp mask    = newTemp(Ity_V128);
   11462          IRTemp olddata = newTemp(Ity_V128);
   11463          IRTemp newdata = newTemp(Ity_V128);
   11464                 addr    = newTemp(Ity_I64);
   11465 
   11466          assign( addr, handleAddrOverrides( vbi, pfx, getIReg64(R_RDI) ));
   11467          assign( regD, getXMMReg( gregOfRexRM(pfx,modrm) ));
   11468 
   11469          /* Unfortunately can't do the obvious thing with SarN8x16
   11470             here since that can't be re-emitted as SSE2 code - no such
   11471             insn. */
   11472 	 assign(
   11473             mask,
   11474             binop(Iop_64HLtoV128,
   11475                   binop(Iop_SarN8x8,
   11476                         getXMMRegLane64( eregOfRexRM(pfx,modrm), 1 ),
   11477                         mkU8(7) ),
   11478                   binop(Iop_SarN8x8,
   11479                         getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ),
   11480                         mkU8(7) ) ));
   11481          assign( olddata, loadLE( Ity_V128, mkexpr(addr) ));
   11482          assign( newdata,
   11483                  binop(Iop_OrV128,
   11484                        binop(Iop_AndV128,
   11485                              mkexpr(regD),
   11486                              mkexpr(mask) ),
   11487                        binop(Iop_AndV128,
   11488                              mkexpr(olddata),
   11489                              unop(Iop_NotV128, mkexpr(mask)))) );
   11490          storeLE( mkexpr(addr), mkexpr(newdata) );
   11491 
   11492          delta += 2+1;
   11493          DIP("maskmovdqu %s,%s\n", nameXMMReg( eregOfRexRM(pfx,modrm) ),
   11494                                    nameXMMReg( gregOfRexRM(pfx,modrm) ) );
   11495          goto decode_success;
   11496       }
   11497       /* else fall through */
   11498    }
   11499 
   11500    /* 66 0F E7 = MOVNTDQ -- for us, just a plain SSE store. */
   11501    if (have66noF2noF3(pfx) && sz == 2
   11502        && insn[0] == 0x0F && insn[1] == 0xE7) {
   11503       modrm = getUChar(delta+2);
   11504       if (!epartIsReg(modrm)) {
   11505          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11506          gen_SEGV_if_not_16_aligned( addr );
   11507          storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   11508          DIP("movntdq %s,%s\n", dis_buf,
   11509                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
   11510          delta += 2+alen;
   11511          goto decode_success;
   11512       }
   11513       /* else fall through */
   11514       goto decode_failure;
   11515    }
   11516 
   11517    /* 0F C3 = MOVNTI -- for us, just a plain ireg store. */
   11518    if (haveNo66noF2noF3(pfx) &&
   11519        insn[0] == 0x0F && insn[1] == 0xC3) {
   11520       vassert(sz == 4 || sz == 8);
   11521       modrm = getUChar(delta+2);
   11522       if (!epartIsReg(modrm)) {
   11523          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11524          storeLE( mkexpr(addr), getIRegG(sz, pfx, modrm) );
   11525          DIP("movnti %s,%s\n", dis_buf,
   11526                                nameIRegG(sz, pfx, modrm));
   11527          delta += 2+alen;
   11528          goto decode_success;
   11529       }
   11530       /* else fall through */
   11531    }
   11532 
   11533    /* 66 0F D6 = MOVQ -- move 64 bits from G (lo half xmm) to E (mem
   11534       or lo half xmm).  */
   11535    if (have66noF2noF3(pfx)
   11536        && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
   11537        && insn[0] == 0x0F && insn[1] == 0xD6) {
   11538       modrm = getUChar(delta+2);
   11539       if (epartIsReg(modrm)) {
   11540          /* fall through, awaiting test case */
   11541          /* dst: lo half copied, hi half zeroed */
   11542       } else {
   11543          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11544          storeLE( mkexpr(addr),
   11545                   getXMMRegLane64( gregOfRexRM(pfx,modrm), 0 ));
   11546          DIP("movq %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf );
   11547          delta += 2+alen;
   11548          goto decode_success;
   11549       }
   11550    }
   11551 
   11552    /* F3 0F D6 = MOVQ2DQ -- move from E (mmx) to G (lo half xmm, zero
   11553       hi half). */
   11554    if (haveF3no66noF2(pfx) && sz == 4
   11555        && insn[0] == 0x0F && insn[1] == 0xD6) {
   11556       modrm = getUChar(delta+2);
   11557       if (epartIsReg(modrm)) {
   11558          do_MMX_preamble();
   11559          putXMMReg( gregOfRexRM(pfx,modrm),
   11560                     unop(Iop_64UtoV128, getMMXReg( eregLO3ofRM(modrm) )) );
   11561          DIP("movq2dq %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   11562                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
   11563          delta += 2+1;
   11564          goto decode_success;
   11565       } else {
   11566          /* apparently no mem case for this insn */
   11567          goto decode_failure;
   11568       }
   11569    }
   11570 
   11571    /* F3 0F 7E = MOVQ -- move 64 bits from E (mem or lo half xmm) to
   11572       G (lo half xmm).  Upper half of G is zeroed out. */
   11573    /* F2 0F 10 = MOVSD -- move 64 bits from E (mem or lo half xmm) to
   11574       G (lo half xmm).  If E is mem, upper half of G is zeroed out.
   11575       If E is reg, upper half of G is unchanged. */
   11576    if ( (haveF2no66noF3(pfx)
   11577          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
   11578          && insn[0] == 0x0F && insn[1] == 0x10)
   11579         ||
   11580         (haveF3no66noF2(pfx)
   11581          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
   11582          && insn[0] == 0x0F && insn[1] == 0x7E)
   11583       ) {
   11584       modrm = getUChar(delta+2);
   11585       if (epartIsReg(modrm)) {
   11586          putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
   11587                           getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
   11588          if (insn[1] == 0x7E/*MOVQ*/) {
   11589             /* zero bits 127:64 */
   11590             putXMMRegLane64( gregOfRexRM(pfx,modrm), 1, mkU64(0) );
   11591          }
   11592          DIP("movsd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   11593                               nameXMMReg(gregOfRexRM(pfx,modrm)));
   11594          delta += 2+1;
   11595       } else {
   11596          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11597          putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
   11598          putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
   11599                           loadLE(Ity_I64, mkexpr(addr)) );
   11600          DIP("movsd %s,%s\n", dis_buf,
   11601                               nameXMMReg(gregOfRexRM(pfx,modrm)));
   11602          delta += 2+alen;
   11603       }
   11604       goto decode_success;
   11605    }
   11606 
   11607    /* F2 0F 11 = MOVSD -- move 64 bits from G (lo half xmm) to E (mem
   11608       or lo half xmm). */
   11609    if (haveF2no66noF3(pfx)
   11610        && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
   11611        && insn[0] == 0x0F && insn[1] == 0x11) {
   11612       modrm = getUChar(delta+2);
   11613       if (epartIsReg(modrm)) {
   11614          putXMMRegLane64( eregOfRexRM(pfx,modrm), 0,
   11615                           getXMMRegLane64( gregOfRexRM(pfx,modrm), 0 ));
   11616          DIP("movsd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11617                               nameXMMReg(eregOfRexRM(pfx,modrm)));
   11618          delta += 2+1;
   11619       } else {
   11620          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11621          storeLE( mkexpr(addr),
   11622                   getXMMRegLane64(gregOfRexRM(pfx,modrm), 0) );
   11623          DIP("movsd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11624                               dis_buf);
   11625          delta += 2+alen;
   11626       }
   11627       goto decode_success;
   11628    }
   11629 
   11630    /* 66 0F 59 = MULPD -- mul 64Fx2 from R/M to R */
   11631    if (have66noF2noF3(pfx)
   11632        && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
   11633        && insn[0] == 0x0F && insn[1] == 0x59) {
   11634       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "mulpd", Iop_Mul64Fx2 );
   11635       goto decode_success;
   11636    }
   11637 
   11638    /* F2 0F 59 = MULSD -- mul 64F0x2 from R/M to R */
   11639    if (haveF2no66noF3(pfx)
   11640        && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
   11641        && insn[0] == 0x0F && insn[1] == 0x59) {
   11642       delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "mulsd", Iop_Mul64F0x2 );
   11643       goto decode_success;
   11644    }
   11645 
   11646    /* 66 0F 56 = ORPD -- G = G and E */
   11647    if (have66noF2noF3(pfx) && sz == 2
   11648        && insn[0] == 0x0F && insn[1] == 0x56) {
   11649       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "orpd", Iop_OrV128 );
   11650       goto decode_success;
   11651    }
   11652 
   11653    /* 66 0F C6 /r ib = SHUFPD -- shuffle packed F64s */
   11654    if (have66noF2noF3(pfx) && sz == 2
   11655        && insn[0] == 0x0F && insn[1] == 0xC6) {
   11656       Int    select;
   11657       IRTemp sV = newTemp(Ity_V128);
   11658       IRTemp dV = newTemp(Ity_V128);
   11659       IRTemp s1 = newTemp(Ity_I64);
   11660       IRTemp s0 = newTemp(Ity_I64);
   11661       IRTemp d1 = newTemp(Ity_I64);
   11662       IRTemp d0 = newTemp(Ity_I64);
   11663 
   11664       modrm = insn[2];
   11665       assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   11666 
   11667       if (epartIsReg(modrm)) {
   11668          assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   11669          select = (Int)insn[3];
   11670          delta += 2+2;
   11671          DIP("shufpd $%d,%s,%s\n", select,
   11672                                    nameXMMReg(eregOfRexRM(pfx,modrm)),
   11673                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   11674       } else {
   11675          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 1 );
   11676          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11677          select = (Int)insn[2+alen];
   11678          delta += 3+alen;
   11679          DIP("shufpd $%d,%s,%s\n", select,
   11680                                    dis_buf,
   11681                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   11682       }
   11683 
   11684       assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
   11685       assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
   11686       assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
   11687       assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
   11688 
   11689 #     define SELD(n) mkexpr((n)==0 ? d0 : d1)
   11690 #     define SELS(n) mkexpr((n)==0 ? s0 : s1)
   11691 
   11692       putXMMReg(
   11693          gregOfRexRM(pfx,modrm),
   11694          binop(Iop_64HLtoV128, SELS((select>>1)&1), SELD((select>>0)&1) )
   11695       );
   11696 
   11697 #     undef SELD
   11698 #     undef SELS
   11699 
   11700       goto decode_success;
   11701    }
   11702 
   11703    /* 66 0F 51 = SQRTPD -- approx sqrt 64Fx2 from R/M to R */
   11704    if (have66noF2noF3(pfx) && sz == 2
   11705        && insn[0] == 0x0F && insn[1] == 0x51) {
   11706       delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta+2,
   11707                                         "sqrtpd", Iop_Sqrt64Fx2 );
   11708       goto decode_success;
   11709    }
   11710 
   11711    /* F2 0F 51 = SQRTSD -- approx sqrt 64F0x2 from R/M to R */
   11712    if (haveF2no66noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x51) {
   11713       vassert(sz == 4);
   11714       delta = dis_SSE_E_to_G_unary_lo64( vbi, pfx, delta+2,
   11715                                          "sqrtsd", Iop_Sqrt64F0x2 );
   11716       goto decode_success;
   11717    }
   11718 
   11719    /* 66 0F 5C = SUBPD -- sub 64Fx2 from R/M to R */
   11720    if (have66noF2noF3(pfx) && sz == 2
   11721        && insn[0] == 0x0F && insn[1] == 0x5C) {
   11722       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "subpd", Iop_Sub64Fx2 );
   11723       goto decode_success;
   11724    }
   11725 
   11726    /* F2 0F 5C = SUBSD -- sub 64F0x2 from R/M to R */
   11727    if (haveF2no66noF3(pfx)
   11728        && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
   11729        && insn[0] == 0x0F && insn[1] == 0x5C) {
   11730       delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "subsd", Iop_Sub64F0x2 );
   11731       goto decode_success;
   11732    }
   11733 
   11734    /* 66 0F 15 = UNPCKHPD -- unpack and interleave high part F64s */
   11735    /* 66 0F 14 = UNPCKLPD -- unpack and interleave low part F64s */
   11736    /* These just appear to be special cases of SHUFPS */
   11737    if (have66noF2noF3(pfx)
   11738        && sz == 2 /* could be 8 if rex also present */
   11739        && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
   11740       IRTemp s1 = newTemp(Ity_I64);
   11741       IRTemp s0 = newTemp(Ity_I64);
   11742       IRTemp d1 = newTemp(Ity_I64);
   11743       IRTemp d0 = newTemp(Ity_I64);
   11744       IRTemp sV = newTemp(Ity_V128);
   11745       IRTemp dV = newTemp(Ity_V128);
   11746       Bool   hi = toBool(insn[1] == 0x15);
   11747 
   11748       modrm = insn[2];
   11749       assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   11750 
   11751       if (epartIsReg(modrm)) {
   11752          assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   11753          delta += 2+1;
   11754          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   11755                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
   11756                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   11757       } else {
   11758          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11759          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11760          delta += 2+alen;
   11761          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   11762                                   dis_buf,
   11763                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   11764       }
   11765 
   11766       assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
   11767       assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
   11768       assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
   11769       assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
   11770 
   11771       if (hi) {
   11772          putXMMReg( gregOfRexRM(pfx,modrm),
   11773                     binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1)) );
   11774       } else {
   11775          putXMMReg( gregOfRexRM(pfx,modrm),
   11776                     binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)) );
   11777       }
   11778 
   11779       goto decode_success;
   11780    }
   11781 
   11782    /* 66 0F 57 = XORPD -- G = G xor E */
   11783    if (have66noF2noF3(pfx) && sz == 2
   11784        && insn[0] == 0x0F && insn[1] == 0x57) {
   11785       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "xorpd", Iop_XorV128 );
   11786       goto decode_success;
   11787    }
   11788 
   11789    /* 66 0F 6B = PACKSSDW */
   11790    if (have66noF2noF3(pfx) && sz == 2
   11791        && insn[0] == 0x0F && insn[1] == 0x6B) {
   11792       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11793                                  "packssdw", Iop_QNarrow32Sx4, True );
   11794       goto decode_success;
   11795    }
   11796 
   11797    /* 66 0F 63 = PACKSSWB */
   11798    if (have66noF2noF3(pfx) && sz == 2
   11799        && insn[0] == 0x0F && insn[1] == 0x63) {
   11800       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11801                                  "packsswb", Iop_QNarrow16Sx8, True );
   11802       goto decode_success;
   11803    }
   11804 
   11805    /* 66 0F 67 = PACKUSWB */
   11806    if (have66noF2noF3(pfx) && sz == 2
   11807        && insn[0] == 0x0F && insn[1] == 0x67) {
   11808       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11809                                  "packuswb", Iop_QNarrow16Ux8, True );
   11810       goto decode_success;
   11811    }
   11812 
   11813    /* 66 0F FC = PADDB */
   11814    if (have66noF2noF3(pfx) && sz == 2
   11815        && insn[0] == 0x0F && insn[1] == 0xFC) {
   11816       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11817                                  "paddb", Iop_Add8x16, False );
   11818       goto decode_success;
   11819    }
   11820 
   11821    /* 66 0F FE = PADDD */
   11822    if (have66noF2noF3(pfx) && sz == 2
   11823        && insn[0] == 0x0F && insn[1] == 0xFE) {
   11824       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11825                                  "paddd", Iop_Add32x4, False );
   11826       goto decode_success;
   11827    }
   11828 
   11829    /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   11830    /* 0F D4 = PADDQ -- add 64x1 */
   11831    if (haveNo66noF2noF3(pfx) && sz == 4
   11832        && insn[0] == 0x0F && insn[1] == 0xD4) {
   11833       do_MMX_preamble();
   11834       delta = dis_MMXop_regmem_to_reg (
   11835                 vbi, pfx, delta+2, insn[1], "paddq", False );
   11836       goto decode_success;
   11837    }
   11838 
   11839    /* 66 0F D4 = PADDQ */
   11840    if (have66noF2noF3(pfx) && sz == 2
   11841        && insn[0] == 0x0F && insn[1] == 0xD4) {
   11842       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11843                                  "paddq", Iop_Add64x2, False );
   11844       goto decode_success;
   11845    }
   11846 
   11847    /* 66 0F FD = PADDW */
   11848    if (have66noF2noF3(pfx) && sz == 2
   11849        && insn[0] == 0x0F && insn[1] == 0xFD) {
   11850       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11851                                  "paddw", Iop_Add16x8, False );
   11852       goto decode_success;
   11853    }
   11854 
   11855    /* 66 0F EC = PADDSB */
   11856    if (have66noF2noF3(pfx) && sz == 2
   11857        && insn[0] == 0x0F && insn[1] == 0xEC) {
   11858       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11859                                  "paddsb", Iop_QAdd8Sx16, False );
   11860       goto decode_success;
   11861    }
   11862 
   11863    /* 66 0F ED = PADDSW */
   11864    if (have66noF2noF3(pfx) && sz == 2
   11865        && insn[0] == 0x0F && insn[1] == 0xED) {
   11866       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11867                                  "paddsw", Iop_QAdd16Sx8, False );
   11868       goto decode_success;
   11869    }
   11870 
   11871    /* 66 0F DC = PADDUSB */
   11872    if (have66noF2noF3(pfx) && sz == 2
   11873        && insn[0] == 0x0F && insn[1] == 0xDC) {
   11874       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11875                                  "paddusb", Iop_QAdd8Ux16, False );
   11876       goto decode_success;
   11877    }
   11878 
   11879    /* 66 0F DD = PADDUSW */
   11880    if (have66noF2noF3(pfx) && sz == 2
   11881        && insn[0] == 0x0F && insn[1] == 0xDD) {
   11882       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11883                                  "paddusw", Iop_QAdd16Ux8, False );
   11884       goto decode_success;
   11885    }
   11886 
   11887    /* 66 0F DB = PAND */
   11888    if (have66noF2noF3(pfx) && sz == 2
   11889        && insn[0] == 0x0F && insn[1] == 0xDB) {
   11890       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "pand", Iop_AndV128 );
   11891       goto decode_success;
   11892    }
   11893 
   11894    /* 66 0F DF = PANDN */
   11895    if (have66noF2noF3(pfx) && sz == 2
   11896        && insn[0] == 0x0F && insn[1] == 0xDF) {
   11897       delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta+2, "pandn", Iop_AndV128 );
   11898       goto decode_success;
   11899    }
   11900 
   11901    /* 66 0F E0 = PAVGB */
   11902    if (have66noF2noF3(pfx) && sz == 2
   11903        && insn[0] == 0x0F && insn[1] == 0xE0) {
   11904       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11905                                  "pavgb", Iop_Avg8Ux16, False );
   11906       goto decode_success;
   11907    }
   11908 
   11909    /* 66 0F E3 = PAVGW */
   11910    if (have66noF2noF3(pfx) && sz == 2
   11911        && insn[0] == 0x0F && insn[1] == 0xE3) {
   11912       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11913                                  "pavgw", Iop_Avg16Ux8, False );
   11914       goto decode_success;
   11915    }
   11916 
   11917    /* 66 0F 74 = PCMPEQB */
   11918    if (have66noF2noF3(pfx) && sz == 2
   11919        && insn[0] == 0x0F && insn[1] == 0x74) {
   11920       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11921                                  "pcmpeqb", Iop_CmpEQ8x16, False );
   11922       goto decode_success;
   11923    }
   11924 
   11925    /* 66 0F 76 = PCMPEQD */
   11926    if (have66noF2noF3(pfx) && sz == 2
   11927        && insn[0] == 0x0F && insn[1] == 0x76) {
   11928       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11929                                  "pcmpeqd", Iop_CmpEQ32x4, False );
   11930       goto decode_success;
   11931    }
   11932 
   11933    /* 66 0F 75 = PCMPEQW */
   11934    if (have66noF2noF3(pfx) && sz == 2
   11935        && insn[0] == 0x0F && insn[1] == 0x75) {
   11936       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11937                                  "pcmpeqw", Iop_CmpEQ16x8, False );
   11938       goto decode_success;
   11939    }
   11940 
   11941    /* 66 0F 64 = PCMPGTB */
   11942    if (have66noF2noF3(pfx) && sz == 2
   11943        && insn[0] == 0x0F && insn[1] == 0x64) {
   11944       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11945                                  "pcmpgtb", Iop_CmpGT8Sx16, False );
   11946       goto decode_success;
   11947    }
   11948 
   11949    /* 66 0F 66 = PCMPGTD */
   11950    if (have66noF2noF3(pfx) && sz == 2
   11951        && insn[0] == 0x0F && insn[1] == 0x66) {
   11952       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11953                                  "pcmpgtd", Iop_CmpGT32Sx4, False );
   11954       goto decode_success;
   11955    }
   11956 
   11957    /* 66 0F 65 = PCMPGTW */
   11958    if (have66noF2noF3(pfx) && sz == 2
   11959        && insn[0] == 0x0F && insn[1] == 0x65) {
   11960       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11961                                  "pcmpgtw", Iop_CmpGT16Sx8, False );
   11962       goto decode_success;
   11963    }
   11964 
   11965    /* 66 0F C5 = PEXTRW -- extract 16-bit field from xmm(E) and put
   11966       zero-extend of it in ireg(G). */
   11967    if (have66noF2noF3(pfx)
   11968        && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
   11969        && insn[0] == 0x0F && insn[1] == 0xC5) {
   11970       modrm = insn[2];
   11971       if (epartIsReg(modrm)) {
   11972          t5 = newTemp(Ity_V128);
   11973          t4 = newTemp(Ity_I16);
   11974          assign(t5, getXMMReg(eregOfRexRM(pfx,modrm)));
   11975          breakup128to32s( t5, &t3, &t2, &t1, &t0 );
   11976          switch (insn[3] & 7) {
   11977             case 0:  assign(t4, unop(Iop_32to16,   mkexpr(t0))); break;
   11978             case 1:  assign(t4, unop(Iop_32HIto16, mkexpr(t0))); break;
   11979             case 2:  assign(t4, unop(Iop_32to16,   mkexpr(t1))); break;
   11980             case 3:  assign(t4, unop(Iop_32HIto16, mkexpr(t1))); break;
   11981             case 4:  assign(t4, unop(Iop_32to16,   mkexpr(t2))); break;
   11982             case 5:  assign(t4, unop(Iop_32HIto16, mkexpr(t2))); break;
   11983             case 6:  assign(t4, unop(Iop_32to16,   mkexpr(t3))); break;
   11984             case 7:  assign(t4, unop(Iop_32HIto16, mkexpr(t3))); break;
   11985             default: vassert(0);
   11986          }
   11987          putIReg32(gregOfRexRM(pfx,modrm), unop(Iop_16Uto32, mkexpr(t4)));
   11988          DIP("pextrw $%d,%s,%s\n",
   11989              (Int)insn[3], nameXMMReg(eregOfRexRM(pfx,modrm)),
   11990                            nameIReg32(gregOfRexRM(pfx,modrm)));
   11991          delta += 4;
   11992          goto decode_success;
   11993       }
   11994       /* else fall through */
   11995       /* note, if memory case is ever filled in, there is 1 byte after
   11996          amode */
   11997    }
   11998 
   11999    /* 66 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
   12000       put it into the specified lane of xmm(G). */
   12001    if (have66noF2noF3(pfx)
   12002        && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
   12003        && insn[0] == 0x0F && insn[1] == 0xC4) {
   12004       Int lane;
   12005       t4 = newTemp(Ity_I16);
   12006       modrm = insn[2];
   12007 
   12008       if (epartIsReg(modrm)) {
   12009          assign(t4, getIReg16(eregOfRexRM(pfx,modrm)));
   12010          delta += 3+1;
   12011          lane = insn[3+1-1];
   12012          DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   12013                                    nameIReg16(eregOfRexRM(pfx,modrm)),
   12014                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   12015       } else {
   12016          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf,
   12017                            1/*byte after the amode*/ );
   12018          delta += 3+alen;
   12019          lane = insn[3+alen-1];
   12020          assign(t4, loadLE(Ity_I16, mkexpr(addr)));
   12021          DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   12022                                    dis_buf,
   12023                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   12024      }
   12025 
   12026       putXMMRegLane16( gregOfRexRM(pfx,modrm), lane & 7, mkexpr(t4) );
   12027       goto decode_success;
   12028    }
   12029 
   12030    /* 66 0F F5 = PMADDWD -- Multiply and add packed integers from
   12031       E(xmm or mem) to G(xmm) */
   12032    if (have66noF2noF3(pfx) && sz == 2
   12033        && insn[0] == 0x0F && insn[1] == 0xF5) {
   12034       IRTemp s1V  = newTemp(Ity_V128);
   12035       IRTemp s2V  = newTemp(Ity_V128);
   12036       IRTemp dV   = newTemp(Ity_V128);
   12037       IRTemp s1Hi = newTemp(Ity_I64);
   12038       IRTemp s1Lo = newTemp(Ity_I64);
   12039       IRTemp s2Hi = newTemp(Ity_I64);
   12040       IRTemp s2Lo = newTemp(Ity_I64);
   12041       IRTemp dHi  = newTemp(Ity_I64);
   12042       IRTemp dLo  = newTemp(Ity_I64);
   12043       modrm = insn[2];
   12044       if (epartIsReg(modrm)) {
   12045          assign( s1V, getXMMReg(eregOfRexRM(pfx,modrm)) );
   12046          delta += 2+1;
   12047          DIP("pmaddwd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12048                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
   12049       } else {
   12050          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   12051          assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
   12052          delta += 2+alen;
   12053          DIP("pmaddwd %s,%s\n", dis_buf,
   12054                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
   12055       }
   12056       assign( s2V, getXMMReg(gregOfRexRM(pfx,modrm)) );
   12057       assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
   12058       assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
   12059       assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
   12060       assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
   12061       assign( dHi, mkIRExprCCall(
   12062                       Ity_I64, 0/*regparms*/,
   12063                       "amd64g_calculate_mmx_pmaddwd",
   12064                       &amd64g_calculate_mmx_pmaddwd,
   12065                       mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
   12066                    ));
   12067       assign( dLo, mkIRExprCCall(
   12068                       Ity_I64, 0/*regparms*/,
   12069                       "amd64g_calculate_mmx_pmaddwd",
   12070                       &amd64g_calculate_mmx_pmaddwd,
   12071                       mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
   12072                    ));
   12073       assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
   12074       putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(dV));
   12075       goto decode_success;
   12076    }
   12077 
   12078    /* 66 0F EE = PMAXSW -- 16x8 signed max */
   12079    if (have66noF2noF3(pfx) && sz == 2
   12080        && insn[0] == 0x0F && insn[1] == 0xEE) {
   12081       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12082                                  "pmaxsw", Iop_Max16Sx8, False );
   12083       goto decode_success;
   12084    }
   12085 
   12086    /* 66 0F DE = PMAXUB -- 8x16 unsigned max */
   12087    if (have66noF2noF3(pfx) && sz == 2
   12088        && insn[0] == 0x0F && insn[1] == 0xDE) {
   12089       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12090                                  "pmaxub", Iop_Max8Ux16, False );
   12091       goto decode_success;
   12092    }
   12093 
   12094    /* 66 0F EA = PMINSW -- 16x8 signed min */
   12095    if (have66noF2noF3(pfx) && sz == 2
   12096        && insn[0] == 0x0F && insn[1] == 0xEA) {
   12097       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12098                                  "pminsw", Iop_Min16Sx8, False );
   12099       goto decode_success;
   12100    }
   12101 
   12102    /* 66 0F DA = PMINUB -- 8x16 unsigned min */
   12103    if (have66noF2noF3(pfx) && sz == 2
   12104        && insn[0] == 0x0F && insn[1] == 0xDA) {
   12105       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12106                                  "pminub", Iop_Min8Ux16, False );
   12107       goto decode_success;
   12108    }
   12109 
   12110    /* 66 0F D7 = PMOVMSKB -- extract sign bits from each of 16 lanes in
   12111       xmm(E), turn them into a byte, and put zero-extend of it in
   12112       ireg(G).  Doing this directly is just too cumbersome; give up
   12113       therefore and call a helper. */
   12114    /* UInt x86g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo ); */
   12115    if (have66noF2noF3(pfx)
   12116        && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
   12117        && insn[0] == 0x0F && insn[1] == 0xD7) {
   12118       modrm = insn[2];
   12119       if (epartIsReg(modrm)) {
   12120          t0 = newTemp(Ity_I64);
   12121          t1 = newTemp(Ity_I64);
   12122          assign(t0, getXMMRegLane64(eregOfRexRM(pfx,modrm), 0));
   12123          assign(t1, getXMMRegLane64(eregOfRexRM(pfx,modrm), 1));
   12124          t5 = newTemp(Ity_I64);
   12125          assign(t5, mkIRExprCCall(
   12126                        Ity_I64, 0/*regparms*/,
   12127                        "amd64g_calculate_sse_pmovmskb",
   12128                        &amd64g_calculate_sse_pmovmskb,
   12129                        mkIRExprVec_2( mkexpr(t1), mkexpr(t0) )));
   12130          putIReg32(gregOfRexRM(pfx,modrm), unop(Iop_64to32,mkexpr(t5)));
   12131          DIP("pmovmskb %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12132                                  nameIReg32(gregOfRexRM(pfx,modrm)));
   12133          delta += 3;
   12134          goto decode_success;
   12135       }
   12136       /* else fall through */
   12137    }
   12138 
   12139    /* 66 0F E4 = PMULHUW -- 16x8 hi-half of unsigned widening multiply */
   12140    if (have66noF2noF3(pfx) && sz == 2
   12141        && insn[0] == 0x0F && insn[1] == 0xE4) {
   12142       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12143                                  "pmulhuw", Iop_MulHi16Ux8, False );
   12144       goto decode_success;
   12145    }
   12146 
   12147    /* 66 0F E5 = PMULHW -- 16x8 hi-half of signed widening multiply */
   12148    if (have66noF2noF3(pfx) && sz == 2
   12149        && insn[0] == 0x0F && insn[1] == 0xE5) {
   12150       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12151                                  "pmulhw", Iop_MulHi16Sx8, False );
   12152       goto decode_success;
   12153    }
   12154 
   12155    /* 66 0F D5 = PMULHL -- 16x8 multiply */
   12156    if (have66noF2noF3(pfx) && sz == 2
   12157        && insn[0] == 0x0F && insn[1] == 0xD5) {
   12158       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12159                                  "pmullw", Iop_Mul16x8, False );
   12160       goto decode_success;
   12161    }
   12162 
   12163    /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   12164    /* 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
   12165       0 to form 64-bit result */
   12166    if (haveNo66noF2noF3(pfx) && sz == 4
   12167        && insn[0] == 0x0F && insn[1] == 0xF4) {
   12168       IRTemp sV = newTemp(Ity_I64);
   12169       IRTemp dV = newTemp(Ity_I64);
   12170       t1 = newTemp(Ity_I32);
   12171       t0 = newTemp(Ity_I32);
   12172       modrm = insn[2];
   12173 
   12174       do_MMX_preamble();
   12175       assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   12176 
   12177       if (epartIsReg(modrm)) {
   12178          assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   12179          delta += 2+1;
   12180          DIP("pmuludq %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   12181                                 nameMMXReg(gregLO3ofRM(modrm)));
   12182       } else {
   12183          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   12184          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12185          delta += 2+alen;
   12186          DIP("pmuludq %s,%s\n", dis_buf,
   12187                                 nameMMXReg(gregLO3ofRM(modrm)));
   12188       }
   12189 
   12190       assign( t0, unop(Iop_64to32, mkexpr(dV)) );
   12191       assign( t1, unop(Iop_64to32, mkexpr(sV)) );
   12192       putMMXReg( gregLO3ofRM(modrm),
   12193                  binop( Iop_MullU32, mkexpr(t0), mkexpr(t1) ) );
   12194       goto decode_success;
   12195    }
   12196 
   12197    /* 66 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
   12198       0 to form lower 64-bit half and lanes 2 x 2 to form upper 64-bit
   12199       half */
   12200    /* This is a really poor translation -- could be improved if
   12201       performance critical */
   12202    if (have66noF2noF3(pfx) && sz == 2
   12203        && insn[0] == 0x0F && insn[1] == 0xF4) {
   12204       IRTemp sV, dV;
   12205       IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   12206       sV = newTemp(Ity_V128);
   12207       dV = newTemp(Ity_V128);
   12208       s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   12209       t1 = newTemp(Ity_I64);
   12210       t0 = newTemp(Ity_I64);
   12211       modrm = insn[2];
   12212       assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   12213 
   12214       if (epartIsReg(modrm)) {
   12215          assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   12216          delta += 2+1;
   12217          DIP("pmuludq %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12218                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
   12219       } else {
   12220          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   12221          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12222          delta += 2+alen;
   12223          DIP("pmuludq %s,%s\n", dis_buf,
   12224                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
   12225       }
   12226 
   12227       breakup128to32s( dV, &d3, &d2, &d1, &d0 );
   12228       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   12229 
   12230       assign( t0, binop( Iop_MullU32, mkexpr(d0), mkexpr(s0)) );
   12231       putXMMRegLane64( gregOfRexRM(pfx,modrm), 0, mkexpr(t0) );
   12232       assign( t1, binop( Iop_MullU32, mkexpr(d2), mkexpr(s2)) );
   12233       putXMMRegLane64( gregOfRexRM(pfx,modrm), 1, mkexpr(t1) );
   12234       goto decode_success;
   12235    }
   12236 
   12237    /* 66 0F EB = POR */
   12238    if (have66noF2noF3(pfx) && sz == 2
   12239        && insn[0] == 0x0F && insn[1] == 0xEB) {
   12240       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "por", Iop_OrV128 );
   12241       goto decode_success;
   12242    }
   12243 
   12244    /* 66 0F F6 = PSADBW -- 2 x (8x8 -> 48 zeroes ++ u16) Sum Abs Diffs
   12245       from E(xmm or mem) to G(xmm) */
   12246    if (have66noF2noF3(pfx) && sz == 2
   12247        && insn[0] == 0x0F && insn[1] == 0xF6) {
   12248       IRTemp s1V  = newTemp(Ity_V128);
   12249       IRTemp s2V  = newTemp(Ity_V128);
   12250       IRTemp dV   = newTemp(Ity_V128);
   12251       IRTemp s1Hi = newTemp(Ity_I64);
   12252       IRTemp s1Lo = newTemp(Ity_I64);
   12253       IRTemp s2Hi = newTemp(Ity_I64);
   12254       IRTemp s2Lo = newTemp(Ity_I64);
   12255       IRTemp dHi  = newTemp(Ity_I64);
   12256       IRTemp dLo  = newTemp(Ity_I64);
   12257       modrm = insn[2];
   12258       if (epartIsReg(modrm)) {
   12259          assign( s1V, getXMMReg(eregOfRexRM(pfx,modrm)) );
   12260          delta += 2+1;
   12261          DIP("psadbw %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12262                                nameXMMReg(gregOfRexRM(pfx,modrm)));
   12263       } else {
   12264          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   12265          assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
   12266          delta += 2+alen;
   12267          DIP("psadbw %s,%s\n", dis_buf,
   12268                                nameXMMReg(gregOfRexRM(pfx,modrm)));
   12269       }
   12270       assign( s2V, getXMMReg(gregOfRexRM(pfx,modrm)) );
   12271       assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
   12272       assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
   12273       assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
   12274       assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
   12275       assign( dHi, mkIRExprCCall(
   12276                       Ity_I64, 0/*regparms*/,
   12277                       "amd64g_calculate_mmx_psadbw",
   12278                       &amd64g_calculate_mmx_psadbw,
   12279                       mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
   12280                    ));
   12281       assign( dLo, mkIRExprCCall(
   12282                       Ity_I64, 0/*regparms*/,
   12283                       "amd64g_calculate_mmx_psadbw",
   12284                       &amd64g_calculate_mmx_psadbw,
   12285                       mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
   12286                    ));
   12287       assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
   12288       putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(dV));
   12289       goto decode_success;
   12290    }
   12291 
   12292    /* 66 0F 70 = PSHUFD -- rearrange 4x32 from E(xmm or mem) to G(xmm) */
   12293    if (have66noF2noF3(pfx) && sz == 2
   12294        && insn[0] == 0x0F && insn[1] == 0x70) {
   12295       Int order;
   12296       IRTemp sV, dV, s3, s2, s1, s0;
   12297       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   12298       sV = newTemp(Ity_V128);
   12299       dV = newTemp(Ity_V128);
   12300       modrm = insn[2];
   12301       if (epartIsReg(modrm)) {
   12302          assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   12303          order = (Int)insn[3];
   12304          delta += 3+1;
   12305          DIP("pshufd $%d,%s,%s\n", order,
   12306                                    nameXMMReg(eregOfRexRM(pfx,modrm)),
   12307                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   12308       } else {
   12309          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf,
   12310                            1/*byte after the amode*/ );
   12311          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12312 	 order = (Int)insn[2+alen];
   12313          delta += 2+alen+1;
   12314          DIP("pshufd $%d,%s,%s\n", order,
   12315                                    dis_buf,
   12316                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   12317       }
   12318       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   12319 
   12320 #     define SEL(n) \
   12321                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   12322       assign(dV,
   12323 	     mk128from32s( SEL((order>>6)&3), SEL((order>>4)&3),
   12324                            SEL((order>>2)&3), SEL((order>>0)&3) )
   12325       );
   12326       putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(dV));
   12327 #     undef SEL
   12328       goto decode_success;
   12329    }
   12330 
   12331    /* F3 0F 70 = PSHUFHW -- rearrange upper half 4x16 from E(xmm or
   12332       mem) to G(xmm), and copy lower half */
   12333    if (haveF3no66noF2(pfx) && sz == 4
   12334        && insn[0] == 0x0F && insn[1] == 0x70) {
   12335       Int order;
   12336       IRTemp sVhi, dVhi, sV, dV, s3, s2, s1, s0;
   12337       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   12338       sV   = newTemp(Ity_V128);
   12339       dV   = newTemp(Ity_V128);
   12340       sVhi = newTemp(Ity_I64);
   12341       dVhi = newTemp(Ity_I64);
   12342       modrm = insn[2];
   12343       if (epartIsReg(modrm)) {
   12344          assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   12345          order = (Int)insn[3];
   12346          delta += 3+1;
   12347          DIP("pshufhw $%d,%s,%s\n", order,
   12348                                     nameXMMReg(eregOfRexRM(pfx,modrm)),
   12349                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12350       } else {
   12351          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf,
   12352                            1/*byte after the amode*/ );
   12353          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12354 	 order = (Int)insn[2+alen];
   12355          delta += 2+alen+1;
   12356          DIP("pshufhw $%d,%s,%s\n", order,
   12357                                     dis_buf,
   12358                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12359       }
   12360       assign( sVhi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12361       breakup64to16s( sVhi, &s3, &s2, &s1, &s0 );
   12362 
   12363 #     define SEL(n) \
   12364                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   12365       assign(dVhi,
   12366 	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
   12367                           SEL((order>>2)&3), SEL((order>>0)&3) )
   12368       );
   12369       assign(dV, binop( Iop_64HLtoV128,
   12370                         mkexpr(dVhi),
   12371                         unop(Iop_V128to64, mkexpr(sV))) );
   12372       putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(dV));
   12373 #     undef SEL
   12374       goto decode_success;
   12375    }
   12376 
   12377    /* F2 0F 70 = PSHUFLW -- rearrange lower half 4x16 from E(xmm or
   12378       mem) to G(xmm), and copy upper half */
   12379    if (haveF2no66noF3(pfx) && sz == 4
   12380        && insn[0] == 0x0F && insn[1] == 0x70) {
   12381       Int order;
   12382       IRTemp sVlo, dVlo, sV, dV, s3, s2, s1, s0;
   12383       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   12384       sV   = newTemp(Ity_V128);
   12385       dV   = newTemp(Ity_V128);
   12386       sVlo = newTemp(Ity_I64);
   12387       dVlo = newTemp(Ity_I64);
   12388       modrm = insn[2];
   12389       if (epartIsReg(modrm)) {
   12390          assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   12391          order = (Int)insn[3];
   12392          delta += 3+1;
   12393          DIP("pshuflw $%d,%s,%s\n", order,
   12394                                     nameXMMReg(eregOfRexRM(pfx,modrm)),
   12395                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12396       } else {
   12397          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf,
   12398                            1/*byte after the amode*/ );
   12399          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12400 	 order = (Int)insn[2+alen];
   12401          delta += 2+alen+1;
   12402          DIP("pshuflw $%d,%s,%s\n", order,
   12403                                     dis_buf,
   12404                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12405       }
   12406       assign( sVlo, unop(Iop_V128to64, mkexpr(sV)) );
   12407       breakup64to16s( sVlo, &s3, &s2, &s1, &s0 );
   12408 
   12409 #     define SEL(n) \
   12410                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   12411       assign(dVlo,
   12412 	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
   12413                           SEL((order>>2)&3), SEL((order>>0)&3) )
   12414       );
   12415       assign(dV, binop( Iop_64HLtoV128,
   12416                         unop(Iop_V128HIto64, mkexpr(sV)),
   12417                         mkexpr(dVlo) ) );
   12418       putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(dV));
   12419 #     undef SEL
   12420       goto decode_success;
   12421    }
   12422 
   12423    /* 66 0F 72 /6 ib = PSLLD by immediate */
   12424    if (have66noF2noF3(pfx) && sz == 2
   12425        && insn[0] == 0x0F && insn[1] == 0x72
   12426        && epartIsReg(insn[2])
   12427        && gregLO3ofRM(insn[2]) == 6) {
   12428       delta = dis_SSE_shiftE_imm( pfx, delta+2, "pslld", Iop_ShlN32x4 );
   12429       goto decode_success;
   12430    }
   12431 
   12432    /* 66 0F F2 = PSLLD by E */
   12433    if (have66noF2noF3(pfx) && sz == 2
   12434        && insn[0] == 0x0F && insn[1] == 0xF2) {
   12435       delta = dis_SSE_shiftG_byE( vbi, pfx, delta+2, "pslld", Iop_ShlN32x4 );
   12436       goto decode_success;
   12437    }
   12438 
   12439    /* 66 0F 73 /7 ib = PSLLDQ by immediate */
   12440    /* note, if mem case ever filled in, 1 byte after amode */
   12441    if (have66noF2noF3(pfx) && sz == 2
   12442        && insn[0] == 0x0F && insn[1] == 0x73
   12443        && epartIsReg(insn[2])
   12444        && gregLO3ofRM(insn[2]) == 7) {
   12445       IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
   12446       Int    imm = (Int)insn[3];
   12447       Int    reg = eregOfRexRM(pfx,insn[2]);
   12448       DIP("pslldq $%d,%s\n", imm, nameXMMReg(reg));
   12449       vassert(imm >= 0 && imm <= 255);
   12450       delta += 4;
   12451 
   12452       sV    = newTemp(Ity_V128);
   12453       dV    = newTemp(Ity_V128);
   12454       hi64  = newTemp(Ity_I64);
   12455       lo64  = newTemp(Ity_I64);
   12456       hi64r = newTemp(Ity_I64);
   12457       lo64r = newTemp(Ity_I64);
   12458 
   12459       if (imm >= 16) {
   12460          putXMMReg(reg, mkV128(0x0000));
   12461          goto decode_success;
   12462       }
   12463 
   12464       assign( sV, getXMMReg(reg) );
   12465       assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
   12466       assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
   12467 
   12468       if (imm == 0) {
   12469          assign( lo64r, mkexpr(lo64) );
   12470          assign( hi64r, mkexpr(hi64) );
   12471       }
   12472       else
   12473       if (imm == 8) {
   12474          assign( lo64r, mkU64(0) );
   12475          assign( hi64r, mkexpr(lo64) );
   12476       }
   12477       else
   12478       if (imm > 8) {
   12479          assign( lo64r, mkU64(0) );
   12480          assign( hi64r, binop( Iop_Shl64,
   12481                                mkexpr(lo64),
   12482                                mkU8( 8*(imm-8) ) ));
   12483       } else {
   12484          assign( lo64r, binop( Iop_Shl64,
   12485                                mkexpr(lo64),
   12486                                mkU8(8 * imm) ));
   12487          assign( hi64r,
   12488                  binop( Iop_Or64,
   12489                         binop(Iop_Shl64, mkexpr(hi64),
   12490                                          mkU8(8 * imm)),
   12491                         binop(Iop_Shr64, mkexpr(lo64),
   12492                                          mkU8(8 * (8 - imm)) )
   12493                       )
   12494                );
   12495       }
   12496       assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
   12497       putXMMReg(reg, mkexpr(dV));
   12498       goto decode_success;
   12499    }
   12500 
   12501    /* 66 0F 73 /6 ib = PSLLQ by immediate */
   12502    if (have66noF2noF3(pfx) && sz == 2
   12503        && insn[0] == 0x0F && insn[1] == 0x73
   12504        && epartIsReg(insn[2])
   12505        && gregLO3ofRM(insn[2]) == 6) {
   12506       delta = dis_SSE_shiftE_imm( pfx, delta+2, "psllq", Iop_ShlN64x2 );
   12507       goto decode_success;
   12508    }
   12509 
   12510    /* 66 0F F3 = PSLLQ by E */
   12511    if (have66noF2noF3(pfx) && sz == 2
   12512        && insn[0] == 0x0F && insn[1] == 0xF3) {
   12513       delta = dis_SSE_shiftG_byE( vbi, pfx, delta+2, "psllq", Iop_ShlN64x2 );
   12514       goto decode_success;
   12515    }
   12516 
   12517    /* 66 0F 71 /6 ib = PSLLW by immediate */
   12518    if (have66noF2noF3(pfx) && sz == 2
   12519        && insn[0] == 0x0F && insn[1] == 0x71
   12520        && epartIsReg(insn[2])
   12521        && gregLO3ofRM(insn[2]) == 6) {
   12522       delta = dis_SSE_shiftE_imm( pfx, delta+2, "psllw", Iop_ShlN16x8 );
   12523       goto decode_success;
   12524    }
   12525 
   12526    /* 66 0F F1 = PSLLW by E */
   12527    if (have66noF2noF3(pfx) && sz == 2
   12528        && insn[0] == 0x0F && insn[1] == 0xF1) {
   12529       delta = dis_SSE_shiftG_byE( vbi, pfx, delta+2, "psllw", Iop_ShlN16x8 );
   12530       goto decode_success;
   12531    }
   12532 
   12533    /* 66 0F 72 /4 ib = PSRAD by immediate */
   12534    if (have66noF2noF3(pfx) && sz == 2
   12535        && insn[0] == 0x0F && insn[1] == 0x72
   12536        && epartIsReg(insn[2])
   12537        && gregLO3ofRM(insn[2]) == 4) {
   12538       delta = dis_SSE_shiftE_imm( pfx, delta+2, "psrad", Iop_SarN32x4 );
   12539       goto decode_success;
   12540    }
   12541 
   12542    /* 66 0F E2 = PSRAD by E */
   12543    if (have66noF2noF3(pfx) && sz == 2
   12544        && insn[0] == 0x0F && insn[1] == 0xE2) {
   12545       delta = dis_SSE_shiftG_byE( vbi, pfx, delta+2, "psrad", Iop_SarN32x4 );
   12546       goto decode_success;
   12547    }
   12548 
   12549    /* 66 0F 71 /4 ib = PSRAW by immediate */
   12550    if (have66noF2noF3(pfx) && sz == 2
   12551        && insn[0] == 0x0F && insn[1] == 0x71
   12552        && epartIsReg(insn[2])
   12553        && gregLO3ofRM(insn[2]) == 4) {
   12554       delta = dis_SSE_shiftE_imm( pfx, delta+2, "psraw", Iop_SarN16x8 );
   12555       goto decode_success;
   12556    }
   12557 
   12558    /* 66 0F E1 = PSRAW by E */
   12559    if (have66noF2noF3(pfx) && sz == 2
   12560        && insn[0] == 0x0F && insn[1] == 0xE1) {
   12561       delta = dis_SSE_shiftG_byE( vbi, pfx, delta+2, "psraw", Iop_SarN16x8 );
   12562       goto decode_success;
   12563    }
   12564 
   12565    /* 66 0F 72 /2 ib = PSRLD by immediate */
   12566    if (have66noF2noF3(pfx) && sz == 2
   12567        && insn[0] == 0x0F && insn[1] == 0x72
   12568        && epartIsReg(insn[2])
   12569        && gregLO3ofRM(insn[2]) == 2) {
   12570       delta = dis_SSE_shiftE_imm( pfx, delta+2, "psrld", Iop_ShrN32x4 );
   12571       goto decode_success;
   12572    }
   12573 
   12574    /* 66 0F D2 = PSRLD by E */
   12575    if (have66noF2noF3(pfx) && sz == 2
   12576        && insn[0] == 0x0F && insn[1] == 0xD2) {
   12577       delta = dis_SSE_shiftG_byE( vbi, pfx, delta+2, "psrld", Iop_ShrN32x4 );
   12578       goto decode_success;
   12579    }
   12580 
   12581    /* 66 0F 73 /3 ib = PSRLDQ by immediate */
   12582    /* note, if mem case ever filled in, 1 byte after amode */
   12583    if (have66noF2noF3(pfx) && sz == 2
   12584        && insn[0] == 0x0F && insn[1] == 0x73
   12585        && epartIsReg(insn[2])
   12586        && gregLO3ofRM(insn[2]) == 3) {
   12587       IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
   12588       Int    imm = (Int)insn[3];
   12589       Int    reg = eregOfRexRM(pfx,insn[2]);
   12590       DIP("psrldq $%d,%s\n", imm, nameXMMReg(reg));
   12591       vassert(imm >= 0 && imm <= 255);
   12592       delta += 4;
   12593 
   12594       sV    = newTemp(Ity_V128);
   12595       dV    = newTemp(Ity_V128);
   12596       hi64  = newTemp(Ity_I64);
   12597       lo64  = newTemp(Ity_I64);
   12598       hi64r = newTemp(Ity_I64);
   12599       lo64r = newTemp(Ity_I64);
   12600 
   12601       if (imm >= 16) {
   12602          putXMMReg(reg, mkV128(0x0000));
   12603          goto decode_success;
   12604       }
   12605 
   12606       assign( sV, getXMMReg(reg) );
   12607       assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
   12608       assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
   12609 
   12610       if (imm == 0) {
   12611          assign( lo64r, mkexpr(lo64) );
   12612          assign( hi64r, mkexpr(hi64) );
   12613       }
   12614       else
   12615       if (imm == 8) {
   12616          assign( hi64r, mkU64(0) );
   12617          assign( lo64r, mkexpr(hi64) );
   12618       }
   12619       else
   12620       if (imm > 8) {
   12621          assign( hi64r, mkU64(0) );
   12622          assign( lo64r, binop( Iop_Shr64,
   12623                                mkexpr(hi64),
   12624                                mkU8( 8*(imm-8) ) ));
   12625       } else {
   12626          assign( hi64r, binop( Iop_Shr64,
   12627                                mkexpr(hi64),
   12628                                mkU8(8 * imm) ));
   12629          assign( lo64r,
   12630                  binop( Iop_Or64,
   12631                         binop(Iop_Shr64, mkexpr(lo64),
   12632                                          mkU8(8 * imm)),
   12633                         binop(Iop_Shl64, mkexpr(hi64),
   12634                                          mkU8(8 * (8 - imm)) )
   12635                       )
   12636                );
   12637       }
   12638 
   12639       assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
   12640       putXMMReg(reg, mkexpr(dV));
   12641       goto decode_success;
   12642    }
   12643 
   12644    /* 66 0F 73 /2 ib = PSRLQ by immediate */
   12645    if (have66noF2noF3(pfx) && sz == 2
   12646        && insn[0] == 0x0F && insn[1] == 0x73
   12647        && epartIsReg(insn[2])
   12648        && gregLO3ofRM(insn[2]) == 2) {
   12649       delta = dis_SSE_shiftE_imm( pfx, delta+2, "psrlq", Iop_ShrN64x2 );
   12650       goto decode_success;
   12651    }
   12652 
   12653    /* 66 0F D3 = PSRLQ by E */
   12654    if (have66noF2noF3(pfx) && sz == 2
   12655        && insn[0] == 0x0F && insn[1] == 0xD3) {
   12656       delta = dis_SSE_shiftG_byE( vbi, pfx, delta+2, "psrlq", Iop_ShrN64x2 );
   12657       goto decode_success;
   12658    }
   12659 
   12660    /* 66 0F 71 /2 ib = PSRLW by immediate */
   12661    if (have66noF2noF3(pfx) && sz == 2
   12662        && insn[0] == 0x0F && insn[1] == 0x71
   12663        && epartIsReg(insn[2])
   12664        && gregLO3ofRM(insn[2]) == 2) {
   12665       delta = dis_SSE_shiftE_imm( pfx, delta+2, "psrlw", Iop_ShrN16x8 );
   12666       goto decode_success;
   12667    }
   12668 
   12669    /* 66 0F D1 = PSRLW by E */
   12670    if (have66noF2noF3(pfx) && sz == 2
   12671        && insn[0] == 0x0F && insn[1] == 0xD1) {
   12672       delta = dis_SSE_shiftG_byE( vbi, pfx, delta+2, "psrlw", Iop_ShrN16x8 );
   12673       goto decode_success;
   12674    }
   12675 
   12676    /* 66 0F F8 = PSUBB */
   12677    if (have66noF2noF3(pfx) && sz == 2
   12678        && insn[0] == 0x0F && insn[1] == 0xF8) {
   12679       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12680                                  "psubb", Iop_Sub8x16, False );
   12681       goto decode_success;
   12682    }
   12683 
   12684    /* 66 0F FA = PSUBD */
   12685    if (have66noF2noF3(pfx) && sz == 2
   12686        && insn[0] == 0x0F && insn[1] == 0xFA) {
   12687       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12688                                  "psubd", Iop_Sub32x4, False );
   12689       goto decode_success;
   12690    }
   12691 
   12692    /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   12693    /* 0F FB = PSUBQ -- sub 64x1 */
   12694    if (haveNo66noF2noF3(pfx) && sz == 4
   12695        && insn[0] == 0x0F && insn[1] == 0xFB) {
   12696       do_MMX_preamble();
   12697       delta = dis_MMXop_regmem_to_reg (
   12698                 vbi, pfx, delta+2, insn[1], "psubq", False );
   12699       goto decode_success;
   12700    }
   12701 
   12702    /* 66 0F FB = PSUBQ */
   12703    if (have66noF2noF3(pfx) && sz == 2
   12704        && insn[0] == 0x0F && insn[1] == 0xFB) {
   12705       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12706                                  "psubq", Iop_Sub64x2, False );
   12707       goto decode_success;
   12708    }
   12709 
   12710    /* 66 0F F9 = PSUBW */
   12711    if (have66noF2noF3(pfx) && sz == 2
   12712        && insn[0] == 0x0F && insn[1] == 0xF9) {
   12713       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12714                                  "psubw", Iop_Sub16x8, False );
   12715       goto decode_success;
   12716    }
   12717 
   12718    /* 66 0F E8 = PSUBSB */
   12719    if (have66noF2noF3(pfx) && sz == 2
   12720        && insn[0] == 0x0F && insn[1] == 0xE8) {
   12721       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12722                                  "psubsb", Iop_QSub8Sx16, False );
   12723       goto decode_success;
   12724    }
   12725 
   12726    /* 66 0F E9 = PSUBSW */
   12727    if (have66noF2noF3(pfx) && sz == 2
   12728        && insn[0] == 0x0F && insn[1] == 0xE9) {
   12729       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12730                                  "psubsw", Iop_QSub16Sx8, False );
   12731       goto decode_success;
   12732    }
   12733 
   12734    /* 66 0F D8 = PSUBSB */
   12735    if (have66noF2noF3(pfx) && sz == 2
   12736        && insn[0] == 0x0F && insn[1] == 0xD8) {
   12737       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12738                                  "psubusb", Iop_QSub8Ux16, False );
   12739       goto decode_success;
   12740    }
   12741 
   12742    /* 66 0F D9 = PSUBSW */
   12743    if (have66noF2noF3(pfx) && sz == 2
   12744        && insn[0] == 0x0F && insn[1] == 0xD9) {
   12745       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12746                                  "psubusw", Iop_QSub16Ux8, False );
   12747       goto decode_success;
   12748    }
   12749 
   12750    /* 66 0F 68 = PUNPCKHBW */
   12751    if (have66noF2noF3(pfx) && sz == 2
   12752        && insn[0] == 0x0F && insn[1] == 0x68) {
   12753       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12754                                  "punpckhbw",
   12755                                  Iop_InterleaveHI8x16, True );
   12756       goto decode_success;
   12757    }
   12758 
   12759    /* 66 0F 6A = PUNPCKHDQ */
   12760    if (have66noF2noF3(pfx) && sz == 2
   12761        && insn[0] == 0x0F && insn[1] == 0x6A) {
   12762       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12763                                  "punpckhdq",
   12764                                  Iop_InterleaveHI32x4, True );
   12765       goto decode_success;
   12766    }
   12767 
   12768    /* 66 0F 6D = PUNPCKHQDQ */
   12769    if (have66noF2noF3(pfx) && sz == 2
   12770        && insn[0] == 0x0F && insn[1] == 0x6D) {
   12771       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12772                                  "punpckhqdq",
   12773                                  Iop_InterleaveHI64x2, True );
   12774       goto decode_success;
   12775    }
   12776 
   12777    /* 66 0F 69 = PUNPCKHWD */
   12778    if (have66noF2noF3(pfx) && sz == 2
   12779        && insn[0] == 0x0F && insn[1] == 0x69) {
   12780       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12781                                  "punpckhwd",
   12782                                  Iop_InterleaveHI16x8, True );
   12783       goto decode_success;
   12784    }
   12785 
   12786    /* 66 0F 60 = PUNPCKLBW */
   12787    if (have66noF2noF3(pfx) && sz == 2
   12788        && insn[0] == 0x0F && insn[1] == 0x60) {
   12789       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12790                                  "punpcklbw",
   12791                                  Iop_InterleaveLO8x16, True );
   12792       goto decode_success;
   12793    }
   12794 
   12795    /* 66 0F 62 = PUNPCKLDQ */
   12796    if (have66noF2noF3(pfx) && sz == 2
   12797        && insn[0] == 0x0F && insn[1] == 0x62) {
   12798       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12799                                  "punpckldq",
   12800                                  Iop_InterleaveLO32x4, True );
   12801       goto decode_success;
   12802    }
   12803 
   12804    /* 66 0F 6C = PUNPCKLQDQ */
   12805    if (have66noF2noF3(pfx) && sz == 2
   12806        && insn[0] == 0x0F && insn[1] == 0x6C) {
   12807       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12808                                  "punpcklqdq",
   12809                                  Iop_InterleaveLO64x2, True );
   12810       goto decode_success;
   12811    }
   12812 
   12813    /* 66 0F 61 = PUNPCKLWD */
   12814    if (have66noF2noF3(pfx) && sz == 2
   12815        && insn[0] == 0x0F && insn[1] == 0x61) {
   12816       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12817                                  "punpcklwd",
   12818                                  Iop_InterleaveLO16x8, True );
   12819       goto decode_success;
   12820    }
   12821 
   12822    /* 66 0F EF = PXOR */
   12823    if (have66noF2noF3(pfx) && sz == 2
   12824        && insn[0] == 0x0F && insn[1] == 0xEF) {
   12825       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "pxor", Iop_XorV128 );
   12826       goto decode_success;
   12827    }
   12828 
   12829 //.. //--    /* FXSAVE/FXRSTOR m32 -- load/store the FPU/MMX/SSE state. */
   12830 //.. //--    if (insn[0] == 0x0F && insn[1] == 0xAE
   12831 //.. //--        && (!epartIsReg(insn[2]))
   12832 //.. //--        && (gregOfRM(insn[2]) == 1 || gregOfRM(insn[2]) == 0) ) {
   12833 //.. //--       Bool store = gregOfRM(insn[2]) == 0;
   12834 //.. //--       vg_assert(sz == 4);
   12835 //.. //--       pair = disAMode ( cb, sorb, eip+2, dis_buf );
   12836 //.. //--       t1   = LOW24(pair);
   12837 //.. //--       eip += 2+HI8(pair);
   12838 //.. //--       uInstr3(cb, store ? SSE2a_MemWr : SSE2a_MemRd, 512,
   12839 //.. //--                   Lit16, (((UShort)insn[0]) << 8) | (UShort)insn[1],
   12840 //.. //--                   Lit16, (UShort)insn[2],
   12841 //.. //--                   TempReg, t1 );
   12842 //.. //--       DIP("fx%s %s\n", store ? "save" : "rstor", dis_buf );
   12843 //.. //--       goto decode_success;
   12844 //.. //--    }
   12845 
   12846    /* 0F AE /7 = CLFLUSH -- flush cache line */
   12847    if (haveNo66noF2noF3(pfx) && sz == 4
   12848        && insn[0] == 0x0F && insn[1] == 0xAE
   12849        && !epartIsReg(insn[2]) && gregLO3ofRM(insn[2]) == 7) {
   12850 
   12851       /* This is something of a hack.  We need to know the size of the
   12852          cache line containing addr.  Since we don't (easily), assume
   12853          256 on the basis that no real cache would have a line that
   12854          big.  It's safe to invalidate more stuff than we need, just
   12855          inefficient. */
   12856       ULong lineszB = 256ULL;
   12857 
   12858       addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   12859       delta += 2+alen;
   12860 
   12861       /* Round addr down to the start of the containing block. */
   12862       stmt( IRStmt_Put(
   12863                OFFB_TISTART,
   12864                binop( Iop_And64,
   12865                       mkexpr(addr),
   12866                       mkU64( ~(lineszB-1) ))) );
   12867 
   12868       stmt( IRStmt_Put(OFFB_TILEN, mkU64(lineszB) ) );
   12869 
   12870       irsb->jumpkind = Ijk_TInval;
   12871       irsb->next     = mkU64(guest_RIP_bbstart+delta);
   12872       dres.whatNext  = Dis_StopHere;
   12873 
   12874       DIP("clflush %s\n", dis_buf);
   12875       goto decode_success;
   12876    }
   12877 
   12878    /* ---------------------------------------------------- */
   12879    /* --- end of the SSE/SSE2 decoder.                 --- */
   12880    /* ---------------------------------------------------- */
   12881 
   12882    /* ---------------------------------------------------- */
   12883    /* --- start of the SSE3 decoder.                   --- */
   12884    /* ---------------------------------------------------- */
   12885 
   12886    /* F3 0F 12 = MOVSLDUP -- move from E (mem or xmm) to G (xmm),
   12887       duplicating some lanes (2:2:0:0). */
   12888    /* F3 0F 16 = MOVSHDUP -- move from E (mem or xmm) to G (xmm),
   12889       duplicating some lanes (3:3:1:1). */
   12890    if (haveF3no66noF2(pfx) && sz == 4
   12891        && insn[0] == 0x0F && (insn[1] == 0x12 || insn[1] == 0x16)) {
   12892       IRTemp s3, s2, s1, s0;
   12893       IRTemp sV  = newTemp(Ity_V128);
   12894       Bool   isH = insn[1] == 0x16;
   12895       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   12896 
   12897       modrm = insn[2];
   12898       if (epartIsReg(modrm)) {
   12899          assign( sV, getXMMReg( eregOfRexRM(pfx,modrm)) );
   12900          DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
   12901                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
   12902                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12903          delta += 2+1;
   12904       } else {
   12905          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   12906          gen_SEGV_if_not_16_aligned( addr );
   12907          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12908          DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
   12909 	     dis_buf,
   12910              nameXMMReg(gregOfRexRM(pfx,modrm)));
   12911          delta += 2+alen;
   12912       }
   12913 
   12914       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   12915       putXMMReg( gregOfRexRM(pfx,modrm),
   12916                  isH ? mk128from32s( s3, s3, s1, s1 )
   12917                      : mk128from32s( s2, s2, s0, s0 ) );
   12918       goto decode_success;
   12919    }
   12920 
   12921    /* F2 0F 12 = MOVDDUP -- move from E (mem or xmm) to G (xmm),
   12922       duplicating some lanes (0:1:0:1). */
   12923    if (haveF2no66noF3(pfx)
   12924        && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
   12925        && insn[0] == 0x0F && insn[1] == 0x12) {
   12926       IRTemp sV = newTemp(Ity_V128);
   12927       IRTemp d0 = newTemp(Ity_I64);
   12928 
   12929       modrm = insn[2];
   12930       if (epartIsReg(modrm)) {
   12931          assign( sV, getXMMReg( eregOfRexRM(pfx,modrm)) );
   12932          DIP("movddup %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12933                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
   12934          delta += 2+1;
   12935          assign ( d0, unop(Iop_V128to64, mkexpr(sV)) );
   12936       } else {
   12937          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   12938          assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
   12939          DIP("movddup %s,%s\n", dis_buf,
   12940                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
   12941          delta += 2+alen;
   12942       }
   12943 
   12944       putXMMReg( gregOfRexRM(pfx,modrm),
   12945                  binop(Iop_64HLtoV128,mkexpr(d0),mkexpr(d0)) );
   12946       goto decode_success;
   12947    }
   12948 
   12949    /* F2 0F D0 = ADDSUBPS -- 32x4 +/-/+/- from E (mem or xmm) to G (xmm). */
   12950    if (haveF2no66noF3(pfx) && sz == 4
   12951        && insn[0] == 0x0F && insn[1] == 0xD0) {
   12952       IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
   12953       IRTemp eV   = newTemp(Ity_V128);
   12954       IRTemp gV   = newTemp(Ity_V128);
   12955       IRTemp addV = newTemp(Ity_V128);
   12956       IRTemp subV = newTemp(Ity_V128);
   12957       a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   12958 
   12959       modrm = insn[2];
   12960       if (epartIsReg(modrm)) {
   12961          assign( eV, getXMMReg( eregOfRexRM(pfx,modrm)) );
   12962          DIP("addsubps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12963                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   12964          delta += 2+1;
   12965       } else {
   12966          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   12967          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   12968          DIP("addsubps %s,%s\n", dis_buf,
   12969                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   12970          delta += 2+alen;
   12971       }
   12972 
   12973       assign( gV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   12974 
   12975       assign( addV, binop(Iop_Add32Fx4, mkexpr(gV), mkexpr(eV)) );
   12976       assign( subV, binop(Iop_Sub32Fx4, mkexpr(gV), mkexpr(eV)) );
   12977 
   12978       breakup128to32s( addV, &a3, &a2, &a1, &a0 );
   12979       breakup128to32s( subV, &s3, &s2, &s1, &s0 );
   12980 
   12981       putXMMReg( gregOfRexRM(pfx,modrm), mk128from32s( a3, s2, a1, s0 ));
   12982       goto decode_success;
   12983    }
   12984 
   12985    /* 66 0F D0 = ADDSUBPD -- 64x4 +/- from E (mem or xmm) to G (xmm). */
   12986    if (have66noF2noF3(pfx) && sz == 2
   12987        && insn[0] == 0x0F && insn[1] == 0xD0) {
   12988       IRTemp eV   = newTemp(Ity_V128);
   12989       IRTemp gV   = newTemp(Ity_V128);
   12990       IRTemp addV = newTemp(Ity_V128);
   12991       IRTemp subV = newTemp(Ity_V128);
   12992       IRTemp a1     = newTemp(Ity_I64);
   12993       IRTemp s0     = newTemp(Ity_I64);
   12994 
   12995       modrm = insn[2];
   12996       if (epartIsReg(modrm)) {
   12997          assign( eV, getXMMReg( eregOfRexRM(pfx,modrm)) );
   12998          DIP("addsubpd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12999                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   13000          delta += 2+1;
   13001       } else {
   13002          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   13003          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   13004          DIP("addsubpd %s,%s\n", dis_buf,
   13005                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   13006          delta += 2+alen;
   13007       }
   13008 
   13009       assign( gV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   13010 
   13011       assign( addV, binop(Iop_Add64Fx2, mkexpr(gV), mkexpr(eV)) );
   13012       assign( subV, binop(Iop_Sub64Fx2, mkexpr(gV), mkexpr(eV)) );
   13013 
   13014       assign( a1, unop(Iop_V128HIto64, mkexpr(addV) ));
   13015       assign( s0, unop(Iop_V128to64,   mkexpr(subV) ));
   13016 
   13017       putXMMReg( gregOfRexRM(pfx,modrm),
   13018                  binop(Iop_64HLtoV128, mkexpr(a1), mkexpr(s0)) );
   13019       goto decode_success;
   13020    }
   13021 
   13022    /* F2 0F 7D = HSUBPS -- 32x4 sub across from E (mem or xmm) to G (xmm). */
   13023    /* F2 0F 7C = HADDPS -- 32x4 add across from E (mem or xmm) to G (xmm). */
   13024    if (haveF2no66noF3(pfx) && sz == 4
   13025        && insn[0] == 0x0F && (insn[1] == 0x7C || insn[1] == 0x7D)) {
   13026       IRTemp e3, e2, e1, e0, g3, g2, g1, g0;
   13027       IRTemp eV     = newTemp(Ity_V128);
   13028       IRTemp gV     = newTemp(Ity_V128);
   13029       IRTemp leftV  = newTemp(Ity_V128);
   13030       IRTemp rightV = newTemp(Ity_V128);
   13031       Bool   isAdd  = insn[1] == 0x7C;
   13032       HChar* str    = isAdd ? "add" : "sub";
   13033       e3 = e2 = e1 = e0 = g3 = g2 = g1 = g0 = IRTemp_INVALID;
   13034 
   13035       modrm = insn[2];
   13036       if (epartIsReg(modrm)) {
   13037          assign( eV, getXMMReg( eregOfRexRM(pfx,modrm)) );
   13038          DIP("h%sps %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
   13039                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   13040          delta += 2+1;
   13041       } else {
   13042          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   13043          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   13044          DIP("h%sps %s,%s\n", str, dis_buf,
   13045                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   13046          delta += 2+alen;
   13047       }
   13048 
   13049       assign( gV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   13050 
   13051       breakup128to32s( eV, &e3, &e2, &e1, &e0 );
   13052       breakup128to32s( gV, &g3, &g2, &g1, &g0 );
   13053 
   13054       assign( leftV,  mk128from32s( e2, e0, g2, g0 ) );
   13055       assign( rightV, mk128from32s( e3, e1, g3, g1 ) );
   13056 
   13057       putXMMReg( gregOfRexRM(pfx,modrm),
   13058                  binop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4,
   13059                        mkexpr(leftV), mkexpr(rightV) ) );
   13060       goto decode_success;
   13061    }
   13062 
   13063    /* 66 0F 7D = HSUBPD -- 64x2 sub across from E (mem or xmm) to G (xmm). */
   13064    /* 66 0F 7C = HADDPD -- 64x2 add across from E (mem or xmm) to G (xmm). */
   13065    if (have66noF2noF3(pfx) && sz == 2
   13066        && insn[0] == 0x0F && (insn[1] == 0x7C || insn[1] == 0x7D)) {
   13067       IRTemp e1     = newTemp(Ity_I64);
   13068       IRTemp e0     = newTemp(Ity_I64);
   13069       IRTemp g1     = newTemp(Ity_I64);
   13070       IRTemp g0     = newTemp(Ity_I64);
   13071       IRTemp eV     = newTemp(Ity_V128);
   13072       IRTemp gV     = newTemp(Ity_V128);
   13073       IRTemp leftV  = newTemp(Ity_V128);
   13074       IRTemp rightV = newTemp(Ity_V128);
   13075       Bool   isAdd  = insn[1] == 0x7C;
   13076       HChar* str    = isAdd ? "add" : "sub";
   13077 
   13078       modrm = insn[2];
   13079       if (epartIsReg(modrm)) {
   13080          assign( eV, getXMMReg( eregOfRexRM(pfx,modrm)) );
   13081          DIP("h%spd %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
   13082                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   13083          delta += 2+1;
   13084       } else {
   13085          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   13086          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   13087          DIP("h%spd %s,%s\n", str, dis_buf,
   13088                               nameXMMReg(gregOfRexRM(pfx,modrm)));
   13089          delta += 2+alen;
   13090       }
   13091 
   13092       assign( gV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   13093 
   13094       assign( e1, unop(Iop_V128HIto64, mkexpr(eV) ));
   13095       assign( e0, unop(Iop_V128to64, mkexpr(eV) ));
   13096       assign( g1, unop(Iop_V128HIto64, mkexpr(gV) ));
   13097       assign( g0, unop(Iop_V128to64, mkexpr(gV) ));
   13098 
   13099       assign( leftV,  binop(Iop_64HLtoV128, mkexpr(e0),mkexpr(g0)) );
   13100       assign( rightV, binop(Iop_64HLtoV128, mkexpr(e1),mkexpr(g1)) );
   13101 
   13102       putXMMReg( gregOfRexRM(pfx,modrm),
   13103                  binop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2,
   13104                        mkexpr(leftV), mkexpr(rightV) ) );
   13105       goto decode_success;
   13106    }
   13107 
   13108    /* F2 0F F0 = LDDQU -- move from E (mem or xmm) to G (xmm). */
   13109    if (haveF2no66noF3(pfx) && sz == 4
   13110        && insn[0] == 0x0F && insn[1] == 0xF0) {
   13111       modrm = insn[2];
   13112       if (epartIsReg(modrm)) {
   13113          goto decode_failure;
   13114       } else {
   13115          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   13116          putXMMReg( gregOfRexRM(pfx,modrm),
   13117                     loadLE(Ity_V128, mkexpr(addr)) );
   13118          DIP("lddqu %s,%s\n", dis_buf,
   13119                               nameXMMReg(gregOfRexRM(pfx,modrm)));
   13120          delta += 2+alen;
   13121       }
   13122       goto decode_success;
   13123    }
   13124 
   13125    /* ---------------------------------------------------- */
   13126    /* --- end of the SSE3 decoder.                     --- */
   13127    /* ---------------------------------------------------- */
   13128 
   13129    /* ---------------------------------------------------- */
   13130    /* --- start of the SSSE3 decoder.                  --- */
   13131    /* ---------------------------------------------------- */
   13132 
   13133    /* 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
   13134       Unsigned Bytes (MMX) */
   13135    if (haveNo66noF2noF3(pfx)
   13136        && sz == 4
   13137        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
   13138       IRTemp sV        = newTemp(Ity_I64);
   13139       IRTemp dV        = newTemp(Ity_I64);
   13140       IRTemp sVoddsSX  = newTemp(Ity_I64);
   13141       IRTemp sVevensSX = newTemp(Ity_I64);
   13142       IRTemp dVoddsZX  = newTemp(Ity_I64);
   13143       IRTemp dVevensZX = newTemp(Ity_I64);
   13144 
   13145       modrm = insn[3];
   13146       do_MMX_preamble();
   13147       assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   13148 
   13149       if (epartIsReg(modrm)) {
   13150          assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   13151          delta += 3+1;
   13152          DIP("pmaddubsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   13153                                   nameMMXReg(gregLO3ofRM(modrm)));
   13154       } else {
   13155          addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   13156          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   13157          delta += 3+alen;
   13158          DIP("pmaddubsw %s,%s\n", dis_buf,
   13159                                   nameMMXReg(gregLO3ofRM(modrm)));
   13160       }
   13161 
   13162       /* compute dV unsigned x sV signed */
   13163       assign( sVoddsSX,
   13164               binop(Iop_SarN16x4, mkexpr(sV), mkU8(8)) );
   13165       assign( sVevensSX,
   13166               binop(Iop_SarN16x4,
   13167                     binop(Iop_ShlN16x4, mkexpr(sV), mkU8(8)),
   13168                     mkU8(8)) );
   13169       assign( dVoddsZX,
   13170               binop(Iop_ShrN16x4, mkexpr(dV), mkU8(8)) );
   13171       assign( dVevensZX,
   13172               binop(Iop_ShrN16x4,
   13173                     binop(Iop_ShlN16x4, mkexpr(dV), mkU8(8)),
   13174                     mkU8(8)) );
   13175 
   13176       putMMXReg(
   13177          gregLO3ofRM(modrm),
   13178          binop(Iop_QAdd16Sx4,
   13179                binop(Iop_Mul16x4, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
   13180                binop(Iop_Mul16x4, mkexpr(sVevensSX), mkexpr(dVevensZX))
   13181          )
   13182       );
   13183       goto decode_success;
   13184    }
   13185 
   13186    /* 66 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
   13187       Unsigned Bytes (XMM) */
   13188    if (have66noF2noF3(pfx)
   13189        && (sz == 2 || /*redundant REX.W*/ sz == 8)
   13190        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
   13191       IRTemp sV        = newTemp(Ity_V128);
   13192       IRTemp dV        = newTemp(Ity_V128);
   13193       IRTemp sVoddsSX  = newTemp(Ity_V128);
   13194       IRTemp sVevensSX = newTemp(Ity_V128);
   13195       IRTemp dVoddsZX  = newTemp(Ity_V128);
   13196       IRTemp dVevensZX = newTemp(Ity_V128);
   13197 
   13198       modrm = insn[3];
   13199       assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   13200 
   13201       if (epartIsReg(modrm)) {
   13202          assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   13203          delta += 3+1;
   13204          DIP("pmaddubsw %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   13205                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13206       } else {
   13207          addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   13208          gen_SEGV_if_not_16_aligned( addr );
   13209          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   13210          delta += 3+alen;
   13211          DIP("pmaddubsw %s,%s\n", dis_buf,
   13212                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13213       }
   13214 
   13215       /* compute dV unsigned x sV signed */
   13216       assign( sVoddsSX,
   13217               binop(Iop_SarN16x8, mkexpr(sV), mkU8(8)) );
   13218       assign( sVevensSX,
   13219               binop(Iop_SarN16x8,
   13220                     binop(Iop_ShlN16x8, mkexpr(sV), mkU8(8)),
   13221                     mkU8(8)) );
   13222       assign( dVoddsZX,
   13223               binop(Iop_ShrN16x8, mkexpr(dV), mkU8(8)) );
   13224       assign( dVevensZX,
   13225               binop(Iop_ShrN16x8,
   13226                     binop(Iop_ShlN16x8, mkexpr(dV), mkU8(8)),
   13227                     mkU8(8)) );
   13228 
   13229       putXMMReg(
   13230          gregOfRexRM(pfx,modrm),
   13231          binop(Iop_QAdd16Sx8,
   13232                binop(Iop_Mul16x8, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
   13233                binop(Iop_Mul16x8, mkexpr(sVevensSX), mkexpr(dVevensZX))
   13234          )
   13235       );
   13236       goto decode_success;
   13237    }
   13238 
   13239    /* ***--- these are MMX class insns introduced in SSSE3 ---*** */
   13240    /* 0F 38 03 = PHADDSW -- 16x4 signed qadd across from E (mem or
   13241       mmx) and G to G (mmx). */
   13242    /* 0F 38 07 = PHSUBSW -- 16x4 signed qsub across from E (mem or
   13243       mmx) and G to G (mmx). */
   13244    /* 0F 38 01 = PHADDW -- 16x4 add across from E (mem or mmx) and G
   13245       to G (mmx). */
   13246    /* 0F 38 05 = PHSUBW -- 16x4 sub across from E (mem or mmx) and G
   13247       to G (mmx). */
   13248    /* 0F 38 02 = PHADDD -- 32x2 add across from E (mem or mmx) and G
   13249       to G (mmx). */
   13250    /* 0F 38 06 = PHSUBD -- 32x2 sub across from E (mem or mmx) and G
   13251       to G (mmx). */
   13252 
   13253    if (haveNo66noF2noF3(pfx)
   13254        && sz == 4
   13255        && insn[0] == 0x0F && insn[1] == 0x38
   13256        && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
   13257            || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
   13258       HChar* str    = "???";
   13259       IROp   opV64  = Iop_INVALID;
   13260       IROp   opCatO = Iop_CatOddLanes16x4;
   13261       IROp   opCatE = Iop_CatEvenLanes16x4;
   13262       IRTemp sV     = newTemp(Ity_I64);
   13263       IRTemp dV     = newTemp(Ity_I64);
   13264 
   13265       modrm = insn[3];
   13266 
   13267       switch (insn[2]) {
   13268          case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
   13269          case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
   13270          case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
   13271          case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
   13272          case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
   13273          case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
   13274          default: vassert(0);
   13275       }
   13276       if (insn[2] == 0x02 || insn[2] == 0x06) {
   13277          opCatO = Iop_InterleaveHI32x2;
   13278          opCatE = Iop_InterleaveLO32x2;
   13279       }
   13280 
   13281       do_MMX_preamble();
   13282       assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   13283 
   13284       if (epartIsReg(modrm)) {
   13285          assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   13286          delta += 3+1;
   13287          DIP("ph%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
   13288                                   nameMMXReg(gregLO3ofRM(modrm)));
   13289       } else {
   13290          addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   13291          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   13292          delta += 3+alen;
   13293          DIP("ph%s %s,%s\n", str, dis_buf,
   13294                                   nameMMXReg(gregLO3ofRM(modrm)));
   13295       }
   13296 
   13297       putMMXReg(
   13298          gregLO3ofRM(modrm),
   13299          binop(opV64,
   13300                binop(opCatE,mkexpr(sV),mkexpr(dV)),
   13301                binop(opCatO,mkexpr(sV),mkexpr(dV))
   13302          )
   13303       );
   13304       goto decode_success;
   13305    }
   13306 
   13307    /* 66 0F 38 03 = PHADDSW -- 16x8 signed qadd across from E (mem or
   13308       xmm) and G to G (xmm). */
   13309    /* 66 0F 38 07 = PHSUBSW -- 16x8 signed qsub across from E (mem or
   13310       xmm) and G to G (xmm). */
   13311    /* 66 0F 38 01 = PHADDW -- 16x8 add across from E (mem or xmm) and
   13312       G to G (xmm). */
   13313    /* 66 0F 38 05 = PHSUBW -- 16x8 sub across from E (mem or xmm) and
   13314       G to G (xmm). */
   13315    /* 66 0F 38 02 = PHADDD -- 32x4 add across from E (mem or xmm) and
   13316       G to G (xmm). */
   13317    /* 66 0F 38 06 = PHSUBD -- 32x4 sub across from E (mem or xmm) and
   13318       G to G (xmm). */
   13319 
   13320    if (have66noF2noF3(pfx)
   13321        && (sz == 2 || /*redundant REX.W*/ sz == 8)
   13322        && insn[0] == 0x0F && insn[1] == 0x38
   13323        && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
   13324            || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
   13325       HChar* str    = "???";
   13326       IROp   opV64  = Iop_INVALID;
   13327       IROp   opCatO = Iop_CatOddLanes16x4;
   13328       IROp   opCatE = Iop_CatEvenLanes16x4;
   13329       IRTemp sV     = newTemp(Ity_V128);
   13330       IRTemp dV     = newTemp(Ity_V128);
   13331       IRTemp sHi    = newTemp(Ity_I64);
   13332       IRTemp sLo    = newTemp(Ity_I64);
   13333       IRTemp dHi    = newTemp(Ity_I64);
   13334       IRTemp dLo    = newTemp(Ity_I64);
   13335 
   13336       modrm = insn[3];
   13337 
   13338       switch (insn[2]) {
   13339          case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
   13340          case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
   13341          case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
   13342          case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
   13343          case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
   13344          case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
   13345          default: vassert(0);
   13346       }
   13347       if (insn[2] == 0x02 || insn[2] == 0x06) {
   13348          opCatO = Iop_InterleaveHI32x2;
   13349          opCatE = Iop_InterleaveLO32x2;
   13350       }
   13351 
   13352       assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   13353 
   13354       if (epartIsReg(modrm)) {
   13355          assign( sV, getXMMReg( eregOfRexRM(pfx,modrm)) );
   13356          DIP("ph%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
   13357                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13358          delta += 3+1;
   13359       } else {
   13360          addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   13361          gen_SEGV_if_not_16_aligned( addr );
   13362          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   13363          DIP("ph%s %s,%s\n", str, dis_buf,
   13364                              nameXMMReg(gregOfRexRM(pfx,modrm)));
   13365          delta += 3+alen;
   13366       }
   13367 
   13368       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   13369       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   13370       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   13371       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   13372 
   13373       /* This isn't a particularly efficient way to compute the
   13374          result, but at least it avoids a proliferation of IROps,
   13375          hence avoids complication all the backends. */
   13376       putXMMReg(
   13377          gregOfRexRM(pfx,modrm),
   13378          binop(Iop_64HLtoV128,
   13379                binop(opV64,
   13380                      binop(opCatE,mkexpr(sHi),mkexpr(sLo)),
   13381                      binop(opCatO,mkexpr(sHi),mkexpr(sLo))
   13382                ),
   13383                binop(opV64,
   13384                      binop(opCatE,mkexpr(dHi),mkexpr(dLo)),
   13385                      binop(opCatO,mkexpr(dHi),mkexpr(dLo))
   13386                )
   13387          )
   13388       );
   13389       goto decode_success;
   13390    }
   13391 
   13392    /* 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and Scale
   13393       (MMX) */
   13394    if (haveNo66noF2noF3(pfx)
   13395        && sz == 4
   13396        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
   13397       IRTemp sV = newTemp(Ity_I64);
   13398       IRTemp dV = newTemp(Ity_I64);
   13399 
   13400       modrm = insn[3];
   13401       do_MMX_preamble();
   13402       assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   13403 
   13404       if (epartIsReg(modrm)) {
   13405          assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   13406          delta += 3+1;
   13407          DIP("pmulhrsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   13408                                  nameMMXReg(gregLO3ofRM(modrm)));
   13409       } else {
   13410          addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   13411          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   13412          delta += 3+alen;
   13413          DIP("pmulhrsw %s,%s\n", dis_buf,
   13414                                  nameMMXReg(gregLO3ofRM(modrm)));
   13415       }
   13416 
   13417       putMMXReg(
   13418          gregLO3ofRM(modrm),
   13419          dis_PMULHRSW_helper( mkexpr(sV), mkexpr(dV) )
   13420       );
   13421       goto decode_success;
   13422    }
   13423 
   13424    /* 66 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and
   13425       Scale (XMM) */
   13426    if (have66noF2noF3(pfx)
   13427        && (sz == 2 || /*redundant REX.W*/ sz == 8)
   13428        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
   13429       IRTemp sV  = newTemp(Ity_V128);
   13430       IRTemp dV  = newTemp(Ity_V128);
   13431       IRTemp sHi = newTemp(Ity_I64);
   13432       IRTemp sLo = newTemp(Ity_I64);
   13433       IRTemp dHi = newTemp(Ity_I64);
   13434       IRTemp dLo = newTemp(Ity_I64);
   13435 
   13436       modrm = insn[3];
   13437       assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   13438 
   13439       if (epartIsReg(modrm)) {
   13440          assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   13441          delta += 3+1;
   13442          DIP("pmulhrsw %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   13443                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   13444       } else {
   13445          addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   13446          gen_SEGV_if_not_16_aligned( addr );
   13447          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   13448          delta += 3+alen;
   13449          DIP("pmulhrsw %s,%s\n", dis_buf,
   13450                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   13451       }
   13452 
   13453       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   13454       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   13455       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   13456       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   13457 
   13458       putXMMReg(
   13459          gregOfRexRM(pfx,modrm),
   13460          binop(Iop_64HLtoV128,
   13461                dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
   13462                dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
   13463          )
   13464       );
   13465       goto decode_success;
   13466    }
   13467 
   13468    /* 0F 38 08 = PSIGNB -- Packed Sign 8x8  (MMX) */
   13469    /* 0F 38 09 = PSIGNW -- Packed Sign 16x4 (MMX) */
   13470    /* 0F 38 09 = PSIGND -- Packed Sign 32x2 (MMX) */
   13471    if (haveNo66noF2noF3(pfx)
   13472        && sz == 4
   13473        && insn[0] == 0x0F && insn[1] == 0x38
   13474        && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
   13475       IRTemp sV      = newTemp(Ity_I64);
   13476       IRTemp dV      = newTemp(Ity_I64);
   13477       HChar* str     = "???";
   13478       Int    laneszB = 0;
   13479 
   13480       switch (insn[2]) {
   13481          case 0x08: laneszB = 1; str = "b"; break;
   13482          case 0x09: laneszB = 2; str = "w"; break;
   13483          case 0x0A: laneszB = 4; str = "d"; break;
   13484          default: vassert(0);
   13485       }
   13486 
   13487       modrm = insn[3];
   13488       do_MMX_preamble();
   13489       assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   13490 
   13491       if (epartIsReg(modrm)) {
   13492          assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   13493          delta += 3+1;
   13494          DIP("psign%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
   13495                                      nameMMXReg(gregLO3ofRM(modrm)));
   13496       } else {
   13497          addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   13498          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   13499          delta += 3+alen;
   13500          DIP("psign%s %s,%s\n", str, dis_buf,
   13501                                      nameMMXReg(gregLO3ofRM(modrm)));
   13502       }
   13503 
   13504       putMMXReg(
   13505          gregLO3ofRM(modrm),
   13506          dis_PSIGN_helper( mkexpr(sV), mkexpr(dV), laneszB )
   13507       );
   13508       goto decode_success;
   13509    }
   13510 
   13511    /* 66 0F 38 08 = PSIGNB -- Packed Sign 8x16 (XMM) */
   13512    /* 66 0F 38 09 = PSIGNW -- Packed Sign 16x8 (XMM) */
   13513    /* 66 0F 38 09 = PSIGND -- Packed Sign 32x4 (XMM) */
   13514    if (have66noF2noF3(pfx)
   13515        && (sz == 2 || /*redundant REX.W*/ sz == 8)
   13516        && insn[0] == 0x0F && insn[1] == 0x38
   13517        && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
   13518       IRTemp sV      = newTemp(Ity_V128);
   13519       IRTemp dV      = newTemp(Ity_V128);
   13520       IRTemp sHi     = newTemp(Ity_I64);
   13521       IRTemp sLo     = newTemp(Ity_I64);
   13522       IRTemp dHi     = newTemp(Ity_I64);
   13523       IRTemp dLo     = newTemp(Ity_I64);
   13524       HChar* str     = "???";
   13525       Int    laneszB = 0;
   13526 
   13527       switch (insn[2]) {
   13528          case 0x08: laneszB = 1; str = "b"; break;
   13529          case 0x09: laneszB = 2; str = "w"; break;
   13530          case 0x0A: laneszB = 4; str = "d"; break;
   13531          default: vassert(0);
   13532       }
   13533 
   13534       modrm = insn[3];
   13535       assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   13536 
   13537       if (epartIsReg(modrm)) {
   13538          assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   13539          delta += 3+1;
   13540          DIP("psign%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
   13541                                      nameXMMReg(gregOfRexRM(pfx,modrm)));
   13542       } else {
   13543          addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   13544          gen_SEGV_if_not_16_aligned( addr );
   13545          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   13546          delta += 3+alen;
   13547          DIP("psign%s %s,%s\n", str, dis_buf,
   13548                                      nameXMMReg(gregOfRexRM(pfx,modrm)));
   13549       }
   13550 
   13551       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   13552       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   13553       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   13554       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   13555 
   13556       putXMMReg(
   13557          gregOfRexRM(pfx,modrm),
   13558          binop(Iop_64HLtoV128,
   13559                dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
   13560                dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
   13561          )
   13562       );
   13563       goto decode_success;
   13564    }
   13565 
   13566    /* 0F 38 1C = PABSB -- Packed Absolute Value 8x8  (MMX) */
   13567    /* 0F 38 1D = PABSW -- Packed Absolute Value 16x4 (MMX) */
   13568    /* 0F 38 1E = PABSD -- Packed Absolute Value 32x2 (MMX) */
   13569    if (haveNo66noF2noF3(pfx)
   13570        && sz == 4
   13571        && insn[0] == 0x0F && insn[1] == 0x38
   13572        && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
   13573       IRTemp sV      = newTemp(Ity_I64);
   13574       HChar* str     = "???";
   13575       Int    laneszB = 0;
   13576 
   13577       switch (insn[2]) {
   13578          case 0x1C: laneszB = 1; str = "b"; break;
   13579          case 0x1D: laneszB = 2; str = "w"; break;
   13580          case 0x1E: laneszB = 4; str = "d"; break;
   13581          default: vassert(0);
   13582       }
   13583 
   13584       modrm = insn[3];
   13585       do_MMX_preamble();
   13586 
   13587       if (epartIsReg(modrm)) {
   13588          assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   13589          delta += 3+1;
   13590          DIP("pabs%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
   13591                                     nameMMXReg(gregLO3ofRM(modrm)));
   13592       } else {
   13593          addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   13594          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   13595          delta += 3+alen;
   13596          DIP("pabs%s %s,%s\n", str, dis_buf,
   13597                                     nameMMXReg(gregLO3ofRM(modrm)));
   13598       }
   13599 
   13600       putMMXReg(
   13601          gregLO3ofRM(modrm),
   13602          dis_PABS_helper( mkexpr(sV), laneszB )
   13603       );
   13604       goto decode_success;
   13605    }
   13606 
   13607    /* 66 0F 38 1C = PABSB -- Packed Absolute Value 8x16 (XMM) */
   13608    /* 66 0F 38 1D = PABSW -- Packed Absolute Value 16x8 (XMM) */
   13609    /* 66 0F 38 1E = PABSD -- Packed Absolute Value 32x4 (XMM) */
   13610    if (have66noF2noF3(pfx)
   13611        && (sz == 2 || /*redundant REX.W*/ sz == 8)
   13612        && insn[0] == 0x0F && insn[1] == 0x38
   13613        && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
   13614       IRTemp sV      = newTemp(Ity_V128);
   13615       IRTemp sHi     = newTemp(Ity_I64);
   13616       IRTemp sLo     = newTemp(Ity_I64);
   13617       HChar* str     = "???";
   13618       Int    laneszB = 0;
   13619 
   13620       switch (insn[2]) {
   13621          case 0x1C: laneszB = 1; str = "b"; break;
   13622          case 0x1D: laneszB = 2; str = "w"; break;
   13623          case 0x1E: laneszB = 4; str = "d"; break;
   13624          default: vassert(0);
   13625       }
   13626 
   13627       modrm = insn[3];
   13628 
   13629       if (epartIsReg(modrm)) {
   13630          assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   13631          delta += 3+1;
   13632          DIP("pabs%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
   13633                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13634       } else {
   13635          addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   13636          gen_SEGV_if_not_16_aligned( addr );
   13637          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   13638          delta += 3+alen;
   13639          DIP("pabs%s %s,%s\n", str, dis_buf,
   13640                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13641       }
   13642 
   13643       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   13644       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   13645 
   13646       putXMMReg(
   13647          gregOfRexRM(pfx,modrm),
   13648          binop(Iop_64HLtoV128,
   13649                dis_PABS_helper( mkexpr(sHi), laneszB ),
   13650                dis_PABS_helper( mkexpr(sLo), laneszB )
   13651          )
   13652       );
   13653       goto decode_success;
   13654    }
   13655 
   13656    /* 0F 3A 0F = PALIGNR -- Packed Align Right (MMX) */
   13657    if (haveNo66noF2noF3(pfx) && sz == 4
   13658        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
   13659       IRTemp sV  = newTemp(Ity_I64);
   13660       IRTemp dV  = newTemp(Ity_I64);
   13661       IRTemp res = newTemp(Ity_I64);
   13662 
   13663       modrm = insn[3];
   13664       do_MMX_preamble();
   13665       assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   13666 
   13667       if (epartIsReg(modrm)) {
   13668          assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   13669          d64 = (Long)insn[3+1];
   13670          delta += 3+1+1;
   13671          DIP("palignr $%d,%s,%s\n",  (Int)d64,
   13672                                      nameMMXReg(eregLO3ofRM(modrm)),
   13673                                      nameMMXReg(gregLO3ofRM(modrm)));
   13674       } else {
   13675          addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 1 );
   13676          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   13677          d64 = (Long)insn[3+alen];
   13678          delta += 3+alen+1;
   13679          DIP("palignr $%d%s,%s\n", (Int)d64,
   13680                                    dis_buf,
   13681                                    nameMMXReg(gregLO3ofRM(modrm)));
   13682       }
   13683 
   13684       if (d64 == 0) {
   13685          assign( res, mkexpr(sV) );
   13686       }
   13687       else if (d64 >= 1 && d64 <= 7) {
   13688          assign(res,
   13689                 binop(Iop_Or64,
   13690                       binop(Iop_Shr64, mkexpr(sV), mkU8(8*d64)),
   13691                       binop(Iop_Shl64, mkexpr(dV), mkU8(8*(8-d64))
   13692                      )));
   13693       }
   13694       else if (d64 == 8) {
   13695         assign( res, mkexpr(dV) );
   13696       }
   13697       else if (d64 >= 9 && d64 <= 15) {
   13698          assign( res, binop(Iop_Shr64, mkexpr(dV), mkU8(8*(d64-8))) );
   13699       }
   13700       else if (d64 >= 16 && d64 <= 255) {
   13701          assign( res, mkU64(0) );
   13702       }
   13703       else
   13704          vassert(0);
   13705 
   13706       putMMXReg( gregLO3ofRM(modrm), mkexpr(res) );
   13707       goto decode_success;
   13708    }
   13709 
   13710    /* 66 0F 3A 0F = PALIGNR -- Packed Align Right (XMM) */
   13711    if (have66noF2noF3(pfx)
   13712        && (sz == 2 || /*redundant REX.W*/ sz == 8)
   13713        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
   13714       IRTemp sV  = newTemp(Ity_V128);
   13715       IRTemp dV  = newTemp(Ity_V128);
   13716       IRTemp sHi = newTemp(Ity_I64);
   13717       IRTemp sLo = newTemp(Ity_I64);
   13718       IRTemp dHi = newTemp(Ity_I64);
   13719       IRTemp dLo = newTemp(Ity_I64);
   13720       IRTemp rHi = newTemp(Ity_I64);
   13721       IRTemp rLo = newTemp(Ity_I64);
   13722 
   13723       modrm = insn[3];
   13724       assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   13725 
   13726       if (epartIsReg(modrm)) {
   13727          assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   13728          d64 = (Long)insn[3+1];
   13729          delta += 3+1+1;
   13730          DIP("palignr $%d,%s,%s\n", (Int)d64,
   13731                                     nameXMMReg(eregOfRexRM(pfx,modrm)),
   13732                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13733       } else {
   13734          addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 1 );
   13735          gen_SEGV_if_not_16_aligned( addr );
   13736          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   13737          d64 = (Long)insn[3+alen];
   13738          delta += 3+alen+1;
   13739          DIP("palignr $%d,%s,%s\n", (Int)d64,
   13740                                     dis_buf,
   13741                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13742       }
   13743 
   13744       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   13745       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   13746       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   13747       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   13748 
   13749       if (d64 == 0) {
   13750          assign( rHi, mkexpr(sHi) );
   13751          assign( rLo, mkexpr(sLo) );
   13752       }
   13753       else if (d64 >= 1 && d64 <= 7) {
   13754          assign( rHi, dis_PALIGNR_XMM_helper(dLo, sHi, d64) );
   13755          assign( rLo, dis_PALIGNR_XMM_helper(sHi, sLo, d64) );
   13756       }
   13757       else if (d64 == 8) {
   13758          assign( rHi, mkexpr(dLo) );
   13759          assign( rLo, mkexpr(sHi) );
   13760       }
   13761       else if (d64 >= 9 && d64 <= 15) {
   13762          assign( rHi, dis_PALIGNR_XMM_helper(dHi, dLo, d64-8) );
   13763          assign( rLo, dis_PALIGNR_XMM_helper(dLo, sHi, d64-8) );
   13764       }
   13765       else if (d64 == 16) {
   13766          assign( rHi, mkexpr(dHi) );
   13767          assign( rLo, mkexpr(dLo) );
   13768       }
   13769       else if (d64 >= 17 && d64 <= 23) {
   13770          assign( rHi, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d64-16))) );
   13771          assign( rLo, dis_PALIGNR_XMM_helper(dHi, dLo, d64-16) );
   13772       }
   13773       else if (d64 == 24) {
   13774          assign( rHi, mkU64(0) );
   13775          assign( rLo, mkexpr(dHi) );
   13776       }
   13777       else if (d64 >= 25 && d64 <= 31) {
   13778          assign( rHi, mkU64(0) );
   13779          assign( rLo, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d64-24))) );
   13780       }
   13781       else if (d64 >= 32 && d64 <= 255) {
   13782          assign( rHi, mkU64(0) );
   13783          assign( rLo, mkU64(0) );
   13784       }
   13785       else
   13786          vassert(0);
   13787 
   13788       putXMMReg(
   13789          gregOfRexRM(pfx,modrm),
   13790          binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
   13791       );
   13792       goto decode_success;
   13793    }
   13794 
   13795    /* 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x8 (MMX) */
   13796    if (haveNo66noF2noF3(pfx)
   13797        && sz == 4
   13798        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
   13799       IRTemp sV      = newTemp(Ity_I64);
   13800       IRTemp dV      = newTemp(Ity_I64);
   13801 
   13802       modrm = insn[3];
   13803       do_MMX_preamble();
   13804       assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   13805 
   13806       if (epartIsReg(modrm)) {
   13807          assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   13808          delta += 3+1;
   13809          DIP("pshufb %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   13810                                nameMMXReg(gregLO3ofRM(modrm)));
   13811       } else {
   13812          addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   13813          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   13814          delta += 3+alen;
   13815          DIP("pshufb %s,%s\n", dis_buf,
   13816                                nameMMXReg(gregLO3ofRM(modrm)));
   13817       }
   13818 
   13819       putMMXReg(
   13820          gregLO3ofRM(modrm),
   13821          binop(
   13822             Iop_And64,
   13823             /* permute the lanes */
   13824             binop(
   13825                Iop_Perm8x8,
   13826                mkexpr(dV),
   13827                binop(Iop_And64, mkexpr(sV), mkU64(0x0707070707070707ULL))
   13828             ),
   13829             /* mask off lanes which have (index & 0x80) == 0x80 */
   13830             unop(Iop_Not64, binop(Iop_SarN8x8, mkexpr(sV), mkU8(7)))
   13831          )
   13832       );
   13833       goto decode_success;
   13834    }
   13835 
   13836    /* 66 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x16 (XMM) */
   13837    if (have66noF2noF3(pfx)
   13838        && (sz == 2 || /*redundant REX.W*/ sz == 8)
   13839        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
   13840       IRTemp sV         = newTemp(Ity_V128);
   13841       IRTemp dV         = newTemp(Ity_V128);
   13842       IRTemp sHi        = newTemp(Ity_I64);
   13843       IRTemp sLo        = newTemp(Ity_I64);
   13844       IRTemp dHi        = newTemp(Ity_I64);
   13845       IRTemp dLo        = newTemp(Ity_I64);
   13846       IRTemp rHi        = newTemp(Ity_I64);
   13847       IRTemp rLo        = newTemp(Ity_I64);
   13848       IRTemp sevens     = newTemp(Ity_I64);
   13849       IRTemp mask0x80hi = newTemp(Ity_I64);
   13850       IRTemp mask0x80lo = newTemp(Ity_I64);
   13851       IRTemp maskBit3hi = newTemp(Ity_I64);
   13852       IRTemp maskBit3lo = newTemp(Ity_I64);
   13853       IRTemp sAnd7hi    = newTemp(Ity_I64);
   13854       IRTemp sAnd7lo    = newTemp(Ity_I64);
   13855       IRTemp permdHi    = newTemp(Ity_I64);
   13856       IRTemp permdLo    = newTemp(Ity_I64);
   13857 
   13858       modrm = insn[3];
   13859       assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   13860 
   13861       if (epartIsReg(modrm)) {
   13862          assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   13863          delta += 3+1;
   13864          DIP("pshufb %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   13865                                nameXMMReg(gregOfRexRM(pfx,modrm)));
   13866       } else {
   13867          addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   13868          gen_SEGV_if_not_16_aligned( addr );
   13869          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   13870          delta += 3+alen;
   13871          DIP("pshufb %s,%s\n", dis_buf,
   13872                                nameXMMReg(gregOfRexRM(pfx,modrm)));
   13873       }
   13874 
   13875       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   13876       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   13877       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   13878       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   13879 
   13880       assign( sevens, mkU64(0x0707070707070707ULL) );
   13881 
   13882       /*
   13883       mask0x80hi = Not(SarN8x8(sHi,7))
   13884       maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7)
   13885       sAnd7hi    = And(sHi,sevens)
   13886       permdHi    = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi),
   13887                        And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) )
   13888       rHi        = And(permdHi,mask0x80hi)
   13889       */
   13890       assign(
   13891          mask0x80hi,
   13892          unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7))));
   13893 
   13894       assign(
   13895          maskBit3hi,
   13896          binop(Iop_SarN8x8,
   13897                binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)),
   13898                mkU8(7)));
   13899 
   13900       assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens)));
   13901 
   13902       assign(
   13903          permdHi,
   13904          binop(
   13905             Iop_Or64,
   13906             binop(Iop_And64,
   13907                   binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)),
   13908                   mkexpr(maskBit3hi)),
   13909             binop(Iop_And64,
   13910                   binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)),
   13911                   unop(Iop_Not64,mkexpr(maskBit3hi))) ));
   13912 
   13913       assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) );
   13914 
   13915       /* And the same for the lower half of the result.  What fun. */
   13916 
   13917       assign(
   13918          mask0x80lo,
   13919          unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7))));
   13920 
   13921       assign(
   13922          maskBit3lo,
   13923          binop(Iop_SarN8x8,
   13924                binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)),
   13925                mkU8(7)));
   13926 
   13927       assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens)));
   13928 
   13929       assign(
   13930          permdLo,
   13931          binop(
   13932             Iop_Or64,
   13933             binop(Iop_And64,
   13934                   binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)),
   13935                   mkexpr(maskBit3lo)),
   13936             binop(Iop_And64,
   13937                   binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)),
   13938                   unop(Iop_Not64,mkexpr(maskBit3lo))) ));
   13939 
   13940       assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) );
   13941 
   13942       putXMMReg(
   13943          gregOfRexRM(pfx,modrm),
   13944          binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
   13945       );
   13946       goto decode_success;
   13947    }
   13948 
   13949    /* ---------------------------------------------------- */
   13950    /* --- end of the SSSE3 decoder.                    --- */
   13951    /* ---------------------------------------------------- */
   13952 
   13953    /* ---------------------------------------------------- */
   13954    /* --- start of the SSE4 decoder                    --- */
   13955    /* ---------------------------------------------------- */
   13956 
   13957    /* 66 0F 3A 0D /r ib = BLENDPD xmm1, xmm2/m128, imm8
   13958       Blend Packed Double Precision Floating-Point Values (XMM) */
   13959    if ( have66noF2noF3( pfx )
   13960         && sz == 2
   13961         && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0D ) {
   13962 
   13963       Int imm8;
   13964       UShort imm8_mask_16;
   13965 
   13966       IRTemp dst_vec = newTemp(Ity_V128);
   13967       IRTemp src_vec = newTemp(Ity_V128);
   13968       IRTemp imm8_mask = newTemp(Ity_V128);
   13969 
   13970       modrm = insn[3];
   13971       assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
   13972 
   13973       if ( epartIsReg( modrm ) ) {
   13974          imm8 = (Int)insn[4];
   13975          assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   13976          delta += 3+1+1;
   13977          DIP( "blendpd $%d, %s,%s\n", imm8,
   13978               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   13979               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   13980       } else {
   13981          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf,
   13982                           1/* imm8 is 1 byte after the amode */ );
   13983          gen_SEGV_if_not_16_aligned( addr );
   13984          assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   13985          imm8 = (Int)insn[2+alen+1];
   13986          delta += 3+alen+1;
   13987          DIP( "blendpd $%d, %s,%s\n",
   13988               imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   13989       }
   13990 
   13991       switch( imm8 & 3 ) {
   13992          case 0:  imm8_mask_16 = 0x0000; break;
   13993          case 1:  imm8_mask_16 = 0x00FF; break;
   13994          case 2:  imm8_mask_16 = 0xFF00; break;
   13995          case 3:  imm8_mask_16 = 0xFFFF; break;
   13996          default: vassert(0);            break;
   13997       }
   13998       assign( imm8_mask, mkV128( imm8_mask_16 ) );
   13999 
   14000       putXMMReg( gregOfRexRM(pfx, modrm),
   14001                  binop( Iop_OrV128,
   14002                         binop( Iop_AndV128, mkexpr(src_vec), mkexpr(imm8_mask) ),
   14003                         binop( Iop_AndV128, mkexpr(dst_vec),
   14004                                unop( Iop_NotV128, mkexpr(imm8_mask) ) ) ) );
   14005 
   14006       goto decode_success;
   14007    }
   14008 
   14009 
   14010    /* 66 0F 3A 0C /r ib = BLENDPS xmm1, xmm2/m128, imm8
   14011       Blend Packed Single Precision Floating-Point Values (XMM) */
   14012    if ( have66noF2noF3( pfx )
   14013         && sz == 2
   14014         && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0C ) {
   14015 
   14016       Int imm8;
   14017       IRTemp dst_vec = newTemp(Ity_V128);
   14018       IRTemp src_vec = newTemp(Ity_V128);
   14019 
   14020       modrm = insn[3];
   14021 
   14022       assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
   14023 
   14024       if ( epartIsReg( modrm ) ) {
   14025          imm8 = (Int)insn[3+1];
   14026          assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   14027          delta += 3+1+1;
   14028          DIP( "blendps $%d, %s,%s\n", imm8,
   14029               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   14030               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14031       } else {
   14032          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf,
   14033                           1/* imm8 is 1 byte after the amode */ );
   14034          gen_SEGV_if_not_16_aligned( addr );
   14035          assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   14036          imm8 = (Int)insn[3+alen];
   14037          delta += 3+alen+1;
   14038          DIP( "blendpd $%d, %s,%s\n",
   14039               imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14040       }
   14041 
   14042       UShort imm8_perms[16] = { 0x0000, 0x000F, 0x00F0, 0x00FF, 0x0F00, 0x0F0F,
   14043                                 0x0FF0, 0x0FFF, 0xF000, 0xF00F, 0xF0F0, 0xF0FF,
   14044                                 0xFF00, 0xFF0F, 0xFFF0, 0xFFFF };
   14045       IRTemp imm8_mask = newTemp(Ity_V128);
   14046       assign( imm8_mask, mkV128( imm8_perms[ (imm8 & 15) ] ) );
   14047 
   14048       putXMMReg( gregOfRexRM(pfx, modrm),
   14049                  binop( Iop_OrV128,
   14050                         binop( Iop_AndV128, mkexpr(src_vec), mkexpr(imm8_mask) ),
   14051                         binop( Iop_AndV128, mkexpr(dst_vec),
   14052                                unop( Iop_NotV128, mkexpr(imm8_mask) ) ) ) );
   14053 
   14054       goto decode_success;
   14055    }
   14056 
   14057 
   14058    /* 66 0F 3A 0E /r ib = PBLENDW xmm1, xmm2/m128, imm8
   14059       Blend Packed Words (XMM) */
   14060    if ( have66noF2noF3( pfx )
   14061         && sz == 2
   14062         && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0E ) {
   14063 
   14064       Int imm8;
   14065       IRTemp dst_vec = newTemp(Ity_V128);
   14066       IRTemp src_vec = newTemp(Ity_V128);
   14067 
   14068       modrm = insn[3];
   14069 
   14070       assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
   14071 
   14072       if ( epartIsReg( modrm ) ) {
   14073          imm8 = (Int)insn[3+1];
   14074          assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   14075          delta += 3+1+1;
   14076          DIP( "pblendw $%d, %s,%s\n", imm8,
   14077               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   14078               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14079       } else {
   14080          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf,
   14081                           1/* imm8 is 1 byte after the amode */ );
   14082          gen_SEGV_if_not_16_aligned( addr );
   14083          assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   14084          imm8 = (Int)insn[3+alen];
   14085          delta += 3+alen+1;
   14086          DIP( "pblendw $%d, %s,%s\n",
   14087               imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14088       }
   14089 
   14090       /* Make w be a 16-bit version of imm8, formed by duplicating each
   14091          bit in imm8. */
   14092       Int i;
   14093       UShort imm16 = 0;
   14094       for (i = 0; i < 8; i++) {
   14095          if (imm8 & (1 << i))
   14096              imm16 |= (3 << (2*i));
   14097       }
   14098       IRTemp imm16_mask = newTemp(Ity_V128);
   14099       assign( imm16_mask, mkV128( imm16 ));
   14100 
   14101       putXMMReg( gregOfRexRM(pfx, modrm),
   14102                  binop( Iop_OrV128,
   14103                         binop( Iop_AndV128, mkexpr(src_vec), mkexpr(imm16_mask) ),
   14104                         binop( Iop_AndV128, mkexpr(dst_vec),
   14105                                unop( Iop_NotV128, mkexpr(imm16_mask) ) ) ) );
   14106 
   14107       goto decode_success;
   14108    }
   14109 
   14110 
   14111    /* 66 0F 3A 44 /r ib = PCLMULQDQ xmm1, xmm2/m128, imm8
   14112     * Carry-less multiplication of selected XMM quadwords into XMM
   14113     * registers (a.k.a multiplication of polynomials over GF(2))
   14114     */
   14115    if ( have66noF2noF3( pfx )
   14116         && sz == 2
   14117         && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x44 ) {
   14118 
   14119       Int imm8;
   14120       IRTemp svec = newTemp(Ity_V128);
   14121       IRTemp dvec = newTemp(Ity_V128);
   14122 
   14123       modrm = insn[3];
   14124 
   14125       assign( dvec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
   14126 
   14127       if ( epartIsReg( modrm ) ) {
   14128          imm8 = (Int)insn[4];
   14129          assign( svec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   14130          delta += 3+1+1;
   14131          DIP( "pclmulqdq $%d, %s,%s\n", imm8,
   14132               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   14133               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14134       } else {
   14135          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf,
   14136                           1/* imm8 is 1 byte after the amode */ );
   14137          gen_SEGV_if_not_16_aligned( addr );
   14138          assign( svec, loadLE( Ity_V128, mkexpr(addr) ) );
   14139          imm8 = (Int)insn[2+alen+1];
   14140          delta += 3+alen+1;
   14141          DIP( "pclmulqdq $%d, %s,%s\n",
   14142               imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14143       }
   14144 
   14145       t0 = newTemp(Ity_I64);
   14146       t1 = newTemp(Ity_I64);
   14147       assign(t0, unop((imm8&1)? Iop_V128HIto64 : Iop_V128to64, mkexpr(dvec)));
   14148       assign(t1, unop((imm8&16) ? Iop_V128HIto64 : Iop_V128to64, mkexpr(svec)));
   14149 
   14150       t2 = newTemp(Ity_I64);
   14151       t3 = newTemp(Ity_I64);
   14152 
   14153       IRExpr** args;
   14154 
   14155       args = mkIRExprVec_3(mkexpr(t0), mkexpr(t1), mkU64(0));
   14156       assign(t2,
   14157               mkIRExprCCall(Ity_I64,0, "amd64g_calculate_pclmul",
   14158                                        &amd64g_calculate_pclmul, args));
   14159       args = mkIRExprVec_3(mkexpr(t0), mkexpr(t1), mkU64(1));
   14160       assign(t3,
   14161               mkIRExprCCall(Ity_I64,0, "amd64g_calculate_pclmul",
   14162                                        &amd64g_calculate_pclmul, args));
   14163 
   14164       IRTemp res     = newTemp(Ity_V128);
   14165       assign(res, binop(Iop_64HLtoV128, mkexpr(t3), mkexpr(t2)));
   14166       putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
   14167 
   14168       goto decode_success;
   14169    }
   14170 
   14171    /* 66 0F 3A 41 /r ib = DPPD xmm1, xmm2/m128, imm8
   14172       Dot Product of Packed Double Precision Floating-Point Values (XMM) */
   14173    if ( have66noF2noF3( pfx )
   14174         && sz == 2
   14175         && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x41 ) {
   14176 
   14177       Int imm8;
   14178       IRTemp src_vec = newTemp(Ity_V128);
   14179       IRTemp dst_vec = newTemp(Ity_V128);
   14180       IRTemp and_vec = newTemp(Ity_V128);
   14181       IRTemp sum_vec = newTemp(Ity_V128);
   14182 
   14183       modrm = insn[3];
   14184 
   14185       assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
   14186 
   14187       if ( epartIsReg( modrm ) ) {
   14188          imm8 = (Int)insn[4];
   14189          assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   14190          delta += 3+1+1;
   14191          DIP( "dppd $%d, %s,%s\n", imm8,
   14192               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   14193               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14194       } else {
   14195          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf,
   14196                           1/* imm8 is 1 byte after the amode */ );
   14197          gen_SEGV_if_not_16_aligned( addr );
   14198          assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   14199          imm8 = (Int)insn[2+alen+1];
   14200          delta += 3+alen+1;
   14201          DIP( "dppd $%d, %s,%s\n",
   14202               imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14203       }
   14204 
   14205       UShort imm8_perms[4] = { 0x0000, 0x00FF, 0xFF00, 0xFFFF };
   14206 
   14207       assign( and_vec, binop( Iop_AndV128,
   14208                               binop( Iop_Mul64Fx2,
   14209                                      mkexpr(dst_vec), mkexpr(src_vec) ),
   14210                               mkV128( imm8_perms[ ((imm8 >> 4) & 3) ] ) ) );
   14211 
   14212       assign( sum_vec, binop( Iop_Add64F0x2,
   14213                               binop( Iop_InterleaveHI64x2,
   14214                                      mkexpr(and_vec), mkexpr(and_vec) ),
   14215                               binop( Iop_InterleaveLO64x2,
   14216                                      mkexpr(and_vec), mkexpr(and_vec) ) ) );
   14217 
   14218       putXMMReg( gregOfRexRM( pfx, modrm ),
   14219                  binop( Iop_AndV128,
   14220                         binop( Iop_InterleaveLO64x2,
   14221                                mkexpr(sum_vec), mkexpr(sum_vec) ),
   14222                         mkV128( imm8_perms[ (imm8 & 3) ] ) ) );
   14223 
   14224       goto decode_success;
   14225    }
   14226 
   14227 
   14228    /* 66 0F 3A 40 /r ib = DPPS xmm1, xmm2/m128, imm8
   14229       Dot Product of Packed Single Precision Floating-Point Values (XMM) */
   14230    if ( have66noF2noF3( pfx )
   14231         && sz == 2
   14232         && insn[0] == 0x0F
   14233         && insn[1] == 0x3A
   14234         && insn[2] == 0x40 ) {
   14235 
   14236       Int imm8;
   14237       IRTemp xmm1_vec     = newTemp(Ity_V128);
   14238       IRTemp xmm2_vec     = newTemp(Ity_V128);
   14239       IRTemp tmp_prod_vec = newTemp(Ity_V128);
   14240       IRTemp prod_vec     = newTemp(Ity_V128);
   14241       IRTemp sum_vec      = newTemp(Ity_V128);
   14242       IRTemp v3, v2, v1, v0;
   14243       v3 = v2 = v1 = v0   = IRTemp_INVALID;
   14244 
   14245       modrm = insn[3];
   14246 
   14247       assign( xmm1_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
   14248 
   14249       if ( epartIsReg( modrm ) ) {
   14250          imm8 = (Int)insn[4];
   14251          assign( xmm2_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   14252          delta += 3+1+1;
   14253          DIP( "dpps $%d, %s,%s\n", imm8,
   14254               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   14255               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14256       } else {
   14257          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf,
   14258                           1/* imm8 is 1 byte after the amode */ );
   14259          gen_SEGV_if_not_16_aligned( addr );
   14260          assign( xmm2_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   14261          imm8 = (Int)insn[2+alen+1];
   14262          delta += 3+alen+1;
   14263          DIP( "dpps $%d, %s,%s\n",
   14264               imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14265       }
   14266 
   14267       UShort imm8_perms[16] = { 0x0000, 0x000F, 0x00F0, 0x00FF, 0x0F00,
   14268                                 0x0F0F, 0x0FF0, 0x0FFF, 0xF000, 0xF00F,
   14269                                 0xF0F0, 0xF0FF, 0xFF00, 0xFF0F, 0xFFF0, 0xFFFF };
   14270 
   14271       assign( tmp_prod_vec,
   14272               binop( Iop_AndV128,
   14273                      binop( Iop_Mul32Fx4, mkexpr(xmm1_vec), mkexpr(xmm2_vec) ),
   14274                      mkV128( imm8_perms[((imm8 >> 4)& 15)] ) ) );
   14275       breakup128to32s( tmp_prod_vec, &v3, &v2, &v1, &v0 );
   14276       assign( prod_vec, mk128from32s( v3, v1, v2, v0 ) );
   14277 
   14278       assign( sum_vec, binop( Iop_Add32Fx4,
   14279                               binop( Iop_InterleaveHI32x4,
   14280                                      mkexpr(prod_vec), mkexpr(prod_vec) ),
   14281                               binop( Iop_InterleaveLO32x4,
   14282                                      mkexpr(prod_vec), mkexpr(prod_vec) ) ) );
   14283 
   14284       putXMMReg( gregOfRexRM(pfx, modrm),
   14285                  binop( Iop_AndV128,
   14286                         binop( Iop_Add32Fx4,
   14287                                binop( Iop_InterleaveHI32x4,
   14288                                       mkexpr(sum_vec), mkexpr(sum_vec) ),
   14289                                binop( Iop_InterleaveLO32x4,
   14290                                       mkexpr(sum_vec), mkexpr(sum_vec) ) ),
   14291                         mkV128( imm8_perms[ (imm8 & 15) ] ) ) );
   14292 
   14293       goto decode_success;
   14294    }
   14295 
   14296 
   14297    /* 66 0F 3A 21 /r ib = INSERTPS xmm1, xmm2/m32, imm8
   14298       Insert Packed Single Precision Floating-Point Value (XMM) */
   14299    if ( have66noF2noF3( pfx )
   14300         && sz == 2
   14301         && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x21 ) {
   14302 
   14303       Int imm8;
   14304       Int imm8_count_s;
   14305       Int imm8_count_d;
   14306       Int imm8_zmask;
   14307       IRTemp dstVec   = newTemp(Ity_V128);
   14308       IRTemp srcDWord = newTemp(Ity_I32);
   14309 
   14310       modrm = insn[3];
   14311 
   14312       assign( dstVec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
   14313 
   14314       if ( epartIsReg( modrm ) ) {
   14315          IRTemp src_vec = newTemp(Ity_V128);
   14316          assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   14317 
   14318          IRTemp src_lane_0 = IRTemp_INVALID;
   14319          IRTemp src_lane_1 = IRTemp_INVALID;
   14320          IRTemp src_lane_2 = IRTemp_INVALID;
   14321          IRTemp src_lane_3 = IRTemp_INVALID;
   14322          breakup128to32s( src_vec,
   14323                           &src_lane_3, &src_lane_2, &src_lane_1, &src_lane_0 );
   14324 
   14325          imm8 = (Int)insn[4];
   14326          imm8_count_s = ((imm8 >> 6) & 3);
   14327          switch( imm8_count_s ) {
   14328            case 0:  assign( srcDWord, mkexpr(src_lane_0) ); break;
   14329            case 1:  assign( srcDWord, mkexpr(src_lane_1) ); break;
   14330            case 2:  assign( srcDWord, mkexpr(src_lane_2) ); break;
   14331            case 3:  assign( srcDWord, mkexpr(src_lane_3) ); break;
   14332            default: vassert(0);                             break;
   14333          }
   14334 
   14335          delta += 3+1+1;
   14336          DIP( "insertps $%d, %s,%s\n", imm8,
   14337               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   14338               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14339       } else {
   14340          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf,
   14341                           1/* const imm8 is 1 byte after the amode */ );
   14342          assign( srcDWord, loadLE( Ity_I32, mkexpr(addr) ) );
   14343          imm8 = (Int)insn[2+alen+1];
   14344          imm8_count_s = 0;
   14345          delta += 3+alen+1;
   14346          DIP( "insertps $%d, %s,%s\n",
   14347               imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14348       }
   14349 
   14350       IRTemp dst_lane_0 = IRTemp_INVALID;
   14351       IRTemp dst_lane_1 = IRTemp_INVALID;
   14352       IRTemp dst_lane_2 = IRTemp_INVALID;
   14353       IRTemp dst_lane_3 = IRTemp_INVALID;
   14354       breakup128to32s( dstVec,
   14355                        &dst_lane_3, &dst_lane_2, &dst_lane_1, &dst_lane_0 );
   14356 
   14357       imm8_count_d = ((imm8 >> 4) & 3);
   14358       switch( imm8_count_d ) {
   14359          case 0:  dst_lane_0 = srcDWord; break;
   14360          case 1:  dst_lane_1 = srcDWord; break;
   14361          case 2:  dst_lane_2 = srcDWord; break;
   14362          case 3:  dst_lane_3 = srcDWord; break;
   14363          default: vassert(0);            break;
   14364       }
   14365 
   14366       imm8_zmask = (imm8 & 15);
   14367       IRTemp zero_32 = newTemp(Ity_I32);
   14368       assign( zero_32, mkU32(0) );
   14369 
   14370       IRExpr* ire_vec_128 = mk128from32s(
   14371                                ((imm8_zmask & 8) == 8) ? zero_32 : dst_lane_3,
   14372                                ((imm8_zmask & 4) == 4) ? zero_32 : dst_lane_2,
   14373                                ((imm8_zmask & 2) == 2) ? zero_32 : dst_lane_1,
   14374                                ((imm8_zmask & 1) == 1) ? zero_32 : dst_lane_0 );
   14375 
   14376       putXMMReg( gregOfRexRM(pfx, modrm), ire_vec_128 );
   14377 
   14378       goto decode_success;
   14379    }
   14380 
   14381 
   14382   /* 66 0F 3A 14 /r ib = PEXTRB r/m16, xmm, imm8
   14383      Extract Byte from xmm, store in mem or zero-extend + store in gen.reg. (XMM) */
   14384   if ( have66noF2noF3( pfx )
   14385        && sz == 2
   14386        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x14 ) {
   14387 
   14388      Int imm8;
   14389      IRTemp xmm_vec  = newTemp(Ity_V128);
   14390      IRTemp sel_lane = newTemp(Ity_I32);
   14391      IRTemp shr_lane = newTemp(Ity_I32);
   14392 
   14393      modrm = insn[3];
   14394      assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
   14395      breakup128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
   14396 
   14397      if ( epartIsReg( modrm ) ) {
   14398         imm8 = (Int)insn[3+1];
   14399      } else {
   14400         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
   14401         imm8 = (Int)insn[3+alen];
   14402      }
   14403      switch( (imm8 >> 2) & 3 ) {
   14404         case 0:  assign( sel_lane, mkexpr(t0) ); break;
   14405         case 1:  assign( sel_lane, mkexpr(t1) ); break;
   14406         case 2:  assign( sel_lane, mkexpr(t2) ); break;
   14407         case 3:  assign( sel_lane, mkexpr(t3) ); break;
   14408         default: vassert(0);
   14409      }
   14410      assign( shr_lane,
   14411              binop( Iop_Shr32, mkexpr(sel_lane), mkU8(((imm8 & 3)*8)) ) );
   14412 
   14413      if ( epartIsReg( modrm ) ) {
   14414         putIReg64( eregOfRexRM(pfx,modrm),
   14415                    unop( Iop_32Uto64,
   14416                          binop(Iop_And32, mkexpr(shr_lane), mkU32(255)) ) );
   14417 
   14418         delta += 3+1+1;
   14419         DIP( "pextrb $%d, %s,%s\n", imm8,
   14420              nameXMMReg( gregOfRexRM(pfx, modrm) ),
   14421              nameIReg64( eregOfRexRM(pfx, modrm) ) );
   14422      } else {
   14423         storeLE( mkexpr(addr), unop(Iop_32to8, mkexpr(shr_lane) ) );
   14424         delta += 3+alen+1;
   14425         DIP( "$%d, pextrb %s,%s\n",
   14426              imm8, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
   14427      }
   14428 
   14429      goto decode_success;
   14430   }
   14431 
   14432 
   14433    /* 66 0F 3A 16 /r ib = PEXTRD reg/mem32, xmm2, imm8
   14434       Extract Doubleword int from xmm reg and store in gen.reg or mem. (XMM)
   14435       Note that this insn has the same opcodes as PEXTRQ, but
   14436       here the REX.W bit is _not_ present */
   14437    if ( have66noF2noF3( pfx )
   14438         && sz == 2  /* REX.W is _not_ present */
   14439         && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x16 ) {
   14440 
   14441       Int imm8_10;
   14442       IRTemp xmm_vec   = newTemp(Ity_V128);
   14443       IRTemp src_dword = newTemp(Ity_I32);
   14444 
   14445       modrm = insn[3];
   14446       assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
   14447       breakup128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
   14448 
   14449       if ( epartIsReg( modrm ) ) {
   14450          imm8_10 = (Int)(insn[3+1] & 3);
   14451       } else {
   14452          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
   14453          imm8_10 = (Int)(insn[3+alen] & 3);
   14454       }
   14455 
   14456       switch ( imm8_10 ) {
   14457          case 0:  assign( src_dword, mkexpr(t0) ); break;
   14458          case 1:  assign( src_dword, mkexpr(t1) ); break;
   14459          case 2:  assign( src_dword, mkexpr(t2) ); break;
   14460          case 3:  assign( src_dword, mkexpr(t3) ); break;
   14461          default: vassert(0);
   14462       }
   14463 
   14464       if ( epartIsReg( modrm ) ) {
   14465          putIReg32( eregOfRexRM(pfx,modrm), mkexpr(src_dword) );
   14466          delta += 3+1+1;
   14467          DIP( "pextrd $%d, %s,%s\n", imm8_10,
   14468               nameXMMReg( gregOfRexRM(pfx, modrm) ),
   14469               nameIReg32( eregOfRexRM(pfx, modrm) ) );
   14470       } else {
   14471          storeLE( mkexpr(addr), mkexpr(src_dword) );
   14472          delta += 3+alen+1;
   14473          DIP( "pextrd $%d, %s,%s\n",
   14474               imm8_10, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
   14475       }
   14476 
   14477       goto decode_success;
   14478    }
   14479 
   14480 
   14481    /* 66 REX.W 0F 3A 16 /r ib = PEXTRQ reg/mem64, xmm2, imm8
   14482       Extract Quadword int from xmm reg and store in gen.reg or mem. (XMM)
   14483       Note that this insn has the same opcodes as PEXTRD, but
   14484       here the REX.W bit is present */
   14485    if ( have66noF2noF3( pfx )
   14486         && sz == 8  /* REX.W is present */
   14487         && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x16 ) {
   14488 
   14489       Int imm8_0;
   14490       IRTemp xmm_vec   = newTemp(Ity_V128);
   14491       IRTemp src_qword = newTemp(Ity_I64);
   14492 
   14493       modrm = insn[3];
   14494       assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
   14495 
   14496       if ( epartIsReg( modrm ) ) {
   14497          imm8_0 = (Int)(insn[3+1] & 1);
   14498       } else {
   14499          addr   = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
   14500          imm8_0 = (Int)(insn[3+alen] & 1);
   14501       }
   14502       switch ( imm8_0 ) {
   14503          case 0:  assign( src_qword, unop(Iop_V128to64,   mkexpr(xmm_vec)) ); break;
   14504          case 1:  assign( src_qword, unop(Iop_V128HIto64, mkexpr(xmm_vec)) ); break;
   14505          default: vassert(0);
   14506       }
   14507 
   14508       if ( epartIsReg( modrm ) ) {
   14509          putIReg64( eregOfRexRM(pfx,modrm), mkexpr(src_qword) );
   14510          delta += 3+1+1;
   14511          DIP( "pextrq $%d, %s,%s\n", imm8_0,
   14512               nameXMMReg( gregOfRexRM(pfx, modrm) ),
   14513               nameIReg64( eregOfRexRM(pfx, modrm) ) );
   14514       } else {
   14515          storeLE( mkexpr(addr), mkexpr(src_qword) );
   14516          delta += 3+alen+1;
   14517          DIP( "pextrq $%d, %s,%s\n",
   14518               imm8_0, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
   14519       }
   14520 
   14521       goto decode_success;
   14522    }
   14523 
   14524 
   14525    /* 66 0F 3A 15 /r ib = PEXTRW r/m16, xmm, imm8
   14526       Extract Word from xmm, store in mem or zero-extend + store in gen.reg. (XMM) */
   14527    if ( have66noF2noF3( pfx )
   14528         && sz == 2
   14529         && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x15 ) {
   14530 
   14531       Int imm8_20;
   14532       IRTemp xmm_vec = newTemp(Ity_V128);
   14533       IRTemp src_word = newTemp(Ity_I16);
   14534 
   14535       modrm = insn[3];
   14536       assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
   14537       breakup128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
   14538 
   14539       if ( epartIsReg( modrm ) ) {
   14540          imm8_20 = (Int)(insn[3+1] & 7);
   14541       } else {
   14542          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
   14543          imm8_20 = (Int)(insn[3+alen] & 7);
   14544       }
   14545 
   14546       switch ( imm8_20 ) {
   14547          case 0:  assign( src_word, unop(Iop_32to16,   mkexpr(t0)) ); break;
   14548          case 1:  assign( src_word, unop(Iop_32HIto16, mkexpr(t0)) ); break;
   14549          case 2:  assign( src_word, unop(Iop_32to16,   mkexpr(t1)) ); break;
   14550          case 3:  assign( src_word, unop(Iop_32HIto16, mkexpr(t1)) ); break;
   14551          case 4:  assign( src_word, unop(Iop_32to16,   mkexpr(t2)) ); break;
   14552          case 5:  assign( src_word, unop(Iop_32HIto16, mkexpr(t2)) ); break;
   14553          case 6:  assign( src_word, unop(Iop_32to16,   mkexpr(t3)) ); break;
   14554          case 7:  assign( src_word, unop(Iop_32HIto16, mkexpr(t3)) ); break;
   14555          default: vassert(0);
   14556       }
   14557 
   14558       if ( epartIsReg( modrm ) ) {
   14559          putIReg64( eregOfRexRM(pfx,modrm), unop(Iop_16Uto64, mkexpr(src_word)) );
   14560          delta += 3+1+1;
   14561          DIP( "pextrw $%d, %s,%s\n", imm8_20,
   14562               nameXMMReg( gregOfRexRM(pfx, modrm) ),
   14563               nameIReg64( eregOfRexRM(pfx, modrm) ) );
   14564       } else {
   14565          storeLE( mkexpr(addr), mkexpr(src_word) );
   14566          delta += 3+alen+1;
   14567          DIP( "pextrw $%d, %s,%s\n",
   14568               imm8_20, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
   14569       }
   14570 
   14571       goto decode_success;
   14572    }
   14573 
   14574 
   14575    /* 66 REX.W 0F 3A 22 /r ib = PINSRQ xmm1, r/m64, imm8
   14576       Extract Quadword int from gen.reg/mem64 and insert into xmm1 */
   14577    if ( have66noF2noF3( pfx )
   14578         && sz == 8  /* REX.W is present */
   14579         && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x22 ) {
   14580 
   14581       Int imm8_0;
   14582       IRTemp src_elems = newTemp(Ity_I64);
   14583       IRTemp src_vec   = newTemp(Ity_V128);
   14584 
   14585       modrm = insn[3];
   14586 
   14587       if ( epartIsReg( modrm ) ) {
   14588          imm8_0 = (Int)(insn[3+1] & 1);
   14589          assign( src_elems, getIReg64( eregOfRexRM(pfx,modrm) ) );
   14590          delta += 3+1+1;
   14591          DIP( "pinsrq $%d, %s,%s\n", imm8_0,
   14592               nameIReg64( eregOfRexRM(pfx, modrm) ),
   14593               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14594       } else {
   14595          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
   14596          imm8_0 = (Int)(insn[3+alen] & 1);
   14597          assign( src_elems, loadLE( Ity_I64, mkexpr(addr) ) );
   14598          delta += 3+alen+1;
   14599          DIP( "pinsrq $%d, %s,%s\n",
   14600               imm8_0, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14601       }
   14602 
   14603       UShort mask = 0;
   14604       if ( imm8_0 == 0 ) {
   14605          mask = 0xFF00;
   14606          assign( src_vec,  binop( Iop_64HLtoV128, mkU64(0), mkexpr(src_elems) ) );
   14607       } else {
   14608          mask = 0x00FF;
   14609          assign( src_vec, binop( Iop_64HLtoV128, mkexpr(src_elems), mkU64(0) ) );
   14610       }
   14611 
   14612       putXMMReg( gregOfRexRM(pfx, modrm),
   14613                  binop( Iop_OrV128, mkexpr(src_vec),
   14614                         binop( Iop_AndV128,
   14615                                getXMMReg( gregOfRexRM(pfx, modrm) ),
   14616                                mkV128(mask) ) ) );
   14617 
   14618       goto decode_success;
   14619    }
   14620 
   14621 
   14622    /* 66 no-REX.W 0F 3A 22 /r ib = PINSRD xmm1, r/m32, imm8
   14623       Extract Doubleword int from gen.reg/mem32 and insert into xmm1 */
   14624    if ( have66noF2noF3( pfx )
   14625         && sz == 2 /* REX.W is NOT present */
   14626         && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x22 ) {
   14627 
   14628       Int imm8_10;
   14629       IRTemp src_elems = newTemp(Ity_I32);
   14630       IRTemp src_vec   = newTemp(Ity_V128);
   14631       IRTemp z32       = newTemp(Ity_I32);
   14632 
   14633       modrm = insn[3];
   14634 
   14635       if ( epartIsReg( modrm ) ) {
   14636          imm8_10 = (Int)(insn[3+1] & 3);
   14637          assign( src_elems, getIReg32( eregOfRexRM(pfx,modrm) ) );
   14638          delta += 3+1+1;
   14639          DIP( "pinsrd $%d, %s,%s\n", imm8_10,
   14640               nameIReg32( eregOfRexRM(pfx, modrm) ),
   14641               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14642       } else {
   14643          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
   14644          imm8_10 = (Int)(insn[3+alen] & 3);
   14645          assign( src_elems, loadLE( Ity_I32, mkexpr(addr) ) );
   14646          delta += 3+alen+1;
   14647          DIP( "pinsrd $%d, %s,%s\n",
   14648               imm8_10, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14649       }
   14650 
   14651       assign(z32, mkU32(0));
   14652 
   14653       UShort mask = 0;
   14654       switch (imm8_10) {
   14655          case 3:  mask = 0x0FFF;
   14656                   assign(src_vec, mk128from32s(src_elems, z32, z32, z32));
   14657                   break;
   14658          case 2:  mask = 0xF0FF;
   14659                   assign(src_vec, mk128from32s(z32, src_elems, z32, z32));
   14660                   break;
   14661          case 1:  mask = 0xFF0F;
   14662                   assign(src_vec, mk128from32s(z32, z32, src_elems, z32));
   14663                   break;
   14664          case 0:  mask = 0xFFF0;
   14665                   assign(src_vec, mk128from32s(z32, z32, z32, src_elems));
   14666                   break;
   14667          default: vassert(0);
   14668       }
   14669 
   14670       putXMMReg( gregOfRexRM(pfx, modrm),
   14671                  binop( Iop_OrV128, mkexpr(src_vec),
   14672                         binop( Iop_AndV128,
   14673                                getXMMReg( gregOfRexRM(pfx, modrm) ),
   14674                                mkV128(mask) ) ) );
   14675 
   14676       goto decode_success;
   14677    }
   14678 
   14679    /* 66 0F 3A 20 /r ib = PINSRB xmm1, r32/m8, imm8
   14680       Extract byte from r32/m8 and insert into xmm1 */
   14681    if ( have66noF2noF3( pfx )
   14682         && sz == 2
   14683         && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x20 ) {
   14684 
   14685       Int    imm8;
   14686       IRTemp new8 = newTemp(Ity_I64);
   14687 
   14688       modrm = insn[3];
   14689 
   14690       if ( epartIsReg( modrm ) ) {
   14691          imm8 = (Int)(insn[3+1] & 0xF);
   14692          assign( new8, binop(Iop_And64,
   14693                              unop(Iop_32Uto64,
   14694                                   getIReg32(eregOfRexRM(pfx,modrm))),
   14695                              mkU64(0xFF)));
   14696          delta += 3+1+1;
   14697          DIP( "pinsrb $%d,%s,%s\n", imm8,
   14698               nameIReg32( eregOfRexRM(pfx, modrm) ),
   14699               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14700       } else {
   14701          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
   14702          imm8 = (Int)(insn[3+alen] & 0xF);
   14703          assign( new8, unop(Iop_8Uto64, loadLE( Ity_I8, mkexpr(addr) )));
   14704          delta += 3+alen+1;
   14705          DIP( "pinsrb $%d,%s,%s\n",
   14706               imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14707       }
   14708 
   14709       // Create a V128 value which has the selected byte in the
   14710       // specified lane, and zeroes everywhere else.
   14711       IRTemp tmp128 = newTemp(Ity_V128);
   14712       IRTemp halfshift = newTemp(Ity_I64);
   14713       assign(halfshift, binop(Iop_Shl64,
   14714                               mkexpr(new8), mkU8(8 * (imm8 & 7))));
   14715       vassert(imm8 >= 0 && imm8 <= 15);
   14716       if (imm8 < 8) {
   14717          assign(tmp128, binop(Iop_64HLtoV128, mkU64(0), mkexpr(halfshift)));
   14718       } else {
   14719          assign(tmp128, binop(Iop_64HLtoV128, mkexpr(halfshift), mkU64(0)));
   14720       }
   14721 
   14722       UShort mask = ~(1 << imm8);
   14723 
   14724       putXMMReg( gregOfRexRM(pfx, modrm),
   14725                  binop( Iop_OrV128,
   14726                         mkexpr(tmp128),
   14727                         binop( Iop_AndV128,
   14728                                getXMMReg( gregOfRexRM(pfx, modrm) ),
   14729                                mkV128(mask) ) ) );
   14730 
   14731       goto decode_success;
   14732    }
   14733 
   14734 
   14735    /* 66 0F 3A 17 /r ib = EXTRACTPS reg/mem32, xmm2, imm8 Extract
   14736       float from xmm reg and store in gen.reg or mem.  This is
   14737       identical to PEXTRD, except that REX.W appears to be ignored.
   14738    */
   14739    if ( have66noF2noF3( pfx )
   14740         && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
   14741         && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x17 ) {
   14742 
   14743       Int imm8_10;
   14744       IRTemp xmm_vec   = newTemp(Ity_V128);
   14745       IRTemp src_dword = newTemp(Ity_I32);
   14746 
   14747       modrm = insn[3];
   14748       assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
   14749       breakup128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
   14750 
   14751       if ( epartIsReg( modrm ) ) {
   14752          imm8_10 = (Int)(insn[3+1] & 3);
   14753       } else {
   14754          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
   14755          imm8_10 = (Int)(insn[3+alen] & 3);
   14756       }
   14757 
   14758       switch ( imm8_10 ) {
   14759          case 0:  assign( src_dword, mkexpr(t0) ); break;
   14760          case 1:  assign( src_dword, mkexpr(t1) ); break;
   14761          case 2:  assign( src_dword, mkexpr(t2) ); break;
   14762          case 3:  assign( src_dword, mkexpr(t3) ); break;
   14763          default: vassert(0);
   14764       }
   14765 
   14766       if ( epartIsReg( modrm ) ) {
   14767          putIReg32( eregOfRexRM(pfx,modrm), mkexpr(src_dword) );
   14768          delta += 3+1+1;
   14769          DIP( "extractps $%d, %s,%s\n", imm8_10,
   14770               nameXMMReg( gregOfRexRM(pfx, modrm) ),
   14771               nameIReg32( eregOfRexRM(pfx, modrm) ) );
   14772       } else {
   14773          storeLE( mkexpr(addr), mkexpr(src_dword) );
   14774          delta += 3+alen+1;
   14775          DIP( "extractps $%d, %s,%s\n",
   14776               imm8_10, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
   14777       }
   14778 
   14779       goto decode_success;
   14780    }
   14781 
   14782 
   14783    /* 66 0F 38 37 = PCMPGTQ
   14784       64x2 comparison (signed, presumably; the Intel docs don't say :-)
   14785    */
   14786    if ( have66noF2noF3( pfx ) && sz == 2
   14787         && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x37) {
   14788       /* FIXME: this needs an alignment check */
   14789       delta = dis_SSEint_E_to_G( vbi, pfx, delta+3,
   14790                                  "pcmpgtq", Iop_CmpGT64Sx2, False );
   14791       goto decode_success;
   14792    }
   14793 
   14794    /* 66 0F 38 3D /r = PMAXSD xmm1, xmm2/m128
   14795       Maximum of Packed Signed Double Word Integers (XMM)
   14796       66 0F 38 39 /r = PMINSD xmm1, xmm2/m128
   14797       Minimum of Packed Signed Double Word Integers (XMM) */
   14798    if ( have66noF2noF3( pfx ) && sz == 2
   14799         && insn[0] == 0x0F && insn[1] == 0x38
   14800         && (insn[2] == 0x3D || insn[2] == 0x39)) {
   14801       /* FIXME: this needs an alignment check */
   14802       Bool isMAX = insn[2] == 0x3D;
   14803       delta = dis_SSEint_E_to_G(
   14804                  vbi, pfx, delta+3,
   14805                  isMAX ? "pmaxsd" : "pminsd",
   14806                  isMAX ? Iop_Max32Sx4 : Iop_Min32Sx4,
   14807                  False
   14808               );
   14809       goto decode_success;
   14810    }
   14811 
   14812    /* 66 0F 38 3F /r = PMAXUD xmm1, xmm2/m128
   14813       Maximum of Packed Unsigned Doubleword Integers (XMM)
   14814       66 0F 38 3B /r = PMINUD xmm1, xmm2/m128
   14815       Minimum of Packed Unsigned Doubleword Integers (XMM) */
   14816    if ( have66noF2noF3( pfx ) && sz == 2
   14817         && insn[0] == 0x0F && insn[1] == 0x38
   14818         && (insn[2] == 0x3F || insn[2] == 0x3B)) {
   14819       /* FIXME: this needs an alignment check */
   14820       Bool isMAX = insn[2] == 0x3F;
   14821       delta = dis_SSEint_E_to_G(
   14822                  vbi, pfx, delta+3,
   14823                  isMAX ? "pmaxud" : "pminud",
   14824                  isMAX ? Iop_Max32Ux4 : Iop_Min32Ux4,
   14825                  False
   14826               );
   14827       goto decode_success;
   14828    }
   14829 
   14830    /* 66 0F 38 3E /r = PMAXUW xmm1, xmm2/m128
   14831       Maximum of Packed Unsigned Word Integers (XMM)
   14832       66 0F 38 3A /r = PMINUW xmm1, xmm2/m128
   14833       Minimum of Packed Unsigned Word Integers (XMM)
   14834    */
   14835    if ( have66noF2noF3( pfx ) && sz == 2
   14836         && insn[0] == 0x0F && insn[1] == 0x38
   14837         && (insn[2] == 0x3E || insn[2] == 0x3A)) {
   14838       /* FIXME: this needs an alignment check */
   14839       Bool isMAX = insn[2] == 0x3E;
   14840       delta = dis_SSEint_E_to_G(
   14841                  vbi, pfx, delta+3,
   14842                  isMAX ? "pmaxuw" : "pminuw",
   14843                  isMAX ? Iop_Max16Ux8 : Iop_Min16Ux8,
   14844                  False
   14845               );
   14846       goto decode_success;
   14847    }
   14848 
   14849    /* 66 0F 38 3C /r = PMAXSB xmm1, xmm2/m128
   14850       8Sx16 (signed) max
   14851       66 0F 38 38 /r = PMINSB xmm1, xmm2/m128
   14852       8Sx16 (signed) min
   14853    */
   14854    if ( have66noF2noF3( pfx ) && sz == 2
   14855         && insn[0] == 0x0F && insn[1] == 0x38
   14856         && (insn[2] == 0x3C || insn[2] == 0x38)) {
   14857       /* FIXME: this needs an alignment check */
   14858       Bool isMAX = insn[2] == 0x3C;
   14859       delta = dis_SSEint_E_to_G(
   14860                  vbi, pfx, delta+3,
   14861                  isMAX ? "pmaxsb" : "pminsb",
   14862                  isMAX ? Iop_Max8Sx16 : Iop_Min8Sx16,
   14863                  False
   14864               );
   14865       goto decode_success;
   14866    }
   14867 
   14868    /* 66 0f 38 20 /r = PMOVSXBW xmm1, xmm2/m64
   14869       Packed Move with Sign Extend from Byte to Word (XMM) */
   14870    if ( have66noF2noF3( pfx )
   14871         && sz == 2
   14872         && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x20 ) {
   14873 
   14874       modrm = insn[3];
   14875 
   14876       IRTemp srcVec = newTemp(Ity_V128);
   14877 
   14878       if ( epartIsReg( modrm ) ) {
   14879          assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   14880          delta += 3+1;
   14881          DIP( "pmovsxbw %s,%s\n",
   14882               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   14883               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14884       } else {
   14885          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   14886          assign( srcVec,
   14887                  unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
   14888          delta += 3+alen;
   14889          DIP( "pmovsxbw %s,%s\n",
   14890               dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14891       }
   14892 
   14893       putXMMReg( gregOfRexRM(pfx, modrm),
   14894                  binop( Iop_SarN16x8,
   14895                         binop( Iop_ShlN16x8,
   14896                                binop( Iop_InterleaveLO8x16,
   14897                                       IRExpr_Const( IRConst_V128(0) ),
   14898                                       mkexpr(srcVec) ),
   14899                                mkU8(8) ),
   14900                         mkU8(8) ) );
   14901 
   14902       goto decode_success;
   14903    }
   14904 
   14905 
   14906    /* 66 0f 38 21 /r = PMOVSXBD xmm1, xmm2/m32
   14907       Packed Move with Sign Extend from Byte to DWord (XMM) */
   14908    if ( have66noF2noF3( pfx )
   14909         && sz == 2
   14910         && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x21 ) {
   14911 
   14912       modrm = insn[3];
   14913 
   14914       IRTemp srcVec = newTemp(Ity_V128);
   14915 
   14916       if ( epartIsReg( modrm ) ) {
   14917          assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   14918          delta += 3+1;
   14919          DIP( "pmovsxbd %s,%s\n",
   14920               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   14921               nameXMMReg( gregOfRexRM(pfx, modrm) )  );
   14922       } else {
   14923          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   14924          assign( srcVec,
   14925                  unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) ) ) );
   14926          delta += 3+alen;
   14927          DIP( "pmovsxbd %s,%s\n",
   14928               dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14929       }
   14930 
   14931       IRTemp zeroVec = newTemp(Ity_V128);
   14932       assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   14933 
   14934       putXMMReg( gregOfRexRM(pfx, modrm),
   14935                  binop( Iop_SarN32x4,
   14936                         binop( Iop_ShlN32x4,
   14937                                binop( Iop_InterleaveLO8x16,
   14938                                       mkexpr(zeroVec),
   14939                                       binop( Iop_InterleaveLO8x16,
   14940                                              mkexpr(zeroVec),
   14941                                              mkexpr(srcVec) ) ),
   14942                                mkU8(24) ), mkU8(24) ) );
   14943 
   14944       goto decode_success;
   14945    }
   14946 
   14947 
   14948    /* 66 0f 38 22 /r = PMOVSXBQ xmm1, xmm2/m16
   14949       Packed Move with Sign Extend from Byte to QWord (XMM) */
   14950    if ( have66noF2noF3(pfx)
   14951         && sz == 2
   14952         && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x22 ) {
   14953 
   14954       modrm = insn[3];
   14955 
   14956       IRTemp srcBytes = newTemp(Ity_I16);
   14957 
   14958       if ( epartIsReg(modrm) ) {
   14959          assign( srcBytes, getXMMRegLane16( eregOfRexRM(pfx, modrm), 0 ) );
   14960          delta += 3+1;
   14961          DIP( "pmovsxbq %s,%s\n",
   14962               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   14963               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14964       } else {
   14965          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   14966          assign( srcBytes, loadLE( Ity_I16, mkexpr(addr) ) );
   14967          delta += 3+alen;
   14968          DIP( "pmovsxbq %s,%s\n",
   14969               dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14970       }
   14971 
   14972       putXMMReg( gregOfRexRM( pfx, modrm ),
   14973                  binop( Iop_64HLtoV128,
   14974                         unop( Iop_8Sto64,
   14975                               unop( Iop_16HIto8,
   14976                                     mkexpr(srcBytes) ) ),
   14977                         unop( Iop_8Sto64,
   14978                               unop( Iop_16to8, mkexpr(srcBytes) ) ) ) );
   14979 
   14980       goto decode_success;
   14981    }
   14982 
   14983 
   14984    /* 66 0f 38 23 /r = PMOVSXWD xmm1, xmm2/m64
   14985       Packed Move with Sign Extend from Word to DWord (XMM) */
   14986    if ( have66noF2noF3( pfx )
   14987         && sz == 2
   14988         && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x23 ) {
   14989 
   14990       modrm = insn[3];
   14991 
   14992       IRTemp srcVec = newTemp(Ity_V128);
   14993 
   14994       if ( epartIsReg(modrm) ) {
   14995          assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   14996          delta += 3+1;
   14997          DIP( "pmovsxwd %s,%s\n",
   14998               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   14999               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15000       } else {
   15001          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15002          assign( srcVec,
   15003                  unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
   15004          delta += 3+alen;
   15005          DIP( "pmovsxwd %s,%s\n",
   15006               dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15007       }
   15008 
   15009       putXMMReg( gregOfRexRM(pfx, modrm),
   15010                  binop( Iop_SarN32x4,
   15011                         binop( Iop_ShlN32x4,
   15012                                binop( Iop_InterleaveLO16x8,
   15013                                       IRExpr_Const( IRConst_V128(0) ),
   15014                                       mkexpr(srcVec) ),
   15015                                mkU8(16) ),
   15016                         mkU8(16) ) );
   15017 
   15018       goto decode_success;
   15019    }
   15020 
   15021 
   15022    /* 66 0f 38 24 /r = PMOVSXWQ xmm1, xmm2/m32
   15023       Packed Move with Sign Extend from Word to QWord (XMM) */
   15024    if ( have66noF2noF3( pfx )
   15025         && sz == 2
   15026         && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x24 ) {
   15027 
   15028       modrm = insn[3];
   15029 
   15030       IRTemp srcBytes = newTemp(Ity_I32);
   15031 
   15032       if ( epartIsReg( modrm ) ) {
   15033          assign( srcBytes, getXMMRegLane32( eregOfRexRM(pfx, modrm), 0 ) );
   15034          delta += 3+1;
   15035          DIP( "pmovsxwq %s,%s\n",
   15036               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   15037               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15038       } else {
   15039          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15040          assign( srcBytes, loadLE( Ity_I32, mkexpr(addr) ) );
   15041          delta += 3+alen;
   15042          DIP( "pmovsxwq %s,%s\n",
   15043               dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15044       }
   15045 
   15046       putXMMReg( gregOfRexRM( pfx, modrm ),
   15047                  binop( Iop_64HLtoV128,
   15048                         unop( Iop_16Sto64,
   15049                               unop( Iop_32HIto16, mkexpr(srcBytes) ) ),
   15050                         unop( Iop_16Sto64,
   15051                               unop( Iop_32to16, mkexpr(srcBytes) ) ) ) );
   15052 
   15053       goto decode_success;
   15054    }
   15055 
   15056 
   15057    /* 66 0f 38 25 /r = PMOVSXDQ xmm1, xmm2/m64
   15058       Packed Move with Sign Extend from Double Word to Quad Word (XMM) */
   15059    if ( have66noF2noF3( pfx )
   15060         && sz == 2
   15061         && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x25 ) {
   15062 
   15063       modrm = insn[3];
   15064 
   15065       IRTemp srcBytes = newTemp(Ity_I64);
   15066 
   15067       if ( epartIsReg(modrm) ) {
   15068          assign( srcBytes, getXMMRegLane64( eregOfRexRM(pfx, modrm), 0 ) );
   15069          delta += 3+1;
   15070          DIP( "pmovsxdq %s,%s\n",
   15071               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   15072               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15073       } else {
   15074          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15075          assign( srcBytes, loadLE( Ity_I64, mkexpr(addr) ) );
   15076          delta += 3+alen;
   15077          DIP( "pmovsxdq %s,%s\n",
   15078               dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15079       }
   15080 
   15081       putXMMReg( gregOfRexRM(pfx, modrm),
   15082                  binop( Iop_64HLtoV128,
   15083                         unop( Iop_32Sto64,
   15084                               unop( Iop_64HIto32, mkexpr(srcBytes) ) ),
   15085                         unop( Iop_32Sto64,
   15086                               unop( Iop_64to32, mkexpr(srcBytes) ) ) ) );
   15087 
   15088       goto decode_success;
   15089    }
   15090 
   15091 
   15092    /* 66 0f 38 30 /r = PMOVZXBW xmm1, xmm2/m64
   15093       Packed Move with Zero Extend from Byte to Word (XMM) */
   15094    if ( have66noF2noF3(pfx)
   15095         && sz == 2
   15096         && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x30 ) {
   15097 
   15098       modrm = insn[3];
   15099 
   15100       IRTemp srcVec = newTemp(Ity_V128);
   15101 
   15102       if ( epartIsReg(modrm) ) {
   15103          assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   15104          delta += 3+1;
   15105          DIP( "pmovzxbw %s,%s\n",
   15106               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   15107               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15108       } else {
   15109          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15110          assign( srcVec,
   15111                  unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
   15112          delta += 3+alen;
   15113          DIP( "pmovzxbw %s,%s\n",
   15114               dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15115       }
   15116 
   15117       putXMMReg( gregOfRexRM(pfx, modrm),
   15118                  binop( Iop_InterleaveLO8x16,
   15119                         IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ) );
   15120 
   15121       goto decode_success;
   15122    }
   15123 
   15124 
   15125    /* 66 0f 38 31 /r = PMOVZXBD xmm1, xmm2/m32
   15126       Packed Move with Zero Extend from Byte to DWord (XMM) */
   15127    if ( have66noF2noF3( pfx )
   15128         && sz == 2
   15129         && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x31 ) {
   15130 
   15131       modrm = insn[3];
   15132 
   15133       IRTemp srcVec = newTemp(Ity_V128);
   15134 
   15135       if ( epartIsReg(modrm) ) {
   15136          assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   15137          delta += 3+1;
   15138          DIP( "pmovzxbd %s,%s\n",
   15139               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   15140               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15141       } else {
   15142          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15143          assign( srcVec,
   15144                  unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) ) ) );
   15145          delta += 3+alen;
   15146          DIP( "pmovzxbd %s,%s\n",
   15147               dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15148       }
   15149 
   15150       IRTemp zeroVec = newTemp(Ity_V128);
   15151       assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   15152 
   15153       putXMMReg( gregOfRexRM( pfx, modrm ),
   15154                  binop( Iop_InterleaveLO8x16,
   15155                         mkexpr(zeroVec),
   15156                         binop( Iop_InterleaveLO8x16,
   15157                                mkexpr(zeroVec), mkexpr(srcVec) ) ) );
   15158 
   15159       goto decode_success;
   15160    }
   15161 
   15162 
   15163    /* 66 0f 38 32 /r = PMOVZXBQ xmm1, xmm2/m16
   15164       Packed Move with Zero Extend from Byte to QWord (XMM) */
   15165    if ( have66noF2noF3( pfx )
   15166         && sz == 2
   15167         && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x32 ) {
   15168 
   15169       modrm = insn[3];
   15170 
   15171       IRTemp srcVec = newTemp(Ity_V128);
   15172 
   15173       if ( epartIsReg(modrm) ) {
   15174          assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   15175          delta += 3+1;
   15176          DIP( "pmovzxbq %s,%s\n",
   15177               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   15178               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15179       } else {
   15180          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15181          assign( srcVec,
   15182                  unop( Iop_32UtoV128,
   15183                        unop( Iop_16Uto32, loadLE( Ity_I16, mkexpr(addr) ) ) ) );
   15184          delta += 3+alen;
   15185          DIP( "pmovzxbq %s,%s\n",
   15186               dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15187       }
   15188 
   15189       IRTemp zeroVec = newTemp(Ity_V128);
   15190       assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   15191 
   15192       putXMMReg( gregOfRexRM( pfx, modrm ),
   15193                  binop( Iop_InterleaveLO8x16,
   15194                         mkexpr(zeroVec),
   15195                         binop( Iop_InterleaveLO8x16,
   15196                                mkexpr(zeroVec),
   15197                                binop( Iop_InterleaveLO8x16,
   15198                                       mkexpr(zeroVec), mkexpr(srcVec) ) ) ) );
   15199 
   15200       goto decode_success;
   15201    }
   15202 
   15203 
   15204    /* 66 0f 38 33 /r = PMOVZXWD xmm1, xmm2/m64
   15205       Packed Move with Zero Extend from Word to DWord (XMM) */
   15206    if ( have66noF2noF3( pfx )
   15207         && sz == 2
   15208         && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x33 ) {
   15209 
   15210       modrm = insn[3];
   15211 
   15212       IRTemp srcVec = newTemp(Ity_V128);
   15213 
   15214       if ( epartIsReg(modrm) ) {
   15215          assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   15216          delta += 3+1;
   15217          DIP( "pmovzxwd %s,%s\n",
   15218               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   15219               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15220       } else {
   15221          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15222          assign( srcVec,
   15223                  unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
   15224          delta += 3+alen;
   15225          DIP( "pmovzxwd %s,%s\n",
   15226               dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15227       }
   15228 
   15229       putXMMReg( gregOfRexRM(pfx, modrm),
   15230                  binop( Iop_InterleaveLO16x8,
   15231                         IRExpr_Const( IRConst_V128(0) ),
   15232                         mkexpr(srcVec) ) );
   15233 
   15234       goto decode_success;
   15235    }
   15236 
   15237 
   15238    /* 66 0f 38 34 /r = PMOVZXWQ xmm1, xmm2/m32
   15239       Packed Move with Zero Extend from Word to QWord (XMM) */
   15240    if ( have66noF2noF3( pfx )
   15241         && sz == 2
   15242         && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x34 ) {
   15243 
   15244       modrm = insn[3];
   15245 
   15246       IRTemp srcVec = newTemp(Ity_V128);
   15247 
   15248       if ( epartIsReg( modrm ) ) {
   15249          assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   15250          delta += 3+1;
   15251          DIP( "pmovzxwq %s,%s\n",
   15252               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   15253               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15254       } else {
   15255          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15256          assign( srcVec,
   15257                  unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) ) ) );
   15258          delta += 3+alen;
   15259          DIP( "pmovzxwq %s,%s\n",
   15260               dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15261       }
   15262 
   15263       IRTemp zeroVec = newTemp( Ity_V128 );
   15264       assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   15265 
   15266       putXMMReg( gregOfRexRM( pfx, modrm ),
   15267                  binop( Iop_InterleaveLO16x8,
   15268                         mkexpr(zeroVec),
   15269                         binop( Iop_InterleaveLO16x8,
   15270                                mkexpr(zeroVec), mkexpr(srcVec) ) ) );
   15271 
   15272       goto decode_success;
   15273    }
   15274 
   15275 
   15276    /* 66 0f 38 35 /r = PMOVZXDQ xmm1, xmm2/m64
   15277       Packed Move with Zero Extend from DWord to QWord (XMM) */
   15278    if ( have66noF2noF3( pfx )
   15279         && sz == 2
   15280         && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x35 ) {
   15281 
   15282       modrm = insn[3];
   15283 
   15284       IRTemp srcVec = newTemp(Ity_V128);
   15285 
   15286       if ( epartIsReg(modrm) ) {
   15287          assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   15288          delta += 3+1;
   15289          DIP( "pmovzxdq %s,%s\n",
   15290               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   15291               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15292       } else {
   15293          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15294          assign( srcVec,
   15295                  unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
   15296          delta += 3+alen;
   15297          DIP( "pmovzxdq %s,%s\n",
   15298               dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15299       }
   15300 
   15301       putXMMReg( gregOfRexRM(pfx, modrm),
   15302                  binop( Iop_InterleaveLO32x4,
   15303                         IRExpr_Const( IRConst_V128(0) ),
   15304                         mkexpr(srcVec) ) );
   15305 
   15306       goto decode_success;
   15307    }
   15308 
   15309 
   15310    /* 66 0f 38 40 /r = PMULLD xmm1, xmm2/m128
   15311       32x4 integer multiply from xmm2/m128 to xmm1 */
   15312    if ( have66noF2noF3( pfx )
   15313         && sz == 2
   15314         && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x40 ) {
   15315 
   15316       modrm = insn[3];
   15317 
   15318       IRTemp argL = newTemp(Ity_V128);
   15319       IRTemp argR = newTemp(Ity_V128);
   15320 
   15321       if ( epartIsReg(modrm) ) {
   15322          assign( argL, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   15323          delta += 3+1;
   15324          DIP( "pmulld %s,%s\n",
   15325               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   15326               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15327       } else {
   15328          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15329          gen_SEGV_if_not_16_aligned( addr );
   15330          assign( argL, loadLE( Ity_V128, mkexpr(addr) ));
   15331          delta += 3+alen;
   15332          DIP( "pmulld %s,%s\n",
   15333               dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15334       }
   15335 
   15336       assign(argR, getXMMReg( gregOfRexRM(pfx, modrm) ));
   15337 
   15338       putXMMReg( gregOfRexRM(pfx, modrm),
   15339                  binop( Iop_Mul32x4, mkexpr(argL), mkexpr(argR)) );
   15340 
   15341       goto decode_success;
   15342    }
   15343 
   15344 
   15345    /* F3 0F B8  = POPCNT{W,L,Q}
   15346       Count the number of 1 bits in a register
   15347     */
   15348    if (haveF3noF2(pfx) /* so both 66 and 48 are possibilities */
   15349        && insn[0] == 0x0F && insn[1] == 0xB8) {
   15350       vassert(sz == 2 || sz == 4 || sz == 8);
   15351       /*IRType*/ ty  = szToITy(sz);
   15352       IRTemp     src = newTemp(ty);
   15353       modrm = insn[2];
   15354       if (epartIsReg(modrm)) {
   15355          assign(src, getIRegE(sz, pfx, modrm));
   15356          delta += 2+1;
   15357          DIP("popcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
   15358              nameIRegG(sz, pfx, modrm));
   15359       } else {
   15360          addr = disAMode( &alen, vbi, pfx, delta+2, dis_buf, 0);
   15361          assign(src, loadLE(ty, mkexpr(addr)));
   15362          delta += 2+alen;
   15363          DIP("popcnt%c %s, %s\n", nameISize(sz), dis_buf,
   15364              nameIRegG(sz, pfx, modrm));
   15365       }
   15366 
   15367       IRTemp result = gen_POPCOUNT(ty, src);
   15368       putIRegG(sz, pfx, modrm, mkexpr(result));
   15369 
   15370       // Update flags.  This is pretty lame .. perhaps can do better
   15371       // if this turns out to be performance critical.
   15372       // O S A C P are cleared.  Z is set if SRC == 0.
   15373       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   15374       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   15375       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   15376       stmt( IRStmt_Put( OFFB_CC_DEP1,
   15377             binop(Iop_Shl64,
   15378                   unop(Iop_1Uto64,
   15379                        binop(Iop_CmpEQ64,
   15380                              widenUto64(mkexpr(src)),
   15381                              mkU64(0))),
   15382                   mkU8(AMD64G_CC_SHIFT_Z))));
   15383 
   15384       goto decode_success;
   15385    }
   15386 
   15387 
   15388    /* 66 0F 3A 0B /r ib = ROUNDSD imm8, xmm2/m64, xmm1
   15389       66 0F 3A 0A /r ib = ROUNDSS imm8, xmm2/m32, xmm1
   15390    */
   15391    if (have66noF2noF3(pfx)
   15392        && sz == 2
   15393        && insn[0] == 0x0F && insn[1] == 0x3A
   15394        && (insn[2] == 0x0B || insn[2] == 0x0A)) {
   15395 
   15396       Bool   isD = insn[2] == 0x0B;
   15397       IRTemp src = newTemp(isD ? Ity_F64 : Ity_F32);
   15398       IRTemp res = newTemp(isD ? Ity_F64 : Ity_F32);
   15399       Int    imm = 0;
   15400 
   15401       modrm = insn[3];
   15402 
   15403       if (epartIsReg(modrm)) {
   15404          assign( src,
   15405                  isD ? getXMMRegLane64F( eregOfRexRM(pfx, modrm), 0 )
   15406                      : getXMMRegLane32F( eregOfRexRM(pfx, modrm), 0 ) );
   15407          imm = insn[3+1];
   15408          if (imm & ~7) goto decode_failure;
   15409          delta += 3+1+1;
   15410          DIP( "rounds%c $%d,%s,%s\n",
   15411               isD ? 'd' : 's',
   15412               imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
   15413                    nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15414       } else {
   15415          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15416          assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
   15417          imm = insn[3+alen];
   15418          if (imm & ~7) goto decode_failure;
   15419          delta += 3+alen+1;
   15420          DIP( "rounds%c $%d,%s,%s\n",
   15421               isD ? 'd' : 's',
   15422               imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15423       }
   15424 
   15425       /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   15426          that encoding is the same as the encoding for IRRoundingMode,
   15427          we can use that value directly in the IR as a rounding
   15428          mode. */
   15429       assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
   15430                         (imm & 4) ? get_sse_roundingmode()
   15431                                   : mkU32(imm & 3),
   15432                         mkexpr(src)) );
   15433 
   15434       if (isD)
   15435          putXMMRegLane64F( gregOfRexRM(pfx, modrm), 0, mkexpr(res) );
   15436       else
   15437          putXMMRegLane32F( gregOfRexRM(pfx, modrm), 0, mkexpr(res) );
   15438 
   15439       goto decode_success;
   15440    }
   15441 
   15442 
   15443    /* 66 0F 3A 09 /r ib = ROUNDPD imm8, xmm2/m128, xmm1 */
   15444    if (have66noF2noF3(pfx)
   15445        && sz == 2
   15446        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x09) {
   15447 
   15448       IRTemp src0 = newTemp(Ity_F64);
   15449       IRTemp src1 = newTemp(Ity_F64);
   15450       IRTemp res0 = newTemp(Ity_F64);
   15451       IRTemp res1 = newTemp(Ity_F64);
   15452       IRTemp rm   = newTemp(Ity_I32);
   15453       Int    imm  = 0;
   15454 
   15455       modrm = insn[3];
   15456 
   15457       if (epartIsReg(modrm)) {
   15458          assign( src0,
   15459                  getXMMRegLane64F( eregOfRexRM(pfx, modrm), 0 ) );
   15460          assign( src1,
   15461                  getXMMRegLane64F( eregOfRexRM(pfx, modrm), 1 ) );
   15462          imm = insn[3+1];
   15463          if (imm & ~7) goto decode_failure;
   15464          delta += 3+1+1;
   15465          DIP( "roundpd $%d,%s,%s\n",
   15466               imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
   15467                    nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15468       } else {
   15469          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15470          gen_SEGV_if_not_16_aligned(addr);
   15471          assign( src0, loadLE(Ity_F64,
   15472                               binop(Iop_Add64, mkexpr(addr), mkU64(0) )));
   15473          assign( src1, loadLE(Ity_F64,
   15474                               binop(Iop_Add64, mkexpr(addr), mkU64(8) )));
   15475          imm = insn[3+alen];
   15476          if (imm & ~7) goto decode_failure;
   15477          delta += 3+alen+1;
   15478          DIP( "roundpd $%d,%s,%s\n",
   15479               imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15480       }
   15481 
   15482       /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   15483          that encoding is the same as the encoding for IRRoundingMode,
   15484          we can use that value directly in the IR as a rounding
   15485          mode. */
   15486       assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   15487 
   15488       assign(res0, binop(Iop_RoundF64toInt, mkexpr(rm), mkexpr(src0)) );
   15489       assign(res1, binop(Iop_RoundF64toInt, mkexpr(rm), mkexpr(src1)) );
   15490 
   15491       putXMMRegLane64F( gregOfRexRM(pfx, modrm), 0, mkexpr(res0) );
   15492       putXMMRegLane64F( gregOfRexRM(pfx, modrm), 1, mkexpr(res1) );
   15493 
   15494       goto decode_success;
   15495    }
   15496 
   15497 
   15498    /* 66 0F 3A 08 /r ib = ROUNDPS imm8, xmm2/m128, xmm1 */
   15499    if (have66noF2noF3(pfx)
   15500        && sz == 2
   15501        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x08) {
   15502 
   15503       IRTemp src0 = newTemp(Ity_F32);
   15504       IRTemp src1 = newTemp(Ity_F32);
   15505       IRTemp src2 = newTemp(Ity_F32);
   15506       IRTemp src3 = newTemp(Ity_F32);
   15507       IRTemp res0 = newTemp(Ity_F32);
   15508       IRTemp res1 = newTemp(Ity_F32);
   15509       IRTemp res2 = newTemp(Ity_F32);
   15510       IRTemp res3 = newTemp(Ity_F32);
   15511       IRTemp rm   = newTemp(Ity_I32);
   15512       Int    imm  = 0;
   15513 
   15514       modrm = insn[3];
   15515 
   15516       if (epartIsReg(modrm)) {
   15517          assign( src0,
   15518                  getXMMRegLane32F( eregOfRexRM(pfx, modrm), 0 ) );
   15519          assign( src1,
   15520                  getXMMRegLane32F( eregOfRexRM(pfx, modrm), 1 ) );
   15521          assign( src2,
   15522                  getXMMRegLane32F( eregOfRexRM(pfx, modrm), 2 ) );
   15523          assign( src3,
   15524                  getXMMRegLane32F( eregOfRexRM(pfx, modrm), 3 ) );
   15525          imm = insn[3+1];
   15526          if (imm & ~7) goto decode_failure;
   15527          delta += 3+1+1;
   15528          DIP( "roundps $%d,%s,%s\n",
   15529               imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
   15530                    nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15531       } else {
   15532          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15533          gen_SEGV_if_not_16_aligned(addr);
   15534          assign( src0, loadLE(Ity_F32,
   15535                               binop(Iop_Add64, mkexpr(addr), mkU64(0) )));
   15536          assign( src1, loadLE(Ity_F32,
   15537                               binop(Iop_Add64, mkexpr(addr), mkU64(4) )));
   15538          assign( src2, loadLE(Ity_F32,
   15539                               binop(Iop_Add64, mkexpr(addr), mkU64(8) )));
   15540          assign( src3, loadLE(Ity_F32,
   15541                               binop(Iop_Add64, mkexpr(addr), mkU64(12) )));
   15542          imm = insn[3+alen];
   15543          if (imm & ~7) goto decode_failure;
   15544          delta += 3+alen+1;
   15545          DIP( "roundps $%d,%s,%s\n",
   15546               imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15547       }
   15548 
   15549       /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   15550          that encoding is the same as the encoding for IRRoundingMode,
   15551          we can use that value directly in the IR as a rounding
   15552          mode. */
   15553       assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   15554 
   15555       assign(res0, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src0)) );
   15556       assign(res1, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src1)) );
   15557       assign(res2, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src2)) );
   15558       assign(res3, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src3)) );
   15559 
   15560       putXMMRegLane32F( gregOfRexRM(pfx, modrm), 0, mkexpr(res0) );
   15561       putXMMRegLane32F( gregOfRexRM(pfx, modrm), 1, mkexpr(res1) );
   15562       putXMMRegLane32F( gregOfRexRM(pfx, modrm), 2, mkexpr(res2) );
   15563       putXMMRegLane32F( gregOfRexRM(pfx, modrm), 3, mkexpr(res3) );
   15564 
   15565       goto decode_success;
   15566    }
   15567 
   15568 
   15569    /* F3 0F BD -- LZCNT (count leading zeroes.  An AMD extension,
   15570       which we can only decode if we're sure this is an AMD cpu that
   15571       supports LZCNT, since otherwise it's BSR, which behaves
   15572       differently. */
   15573    if (haveF3noF2(pfx) /* so both 66 and 48 are possibilities */
   15574        && insn[0] == 0x0F && insn[1] == 0xBD
   15575        && 0 != (archinfo->hwcaps & VEX_HWCAPS_AMD64_LZCNT)) {
   15576       vassert(sz == 2 || sz == 4 || sz == 8);
   15577       /*IRType*/ ty  = szToITy(sz);
   15578       IRTemp     src = newTemp(ty);
   15579       modrm = insn[2];
   15580       if (epartIsReg(modrm)) {
   15581          assign(src, getIRegE(sz, pfx, modrm));
   15582          delta += 2+1;
   15583          DIP("lzcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
   15584              nameIRegG(sz, pfx, modrm));
   15585       } else {
   15586          addr = disAMode( &alen, vbi, pfx, delta+2, dis_buf, 0);
   15587          assign(src, loadLE(ty, mkexpr(addr)));
   15588          delta += 2+alen;
   15589          DIP("lzcnt%c %s, %s\n", nameISize(sz), dis_buf,
   15590              nameIRegG(sz, pfx, modrm));
   15591       }
   15592 
   15593       IRTemp res = gen_LZCNT(ty, src);
   15594       putIRegG(sz, pfx, modrm, mkexpr(res));
   15595 
   15596       // Update flags.  This is pretty lame .. perhaps can do better
   15597       // if this turns out to be performance critical.
   15598       // O S A P are cleared.  Z is set if RESULT == 0.
   15599       // C is set if SRC is zero.
   15600       IRTemp src64 = newTemp(Ity_I64);
   15601       IRTemp res64 = newTemp(Ity_I64);
   15602       assign(src64, widenUto64(mkexpr(src)));
   15603       assign(res64, widenUto64(mkexpr(res)));
   15604 
   15605       IRTemp oszacp = newTemp(Ity_I64);
   15606       assign(
   15607          oszacp,
   15608          binop(Iop_Or64,
   15609                binop(Iop_Shl64,
   15610                      unop(Iop_1Uto64,
   15611                           binop(Iop_CmpEQ64, mkexpr(res64), mkU64(0))),
   15612                      mkU8(AMD64G_CC_SHIFT_Z)),
   15613                binop(Iop_Shl64,
   15614                      unop(Iop_1Uto64,
   15615                           binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0))),
   15616                      mkU8(AMD64G_CC_SHIFT_C))
   15617          )
   15618       );
   15619 
   15620       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   15621       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   15622       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   15623       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
   15624 
   15625       goto decode_success;
   15626    }
   15627 
   15628    /* 66 0F 3A 63 /r ib = PCMPISTRI imm8, xmm2/m128, xmm1
   15629       66 0F 3A 62 /r ib = PCMPISTRM imm8, xmm2/m128, xmm1
   15630       66 0F 3A 61 /r ib = PCMPESTRI imm8, xmm2/m128, xmm1
   15631       66 0F 3A 60 /r ib = PCMPESTRM imm8, xmm2/m128, xmm1
   15632       (selected special cases that actually occur in glibc,
   15633        not by any means a complete implementation.)
   15634    */
   15635    if (have66noF2noF3(pfx)
   15636        && sz == 2
   15637        && insn[0] == 0x0F && insn[1] == 0x3A
   15638        && (insn[2] >= 0x60 && insn[2] <= 0x63)) {
   15639 
   15640       UInt  isISTRx = insn[2] & 2;
   15641       UInt  isxSTRM = (insn[2] & 1) ^ 1;
   15642       UInt  regNoL = 0;
   15643       UInt  regNoR = 0;
   15644       UChar imm    = 0;
   15645 
   15646       /* This is a nasty kludge.  We need to pass 2 x V128 to the
   15647          helper (which is clean).  Since we can't do that, use a dirty
   15648          helper to compute the results directly from the XMM regs in
   15649          the guest state.  That means for the memory case, we need to
   15650          move the left operand into a pseudo-register (XMM16, let's
   15651          call it). */
   15652       modrm = insn[3];
   15653       if (epartIsReg(modrm)) {
   15654          regNoL = eregOfRexRM(pfx, modrm);
   15655          regNoR = gregOfRexRM(pfx, modrm);
   15656          imm = insn[3+1];
   15657          delta += 3+1+1;
   15658       } else {
   15659          regNoL = 16; /* use XMM16 as an intermediary */
   15660          regNoR = gregOfRexRM(pfx, modrm);
   15661          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15662          /* No alignment check; I guess that makes sense, given that
   15663             these insns are for dealing with C style strings. */
   15664          stmt( IRStmt_Put( OFFB_XMM16, loadLE(Ity_V128, mkexpr(addr)) ));
   15665          imm = insn[3+alen];
   15666          delta += 3+alen+1;
   15667       }
   15668 
   15669       /* Now we know the XMM reg numbers for the operands, and the
   15670          immediate byte.  Is it one we can actually handle? Throw out
   15671          any cases for which the helper function has not been
   15672          verified. */
   15673       switch (imm) {
   15674          case 0x00:
   15675          case 0x02: case 0x08: case 0x0A: case 0x0C: case 0x12:
   15676          case 0x1A: case 0x3A: case 0x44: case 0x4A:
   15677             break;
   15678          default:
   15679             goto decode_failure;
   15680       }
   15681 
   15682       /* Who ya gonna call?  Presumably not Ghostbusters. */
   15683       void*  fn = &amd64g_dirtyhelper_PCMPxSTRx;
   15684       HChar* nm = "amd64g_dirtyhelper_PCMPxSTRx";
   15685 
   15686       /* Round up the arguments.  Note that this is a kludge -- the
   15687          use of mkU64 rather than mkIRExpr_HWord implies the
   15688          assumption that the host's word size is 64-bit. */
   15689       UInt gstOffL = regNoL == 16 ? OFFB_XMM16 : xmmGuestRegOffset(regNoL);
   15690       UInt gstOffR = xmmGuestRegOffset(regNoR);
   15691 
   15692       IRExpr*  opc4_and_imm = mkU64((insn[2] << 8) | (imm & 0xFF));
   15693       IRExpr*  gstOffLe     = mkU64(gstOffL);
   15694       IRExpr*  gstOffRe     = mkU64(gstOffR);
   15695       IRExpr*  edxIN        = isISTRx ? mkU64(0) : getIRegRDX(8);
   15696       IRExpr*  eaxIN        = isISTRx ? mkU64(0) : getIRegRAX(8);
   15697       IRExpr** args
   15698          = mkIRExprVec_5( opc4_and_imm, gstOffLe, gstOffRe, edxIN, eaxIN );
   15699 
   15700       IRTemp   resT = newTemp(Ity_I64);
   15701       IRDirty* d    = unsafeIRDirty_1_N( resT, 0/*regparms*/, nm, fn, args );
   15702       /* It's not really a dirty call, but we can't use the clean
   15703          helper mechanism here for the very lame reason that we can't
   15704          pass 2 x V128s by value to a helper, nor get one back.  Hence
   15705          this roundabout scheme. */
   15706       d->needsBBP = True;
   15707       d->nFxState = 2;
   15708       d->fxState[0].fx     = Ifx_Read;
   15709       d->fxState[0].offset = gstOffL;
   15710       d->fxState[0].size   = sizeof(U128);
   15711       d->fxState[1].fx     = Ifx_Read;
   15712       d->fxState[1].offset = gstOffR;
   15713       d->fxState[1].size   = sizeof(U128);
   15714       if (isxSTRM) {
   15715          /* Declare that the helper writes XMM0. */
   15716          d->nFxState = 3;
   15717          d->fxState[2].fx     = Ifx_Write;
   15718          d->fxState[2].offset = xmmGuestRegOffset(0);
   15719          d->fxState[2].size   = sizeof(U128);
   15720       }
   15721 
   15722       stmt( IRStmt_Dirty(d) );
   15723 
   15724       /* Now resT[15:0] holds the new OSZACP values, so the condition
   15725          codes must be updated. And for a xSTRI case, resT[31:16]
   15726          holds the new ECX value, so stash that too. */
   15727       if (!isxSTRM) {
   15728          putIReg64(R_RCX, binop(Iop_And64,
   15729                                 binop(Iop_Shr64, mkexpr(resT), mkU8(16)),
   15730                                 mkU64(0xFFFF)));
   15731       }
   15732 
   15733       stmt( IRStmt_Put(
   15734                OFFB_CC_DEP1,
   15735                binop(Iop_And64, mkexpr(resT), mkU64(0xFFFF))
   15736       ));
   15737       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   15738       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   15739       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   15740 
   15741       if (regNoL == 16) {
   15742          DIP("pcmp%cstr%c $%x,%s,%s\n",
   15743              isISTRx ? 'i' : 'e', isxSTRM ? 'm' : 'i',
   15744              (UInt)imm, dis_buf, nameXMMReg(regNoR));
   15745       } else {
   15746          DIP("pcmp%cstr%c $%x,%s,%s\n",
   15747              isISTRx ? 'i' : 'e', isxSTRM ? 'm' : 'i',
   15748              (UInt)imm, nameXMMReg(regNoL), nameXMMReg(regNoR));
   15749       }
   15750 
   15751       goto decode_success;
   15752    }
   15753 
   15754 
   15755    /* 66 0f 38 17 /r = PTEST xmm1, xmm2/m128
   15756       Logical compare (set ZF and CF from AND/ANDN of the operands) */
   15757    if (have66noF2noF3( pfx ) && sz == 2
   15758        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x17) {
   15759       modrm = insn[3];
   15760       IRTemp vecE = newTemp(Ity_V128);
   15761       IRTemp vecG = newTemp(Ity_V128);
   15762 
   15763       if ( epartIsReg(modrm) ) {
   15764          assign(vecE, getXMMReg(eregOfRexRM(pfx, modrm)));
   15765          delta += 3+1;
   15766          DIP( "ptest %s,%s\n",
   15767               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   15768               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15769       } else {
   15770          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15771          gen_SEGV_if_not_16_aligned( addr );
   15772          assign(vecE, loadLE( Ity_V128, mkexpr(addr) ));
   15773          delta += 3+alen;
   15774          DIP( "ptest %s,%s\n",
   15775               dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15776       }
   15777 
   15778       assign(vecG, getXMMReg(gregOfRexRM(pfx, modrm)));
   15779 
   15780       /* Set Z=1 iff (vecE & vecG) == 0
   15781          Set C=1 iff (vecE & not vecG) == 0
   15782       */
   15783 
   15784       /* andV, andnV:  vecE & vecG,  vecE and not(vecG) */
   15785       IRTemp andV  = newTemp(Ity_V128);
   15786       IRTemp andnV = newTemp(Ity_V128);
   15787       assign(andV,  binop(Iop_AndV128, mkexpr(vecE), mkexpr(vecG)));
   15788       assign(andnV, binop(Iop_AndV128,
   15789                           mkexpr(vecE),
   15790                           binop(Iop_XorV128, mkexpr(vecG),
   15791                                              mkV128(0xFFFF))));
   15792 
   15793       /* The same, but reduced to 64-bit values, by or-ing the top
   15794          and bottom 64-bits together.  It relies on this trick:
   15795 
   15796           InterleaveLO64x2([a,b],[c,d]) == [b,d]    hence
   15797 
   15798           InterleaveLO64x2([a,b],[a,b]) == [b,b]    and similarly
   15799           InterleaveHI64x2([a,b],[a,b]) == [a,a]
   15800 
   15801           and so the OR of the above 2 exprs produces
   15802           [a OR b, a OR b], from which we simply take the lower half.
   15803       */
   15804       IRTemp and64  = newTemp(Ity_I64);
   15805       IRTemp andn64 = newTemp(Ity_I64);
   15806 
   15807       assign(
   15808          and64,
   15809          unop(Iop_V128to64,
   15810               binop(Iop_OrV128,
   15811                     binop(Iop_InterleaveLO64x2, mkexpr(andV), mkexpr(andV)),
   15812                     binop(Iop_InterleaveHI64x2, mkexpr(andV), mkexpr(andV))
   15813               )
   15814          )
   15815       );
   15816 
   15817       assign(
   15818          andn64,
   15819          unop(Iop_V128to64,
   15820               binop(Iop_OrV128,
   15821                     binop(Iop_InterleaveLO64x2, mkexpr(andnV), mkexpr(andnV)),
   15822                     binop(Iop_InterleaveHI64x2, mkexpr(andnV), mkexpr(andnV))
   15823               )
   15824           )
   15825        );
   15826 
   15827       /* Now convert and64, andn64 to all-zeroes or all-1s, so we can
   15828          slice out the Z and C bits conveniently.  We use the standard
   15829          trick all-zeroes -> all-zeroes, anything-else -> all-ones
   15830          done by "(x | -x) >>s (word-size - 1)".
   15831       */
   15832       IRTemp z64 = newTemp(Ity_I64);
   15833       IRTemp c64 = newTemp(Ity_I64);
   15834       assign(z64,
   15835              unop(Iop_Not64,
   15836                   binop(Iop_Sar64,
   15837                         binop(Iop_Or64,
   15838                               binop(Iop_Sub64, mkU64(0), mkexpr(and64)),
   15839                               mkexpr(and64)
   15840                         ),
   15841                         mkU8(63)))
   15842       );
   15843 
   15844       assign(c64,
   15845              unop(Iop_Not64,
   15846                   binop(Iop_Sar64,
   15847                         binop(Iop_Or64,
   15848                               binop(Iop_Sub64, mkU64(0), mkexpr(andn64)),
   15849                               mkexpr(andn64)
   15850                         ),
   15851                         mkU8(63)))
   15852       );
   15853 
   15854       /* And finally, slice out the Z and C flags and set the flags
   15855          thunk to COPY for them.  OSAP are set to zero. */
   15856       IRTemp newOSZACP = newTemp(Ity_I64);
   15857       assign(newOSZACP,
   15858              binop(Iop_Or64,
   15859                    binop(Iop_And64, mkexpr(z64), mkU64(AMD64G_CC_MASK_Z)),
   15860                    binop(Iop_And64, mkexpr(c64), mkU64(AMD64G_CC_MASK_C))
   15861              )
   15862       );
   15863 
   15864       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(newOSZACP)));
   15865       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   15866       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   15867       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   15868 
   15869       goto decode_success;
   15870    }
   15871 
   15872    /* 66 0F 38 15 /r = BLENDVPD xmm1, xmm2/m128  (double gran)
   15873       66 0F 38 14 /r = BLENDVPS xmm1, xmm2/m128  (float gran)
   15874       66 0F 38 10 /r = PBLENDVB xmm1, xmm2/m128  (byte gran)
   15875       Blend at various granularities, with XMM0 (implicit operand)
   15876       providing the controlling mask.
   15877    */
   15878    if (have66noF2noF3(pfx) && sz == 2
   15879        && insn[0] == 0x0F && insn[1] == 0x38
   15880        && (insn[2] == 0x15 || insn[2] == 0x14 || insn[2] == 0x10)) {
   15881       modrm = insn[3];
   15882 
   15883       HChar* nm    = NULL;
   15884       UInt   gran  = 0;
   15885       IROp   opSAR = Iop_INVALID;
   15886       switch (insn[2]) {
   15887          case 0x15:
   15888             nm = "blendvpd"; gran = 8; opSAR = Iop_SarN64x2;
   15889             break;
   15890          case 0x14:
   15891             nm = "blendvps"; gran = 4; opSAR = Iop_SarN32x4;
   15892             break;
   15893          case 0x10:
   15894             nm = "pblendvb"; gran = 1; opSAR = Iop_SarN8x16;
   15895             break;
   15896       }
   15897       vassert(nm);
   15898 
   15899       IRTemp vecE = newTemp(Ity_V128);
   15900       IRTemp vecG = newTemp(Ity_V128);
   15901       IRTemp vec0 = newTemp(Ity_V128);
   15902 
   15903       if ( epartIsReg(modrm) ) {
   15904          assign(vecE, getXMMReg(eregOfRexRM(pfx, modrm)));
   15905          delta += 3+1;
   15906          DIP( "%s %s,%s\n", nm,
   15907               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   15908               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15909       } else {
   15910          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15911          gen_SEGV_if_not_16_aligned( addr );
   15912          assign(vecE, loadLE( Ity_V128, mkexpr(addr) ));
   15913          delta += 3+alen;
   15914          DIP( "%s %s,%s\n", nm,
   15915               dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15916       }
   15917 
   15918       assign(vecG, getXMMReg(gregOfRexRM(pfx, modrm)));
   15919       assign(vec0, getXMMReg(0));
   15920 
   15921       /* Now the tricky bit is to convert vec0 into a suitable mask,
   15922          by copying the most significant bit of each lane into all
   15923          positions in the lane. */
   15924       IRTemp sh = newTemp(Ity_I8);
   15925       assign(sh, mkU8(8 * gran - 1));
   15926 
   15927       IRTemp mask = newTemp(Ity_V128);
   15928       assign(mask, binop(opSAR, mkexpr(vec0), mkexpr(sh)));
   15929 
   15930       IRTemp notmask = newTemp(Ity_V128);
   15931       assign(notmask, unop(Iop_NotV128, mkexpr(mask)));
   15932 
   15933       IRExpr* res = binop(Iop_OrV128,
   15934                           binop(Iop_AndV128, mkexpr(vecE), mkexpr(mask)),
   15935                           binop(Iop_AndV128, mkexpr(vecG), mkexpr(notmask)));
   15936       putXMMReg(gregOfRexRM(pfx, modrm), res);
   15937 
   15938       goto decode_success;
   15939    }
   15940 
   15941    /* F2 0F 38 F0 /r = CRC32 r/m8, r32 (REX.W ok, 66 not ok)
   15942       F2 0F 38 F1 /r = CRC32 r/m{16,32,64}, r32
   15943       The decoding on this is a bit unusual.
   15944    */
   15945    if (haveF2noF3(pfx)
   15946        && insn[0] == 0x0F && insn[1] == 0x38
   15947        && (insn[2] == 0xF1
   15948            || (insn[2] == 0xF0 && !have66(pfx)))) {
   15949       modrm = insn[3];
   15950 
   15951       if (insn[2] == 0xF0)
   15952          sz = 1;
   15953       else
   15954          vassert(sz == 2 || sz == 4 || sz == 8);
   15955 
   15956       IRType tyE = szToITy(sz);
   15957       IRTemp valE = newTemp(tyE);
   15958 
   15959       if (epartIsReg(modrm)) {
   15960          assign(valE, getIRegE(sz, pfx, modrm));
   15961          delta += 3+1;
   15962          DIP("crc32b %s,%s\n", nameIRegE(sz, pfx, modrm),
   15963              nameIRegG(1==getRexW(pfx) ? 8 : 4 ,pfx, modrm));
   15964       } else {
   15965          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15966          assign(valE, loadLE(tyE, mkexpr(addr)));
   15967          delta += 3+alen;
   15968          DIP("crc32b %s,%s\n", dis_buf,
   15969              nameIRegG(1==getRexW(pfx) ? 8 : 4 ,pfx, modrm));
   15970       }
   15971 
   15972       /* Somewhat funny getting/putting of the crc32 value, in order
   15973          to ensure that it turns into 64-bit gets and puts.  However,
   15974          mask off the upper 32 bits so as to not get memcheck false
   15975          +ves around the helper call. */
   15976       IRTemp valG0 = newTemp(Ity_I64);
   15977       assign(valG0, binop(Iop_And64, getIRegG(8, pfx, modrm),
   15978                           mkU64(0xFFFFFFFF)));
   15979 
   15980       HChar* nm = NULL;
   15981       void* fn = NULL;
   15982       switch (sz) {
   15983          case 1: nm = "amd64g_calc_crc32b";
   15984                  fn = &amd64g_calc_crc32b; break;
   15985          case 2: nm = "amd64g_calc_crc32w";
   15986                  fn = &amd64g_calc_crc32w; break;
   15987          case 4: nm = "amd64g_calc_crc32l";
   15988                  fn = &amd64g_calc_crc32l; break;
   15989          case 8: nm = "amd64g_calc_crc32q";
   15990                  fn = &amd64g_calc_crc32q; break;
   15991       }
   15992       vassert(nm && fn);
   15993       IRTemp valG1 = newTemp(Ity_I64);
   15994       assign(valG1,
   15995              mkIRExprCCall(Ity_I64, 0/*regparm*/, nm, fn,
   15996                            mkIRExprVec_2(mkexpr(valG0),
   15997                                          widenUto64(mkexpr(valE)))));
   15998 
   15999       putIRegG(4, pfx, modrm, unop(Iop_64to32, mkexpr(valG1)));
   16000       goto decode_success;
   16001    }
   16002 
   16003    /* ---------------------------------------------------- */
   16004    /* --- end of the SSE4 decoder                      --- */
   16005    /* ---------------------------------------------------- */
   16006 
   16007    /*after_sse_decoders:*/
   16008 
   16009    /* Get the primary opcode. */
   16010    opc = getUChar(delta); delta++;
   16011 
   16012    /* We get here if the current insn isn't SSE, or this CPU doesn't
   16013       support SSE. */
   16014 
   16015    switch (opc) {
   16016 
   16017    /* ------------------------ Control flow --------------- */
   16018 
   16019    case 0xC2: /* RET imm16 */
   16020       if (have66orF2orF3(pfx)) goto decode_failure;
   16021       d64 = getUDisp16(delta);
   16022       delta += 2;
   16023       dis_ret(vbi, d64);
   16024       dres.whatNext = Dis_StopHere;
   16025       DIP("ret %lld\n", d64);
   16026       break;
   16027 
   16028    case 0xC3: /* RET */
   16029       if (have66orF2(pfx)) goto decode_failure;
   16030       /* F3 is acceptable on AMD. */
   16031       dis_ret(vbi, 0);
   16032       dres.whatNext = Dis_StopHere;
   16033       DIP(haveF3(pfx) ? "rep ; ret\n" : "ret\n");
   16034       break;
   16035 
   16036    case 0xE8: /* CALL J4 */
   16037       if (haveF2orF3(pfx)) goto decode_failure;
   16038       d64 = getSDisp32(delta); delta += 4;
   16039       d64 += (guest_RIP_bbstart+delta);
   16040       /* (guest_RIP_bbstart+delta) == return-to addr, d64 == call-to addr */
   16041       t1 = newTemp(Ity_I64);
   16042       assign(t1, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   16043       putIReg64(R_RSP, mkexpr(t1));
   16044       storeLE( mkexpr(t1), mkU64(guest_RIP_bbstart+delta));
   16045       t2 = newTemp(Ity_I64);
   16046       assign(t2, mkU64((Addr64)d64));
   16047       make_redzone_AbiHint(vbi, t1, t2/*nia*/, "call-d32");
   16048       if (resteerOkFn( callback_opaque, (Addr64)d64) ) {
   16049          /* follow into the call target. */
   16050          dres.whatNext   = Dis_ResteerU;
   16051          dres.continueAt = d64;
   16052       } else {
   16053          jmp_lit(Ijk_Call,d64);
   16054          dres.whatNext = Dis_StopHere;
   16055       }
   16056       DIP("call 0x%llx\n",d64);
   16057       break;
   16058 
   16059 //.. //--    case 0xC8: /* ENTER */
   16060 //.. //--       d32 = getUDisp16(eip); eip += 2;
   16061 //.. //--       abyte = getUChar(delta); delta++;
   16062 //.. //--
   16063 //.. //--       vg_assert(sz == 4);
   16064 //.. //--       vg_assert(abyte == 0);
   16065 //.. //--
   16066 //.. //--       t1 = newTemp(cb); t2 = newTemp(cb);
   16067 //.. //--       uInstr2(cb, GET,   sz, ArchReg, R_EBP, TempReg, t1);
   16068 //.. //--       uInstr2(cb, GET,    4, ArchReg, R_ESP, TempReg, t2);
   16069 //.. //--       uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
   16070 //.. //--       uLiteral(cb, sz);
   16071 //.. //--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
   16072 //.. //--       uInstr2(cb, STORE,  4, TempReg, t1,    TempReg, t2);
   16073 //.. //--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_EBP);
   16074 //.. //--       if (d32) {
   16075 //.. //--          uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
   16076 //.. //--          uLiteral(cb, d32);
   16077 //.. //--          uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
   16078 //.. //--       }
   16079 //.. //--       DIP("enter 0x%x, 0x%x", d32, abyte);
   16080 //.. //--       break;
   16081 
   16082    case 0xC9: /* LEAVE */
   16083       /* In 64-bit mode this defaults to a 64-bit operand size.  There
   16084          is no way to encode a 32-bit variant.  Hence sz==4 but we do
   16085          it as if sz=8. */
   16086       if (sz != 4)
   16087          goto decode_failure;
   16088       t1 = newTemp(Ity_I64);
   16089       t2 = newTemp(Ity_I64);
   16090       assign(t1, getIReg64(R_RBP));
   16091       /* First PUT RSP looks redundant, but need it because RSP must
   16092          always be up-to-date for Memcheck to work... */
   16093       putIReg64(R_RSP, mkexpr(t1));
   16094       assign(t2, loadLE(Ity_I64,mkexpr(t1)));
   16095       putIReg64(R_RBP, mkexpr(t2));
   16096       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t1), mkU64(8)) );
   16097       DIP("leave\n");
   16098       break;
   16099 
   16100 //.. //--    /* ---------------- Misc weird-ass insns --------------- */
   16101 //.. //--
   16102 //.. //--    case 0x27: /* DAA */
   16103 //.. //--    case 0x2F: /* DAS */
   16104 //.. //--       t1 = newTemp(cb);
   16105 //.. //--       uInstr2(cb, GET, 1, ArchReg, R_AL, TempReg, t1);
   16106 //.. //--       /* Widen %AL to 32 bits, so it's all defined when we push it. */
   16107 //.. //--       uInstr1(cb, WIDEN, 4, TempReg, t1);
   16108 //.. //--       uWiden(cb, 1, False);
   16109 //.. //--       uInstr0(cb, CALLM_S, 0);
   16110 //.. //--       uInstr1(cb, PUSH, 4, TempReg, t1);
   16111 //.. //--       uInstr1(cb, CALLM, 0, Lit16,
   16112 //.. //--                   opc == 0x27 ? VGOFF_(helper_DAA) : VGOFF_(helper_DAS) );
   16113 //.. //--       uFlagsRWU(cb, FlagsAC, FlagsSZACP, FlagO);
   16114 //.. //--       uInstr1(cb, POP, 4, TempReg, t1);
   16115 //.. //--       uInstr0(cb, CALLM_E, 0);
   16116 //.. //--       uInstr2(cb, PUT, 1, TempReg, t1, ArchReg, R_AL);
   16117 //.. //--       DIP(opc == 0x27 ? "daa\n" : "das\n");
   16118 //.. //--       break;
   16119 //.. //--
   16120 //.. //--    case 0x37: /* AAA */
   16121 //.. //--    case 0x3F: /* AAS */
   16122 //.. //--       t1 = newTemp(cb);
   16123 //.. //--       uInstr2(cb, GET, 2, ArchReg, R_EAX, TempReg, t1);
   16124 //.. //--       /* Widen %AL to 32 bits, so it's all defined when we push it. */
   16125 //.. //--       uInstr1(cb, WIDEN, 4, TempReg, t1);
   16126 //.. //--       uWiden(cb, 2, False);
   16127 //.. //--       uInstr0(cb, CALLM_S, 0);
   16128 //.. //--       uInstr1(cb, PUSH, 4, TempReg, t1);
   16129 //.. //--       uInstr1(cb, CALLM, 0, Lit16,
   16130 //.. //--                   opc == 0x37 ? VGOFF_(helper_AAA) : VGOFF_(helper_AAS) );
   16131 //.. //--       uFlagsRWU(cb, FlagA, FlagsAC, FlagsEmpty);
   16132 //.. //--       uInstr1(cb, POP, 4, TempReg, t1);
   16133 //.. //--       uInstr0(cb, CALLM_E, 0);
   16134 //.. //--       uInstr2(cb, PUT, 2, TempReg, t1, ArchReg, R_EAX);
   16135 //.. //--       DIP(opc == 0x37 ? "aaa\n" : "aas\n");
   16136 //.. //--       break;
   16137 //.. //--
   16138 //.. //--    case 0xD4: /* AAM */
   16139 //.. //--    case 0xD5: /* AAD */
   16140 //.. //--       d32 = getUChar(delta); delta++;
   16141 //.. //--       if (d32 != 10) VG_(core_panic)("disInstr: AAM/AAD but base not 10 !");
   16142 //.. //--       t1 = newTemp(cb);
   16143 //.. //--       uInstr2(cb, GET, 2, ArchReg, R_EAX, TempReg, t1);
   16144 //.. //--       /* Widen %AX to 32 bits, so it's all defined when we push it. */
   16145 //.. //--       uInstr1(cb, WIDEN, 4, TempReg, t1);
   16146 //.. //--       uWiden(cb, 2, False);
   16147 //.. //--       uInstr0(cb, CALLM_S, 0);
   16148 //.. //--       uInstr1(cb, PUSH, 4, TempReg, t1);
   16149 //.. //--       uInstr1(cb, CALLM, 0, Lit16,
   16150 //.. //--                   opc == 0xD4 ? VGOFF_(helper_AAM) : VGOFF_(helper_AAD) );
   16151 //.. //--       uFlagsRWU(cb, FlagsEmpty, FlagsSZP, FlagsEmpty);
   16152 //.. //--       uInstr1(cb, POP, 4, TempReg, t1);
   16153 //.. //--       uInstr0(cb, CALLM_E, 0);
   16154 //.. //--       uInstr2(cb, PUT, 2, TempReg, t1, ArchReg, R_EAX);
   16155 //.. //--       DIP(opc == 0xD4 ? "aam\n" : "aad\n");
   16156 //.. //--       break;
   16157 
   16158    /* ------------------------ CWD/CDQ -------------------- */
   16159 
   16160    case 0x98: /* CBW */
   16161       if (haveF2orF3(pfx)) goto decode_failure;
   16162       if (sz == 8) {
   16163          putIRegRAX( 8, unop(Iop_32Sto64, getIRegRAX(4)) );
   16164          DIP(/*"cdqe\n"*/"cltq");
   16165          break;
   16166       }
   16167       if (sz == 4) {
   16168          putIRegRAX( 4, unop(Iop_16Sto32, getIRegRAX(2)) );
   16169          DIP("cwtl\n");
   16170          break;
   16171       }
   16172       if (sz == 2) {
   16173          putIRegRAX( 2, unop(Iop_8Sto16, getIRegRAX(1)) );
   16174          DIP("cbw\n");
   16175          break;
   16176       }
   16177       goto decode_failure;
   16178 
   16179    case 0x99: /* CWD/CDQ/CQO */
   16180       if (haveF2orF3(pfx)) goto decode_failure;
   16181       vassert(sz == 2 || sz == 4 || sz == 8);
   16182       ty = szToITy(sz);
   16183       putIRegRDX( sz,
   16184                   binop(mkSizedOp(ty,Iop_Sar8),
   16185                         getIRegRAX(sz),
   16186                         mkU8(sz == 2 ? 15 : (sz == 4 ? 31 : 63))) );
   16187       DIP(sz == 2 ? "cwd\n"
   16188                   : (sz == 4 ? /*"cdq\n"*/ "cltd\n"
   16189                              : "cqo\n"));
   16190       break;
   16191 
   16192    /* ------------------------ FPU ops -------------------- */
   16193 
   16194    case 0x9E: /* SAHF */
   16195       codegen_SAHF();
   16196       DIP("sahf\n");
   16197       break;
   16198 
   16199    case 0x9F: /* LAHF */
   16200       codegen_LAHF();
   16201       DIP("lahf\n");
   16202       break;
   16203 
   16204    case 0x9B: /* FWAIT */
   16205       /* ignore? */
   16206       DIP("fwait\n");
   16207       break;
   16208 
   16209    case 0xD8:
   16210    case 0xD9:
   16211    case 0xDA:
   16212    case 0xDB:
   16213    case 0xDC:
   16214    case 0xDD:
   16215    case 0xDE:
   16216    case 0xDF: {
   16217       Bool redundantREXWok = False;
   16218 
   16219       if (haveF2orF3(pfx))
   16220          goto decode_failure;
   16221 
   16222       /* kludge to tolerate redundant rex.w prefixes (should do this
   16223          properly one day) */
   16224       /* mono 1.1.18.1 produces 48 D9 FA, which is rex.w fsqrt */
   16225       if ( (opc == 0xD9 && getUChar(delta+0) == 0xFA)/*fsqrt*/ )
   16226          redundantREXWok = True;
   16227 
   16228       if ( (sz == 4
   16229            || (sz == 8 && redundantREXWok))
   16230            && haveNo66noF2noF3(pfx)) {
   16231          Long delta0    = delta;
   16232          Bool decode_OK = False;
   16233          delta = dis_FPU ( &decode_OK, vbi, pfx, delta );
   16234          if (!decode_OK) {
   16235             delta = delta0;
   16236             goto decode_failure;
   16237          }
   16238          break;
   16239       } else {
   16240          goto decode_failure;
   16241       }
   16242    }
   16243 
   16244    /* ------------------------ INT ------------------------ */
   16245 
   16246    case 0xCC: /* INT 3 */
   16247       jmp_lit(Ijk_SigTRAP, guest_RIP_bbstart + delta);
   16248       dres.whatNext = Dis_StopHere;
   16249       DIP("int $0x3\n");
   16250       break;
   16251 
   16252    case 0xCD: { /* INT imm8 */
   16253       IRJumpKind jk = Ijk_Boring;
   16254       if (have66orF2orF3(pfx)) goto decode_failure;
   16255       d64 = getUChar(delta); delta++;
   16256       switch (d64) {
   16257          case 32: jk = Ijk_Sys_int32; break;
   16258          default: goto decode_failure;
   16259       }
   16260       guest_RIP_next_mustcheck = True;
   16261       guest_RIP_next_assumed = guest_RIP_bbstart + delta;
   16262       jmp_lit(jk, guest_RIP_next_assumed);
   16263       /* It's important that all ArchRegs carry their up-to-date value
   16264          at this point.  So we declare an end-of-block here, which
   16265          forces any TempRegs caching ArchRegs to be flushed. */
   16266       dres.whatNext = Dis_StopHere;
   16267       DIP("int $0x%02x\n", (UInt)d64);
   16268       break;
   16269    }
   16270 
   16271    /* ------------------------ Jcond, byte offset --------- */
   16272 
   16273    case 0xEB: /* Jb (jump, byte offset) */
   16274       if (haveF2orF3(pfx)) goto decode_failure;
   16275       if (sz != 4)
   16276          goto decode_failure; /* JRS added 2004 July 11 */
   16277       d64 = (guest_RIP_bbstart+delta+1) + getSDisp8(delta);
   16278       delta++;
   16279       if (resteerOkFn(callback_opaque,d64)) {
   16280          dres.whatNext   = Dis_ResteerU;
   16281          dres.continueAt = d64;
   16282       } else {
   16283          jmp_lit(Ijk_Boring,d64);
   16284          dres.whatNext = Dis_StopHere;
   16285       }
   16286       DIP("jmp-8 0x%llx\n", d64);
   16287       break;
   16288 
   16289    case 0xE9: /* Jv (jump, 16/32 offset) */
   16290       if (haveF2orF3(pfx)) goto decode_failure;
   16291       if (sz != 4)
   16292          goto decode_failure; /* JRS added 2004 July 11 */
   16293       d64 = (guest_RIP_bbstart+delta+sz) + getSDisp(sz,delta);
   16294       delta += sz;
   16295       if (resteerOkFn(callback_opaque,d64)) {
   16296          dres.whatNext   = Dis_ResteerU;
   16297          dres.continueAt = d64;
   16298       } else {
   16299          jmp_lit(Ijk_Boring,d64);
   16300          dres.whatNext = Dis_StopHere;
   16301       }
   16302       DIP("jmp 0x%llx\n", d64);
   16303       break;
   16304 
   16305    case 0x70:
   16306    case 0x71:
   16307    case 0x72: /* JBb/JNAEb (jump below) */
   16308    case 0x73: /* JNBb/JAEb (jump not below) */
   16309    case 0x74: /* JZb/JEb (jump zero) */
   16310    case 0x75: /* JNZb/JNEb (jump not zero) */
   16311    case 0x76: /* JBEb/JNAb (jump below or equal) */
   16312    case 0x77: /* JNBEb/JAb (jump not below or equal) */
   16313    case 0x78: /* JSb (jump negative) */
   16314    case 0x79: /* JSb (jump not negative) */
   16315    case 0x7A: /* JP (jump parity even) */
   16316    case 0x7B: /* JNP/JPO (jump parity odd) */
   16317    case 0x7C: /* JLb/JNGEb (jump less) */
   16318    case 0x7D: /* JGEb/JNLb (jump greater or equal) */
   16319    case 0x7E: /* JLEb/JNGb (jump less or equal) */
   16320    case 0x7F: /* JGb/JNLEb (jump greater) */
   16321     { Long   jmpDelta;
   16322       HChar* comment  = "";
   16323       if (haveF2orF3(pfx)) goto decode_failure;
   16324       jmpDelta = getSDisp8(delta);
   16325       vassert(-128 <= jmpDelta && jmpDelta < 128);
   16326       d64 = (guest_RIP_bbstart+delta+1) + jmpDelta;
   16327       delta++;
   16328       if (resteerCisOk
   16329           && vex_control.guest_chase_cond
   16330           && (Addr64)d64 != (Addr64)guest_RIP_bbstart
   16331           && jmpDelta < 0
   16332           && resteerOkFn( callback_opaque, d64) ) {
   16333          /* Speculation: assume this backward branch is taken.  So we
   16334             need to emit a side-exit to the insn following this one,
   16335             on the negation of the condition, and continue at the
   16336             branch target address (d64).  If we wind up back at the
   16337             first instruction of the trace, just stop; it's better to
   16338             let the IR loop unroller handle that case. */
   16339          stmt( IRStmt_Exit(
   16340                   mk_amd64g_calculate_condition(
   16341                      (AMD64Condcode)(1 ^ (opc - 0x70))),
   16342                   Ijk_Boring,
   16343                   IRConst_U64(guest_RIP_bbstart+delta) ) );
   16344          dres.whatNext   = Dis_ResteerC;
   16345          dres.continueAt = d64;
   16346          comment = "(assumed taken)";
   16347       }
   16348       else
   16349       if (resteerCisOk
   16350           && vex_control.guest_chase_cond
   16351           && (Addr64)d64 != (Addr64)guest_RIP_bbstart
   16352           && jmpDelta >= 0
   16353           && resteerOkFn( callback_opaque, guest_RIP_bbstart+delta ) ) {
   16354          /* Speculation: assume this forward branch is not taken.  So
   16355             we need to emit a side-exit to d64 (the dest) and continue
   16356             disassembling at the insn immediately following this
   16357             one. */
   16358          stmt( IRStmt_Exit(
   16359                   mk_amd64g_calculate_condition((AMD64Condcode)(opc - 0x70)),
   16360                   Ijk_Boring,
   16361                   IRConst_U64(d64) ) );
   16362          dres.whatNext   = Dis_ResteerC;
   16363          dres.continueAt = guest_RIP_bbstart+delta;
   16364          comment = "(assumed not taken)";
   16365       }
   16366       else {
   16367          /* Conservative default translation - end the block at this
   16368             point. */
   16369          jcc_01( (AMD64Condcode)(opc - 0x70),
   16370                  guest_RIP_bbstart+delta,
   16371                  d64 );
   16372          dres.whatNext = Dis_StopHere;
   16373       }
   16374       DIP("j%s-8 0x%llx %s\n", name_AMD64Condcode(opc - 0x70), d64, comment);
   16375       break;
   16376     }
   16377 
   16378    case 0xE3:
   16379       /* JRCXZ or JECXZ, depending address size override. */
   16380       if (have66orF2orF3(pfx)) goto decode_failure;
   16381       d64 = (guest_RIP_bbstart+delta+1) + getSDisp8(delta);
   16382       delta++;
   16383       if (haveASO(pfx)) {
   16384          /* 32-bit */
   16385          stmt( IRStmt_Exit( binop(Iop_CmpEQ64,
   16386                             unop(Iop_32Uto64, getIReg32(R_RCX)),
   16387                             mkU64(0)),
   16388                Ijk_Boring,
   16389                IRConst_U64(d64))
   16390              );
   16391          DIP("jecxz 0x%llx\n", d64);
   16392       } else {
   16393          /* 64-bit */
   16394          stmt( IRStmt_Exit( binop(Iop_CmpEQ64,
   16395                                   getIReg64(R_RCX),
   16396                                   mkU64(0)),
   16397                Ijk_Boring,
   16398                IRConst_U64(d64))
   16399              );
   16400          DIP("jrcxz 0x%llx\n", d64);
   16401       }
   16402       break;
   16403 
   16404    case 0xE0: /* LOOPNE disp8: decrement count, jump if count != 0 && ZF==0 */
   16405    case 0xE1: /* LOOPE  disp8: decrement count, jump if count != 0 && ZF==1 */
   16406    case 0xE2: /* LOOP   disp8: decrement count, jump if count != 0 */
   16407     { /* The docs say this uses rCX as a count depending on the
   16408          address size override, not the operand one. */
   16409       IRExpr* zbit  = NULL;
   16410       IRExpr* count = NULL;
   16411       IRExpr* cond  = NULL;
   16412       HChar*  xtra  = NULL;
   16413 
   16414       if (have66orF2orF3(pfx) || 1==getRexW(pfx)) goto decode_failure;
   16415       /* So at this point we've rejected any variants which appear to
   16416          be governed by the usual operand-size modifiers.  Hence only
   16417          the address size prefix can have an effect.  It changes the
   16418          size from 64 (default) to 32. */
   16419       d64 = guest_RIP_bbstart+delta+1 + getSDisp8(delta);
   16420       delta++;
   16421       if (haveASO(pfx)) {
   16422          /* 64to32 of 64-bit get is merely a get-put improvement
   16423             trick. */
   16424          putIReg32(R_RCX, binop(Iop_Sub32,
   16425                                 unop(Iop_64to32, getIReg64(R_RCX)),
   16426                                 mkU32(1)));
   16427       } else {
   16428          putIReg64(R_RCX, binop(Iop_Sub64, getIReg64(R_RCX), mkU64(1)));
   16429       }
   16430 
   16431       /* This is correct, both for 32- and 64-bit versions.  If we're
   16432          doing a 32-bit dec and the result is zero then the default
   16433          zero extension rule will cause the upper 32 bits to be zero
   16434          too.  Hence a 64-bit check against zero is OK. */
   16435       count = getIReg64(R_RCX);
   16436       cond = binop(Iop_CmpNE64, count, mkU64(0));
   16437       switch (opc) {
   16438          case 0xE2:
   16439             xtra = "";
   16440             break;
   16441          case 0xE1:
   16442             xtra = "e";
   16443             zbit = mk_amd64g_calculate_condition( AMD64CondZ );
   16444             cond = mkAnd1(cond, zbit);
   16445             break;
   16446          case 0xE0:
   16447             xtra = "ne";
   16448             zbit = mk_amd64g_calculate_condition( AMD64CondNZ );
   16449             cond = mkAnd1(cond, zbit);
   16450             break;
   16451          default:
   16452 	    vassert(0);
   16453       }
   16454       stmt( IRStmt_Exit(cond, Ijk_Boring, IRConst_U64(d64)) );
   16455 
   16456       DIP("loop%s%s 0x%llx\n", xtra, haveASO(pfx) ? "l" : "", d64);
   16457       break;
   16458     }
   16459 
   16460    /* ------------------------ IMUL ----------------------- */
   16461 
   16462    case 0x69: /* IMUL Iv, Ev, Gv */
   16463       if (haveF2orF3(pfx)) goto decode_failure;
   16464       delta = dis_imul_I_E_G ( vbi, pfx, sz, delta, sz );
   16465       break;
   16466    case 0x6B: /* IMUL Ib, Ev, Gv */
   16467       delta = dis_imul_I_E_G ( vbi, pfx, sz, delta, 1 );
   16468       break;
   16469 
   16470    /* ------------------------ MOV ------------------------ */
   16471 
   16472    case 0x88: /* MOV Gb,Eb */
   16473       if (haveF2orF3(pfx)) goto decode_failure;
   16474       delta = dis_mov_G_E(vbi, pfx, 1, delta);
   16475       break;
   16476 
   16477    case 0x89: /* MOV Gv,Ev */
   16478       if (haveF2orF3(pfx)) goto decode_failure;
   16479       delta = dis_mov_G_E(vbi, pfx, sz, delta);
   16480       break;
   16481 
   16482    case 0x8A: /* MOV Eb,Gb */
   16483       if (haveF2orF3(pfx)) goto decode_failure;
   16484       delta = dis_mov_E_G(vbi, pfx, 1, delta);
   16485       break;
   16486 
   16487    case 0x8B: /* MOV Ev,Gv */
   16488       if (haveF2orF3(pfx)) goto decode_failure;
   16489       delta = dis_mov_E_G(vbi, pfx, sz, delta);
   16490       break;
   16491 
   16492    case 0x8D: /* LEA M,Gv */
   16493       if (haveF2orF3(pfx)) goto decode_failure;
   16494       if (sz != 4 && sz != 8)
   16495          goto decode_failure;
   16496       modrm = getUChar(delta);
   16497       if (epartIsReg(modrm))
   16498          goto decode_failure;
   16499       /* NOTE!  this is the one place where a segment override prefix
   16500          has no effect on the address calculation.  Therefore we clear
   16501          any segment override bits in pfx. */
   16502       addr = disAMode ( &alen, vbi, clearSegBits(pfx), delta, dis_buf, 0 );
   16503       delta += alen;
   16504       /* This is a hack.  But it isn't clear that really doing the
   16505          calculation at 32 bits is really worth it.  Hence for leal,
   16506          do the full 64-bit calculation and then truncate it. */
   16507       putIRegG( sz, pfx, modrm,
   16508                          sz == 4
   16509                             ? unop(Iop_64to32, mkexpr(addr))
   16510                             : mkexpr(addr)
   16511               );
   16512       DIP("lea%c %s, %s\n", nameISize(sz), dis_buf,
   16513                             nameIRegG(sz,pfx,modrm));
   16514       break;
   16515 
   16516 //..    case 0x8C: /* MOV Sw,Ew -- MOV from a SEGMENT REGISTER */
   16517 //..       delta = dis_mov_Sw_Ew(sorb, sz, delta);
   16518 //..       break;
   16519 //..
   16520 //..    case 0x8E: /* MOV Ew,Sw -- MOV to a SEGMENT REGISTER */
   16521 //..       delta = dis_mov_Ew_Sw(sorb, delta);
   16522 //..       break;
   16523 
   16524    case 0xA0: /* MOV Ob,AL */
   16525       if (have66orF2orF3(pfx)) goto decode_failure;
   16526       sz = 1;
   16527       /* Fall through ... */
   16528    case 0xA1: /* MOV Ov,eAX */
   16529       if (sz != 8 && sz != 4 && sz != 2 && sz != 1)
   16530          goto decode_failure;
   16531       d64 = getDisp64(delta);
   16532       delta += 8;
   16533       ty = szToITy(sz);
   16534       addr = newTemp(Ity_I64);
   16535       assign( addr, handleAddrOverrides(vbi, pfx, mkU64(d64)) );
   16536       putIRegRAX(sz, loadLE( ty, mkexpr(addr) ));
   16537       DIP("mov%c %s0x%llx, %s\n", nameISize(sz),
   16538                                   segRegTxt(pfx), d64,
   16539                                   nameIRegRAX(sz));
   16540       break;
   16541 
   16542    case 0xA2: /* MOV AL,Ob */
   16543       if (have66orF2orF3(pfx)) goto decode_failure;
   16544       sz = 1;
   16545       /* Fall through ... */
   16546    case 0xA3: /* MOV eAX,Ov */
   16547       if (sz != 8 && sz != 4 && sz != 2 && sz != 1)
   16548          goto decode_failure;
   16549       d64 = getDisp64(delta);
   16550       delta += 8;
   16551       ty = szToITy(sz);
   16552       addr = newTemp(Ity_I64);
   16553       assign( addr, handleAddrOverrides(vbi, pfx, mkU64(d64)) );
   16554       storeLE( mkexpr(addr), getIRegRAX(sz) );
   16555       DIP("mov%c %s, %s0x%llx\n", nameISize(sz), nameIRegRAX(sz),
   16556                                   segRegTxt(pfx), d64);
   16557       break;
   16558 
   16559    /* XXXX be careful here with moves to AH/BH/CH/DH */
   16560    case 0xB0: /* MOV imm,AL */
   16561    case 0xB1: /* MOV imm,CL */
   16562    case 0xB2: /* MOV imm,DL */
   16563    case 0xB3: /* MOV imm,BL */
   16564    case 0xB4: /* MOV imm,AH */
   16565    case 0xB5: /* MOV imm,CH */
   16566    case 0xB6: /* MOV imm,DH */
   16567    case 0xB7: /* MOV imm,BH */
   16568       if (haveF2orF3(pfx)) goto decode_failure;
   16569       d64 = getUChar(delta);
   16570       delta += 1;
   16571       putIRegRexB(1, pfx, opc-0xB0, mkU8(d64));
   16572       DIP("movb $%lld,%s\n", d64, nameIRegRexB(1,pfx,opc-0xB0));
   16573       break;
   16574 
   16575    case 0xB8: /* MOV imm,eAX */
   16576    case 0xB9: /* MOV imm,eCX */
   16577    case 0xBA: /* MOV imm,eDX */
   16578    case 0xBB: /* MOV imm,eBX */
   16579    case 0xBC: /* MOV imm,eSP */
   16580    case 0xBD: /* MOV imm,eBP */
   16581    case 0xBE: /* MOV imm,eSI */
   16582    case 0xBF: /* MOV imm,eDI */
   16583       /* This is the one-and-only place where 64-bit literals are
   16584          allowed in the instruction stream. */
   16585       if (haveF2orF3(pfx)) goto decode_failure;
   16586       if (sz == 8) {
   16587          d64 = getDisp64(delta);
   16588          delta += 8;
   16589          putIRegRexB(8, pfx, opc-0xB8, mkU64(d64));
   16590          DIP("movabsq $%lld,%s\n", (Long)d64,
   16591                                    nameIRegRexB(8,pfx,opc-0xB8));
   16592       } else {
   16593          d64 = getSDisp(imin(4,sz),delta);
   16594          delta += imin(4,sz);
   16595          putIRegRexB(sz, pfx, opc-0xB8,
   16596                          mkU(szToITy(sz), d64 & mkSizeMask(sz)));
   16597          DIP("mov%c $%lld,%s\n", nameISize(sz),
   16598                                  (Long)d64,
   16599                                  nameIRegRexB(sz,pfx,opc-0xB8));
   16600       }
   16601       break;
   16602 
   16603    case 0xC6: /* MOV Ib,Eb */
   16604       sz = 1;
   16605       goto do_Mov_I_E;
   16606    case 0xC7: /* MOV Iv,Ev */
   16607       goto do_Mov_I_E;
   16608 
   16609    do_Mov_I_E:
   16610       if (haveF2orF3(pfx)) goto decode_failure;
   16611       modrm = getUChar(delta);
   16612       if (epartIsReg(modrm)) {
   16613          delta++; /* mod/rm byte */
   16614          d64 = getSDisp(imin(4,sz),delta);
   16615          delta += imin(4,sz);
   16616          putIRegE(sz, pfx, modrm,
   16617                       mkU(szToITy(sz), d64 & mkSizeMask(sz)));
   16618          DIP("mov%c $%lld, %s\n", nameISize(sz),
   16619                                   (Long)d64,
   16620                                   nameIRegE(sz,pfx,modrm));
   16621       } else {
   16622          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   16623                            /*xtra*/imin(4,sz) );
   16624          delta += alen;
   16625          d64 = getSDisp(imin(4,sz),delta);
   16626          delta += imin(4,sz);
   16627          storeLE(mkexpr(addr),
   16628                  mkU(szToITy(sz), d64 & mkSizeMask(sz)));
   16629          DIP("mov%c $%lld, %s\n", nameISize(sz), (Long)d64, dis_buf);
   16630       }
   16631       break;
   16632 
   16633    /* ------------------------ MOVx ------------------------ */
   16634 
   16635    case 0x63: /* MOVSX */
   16636       if (haveF2orF3(pfx)) goto decode_failure;
   16637       if (haveREX(pfx) && 1==getRexW(pfx)) {
   16638          vassert(sz == 8);
   16639          /* movsx r/m32 to r64 */
   16640          modrm = getUChar(delta);
   16641          if (epartIsReg(modrm)) {
   16642             delta++;
   16643             putIRegG(8, pfx, modrm,
   16644                              unop(Iop_32Sto64,
   16645                                   getIRegE(4, pfx, modrm)));
   16646             DIP("movslq %s,%s\n",
   16647                 nameIRegE(4, pfx, modrm),
   16648                 nameIRegG(8, pfx, modrm));
   16649             break;
   16650          } else {
   16651             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   16652             delta += alen;
   16653             putIRegG(8, pfx, modrm,
   16654                              unop(Iop_32Sto64,
   16655                                   loadLE(Ity_I32, mkexpr(addr))));
   16656             DIP("movslq %s,%s\n", dis_buf,
   16657                 nameIRegG(8, pfx, modrm));
   16658             break;
   16659          }
   16660       } else {
   16661          goto decode_failure;
   16662       }
   16663 
   16664    /* ------------------------ opl imm, A ----------------- */
   16665 
   16666    case 0x04: /* ADD Ib, AL */
   16667       if (haveF2orF3(pfx)) goto decode_failure;
   16668       delta = dis_op_imm_A( 1, False, Iop_Add8, True, delta, "add" );
   16669       break;
   16670    case 0x05: /* ADD Iv, eAX */
   16671       if (haveF2orF3(pfx)) goto decode_failure;
   16672       delta = dis_op_imm_A(sz, False, Iop_Add8, True, delta, "add" );
   16673       break;
   16674 
   16675    case 0x0C: /* OR Ib, AL */
   16676       if (haveF2orF3(pfx)) goto decode_failure;
   16677       delta = dis_op_imm_A( 1, False, Iop_Or8, True, delta, "or" );
   16678       break;
   16679    case 0x0D: /* OR Iv, eAX */
   16680       if (haveF2orF3(pfx)) goto decode_failure;
   16681       delta = dis_op_imm_A( sz, False, Iop_Or8, True, delta, "or" );
   16682       break;
   16683 
   16684    case 0x14: /* ADC Ib, AL */
   16685       if (haveF2orF3(pfx)) goto decode_failure;
   16686       delta = dis_op_imm_A( 1, True, Iop_Add8, True, delta, "adc" );
   16687       break;
   16688    case 0x15: /* ADC Iv, eAX */
   16689       if (haveF2orF3(pfx)) goto decode_failure;
   16690       delta = dis_op_imm_A( sz, True, Iop_Add8, True, delta, "adc" );
   16691       break;
   16692 
   16693    case 0x1C: /* SBB Ib, AL */
   16694       if (haveF2orF3(pfx)) goto decode_failure;
   16695       delta = dis_op_imm_A( 1, True, Iop_Sub8, True, delta, "sbb" );
   16696       break;
   16697    case 0x1D: /* SBB Iv, eAX */
   16698       if (haveF2orF3(pfx)) goto decode_failure;
   16699       delta = dis_op_imm_A( sz, True, Iop_Sub8, True, delta, "sbb" );
   16700       break;
   16701 
   16702    case 0x24: /* AND Ib, AL */
   16703       if (haveF2orF3(pfx)) goto decode_failure;
   16704       delta = dis_op_imm_A( 1, False, Iop_And8, True, delta, "and" );
   16705       break;
   16706    case 0x25: /* AND Iv, eAX */
   16707       if (haveF2orF3(pfx)) goto decode_failure;
   16708       delta = dis_op_imm_A( sz, False, Iop_And8, True, delta, "and" );
   16709       break;
   16710 
   16711    case 0x2C: /* SUB Ib, AL */
   16712       if (haveF2orF3(pfx)) goto decode_failure;
   16713       delta = dis_op_imm_A(1, False, Iop_Sub8, True, delta, "sub" );
   16714       break;
   16715    case 0x2D: /* SUB Iv, eAX */
   16716       if (haveF2orF3(pfx)) goto decode_failure;
   16717       delta = dis_op_imm_A( sz, False, Iop_Sub8, True, delta, "sub" );
   16718       break;
   16719 
   16720    case 0x34: /* XOR Ib, AL */
   16721       if (haveF2orF3(pfx)) goto decode_failure;
   16722       delta = dis_op_imm_A( 1, False, Iop_Xor8, True, delta, "xor" );
   16723       break;
   16724    case 0x35: /* XOR Iv, eAX */
   16725       if (haveF2orF3(pfx)) goto decode_failure;
   16726       delta = dis_op_imm_A( sz, False, Iop_Xor8, True, delta, "xor" );
   16727       break;
   16728 
   16729    case 0x3C: /* CMP Ib, AL */
   16730       if (haveF2orF3(pfx)) goto decode_failure;
   16731       delta = dis_op_imm_A( 1, False, Iop_Sub8, False, delta, "cmp" );
   16732       break;
   16733    case 0x3D: /* CMP Iv, eAX */
   16734       if (haveF2orF3(pfx)) goto decode_failure;
   16735       delta = dis_op_imm_A( sz, False, Iop_Sub8, False, delta, "cmp" );
   16736       break;
   16737 
   16738    case 0xA8: /* TEST Ib, AL */
   16739       if (haveF2orF3(pfx)) goto decode_failure;
   16740       delta = dis_op_imm_A( 1, False, Iop_And8, False, delta, "test" );
   16741       break;
   16742    case 0xA9: /* TEST Iv, eAX */
   16743       if (haveF2orF3(pfx)) goto decode_failure;
   16744       delta = dis_op_imm_A( sz, False, Iop_And8, False, delta, "test" );
   16745       break;
   16746 
   16747    /* ------------------------ opl Ev, Gv ----------------- */
   16748 
   16749    case 0x02: /* ADD Eb,Gb */
   16750       if (haveF2orF3(pfx)) goto decode_failure;
   16751       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Add8, True, 1, delta, "add" );
   16752       break;
   16753    case 0x03: /* ADD Ev,Gv */
   16754       if (haveF2orF3(pfx)) goto decode_failure;
   16755       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Add8, True, sz, delta, "add" );
   16756       break;
   16757 
   16758    case 0x0A: /* OR Eb,Gb */
   16759       if (haveF2orF3(pfx)) goto decode_failure;
   16760       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Or8, True, 1, delta, "or" );
   16761       break;
   16762    case 0x0B: /* OR Ev,Gv */
   16763       if (haveF2orF3(pfx)) goto decode_failure;
   16764       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Or8, True, sz, delta, "or" );
   16765       break;
   16766 
   16767    case 0x12: /* ADC Eb,Gb */
   16768       if (haveF2orF3(pfx)) goto decode_failure;
   16769       delta = dis_op2_E_G ( vbi, pfx, True, Iop_Add8, True, 1, delta, "adc" );
   16770       break;
   16771    case 0x13: /* ADC Ev,Gv */
   16772       if (haveF2orF3(pfx)) goto decode_failure;
   16773       delta = dis_op2_E_G ( vbi, pfx, True, Iop_Add8, True, sz, delta, "adc" );
   16774       break;
   16775 
   16776    case 0x1A: /* SBB Eb,Gb */
   16777       if (haveF2orF3(pfx)) goto decode_failure;
   16778       delta = dis_op2_E_G ( vbi, pfx, True, Iop_Sub8, True, 1, delta, "sbb" );
   16779       break;
   16780    case 0x1B: /* SBB Ev,Gv */
   16781       if (haveF2orF3(pfx)) goto decode_failure;
   16782       delta = dis_op2_E_G ( vbi, pfx, True, Iop_Sub8, True, sz, delta, "sbb" );
   16783       break;
   16784 
   16785    case 0x22: /* AND Eb,Gb */
   16786       if (haveF2orF3(pfx)) goto decode_failure;
   16787       delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, True, 1, delta, "and" );
   16788       break;
   16789    case 0x23: /* AND Ev,Gv */
   16790       if (haveF2orF3(pfx)) goto decode_failure;
   16791       delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, True, sz, delta, "and" );
   16792       break;
   16793 
   16794    case 0x2A: /* SUB Eb,Gb */
   16795       if (haveF2orF3(pfx)) goto decode_failure;
   16796       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, True, 1, delta, "sub" );
   16797       break;
   16798    case 0x2B: /* SUB Ev,Gv */
   16799       if (haveF2orF3(pfx)) goto decode_failure;
   16800       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, True, sz, delta, "sub" );
   16801       break;
   16802 
   16803    case 0x32: /* XOR Eb,Gb */
   16804       if (haveF2orF3(pfx)) goto decode_failure;
   16805       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Xor8, True, 1, delta, "xor" );
   16806       break;
   16807    case 0x33: /* XOR Ev,Gv */
   16808       if (haveF2orF3(pfx)) goto decode_failure;
   16809       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Xor8, True, sz, delta, "xor" );
   16810       break;
   16811 
   16812    case 0x3A: /* CMP Eb,Gb */
   16813       if (haveF2orF3(pfx)) goto decode_failure;
   16814       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, False, 1, delta, "cmp" );
   16815       break;
   16816    case 0x3B: /* CMP Ev,Gv */
   16817       if (haveF2orF3(pfx)) goto decode_failure;
   16818       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, False, sz, delta, "cmp" );
   16819       break;
   16820 
   16821    case 0x84: /* TEST Eb,Gb */
   16822       if (haveF2orF3(pfx)) goto decode_failure;
   16823       delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, False, 1, delta, "test" );
   16824       break;
   16825    case 0x85: /* TEST Ev,Gv */
   16826       if (haveF2orF3(pfx)) goto decode_failure;
   16827       delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, False, sz, delta, "test" );
   16828       break;
   16829 
   16830    /* ------------------------ opl Gv, Ev ----------------- */
   16831 
   16832    case 0x00: /* ADD Gb,Eb */
   16833       if (haveF2orF3(pfx)) goto decode_failure;
   16834       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Add8, True, 1, delta, "add" );
   16835       break;
   16836    case 0x01: /* ADD Gv,Ev */
   16837       if (haveF2orF3(pfx)) goto decode_failure;
   16838       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Add8, True, sz, delta, "add" );
   16839       break;
   16840 
   16841    case 0x08: /* OR Gb,Eb */
   16842       if (haveF2orF3(pfx)) goto decode_failure;
   16843       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Or8, True, 1, delta, "or" );
   16844       break;
   16845    case 0x09: /* OR Gv,Ev */
   16846       if (haveF2orF3(pfx)) goto decode_failure;
   16847       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Or8, True, sz, delta, "or" );
   16848       break;
   16849 
   16850    case 0x10: /* ADC Gb,Eb */
   16851       if (haveF2orF3(pfx)) goto decode_failure;
   16852       delta = dis_op2_G_E ( vbi, pfx, True, Iop_Add8, True, 1, delta, "adc" );
   16853       break;
   16854    case 0x11: /* ADC Gv,Ev */
   16855       if (haveF2orF3(pfx)) goto decode_failure;
   16856       delta = dis_op2_G_E ( vbi, pfx, True, Iop_Add8, True, sz, delta, "adc" );
   16857       break;
   16858 
   16859    case 0x18: /* SBB Gb,Eb */
   16860       if (haveF2orF3(pfx)) goto decode_failure;
   16861       delta = dis_op2_G_E ( vbi, pfx, True, Iop_Sub8, True, 1, delta, "sbb" );
   16862       break;
   16863    case 0x19: /* SBB Gv,Ev */
   16864       if (haveF2orF3(pfx)) goto decode_failure;
   16865       delta = dis_op2_G_E ( vbi, pfx, True, Iop_Sub8, True, sz, delta, "sbb" );
   16866       break;
   16867 
   16868    case 0x20: /* AND Gb,Eb */
   16869       if (haveF2orF3(pfx)) goto decode_failure;
   16870       delta = dis_op2_G_E ( vbi, pfx, False, Iop_And8, True, 1, delta, "and" );
   16871       break;
   16872    case 0x21: /* AND Gv,Ev */
   16873       if (haveF2orF3(pfx)) goto decode_failure;
   16874       delta = dis_op2_G_E ( vbi, pfx, False, Iop_And8, True, sz, delta, "and" );
   16875       break;
   16876 
   16877    case 0x28: /* SUB Gb,Eb */
   16878       if (haveF2orF3(pfx)) goto decode_failure;
   16879       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, True, 1, delta, "sub" );
   16880       break;
   16881    case 0x29: /* SUB Gv,Ev */
   16882       if (haveF2orF3(pfx)) goto decode_failure;
   16883       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, True, sz, delta, "sub" );
   16884       break;
   16885 
   16886    case 0x30: /* XOR Gb,Eb */
   16887       if (haveF2orF3(pfx)) goto decode_failure;
   16888       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Xor8, True, 1, delta, "xor" );
   16889       break;
   16890    case 0x31: /* XOR Gv,Ev */
   16891       if (haveF2orF3(pfx)) goto decode_failure;
   16892       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Xor8, True, sz, delta, "xor" );
   16893       break;
   16894 
   16895    case 0x38: /* CMP Gb,Eb */
   16896       if (haveF2orF3(pfx)) goto decode_failure;
   16897       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, False, 1, delta, "cmp" );
   16898       break;
   16899    case 0x39: /* CMP Gv,Ev */
   16900       if (haveF2orF3(pfx)) goto decode_failure;
   16901       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, False, sz, delta, "cmp" );
   16902       break;
   16903 
   16904    /* ------------------------ POP ------------------------ */
   16905 
   16906    case 0x58: /* POP eAX */
   16907    case 0x59: /* POP eCX */
   16908    case 0x5A: /* POP eDX */
   16909    case 0x5B: /* POP eBX */
   16910    case 0x5D: /* POP eBP */
   16911    case 0x5E: /* POP eSI */
   16912    case 0x5F: /* POP eDI */
   16913    case 0x5C: /* POP eSP */
   16914       if (haveF2orF3(pfx)) goto decode_failure;
   16915       vassert(sz == 2 || sz == 4 || sz == 8);
   16916       if (sz == 4)
   16917          sz = 8; /* there is no encoding for 32-bit pop in 64-bit mode */
   16918       t1 = newTemp(szToITy(sz));
   16919       t2 = newTemp(Ity_I64);
   16920       assign(t2, getIReg64(R_RSP));
   16921       assign(t1, loadLE(szToITy(sz),mkexpr(t2)));
   16922       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t2), mkU64(sz)));
   16923       putIRegRexB(sz, pfx, opc-0x58, mkexpr(t1));
   16924       DIP("pop%c %s\n", nameISize(sz), nameIRegRexB(sz,pfx,opc-0x58));
   16925       break;
   16926 
   16927    case 0x9D: /* POPF */
   16928       /* Note.  There is no encoding for a 32-bit popf in 64-bit mode.
   16929          So sz==4 actually means sz==8. */
   16930       if (haveF2orF3(pfx)) goto decode_failure;
   16931       vassert(sz == 2 || sz == 4 || sz == 8);
   16932       if (sz == 4) sz = 8;
   16933       if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
   16934       t1 = newTemp(Ity_I64); t2 = newTemp(Ity_I64);
   16935       assign(t2, getIReg64(R_RSP));
   16936       assign(t1, widenUto64(loadLE(szToITy(sz),mkexpr(t2))));
   16937       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t2), mkU64(sz)));
   16938       /* t1 is the flag word.  Mask out everything except OSZACP and
   16939          set the flags thunk to AMD64G_CC_OP_COPY. */
   16940       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   16941       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   16942       stmt( IRStmt_Put( OFFB_CC_DEP1,
   16943                         binop(Iop_And64,
   16944                               mkexpr(t1),
   16945                               mkU64( AMD64G_CC_MASK_C | AMD64G_CC_MASK_P
   16946                                      | AMD64G_CC_MASK_A | AMD64G_CC_MASK_Z
   16947                                      | AMD64G_CC_MASK_S| AMD64G_CC_MASK_O )
   16948                              )
   16949                        )
   16950           );
   16951 
   16952       /* Also need to set the D flag, which is held in bit 10 of t1.
   16953          If zero, put 1 in OFFB_DFLAG, else -1 in OFFB_DFLAG. */
   16954       stmt( IRStmt_Put(
   16955                OFFB_DFLAG,
   16956                IRExpr_Mux0X(
   16957                   unop(Iop_32to8,
   16958                   unop(Iop_64to32,
   16959                        binop(Iop_And64,
   16960                              binop(Iop_Shr64, mkexpr(t1), mkU8(10)),
   16961                              mkU64(1)))),
   16962                   mkU64(1),
   16963                   mkU64(0xFFFFFFFFFFFFFFFFULL)))
   16964           );
   16965 
   16966       /* And set the ID flag */
   16967       stmt( IRStmt_Put(
   16968                OFFB_IDFLAG,
   16969                IRExpr_Mux0X(
   16970                   unop(Iop_32to8,
   16971                   unop(Iop_64to32,
   16972                        binop(Iop_And64,
   16973                              binop(Iop_Shr64, mkexpr(t1), mkU8(21)),
   16974                              mkU64(1)))),
   16975                   mkU64(0),
   16976                   mkU64(1)))
   16977           );
   16978 
   16979       /* And set the AC flag too */
   16980       stmt( IRStmt_Put(
   16981                OFFB_ACFLAG,
   16982                IRExpr_Mux0X(
   16983                   unop(Iop_32to8,
   16984                   unop(Iop_64to32,
   16985                        binop(Iop_And64,
   16986                              binop(Iop_Shr64, mkexpr(t1), mkU8(18)),
   16987                              mkU64(1)))),
   16988                   mkU64(0),
   16989                   mkU64(1)))
   16990           );
   16991 
   16992       DIP("popf%c\n", nameISize(sz));
   16993       break;
   16994 
   16995 //..    case 0x61: /* POPA */
   16996 //..       /* This is almost certainly wrong for sz==2.  So ... */
   16997 //..       if (sz != 4) goto decode_failure;
   16998 //..
   16999 //..       /* t5 is the old %ESP value. */
   17000 //..       t5 = newTemp(Ity_I32);
   17001 //..       assign( t5, getIReg(4, R_ESP) );
   17002 //..
   17003 //..       /* Reload all the registers, except %esp. */
   17004 //..       putIReg(4,R_EAX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(28)) ));
   17005 //..       putIReg(4,R_ECX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(24)) ));
   17006 //..       putIReg(4,R_EDX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(20)) ));
   17007 //..       putIReg(4,R_EBX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(16)) ));
   17008 //..       /* ignore saved %ESP */
   17009 //..       putIReg(4,R_EBP, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 8)) ));
   17010 //..       putIReg(4,R_ESI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 4)) ));
   17011 //..       putIReg(4,R_EDI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 0)) ));
   17012 //..
   17013 //..       /* and move %ESP back up */
   17014 //..       putIReg( 4, R_ESP, binop(Iop_Add32, mkexpr(t5), mkU32(8*4)) );
   17015 //..
   17016 //..       DIP("pusha%c\n", nameISize(sz));
   17017 //..       break;
   17018 
   17019    case 0x8F: { /* POPQ m64 / POPW m16 */
   17020       Int   len;
   17021       UChar rm;
   17022       /* There is no encoding for 32-bit pop in 64-bit mode.
   17023          So sz==4 actually means sz==8. */
   17024       if (haveF2orF3(pfx)) goto decode_failure;
   17025       vassert(sz == 2 || sz == 4
   17026               || /* tolerate redundant REX.W, see #210481 */ sz == 8);
   17027       if (sz == 4) sz = 8;
   17028       if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
   17029 
   17030       rm = getUChar(delta);
   17031 
   17032       /* make sure this instruction is correct POP */
   17033       if (epartIsReg(rm) || gregLO3ofRM(rm) != 0)
   17034          goto decode_failure;
   17035       /* and has correct size */
   17036       vassert(sz == 8);
   17037 
   17038       t1 = newTemp(Ity_I64);
   17039       t3 = newTemp(Ity_I64);
   17040       assign( t1, getIReg64(R_RSP) );
   17041       assign( t3, loadLE(Ity_I64, mkexpr(t1)) );
   17042 
   17043       /* Increase RSP; must be done before the STORE.  Intel manual
   17044          says: If the RSP register is used as a base register for
   17045          addressing a destination operand in memory, the POP
   17046          instruction computes the effective address of the operand
   17047          after it increments the RSP register.  */
   17048       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t1), mkU64(sz)) );
   17049 
   17050       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   17051       storeLE( mkexpr(addr), mkexpr(t3) );
   17052 
   17053       DIP("popl %s\n", dis_buf);
   17054 
   17055       delta += len;
   17056       break;
   17057    }
   17058 
   17059 //.. //--    case 0x1F: /* POP %DS */
   17060 //.. //--       dis_pop_segreg( cb, R_DS, sz ); break;
   17061 //.. //--    case 0x07: /* POP %ES */
   17062 //.. //--       dis_pop_segreg( cb, R_ES, sz ); break;
   17063 //.. //--    case 0x17: /* POP %SS */
   17064 //.. //--       dis_pop_segreg( cb, R_SS, sz ); break;
   17065 
   17066    /* ------------------------ PUSH ----------------------- */
   17067 
   17068    case 0x50: /* PUSH eAX */
   17069    case 0x51: /* PUSH eCX */
   17070    case 0x52: /* PUSH eDX */
   17071    case 0x53: /* PUSH eBX */
   17072    case 0x55: /* PUSH eBP */
   17073    case 0x56: /* PUSH eSI */
   17074    case 0x57: /* PUSH eDI */
   17075    case 0x54: /* PUSH eSP */
   17076       /* This is the Right Way, in that the value to be pushed is
   17077          established before %rsp is changed, so that pushq %rsp
   17078          correctly pushes the old value. */
   17079       if (haveF2orF3(pfx)) goto decode_failure;
   17080       vassert(sz == 2 || sz == 4 || sz == 8);
   17081       if (sz == 4)
   17082          sz = 8; /* there is no encoding for 32-bit push in 64-bit mode */
   17083       ty = sz==2 ? Ity_I16 : Ity_I64;
   17084       t1 = newTemp(ty);
   17085       t2 = newTemp(Ity_I64);
   17086       assign(t1, getIRegRexB(sz, pfx, opc-0x50));
   17087       assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(sz)));
   17088       putIReg64(R_RSP, mkexpr(t2) );
   17089       storeLE(mkexpr(t2),mkexpr(t1));
   17090       DIP("push%c %s\n", nameISize(sz), nameIRegRexB(sz,pfx,opc-0x50));
   17091       break;
   17092 
   17093    case 0x68: /* PUSH Iv */
   17094       if (haveF2orF3(pfx)) goto decode_failure;
   17095       /* Note, sz==4 is not possible in 64-bit mode.  Hence ... */
   17096       if (sz == 4) sz = 8;
   17097       d64 = getSDisp(imin(4,sz),delta);
   17098       delta += imin(4,sz);
   17099       goto do_push_I;
   17100    case 0x6A: /* PUSH Ib, sign-extended to sz */
   17101       if (haveF2orF3(pfx)) goto decode_failure;
   17102       /* Note, sz==4 is not possible in 64-bit mode.  Hence ... */
   17103       if (sz == 4) sz = 8;
   17104       d64 = getSDisp8(delta); delta += 1;
   17105       goto do_push_I;
   17106    do_push_I:
   17107       ty = szToITy(sz);
   17108       t1 = newTemp(Ity_I64);
   17109       t2 = newTemp(ty);
   17110       assign( t1, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
   17111       putIReg64(R_RSP, mkexpr(t1) );
   17112       /* stop mkU16 asserting if d32 is a negative 16-bit number
   17113          (bug #132813) */
   17114       if (ty == Ity_I16)
   17115          d64 &= 0xFFFF;
   17116       storeLE( mkexpr(t1), mkU(ty,d64) );
   17117       DIP("push%c $%lld\n", nameISize(sz), (Long)d64);
   17118       break;
   17119 
   17120    case 0x9C: /* PUSHF */ {
   17121       /* Note.  There is no encoding for a 32-bit pushf in 64-bit
   17122          mode.  So sz==4 actually means sz==8. */
   17123       /* 24 July 06: has also been seen with a redundant REX prefix,
   17124          so must also allow sz==8. */
   17125       if (haveF2orF3(pfx)) goto decode_failure;
   17126       vassert(sz == 2 || sz == 4 || sz == 8);
   17127       if (sz == 4) sz = 8;
   17128       if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
   17129 
   17130       t1 = newTemp(Ity_I64);
   17131       assign( t1, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
   17132       putIReg64(R_RSP, mkexpr(t1) );
   17133 
   17134       t2 = newTemp(Ity_I64);
   17135       assign( t2, mk_amd64g_calculate_rflags_all() );
   17136 
   17137       /* Patch in the D flag.  This can simply be a copy of bit 10 of
   17138          baseBlock[OFFB_DFLAG]. */
   17139       t3 = newTemp(Ity_I64);
   17140       assign( t3, binop(Iop_Or64,
   17141                         mkexpr(t2),
   17142                         binop(Iop_And64,
   17143                               IRExpr_Get(OFFB_DFLAG,Ity_I64),
   17144                               mkU64(1<<10)))
   17145             );
   17146 
   17147       /* And patch in the ID flag. */
   17148       t4 = newTemp(Ity_I64);
   17149       assign( t4, binop(Iop_Or64,
   17150                         mkexpr(t3),
   17151                         binop(Iop_And64,
   17152                               binop(Iop_Shl64, IRExpr_Get(OFFB_IDFLAG,Ity_I64),
   17153                                                mkU8(21)),
   17154                               mkU64(1<<21)))
   17155             );
   17156 
   17157       /* And patch in the AC flag too. */
   17158       t5 = newTemp(Ity_I64);
   17159       assign( t5, binop(Iop_Or64,
   17160                         mkexpr(t4),
   17161                         binop(Iop_And64,
   17162                               binop(Iop_Shl64, IRExpr_Get(OFFB_ACFLAG,Ity_I64),
   17163                                                mkU8(18)),
   17164                               mkU64(1<<18)))
   17165             );
   17166 
   17167       /* if sz==2, the stored value needs to be narrowed. */
   17168       if (sz == 2)
   17169         storeLE( mkexpr(t1), unop(Iop_32to16,
   17170                              unop(Iop_64to32,mkexpr(t5))) );
   17171       else
   17172         storeLE( mkexpr(t1), mkexpr(t5) );
   17173 
   17174       DIP("pushf%c\n", nameISize(sz));
   17175       break;
   17176    }
   17177 
   17178 //..    case 0x60: /* PUSHA */
   17179 //..       /* This is almost certainly wrong for sz==2.  So ... */
   17180 //..       if (sz != 4) goto decode_failure;
   17181 //..
   17182 //..       /* This is the Right Way, in that the value to be pushed is
   17183 //..          established before %esp is changed, so that pusha
   17184 //..          correctly pushes the old %esp value.  New value of %esp is
   17185 //..          pushed at start. */
   17186 //..       /* t0 is the %ESP value we're going to push. */
   17187 //..       t0 = newTemp(Ity_I32);
   17188 //..       assign( t0, getIReg(4, R_ESP) );
   17189 //..
   17190 //..       /* t5 will be the new %ESP value. */
   17191 //..       t5 = newTemp(Ity_I32);
   17192 //..       assign( t5, binop(Iop_Sub32, mkexpr(t0), mkU32(8*4)) );
   17193 //..
   17194 //..       /* Update guest state before prodding memory. */
   17195 //..       putIReg(4, R_ESP, mkexpr(t5));
   17196 //..
   17197 //..       /* Dump all the registers. */
   17198 //..       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(28)), getIReg(4,R_EAX) );
   17199 //..       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(24)), getIReg(4,R_ECX) );
   17200 //..       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(20)), getIReg(4,R_EDX) );
   17201 //..       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(16)), getIReg(4,R_EBX) );
   17202 //..       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(12)), mkexpr(t0) /*esp*/);
   17203 //..       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 8)), getIReg(4,R_EBP) );
   17204 //..       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 4)), getIReg(4,R_ESI) );
   17205 //..       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 0)), getIReg(4,R_EDI) );
   17206 //..
   17207 //..       DIP("pusha%c\n", nameISize(sz));
   17208 //..       break;
   17209 //..
   17210 //..
   17211 //.. //--    case 0x0E: /* PUSH %CS */
   17212 //.. //--       dis_push_segreg( cb, R_CS, sz ); break;
   17213 //.. //--    case 0x1E: /* PUSH %DS */
   17214 //.. //--       dis_push_segreg( cb, R_DS, sz ); break;
   17215 //.. //--    case 0x06: /* PUSH %ES */
   17216 //.. //--       dis_push_segreg( cb, R_ES, sz ); break;
   17217 //.. //--    case 0x16: /* PUSH %SS */
   17218 //.. //--       dis_push_segreg( cb, R_SS, sz ); break;
   17219 //..
   17220 //..    /* ------------------------ SCAS et al ----------------- */
   17221 //..
   17222 //..    case 0xA4: /* MOVS, no REP prefix */
   17223 //..    case 0xA5:
   17224 //..       dis_string_op( dis_MOVS, ( opc == 0xA4 ? 1 : sz ), "movs", sorb );
   17225 //..       break;
   17226 //..
   17227 //..   case 0xA6: /* CMPSb, no REP prefix */
   17228 //.. //--    case 0xA7:
   17229 //..      dis_string_op( dis_CMPS, ( opc == 0xA6 ? 1 : sz ), "cmps", sorb );
   17230 //..      break;
   17231 //.. //--
   17232 //.. //--
   17233     case 0xAC: /* LODS, no REP prefix */
   17234     case 0xAD:
   17235        dis_string_op( dis_LODS, ( opc == 0xAC ? 1 : sz ), "lods", pfx );
   17236        break;
   17237 //..
   17238 //..    case 0xAE: /* SCAS, no REP prefix */
   17239 //..    case 0xAF:
   17240 //..       dis_string_op( dis_SCAS, ( opc == 0xAE ? 1 : sz ), "scas", sorb );
   17241 //..       break;
   17242 
   17243 
   17244    case 0xFC: /* CLD */
   17245       if (haveF2orF3(pfx)) goto decode_failure;
   17246       stmt( IRStmt_Put( OFFB_DFLAG, mkU64(1)) );
   17247       DIP("cld\n");
   17248       break;
   17249 
   17250    case 0xFD: /* STD */
   17251       if (haveF2orF3(pfx)) goto decode_failure;
   17252       stmt( IRStmt_Put( OFFB_DFLAG, mkU64(-1ULL)) );
   17253       DIP("std\n");
   17254       break;
   17255 
   17256    case 0xF8: /* CLC */
   17257    case 0xF9: /* STC */
   17258    case 0xF5: /* CMC */
   17259       t0 = newTemp(Ity_I64);
   17260       t1 = newTemp(Ity_I64);
   17261       assign( t0, mk_amd64g_calculate_rflags_all() );
   17262       switch (opc) {
   17263          case 0xF8:
   17264             assign( t1, binop(Iop_And64, mkexpr(t0),
   17265                                          mkU64(~AMD64G_CC_MASK_C)));
   17266             DIP("clc\n");
   17267             break;
   17268          case 0xF9:
   17269             assign( t1, binop(Iop_Or64, mkexpr(t0),
   17270                                         mkU64(AMD64G_CC_MASK_C)));
   17271             DIP("stc\n");
   17272             break;
   17273          case 0xF5:
   17274             assign( t1, binop(Iop_Xor64, mkexpr(t0),
   17275                                          mkU64(AMD64G_CC_MASK_C)));
   17276             DIP("cmc\n");
   17277             break;
   17278          default:
   17279             vpanic("disInstr(x64)(clc/stc/cmc)");
   17280       }
   17281       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   17282       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   17283       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t1) ));
   17284       /* Set NDEP even though it isn't used.  This makes redundant-PUT
   17285          elimination of previous stores to this field work better. */
   17286       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   17287       break;
   17288 
   17289 //..    /* REPNE prefix insn */
   17290 //..    case 0xF2: {
   17291 //..       Addr32 eip_orig = guest_eip_bbstart + delta - 1;
   17292 //..       vassert(sorb == 0);
   17293 //..       abyte = getUChar(delta); delta++;
   17294 //..
   17295 //..       if (abyte == 0x66) { sz = 2; abyte = getUChar(delta); delta++; }
   17296 //..       whatNext = Dis_StopHere;
   17297 //..
   17298 //..       switch (abyte) {
   17299 //..       /* According to the Intel manual, "repne movs" should never occur, but
   17300 //..        * in practice it has happened, so allow for it here... */
   17301 //..       case 0xA4: sz = 1;   /* REPNE MOVS<sz> */
   17302 //..         goto decode_failure;
   17303 //.. //--       case 0xA5:
   17304 //..         //         dis_REP_op ( CondNZ, dis_MOVS, sz, eip_orig,
   17305 //..         //                              guest_eip_bbstart+delta, "repne movs" );
   17306 //..         //         break;
   17307 //.. //--
   17308 //.. //--       case 0xA6: sz = 1;   /* REPNE CMPS<sz> */
   17309 //.. //--       case 0xA7:
   17310 //.. //--          dis_REP_op ( cb, CondNZ, dis_CMPS, sz, eip_orig, eip, "repne cmps" );
   17311 //.. //--          break;
   17312 //.. //--
   17313 //..       case 0xAE: sz = 1;   /* REPNE SCAS<sz> */
   17314 //..       case 0xAF:
   17315 //..          dis_REP_op ( X86CondNZ, dis_SCAS, sz, eip_orig,
   17316 //..                                  guest_eip_bbstart+delta, "repne scas" );
   17317 //..          break;
   17318 //..
   17319 //..       default:
   17320 //..          goto decode_failure;
   17321 //..       }
   17322 //..       break;
   17323 //..    }
   17324 
   17325    /* ------ AE: SCAS variants ------ */
   17326    case 0xAE:
   17327    case 0xAF:
   17328       /* F2 AE/AF: repne scasb/repne scas{w,l,q} */
   17329       if (haveASO(pfx))
   17330          goto decode_failure;
   17331       if (haveF2(pfx) && !haveF3(pfx)) {
   17332          if (opc == 0xAE)
   17333             sz = 1;
   17334          dis_REP_op ( AMD64CondNZ, dis_SCAS, sz,
   17335                       guest_RIP_curr_instr,
   17336                       guest_RIP_bbstart+delta, "repne scas", pfx );
   17337          dres.whatNext = Dis_StopHere;
   17338          break;
   17339       }
   17340       /* F3 AE/AF: repe scasb/repe scas{w,l,q} */
   17341       if (haveASO(pfx))
   17342          goto decode_failure;
   17343       if (!haveF2(pfx) && haveF3(pfx)) {
   17344          if (opc == 0xAE)
   17345             sz = 1;
   17346          dis_REP_op ( AMD64CondZ, dis_SCAS, sz,
   17347                       guest_RIP_curr_instr,
   17348                       guest_RIP_bbstart+delta, "repe scas", pfx );
   17349          dres.whatNext = Dis_StopHere;
   17350          break;
   17351       }
   17352       /* AE/AF: scasb/scas{w,l,q} */
   17353       if (!haveF2(pfx) && !haveF3(pfx)) {
   17354          if (opc == 0xAE)
   17355             sz = 1;
   17356          dis_string_op( dis_SCAS, sz, "scas", pfx );
   17357          break;
   17358       }
   17359       goto decode_failure;
   17360 
   17361    /* ------ A6, A7: CMPS variants ------ */
   17362    case 0xA6:
   17363    case 0xA7:
   17364       /* F3 A6/A7: repe cmps/rep cmps{w,l,q} */
   17365       if (haveASO(pfx))
   17366          goto decode_failure;
   17367       if (haveF3(pfx) && !haveF2(pfx)) {
   17368          if (opc == 0xA6)
   17369             sz = 1;
   17370          dis_REP_op ( AMD64CondZ, dis_CMPS, sz,
   17371                       guest_RIP_curr_instr,
   17372                       guest_RIP_bbstart+delta, "repe cmps", pfx );
   17373          dres.whatNext = Dis_StopHere;
   17374          break;
   17375       }
   17376       goto decode_failure;
   17377 
   17378    /* ------ AA, AB: STOS variants ------ */
   17379    case 0xAA:
   17380    case 0xAB:
   17381       /* F3 AA/AB: rep stosb/rep stos{w,l,q} */
   17382       if (haveASO(pfx))
   17383          goto decode_failure;
   17384       if (haveF3(pfx) && !haveF2(pfx)) {
   17385          if (opc == 0xAA)
   17386             sz = 1;
   17387          dis_REP_op ( AMD64CondAlways, dis_STOS, sz,
   17388                       guest_RIP_curr_instr,
   17389                       guest_RIP_bbstart+delta, "rep stos", pfx );
   17390         dres.whatNext = Dis_StopHere;
   17391         break;
   17392       }
   17393       /* AA/AB: stosb/stos{w,l,q} */
   17394       if (!haveF3(pfx) && !haveF2(pfx)) {
   17395          if (opc == 0xAA)
   17396             sz = 1;
   17397          dis_string_op( dis_STOS, sz, "stos", pfx );
   17398          break;
   17399       }
   17400       goto decode_failure;
   17401 
   17402    /* ------ A4, A5: MOVS variants ------ */
   17403    case 0xA4:
   17404    case 0xA5:
   17405       /* F3 A4: rep movsb */
   17406       if (haveASO(pfx))
   17407          goto decode_failure;
   17408       if (haveF3(pfx) && !haveF2(pfx)) {
   17409          if (opc == 0xA4)
   17410             sz = 1;
   17411          dis_REP_op ( AMD64CondAlways, dis_MOVS, sz,
   17412                       guest_RIP_curr_instr,
   17413                       guest_RIP_bbstart+delta, "rep movs", pfx );
   17414         dres.whatNext = Dis_StopHere;
   17415         break;
   17416       }
   17417       /* A4: movsb */
   17418       if (!haveF3(pfx) && !haveF2(pfx)) {
   17419          if (opc == 0xA4)
   17420             sz = 1;
   17421          dis_string_op( dis_MOVS, sz, "movs", pfx );
   17422          break;
   17423       }
   17424       goto decode_failure;
   17425 
   17426 
   17427    /* ------------------------ XCHG ----------------------- */
   17428 
   17429    /* XCHG reg,mem automatically asserts LOCK# even without a LOCK
   17430       prefix.  Therefore, surround it with a IRStmt_MBE(Imbe_BusLock)
   17431       and IRStmt_MBE(Imbe_BusUnlock) pair.  But be careful; if it is
   17432       used with an explicit LOCK prefix, we don't want to end up with
   17433       two IRStmt_MBE(Imbe_BusLock)s -- one made here and one made by
   17434       the generic LOCK logic at the top of disInstr. */
   17435    case 0x86: /* XCHG Gb,Eb */
   17436       sz = 1;
   17437       /* Fall through ... */
   17438    case 0x87: /* XCHG Gv,Ev */
   17439       if (haveF2orF3(pfx)) goto decode_failure;
   17440       modrm = getUChar(delta);
   17441       ty = szToITy(sz);
   17442       t1 = newTemp(ty); t2 = newTemp(ty);
   17443       if (epartIsReg(modrm)) {
   17444          assign(t1, getIRegE(sz, pfx, modrm));
   17445          assign(t2, getIRegG(sz, pfx, modrm));
   17446          putIRegG(sz, pfx, modrm, mkexpr(t1));
   17447          putIRegE(sz, pfx, modrm, mkexpr(t2));
   17448          delta++;
   17449          DIP("xchg%c %s, %s\n",
   17450              nameISize(sz), nameIRegG(sz, pfx, modrm),
   17451                             nameIRegE(sz, pfx, modrm));
   17452       } else {
   17453          *expect_CAS = True;
   17454          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   17455          assign( t1, loadLE(ty, mkexpr(addr)) );
   17456          assign( t2, getIRegG(sz, pfx, modrm) );
   17457          casLE( mkexpr(addr),
   17458                 mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
   17459          putIRegG( sz, pfx, modrm, mkexpr(t1) );
   17460          delta += alen;
   17461          DIP("xchg%c %s, %s\n", nameISize(sz),
   17462                                 nameIRegG(sz, pfx, modrm), dis_buf);
   17463       }
   17464       break;
   17465 
   17466    case 0x90: /* XCHG eAX,eAX */
   17467       /* detect and handle F3 90 (rep nop) specially */
   17468       if (!have66(pfx) && !haveF2(pfx) && haveF3(pfx)) {
   17469          DIP("rep nop (P4 pause)\n");
   17470          /* "observe" the hint.  The Vex client needs to be careful not
   17471             to cause very long delays as a result, though. */
   17472          jmp_lit(Ijk_Yield, guest_RIP_bbstart+delta);
   17473          dres.whatNext = Dis_StopHere;
   17474          break;
   17475       }
   17476       /* detect and handle NOPs specially */
   17477       if (/* F2/F3 probably change meaning completely */
   17478           !haveF2orF3(pfx)
   17479           /* If REX.B is 1, we're not exchanging rAX with itself */
   17480           && getRexB(pfx)==0 ) {
   17481          DIP("nop\n");
   17482          break;
   17483       }
   17484       /* else fall through to normal case. */
   17485    case 0x91: /* XCHG rAX,rCX */
   17486    case 0x92: /* XCHG rAX,rDX */
   17487    case 0x93: /* XCHG rAX,rBX */
   17488    case 0x94: /* XCHG rAX,rSP */
   17489    case 0x95: /* XCHG rAX,rBP */
   17490    case 0x96: /* XCHG rAX,rSI */
   17491    case 0x97: /* XCHG rAX,rDI */
   17492 
   17493       /* guard against mutancy */
   17494       if (haveF2orF3(pfx)) goto decode_failure;
   17495 
   17496       /* sz == 2 could legitimately happen, but we don't handle it yet */
   17497       if (sz == 2) goto decode_failure; /* awaiting test case */
   17498 
   17499       codegen_xchg_rAX_Reg ( pfx, sz, opc - 0x90 );
   17500       break;
   17501 
   17502 //.. //--    /* ------------------------ XLAT ----------------------- */
   17503 //.. //--
   17504 //.. //--    case 0xD7: /* XLAT */
   17505 //.. //--       t1 = newTemp(cb); t2 = newTemp(cb);
   17506 //.. //--       uInstr2(cb, GET, sz, ArchReg, R_EBX, TempReg, t1); /* get eBX */
   17507 //.. //--       handleAddrOverrides( cb, sorb, t1 );               /* make t1 DS:eBX */
   17508 //.. //--       uInstr2(cb, GET, 1, ArchReg, R_AL, TempReg, t2); /* get AL */
   17509 //.. //--       /* Widen %AL to 32 bits, so it's all defined when we add it. */
   17510 //.. //--       uInstr1(cb, WIDEN, 4, TempReg, t2);
   17511 //.. //--       uWiden(cb, 1, False);
   17512 //.. //--       uInstr2(cb, ADD, sz, TempReg, t2, TempReg, t1);  /* add AL to eBX */
   17513 //.. //--       uInstr2(cb, LOAD, 1, TempReg, t1,  TempReg, t2); /* get byte at t1 into t2 */
   17514 //.. //--       uInstr2(cb, PUT, 1, TempReg, t2, ArchReg, R_AL); /* put byte into AL */
   17515 //.. //--
   17516 //.. //--       DIP("xlat%c [ebx]\n", nameISize(sz));
   17517 //.. //--       break;
   17518 
   17519    /* ------------------------ IN / OUT ----------------------- */
   17520 
   17521    case 0xE4: /* IN imm8, AL */
   17522       sz = 1;
   17523       t1 = newTemp(Ity_I64);
   17524       abyte = getUChar(delta); delta++;
   17525       assign(t1, mkU64( abyte & 0xFF ));
   17526       DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIRegRAX(sz));
   17527       goto do_IN;
   17528    case 0xE5: /* IN imm8, eAX */
   17529       if (!(sz == 2 || sz == 4)) goto decode_failure;
   17530       t1 = newTemp(Ity_I64);
   17531       abyte = getUChar(delta); delta++;
   17532       assign(t1, mkU64( abyte & 0xFF ));
   17533       DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIRegRAX(sz));
   17534       goto do_IN;
   17535    case 0xEC: /* IN %DX, AL */
   17536       sz = 1;
   17537       t1 = newTemp(Ity_I64);
   17538       assign(t1, unop(Iop_16Uto64, getIRegRDX(2)));
   17539       DIP("in%c %s,%s\n", nameISize(sz), nameIRegRDX(2),
   17540                                          nameIRegRAX(sz));
   17541       goto do_IN;
   17542    case 0xED: /* IN %DX, eAX */
   17543       if (!(sz == 2 || sz == 4)) goto decode_failure;
   17544       t1 = newTemp(Ity_I64);
   17545       assign(t1, unop(Iop_16Uto64, getIRegRDX(2)));
   17546       DIP("in%c %s,%s\n", nameISize(sz), nameIRegRDX(2),
   17547                                          nameIRegRAX(sz));
   17548       goto do_IN;
   17549    do_IN: {
   17550       /* At this point, sz indicates the width, and t1 is a 64-bit
   17551          value giving port number. */
   17552       IRDirty* d;
   17553       if (haveF2orF3(pfx)) goto decode_failure;
   17554       vassert(sz == 1 || sz == 2 || sz == 4);
   17555       ty = szToITy(sz);
   17556       t2 = newTemp(Ity_I64);
   17557       d = unsafeIRDirty_1_N(
   17558              t2,
   17559              0/*regparms*/,
   17560              "amd64g_dirtyhelper_IN",
   17561              &amd64g_dirtyhelper_IN,
   17562              mkIRExprVec_2( mkexpr(t1), mkU64(sz) )
   17563           );
   17564       /* do the call, dumping the result in t2. */
   17565       stmt( IRStmt_Dirty(d) );
   17566       putIRegRAX(sz, narrowTo( ty, mkexpr(t2) ) );
   17567       break;
   17568    }
   17569 
   17570    case 0xE6: /* OUT AL, imm8 */
   17571       sz = 1;
   17572       t1 = newTemp(Ity_I64);
   17573       abyte = getUChar(delta); delta++;
   17574       assign( t1, mkU64( abyte & 0xFF ) );
   17575       DIP("out%c %s,$%d\n", nameISize(sz), nameIRegRAX(sz), (Int)abyte);
   17576       goto do_OUT;
   17577    case 0xE7: /* OUT eAX, imm8 */
   17578       if (!(sz == 2 || sz == 4)) goto decode_failure;
   17579       t1 = newTemp(Ity_I64);
   17580       abyte = getUChar(delta); delta++;
   17581       assign( t1, mkU64( abyte & 0xFF ) );
   17582       DIP("out%c %s,$%d\n", nameISize(sz), nameIRegRAX(sz), (Int)abyte);
   17583       goto do_OUT;
   17584    case 0xEE: /* OUT AL, %DX */
   17585       sz = 1;
   17586       t1 = newTemp(Ity_I64);
   17587       assign( t1, unop(Iop_16Uto64, getIRegRDX(2)) );
   17588       DIP("out%c %s,%s\n", nameISize(sz), nameIRegRAX(sz),
   17589                                           nameIRegRDX(2));
   17590       goto do_OUT;
   17591    case 0xEF: /* OUT eAX, %DX */
   17592       if (!(sz == 2 || sz == 4)) goto decode_failure;
   17593       t1 = newTemp(Ity_I64);
   17594       assign( t1, unop(Iop_16Uto64, getIRegRDX(2)) );
   17595       DIP("out%c %s,%s\n", nameISize(sz), nameIRegRAX(sz),
   17596                                           nameIRegRDX(2));
   17597       goto do_OUT;
   17598    do_OUT: {
   17599       /* At this point, sz indicates the width, and t1 is a 64-bit
   17600          value giving port number. */
   17601       IRDirty* d;
   17602       if (haveF2orF3(pfx)) goto decode_failure;
   17603       vassert(sz == 1 || sz == 2 || sz == 4);
   17604       ty = szToITy(sz);
   17605       d = unsafeIRDirty_0_N(
   17606              0/*regparms*/,
   17607              "amd64g_dirtyhelper_OUT",
   17608              &amd64g_dirtyhelper_OUT,
   17609              mkIRExprVec_3( mkexpr(t1),
   17610                             widenUto64( getIRegRAX(sz) ),
   17611                             mkU64(sz) )
   17612           );
   17613       stmt( IRStmt_Dirty(d) );
   17614       break;
   17615    }
   17616 
   17617    /* ------------------------ (Grp1 extensions) ---------- */
   17618 
   17619    case 0x80: /* Grp1 Ib,Eb */
   17620       if (haveF2orF3(pfx)) goto decode_failure;
   17621       modrm = getUChar(delta);
   17622       am_sz = lengthAMode(pfx,delta);
   17623       sz    = 1;
   17624       d_sz  = 1;
   17625       d64   = getSDisp8(delta + am_sz);
   17626       delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
   17627       break;
   17628 
   17629    case 0x81: /* Grp1 Iv,Ev */
   17630       if (haveF2orF3(pfx)) goto decode_failure;
   17631       modrm = getUChar(delta);
   17632       am_sz = lengthAMode(pfx,delta);
   17633       d_sz  = imin(sz,4);
   17634       d64   = getSDisp(d_sz, delta + am_sz);
   17635       delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
   17636       break;
   17637 
   17638    case 0x83: /* Grp1 Ib,Ev */
   17639       if (haveF2orF3(pfx)) goto decode_failure;
   17640       modrm = getUChar(delta);
   17641       am_sz = lengthAMode(pfx,delta);
   17642       d_sz  = 1;
   17643       d64   = getSDisp8(delta + am_sz);
   17644       delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
   17645       break;
   17646 
   17647    /* ------------------------ (Grp2 extensions) ---------- */
   17648 
   17649    case 0xC0: { /* Grp2 Ib,Eb */
   17650       Bool decode_OK = True;
   17651       if (haveF2orF3(pfx)) goto decode_failure;
   17652       modrm = getUChar(delta);
   17653       am_sz = lengthAMode(pfx,delta);
   17654       d_sz  = 1;
   17655       d64   = getUChar(delta + am_sz);
   17656       sz    = 1;
   17657       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   17658                          mkU8(d64 & 0xFF), NULL, &decode_OK );
   17659       if (!decode_OK) goto decode_failure;
   17660       break;
   17661    }
   17662    case 0xC1: { /* Grp2 Ib,Ev */
   17663       Bool decode_OK = True;
   17664       if (haveF2orF3(pfx)) goto decode_failure;
   17665       modrm = getUChar(delta);
   17666       am_sz = lengthAMode(pfx,delta);
   17667       d_sz  = 1;
   17668       d64   = getUChar(delta + am_sz);
   17669       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   17670                          mkU8(d64 & 0xFF), NULL, &decode_OK );
   17671       if (!decode_OK) goto decode_failure;
   17672       break;
   17673    }
   17674    case 0xD0: { /* Grp2 1,Eb */
   17675       Bool decode_OK = True;
   17676       if (haveF2orF3(pfx)) goto decode_failure;
   17677       modrm = getUChar(delta);
   17678       am_sz = lengthAMode(pfx,delta);
   17679       d_sz  = 0;
   17680       d64   = 1;
   17681       sz    = 1;
   17682       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   17683                          mkU8(d64), NULL, &decode_OK );
   17684       if (!decode_OK) goto decode_failure;
   17685       break;
   17686    }
   17687    case 0xD1: { /* Grp2 1,Ev */
   17688       Bool decode_OK = True;
   17689       if (haveF2orF3(pfx)) goto decode_failure;
   17690       modrm = getUChar(delta);
   17691       am_sz = lengthAMode(pfx,delta);
   17692       d_sz  = 0;
   17693       d64   = 1;
   17694       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   17695                          mkU8(d64), NULL, &decode_OK );
   17696       if (!decode_OK) goto decode_failure;
   17697       break;
   17698    }
   17699    case 0xD2: { /* Grp2 CL,Eb */
   17700       Bool decode_OK = True;
   17701       if (haveF2orF3(pfx)) goto decode_failure;
   17702       modrm = getUChar(delta);
   17703       am_sz = lengthAMode(pfx,delta);
   17704       d_sz  = 0;
   17705       sz    = 1;
   17706       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   17707                          getIRegCL(), "%cl", &decode_OK );
   17708       if (!decode_OK) goto decode_failure;
   17709       break;
   17710    }
   17711    case 0xD3: { /* Grp2 CL,Ev */
   17712       Bool decode_OK = True;
   17713       if (haveF2orF3(pfx)) goto decode_failure;
   17714       modrm = getUChar(delta);
   17715       am_sz = lengthAMode(pfx,delta);
   17716       d_sz  = 0;
   17717       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   17718                          getIRegCL(), "%cl", &decode_OK );
   17719       if (!decode_OK) goto decode_failure;
   17720       break;
   17721    }
   17722 
   17723    /* ------------------------ (Grp3 extensions) ---------- */
   17724 
   17725    case 0xF6: { /* Grp3 Eb */
   17726       Bool decode_OK = True;
   17727       if (haveF2orF3(pfx)) goto decode_failure;
   17728       delta = dis_Grp3 ( vbi, pfx, 1, delta, &decode_OK );
   17729       if (!decode_OK) goto decode_failure;
   17730       break;
   17731    }
   17732    case 0xF7: { /* Grp3 Ev */
   17733       Bool decode_OK = True;
   17734       if (haveF2orF3(pfx)) goto decode_failure;
   17735       delta = dis_Grp3 ( vbi, pfx, sz, delta, &decode_OK );
   17736       if (!decode_OK) goto decode_failure;
   17737       break;
   17738    }
   17739 
   17740    /* ------------------------ (Grp4 extensions) ---------- */
   17741 
   17742    case 0xFE: { /* Grp4 Eb */
   17743       Bool decode_OK = True;
   17744       if (haveF2orF3(pfx)) goto decode_failure;
   17745       delta = dis_Grp4 ( vbi, pfx, delta, &decode_OK );
   17746       if (!decode_OK) goto decode_failure;
   17747       break;
   17748    }
   17749 
   17750    /* ------------------------ (Grp5 extensions) ---------- */
   17751 
   17752    case 0xFF: { /* Grp5 Ev */
   17753       Bool decode_OK = True;
   17754       if (haveF2orF3(pfx)) goto decode_failure;
   17755       delta = dis_Grp5 ( vbi, pfx, sz, delta, &dres, &decode_OK );
   17756       if (!decode_OK) goto decode_failure;
   17757       break;
   17758    }
   17759 
   17760    /* ------------------------ Escapes to 2-byte opcodes -- */
   17761 
   17762    case 0x0F: {
   17763       opc = getUChar(delta); delta++;
   17764       switch (opc) {
   17765 
   17766       /* =-=-=-=-=-=-=-=-=- Grp8 =-=-=-=-=-=-=-=-=-=-=-= */
   17767 
   17768       case 0xBA: { /* Grp8 Ib,Ev */
   17769          Bool decode_OK = False;
   17770          if (haveF2orF3(pfx)) goto decode_failure;
   17771          modrm = getUChar(delta);
   17772          am_sz = lengthAMode(pfx,delta);
   17773          d64   = getSDisp8(delta + am_sz);
   17774          delta = dis_Grp8_Imm ( vbi, pfx, delta, modrm, am_sz, sz, d64,
   17775                                 &decode_OK );
   17776          if (!decode_OK)
   17777             goto decode_failure;
   17778          break;
   17779       }
   17780 
   17781       /* =-=-=-=-=-=-=-=-=- BSF/BSR -=-=-=-=-=-=-=-=-=-= */
   17782 
   17783       case 0xBC: /* BSF Gv,Ev */
   17784          if (haveF2orF3(pfx)) goto decode_failure;
   17785          delta = dis_bs_E_G ( vbi, pfx, sz, delta, True );
   17786          break;
   17787       case 0xBD: /* BSR Gv,Ev */
   17788          if (haveF2orF3(pfx)) goto decode_failure;
   17789          delta = dis_bs_E_G ( vbi, pfx, sz, delta, False );
   17790          break;
   17791 
   17792       /* =-=-=-=-=-=-=-=-=- BSWAP -=-=-=-=-=-=-=-=-=-=-= */
   17793 
   17794       case 0xC8: /* BSWAP %eax */
   17795       case 0xC9:
   17796       case 0xCA:
   17797       case 0xCB:
   17798       case 0xCC:
   17799       case 0xCD:
   17800       case 0xCE:
   17801       case 0xCF: /* BSWAP %edi */
   17802          if (haveF2orF3(pfx)) goto decode_failure;
   17803          /* According to the AMD64 docs, this insn can have size 4 or
   17804             8. */
   17805          if (sz == 4) {
   17806             t1 = newTemp(Ity_I32);
   17807             t2 = newTemp(Ity_I32);
   17808             assign( t1, getIRegRexB(4, pfx, opc-0xC8) );
   17809             assign( t2,
   17810                binop(Iop_Or32,
   17811                   binop(Iop_Shl32, mkexpr(t1), mkU8(24)),
   17812                binop(Iop_Or32,
   17813                   binop(Iop_And32, binop(Iop_Shl32, mkexpr(t1), mkU8(8)),
   17814                                    mkU32(0x00FF0000)),
   17815                binop(Iop_Or32,
   17816                   binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(8)),
   17817                                    mkU32(0x0000FF00)),
   17818                   binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(24)),
   17819                                    mkU32(0x000000FF) )
   17820                )))
   17821             );
   17822             putIRegRexB(4, pfx, opc-0xC8, mkexpr(t2));
   17823             DIP("bswapl %s\n", nameIRegRexB(4, pfx, opc-0xC8));
   17824             break;
   17825          }
   17826 	 else if (sz == 8) {
   17827             IRTemp m8  = newTemp(Ity_I64);
   17828             IRTemp s8  = newTemp(Ity_I64);
   17829             IRTemp m16 = newTemp(Ity_I64);
   17830             IRTemp s16 = newTemp(Ity_I64);
   17831             IRTemp m32 = newTemp(Ity_I64);
   17832             t1 = newTemp(Ity_I64);
   17833             t2 = newTemp(Ity_I64);
   17834             assign( t1, getIRegRexB(8, pfx, opc-0xC8) );
   17835 
   17836             assign( m8, mkU64(0xFF00FF00FF00FF00ULL) );
   17837             assign( s8,
   17838                     binop(Iop_Or64,
   17839                           binop(Iop_Shr64,
   17840                                 binop(Iop_And64,mkexpr(t1),mkexpr(m8)),
   17841                                 mkU8(8)),
   17842                           binop(Iop_And64,
   17843                                 binop(Iop_Shl64,mkexpr(t1),mkU8(8)),
   17844                                 mkexpr(m8))
   17845                          )
   17846                   );
   17847 
   17848             assign( m16, mkU64(0xFFFF0000FFFF0000ULL) );
   17849             assign( s16,
   17850                     binop(Iop_Or64,
   17851                           binop(Iop_Shr64,
   17852                                 binop(Iop_And64,mkexpr(s8),mkexpr(m16)),
   17853                                 mkU8(16)),
   17854                           binop(Iop_And64,
   17855                                 binop(Iop_Shl64,mkexpr(s8),mkU8(16)),
   17856                                 mkexpr(m16))
   17857                          )
   17858                   );
   17859 
   17860             assign( m32, mkU64(0xFFFFFFFF00000000ULL) );
   17861             assign( t2,
   17862                     binop(Iop_Or64,
   17863                           binop(Iop_Shr64,
   17864                                 binop(Iop_And64,mkexpr(s16),mkexpr(m32)),
   17865                                 mkU8(32)),
   17866                           binop(Iop_And64,
   17867                                 binop(Iop_Shl64,mkexpr(s16),mkU8(32)),
   17868                                 mkexpr(m32))
   17869                          )
   17870                   );
   17871 
   17872             putIRegRexB(8, pfx, opc-0xC8, mkexpr(t2));
   17873             DIP("bswapq %s\n", nameIRegRexB(8, pfx, opc-0xC8));
   17874             break;
   17875          } else {
   17876             goto decode_failure;
   17877          }
   17878 
   17879       /* =-=-=-=-=-=-=-=-=- BT/BTS/BTR/BTC =-=-=-=-=-=-= */
   17880 
   17881       /* All of these are possible at sizes 2, 4 and 8, but until a
   17882          size 2 test case shows up, only handle sizes 4 and 8. */
   17883 
   17884       case 0xA3: /* BT Gv,Ev */
   17885          if (haveF2orF3(pfx)) goto decode_failure;
   17886          if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
   17887          delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpNone );
   17888          break;
   17889       case 0xB3: /* BTR Gv,Ev */
   17890          if (haveF2orF3(pfx)) goto decode_failure;
   17891          if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
   17892          delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpReset );
   17893          break;
   17894       case 0xAB: /* BTS Gv,Ev */
   17895          if (haveF2orF3(pfx)) goto decode_failure;
   17896          if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
   17897          delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpSet );
   17898          break;
   17899       case 0xBB: /* BTC Gv,Ev */
   17900          if (haveF2orF3(pfx)) goto decode_failure;
   17901          if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
   17902          delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpComp );
   17903          break;
   17904 
   17905       /* =-=-=-=-=-=-=-=-=- CMOV =-=-=-=-=-=-=-=-=-=-=-= */
   17906 
   17907       case 0x40:
   17908       case 0x41:
   17909       case 0x42: /* CMOVBb/CMOVNAEb (cmov below) */
   17910       case 0x43: /* CMOVNBb/CMOVAEb (cmov not below) */
   17911       case 0x44: /* CMOVZb/CMOVEb (cmov zero) */
   17912       case 0x45: /* CMOVNZb/CMOVNEb (cmov not zero) */
   17913       case 0x46: /* CMOVBEb/CMOVNAb (cmov below or equal) */
   17914       case 0x47: /* CMOVNBEb/CMOVAb (cmov not below or equal) */
   17915       case 0x48: /* CMOVSb (cmov negative) */
   17916       case 0x49: /* CMOVSb (cmov not negative) */
   17917       case 0x4A: /* CMOVP (cmov parity even) */
   17918       case 0x4B: /* CMOVNP (cmov parity odd) */
   17919       case 0x4C: /* CMOVLb/CMOVNGEb (cmov less) */
   17920       case 0x4D: /* CMOVGEb/CMOVNLb (cmov greater or equal) */
   17921       case 0x4E: /* CMOVLEb/CMOVNGb (cmov less or equal) */
   17922       case 0x4F: /* CMOVGb/CMOVNLEb (cmov greater) */
   17923          if (haveF2orF3(pfx)) goto decode_failure;
   17924          delta = dis_cmov_E_G(vbi, pfx, sz, (AMD64Condcode)(opc - 0x40), delta);
   17925          break;
   17926 
   17927       /* =-=-=-=-=-=-=-=-=- CMPXCHG -=-=-=-=-=-=-=-=-=-= */
   17928 
   17929       case 0xB0: { /* CMPXCHG Gb,Eb */
   17930          Bool ok = True;
   17931          if (haveF2orF3(pfx)) goto decode_failure;
   17932          delta = dis_cmpxchg_G_E ( &ok, vbi, pfx, 1, delta );
   17933          if (!ok) goto decode_failure;
   17934          break;
   17935       }
   17936       case 0xB1: { /* CMPXCHG Gv,Ev (allowed in 16,32,64 bit) */
   17937          Bool ok = True;
   17938          if (haveF2orF3(pfx)) goto decode_failure;
   17939          if (sz != 2 && sz != 4 && sz != 8) goto decode_failure;
   17940          delta = dis_cmpxchg_G_E ( &ok, vbi, pfx, sz, delta );
   17941          if (!ok) goto decode_failure;
   17942          break;
   17943       }
   17944 
   17945       case 0xC7: { /* CMPXCHG8B Ev, CMPXCHG16B Ev */
   17946          IRType  elemTy     = sz==4 ? Ity_I32 : Ity_I64;
   17947          IRTemp  expdHi     = newTemp(elemTy);
   17948          IRTemp  expdLo     = newTemp(elemTy);
   17949          IRTemp  dataHi     = newTemp(elemTy);
   17950          IRTemp  dataLo     = newTemp(elemTy);
   17951          IRTemp  oldHi      = newTemp(elemTy);
   17952          IRTemp  oldLo      = newTemp(elemTy);
   17953          IRTemp  flags_old  = newTemp(Ity_I64);
   17954          IRTemp  flags_new  = newTemp(Ity_I64);
   17955          IRTemp  success    = newTemp(Ity_I1);
   17956          IROp    opOR       = sz==4 ? Iop_Or32    : Iop_Or64;
   17957          IROp    opXOR      = sz==4 ? Iop_Xor32   : Iop_Xor64;
   17958          IROp    opCasCmpEQ = sz==4 ? Iop_CasCmpEQ32 : Iop_CasCmpEQ64;
   17959          IRExpr* zero       = sz==4 ? mkU32(0)    : mkU64(0);
   17960          IRTemp expdHi64    = newTemp(Ity_I64);
   17961          IRTemp expdLo64    = newTemp(Ity_I64);
   17962 
   17963          /* Translate this using a DCAS, even if there is no LOCK
   17964             prefix.  Life is too short to bother with generating two
   17965             different translations for the with/without-LOCK-prefix
   17966             cases. */
   17967          *expect_CAS = True;
   17968 
   17969 	 /* Decode, and generate address. */
   17970          if (have66orF2orF3(pfx)) goto decode_failure;
   17971          if (sz != 4 && sz != 8) goto decode_failure;
   17972          if (sz == 8 && !(archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16))
   17973             goto decode_failure;
   17974          modrm = getUChar(delta);
   17975          if (epartIsReg(modrm)) goto decode_failure;
   17976          if (gregLO3ofRM(modrm) != 1) goto decode_failure;
   17977          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   17978          delta += alen;
   17979 
   17980          /* cmpxchg16b requires an alignment check. */
   17981          if (sz == 8)
   17982             gen_SEGV_if_not_16_aligned( addr );
   17983 
   17984          /* Get the expected and new values. */
   17985          assign( expdHi64, getIReg64(R_RDX) );
   17986          assign( expdLo64, getIReg64(R_RAX) );
   17987 
   17988          /* These are the correctly-sized expected and new values.
   17989             However, we also get expdHi64/expdLo64 above as 64-bits
   17990             regardless, because we will need them later in the 32-bit
   17991             case (paradoxically). */
   17992          assign( expdHi, sz==4 ? unop(Iop_64to32, mkexpr(expdHi64))
   17993                                : mkexpr(expdHi64) );
   17994          assign( expdLo, sz==4 ? unop(Iop_64to32, mkexpr(expdLo64))
   17995                                : mkexpr(expdLo64) );
   17996          assign( dataHi, sz==4 ? getIReg32(R_RCX) : getIReg64(R_RCX) );
   17997          assign( dataLo, sz==4 ? getIReg32(R_RBX) : getIReg64(R_RBX) );
   17998 
   17999          /* Do the DCAS */
   18000          stmt( IRStmt_CAS(
   18001                   mkIRCAS( oldHi, oldLo,
   18002                            Iend_LE, mkexpr(addr),
   18003                            mkexpr(expdHi), mkexpr(expdLo),
   18004                            mkexpr(dataHi), mkexpr(dataLo)
   18005                )));
   18006 
   18007          /* success when oldHi:oldLo == expdHi:expdLo */
   18008          assign( success,
   18009                  binop(opCasCmpEQ,
   18010                        binop(opOR,
   18011                              binop(opXOR, mkexpr(oldHi), mkexpr(expdHi)),
   18012                              binop(opXOR, mkexpr(oldLo), mkexpr(expdLo))
   18013                        ),
   18014                        zero
   18015                  ));
   18016 
   18017          /* If the DCAS is successful, that is to say oldHi:oldLo ==
   18018             expdHi:expdLo, then put expdHi:expdLo back in RDX:RAX,
   18019             which is where they came from originally.  Both the actual
   18020             contents of these two regs, and any shadow values, are
   18021             unchanged.  If the DCAS fails then we're putting into
   18022             RDX:RAX the value seen in memory. */
   18023          /* Now of course there's a complication in the 32-bit case
   18024             (bah!): if the DCAS succeeds, we need to leave RDX:RAX
   18025             unchanged; but if we use the same scheme as in the 64-bit
   18026             case, we get hit by the standard rule that a write to the
   18027             bottom 32 bits of an integer register zeros the upper 32
   18028             bits.  And so the upper halves of RDX and RAX mysteriously
   18029             become zero.  So we have to stuff back in the original
   18030             64-bit values which we previously stashed in
   18031             expdHi64:expdLo64, even if we're doing a cmpxchg8b. */
   18032          /* It's just _so_ much fun ... */
   18033          putIRegRDX( 8,
   18034                      IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(success)),
   18035                                    sz == 4 ? unop(Iop_32Uto64, mkexpr(oldHi))
   18036                                            : mkexpr(oldHi),
   18037                                    mkexpr(expdHi64)
   18038                    ));
   18039          putIRegRAX( 8,
   18040                      IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(success)),
   18041                                    sz == 4 ? unop(Iop_32Uto64, mkexpr(oldLo))
   18042                                            : mkexpr(oldLo),
   18043                                    mkexpr(expdLo64)
   18044                    ));
   18045 
   18046          /* Copy the success bit into the Z flag and leave the others
   18047             unchanged */
   18048          assign( flags_old, widenUto64(mk_amd64g_calculate_rflags_all()));
   18049          assign(
   18050             flags_new,
   18051             binop(Iop_Or64,
   18052                   binop(Iop_And64, mkexpr(flags_old),
   18053                                    mkU64(~AMD64G_CC_MASK_Z)),
   18054                   binop(Iop_Shl64,
   18055                         binop(Iop_And64,
   18056                               unop(Iop_1Uto64, mkexpr(success)), mkU64(1)),
   18057                         mkU8(AMD64G_CC_SHIFT_Z)) ));
   18058 
   18059          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   18060          stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(flags_new) ));
   18061          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   18062          /* Set NDEP even though it isn't used.  This makes
   18063             redundant-PUT elimination of previous stores to this field
   18064             work better. */
   18065          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   18066 
   18067          /* Sheesh.  Aren't you glad it was me and not you that had to
   18068 	    write and validate all this grunge? */
   18069 
   18070 	 DIP("cmpxchg8b %s\n", dis_buf);
   18071 	 break;
   18072 
   18073       }
   18074 
   18075       /* =-=-=-=-=-=-=-=-=- CPUID -=-=-=-=-=-=-=-=-=-=-= */
   18076 
   18077       case 0xA2: { /* CPUID */
   18078          /* Uses dirty helper:
   18079                void amd64g_dirtyhelper_CPUID ( VexGuestAMD64State* )
   18080             declared to mod rax, wr rbx, rcx, rdx
   18081          */
   18082          IRDirty* d     = NULL;
   18083          HChar*   fName = NULL;
   18084          void*    fAddr = NULL;
   18085          if (haveF2orF3(pfx)) goto decode_failure;
   18086          if (archinfo->hwcaps == (VEX_HWCAPS_AMD64_SSE3
   18087                                   |VEX_HWCAPS_AMD64_CX16)) {
   18088             fName = "amd64g_dirtyhelper_CPUID_sse3_and_cx16";
   18089             fAddr = &amd64g_dirtyhelper_CPUID_sse3_and_cx16;
   18090             /* This is a Core-2-like machine */
   18091             //fName = "amd64g_dirtyhelper_CPUID_sse42_and_cx16";
   18092             //fAddr = &amd64g_dirtyhelper_CPUID_sse42_and_cx16;
   18093             /* This is a Core-i5-like machine */
   18094          }
   18095          else {
   18096             /* Give a CPUID for at least a baseline machine, SSE2
   18097                only, and no CX16 */
   18098             fName = "amd64g_dirtyhelper_CPUID_baseline";
   18099             fAddr = &amd64g_dirtyhelper_CPUID_baseline;
   18100          }
   18101 
   18102          vassert(fName); vassert(fAddr);
   18103          d = unsafeIRDirty_0_N ( 0/*regparms*/,
   18104                                  fName, fAddr, mkIRExprVec_0() );
   18105          /* declare guest state effects */
   18106          d->needsBBP = True;
   18107          d->nFxState = 4;
   18108          d->fxState[0].fx     = Ifx_Modify;
   18109          d->fxState[0].offset = OFFB_RAX;
   18110          d->fxState[0].size   = 8;
   18111          d->fxState[1].fx     = Ifx_Write;
   18112          d->fxState[1].offset = OFFB_RBX;
   18113          d->fxState[1].size   = 8;
   18114          d->fxState[2].fx     = Ifx_Modify;
   18115          d->fxState[2].offset = OFFB_RCX;
   18116          d->fxState[2].size   = 8;
   18117          d->fxState[3].fx     = Ifx_Write;
   18118          d->fxState[3].offset = OFFB_RDX;
   18119          d->fxState[3].size   = 8;
   18120          /* execute the dirty call, side-effecting guest state */
   18121          stmt( IRStmt_Dirty(d) );
   18122          /* CPUID is a serialising insn.  So, just in case someone is
   18123             using it as a memory fence ... */
   18124          stmt( IRStmt_MBE(Imbe_Fence) );
   18125          DIP("cpuid\n");
   18126          break;
   18127       }
   18128 
   18129       /* =-=-=-=-=-=-=-=-=- MOVZX, MOVSX =-=-=-=-=-=-=-= */
   18130 
   18131       case 0xB6: /* MOVZXb Eb,Gv */
   18132          if (haveF2orF3(pfx)) goto decode_failure;
   18133          if (sz != 2 && sz != 4 && sz != 8)
   18134             goto decode_failure;
   18135          delta = dis_movx_E_G ( vbi, pfx, delta, 1, sz, False );
   18136          break;
   18137       case 0xB7: /* MOVZXw Ew,Gv */
   18138          if (haveF2orF3(pfx)) goto decode_failure;
   18139          if (sz != 4 && sz != 8)
   18140             goto decode_failure;
   18141          delta = dis_movx_E_G ( vbi, pfx, delta, 2, sz, False );
   18142          break;
   18143 
   18144       case 0xBE: /* MOVSXb Eb,Gv */
   18145          if (haveF2orF3(pfx)) goto decode_failure;
   18146          if (sz != 2 && sz != 4 && sz != 8)
   18147             goto decode_failure;
   18148          delta = dis_movx_E_G ( vbi, pfx, delta, 1, sz, True );
   18149          break;
   18150       case 0xBF: /* MOVSXw Ew,Gv */
   18151          if (haveF2orF3(pfx)) goto decode_failure;
   18152          if (sz != 4 && sz != 8)
   18153             goto decode_failure;
   18154          delta = dis_movx_E_G ( vbi, pfx, delta, 2, sz, True );
   18155          break;
   18156 
   18157 //.. //--       /* =-=-=-=-=-=-=-=-=-=-= MOVNTI -=-=-=-=-=-=-=-=-= */
   18158 //.. //--
   18159 //.. //--       case 0xC3: /* MOVNTI Gv,Ev */
   18160 //.. //--          vg_assert(sz == 4);
   18161 //.. //--          modrm = getUChar(eip);
   18162 //.. //--          vg_assert(!epartIsReg(modrm));
   18163 //.. //--          t1 = newTemp(cb);
   18164 //.. //--          uInstr2(cb, GET, 4, ArchReg, gregOfRM(modrm), TempReg, t1);
   18165 //.. //--          pair = disAMode ( cb, sorb, eip, dis_buf );
   18166 //.. //--          t2 = LOW24(pair);
   18167 //.. //--          eip += HI8(pair);
   18168 //.. //--          uInstr2(cb, STORE, 4, TempReg, t1, TempReg, t2);
   18169 //.. //--          DIP("movnti %s,%s\n", nameIReg(4,gregOfRM(modrm)), dis_buf);
   18170 //.. //--          break;
   18171 
   18172       /* =-=-=-=-=-=-=-=-=- MUL/IMUL =-=-=-=-=-=-=-=-=-= */
   18173 
   18174       case 0xAF: /* IMUL Ev, Gv */
   18175          if (haveF2orF3(pfx)) goto decode_failure;
   18176          delta = dis_mul_E_G ( vbi, pfx, sz, delta );
   18177          break;
   18178 
   18179       /* =-=-=-=-=-=-=-=-=- NOPs =-=-=-=-=-=-=-=-=-=-=-= */
   18180 
   18181       case 0x1F:
   18182          if (haveF2orF3(pfx)) goto decode_failure;
   18183          modrm = getUChar(delta);
   18184          if (epartIsReg(modrm)) goto decode_failure;
   18185          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   18186          delta += alen;
   18187          DIP("nop%c %s\n", nameISize(sz), dis_buf);
   18188          break;
   18189 
   18190       /* =-=-=-=-=-=-=-=-=- Jcond d32 -=-=-=-=-=-=-=-=-= */
   18191       case 0x80:
   18192       case 0x81:
   18193       case 0x82: /* JBb/JNAEb (jump below) */
   18194       case 0x83: /* JNBb/JAEb (jump not below) */
   18195       case 0x84: /* JZb/JEb (jump zero) */
   18196       case 0x85: /* JNZb/JNEb (jump not zero) */
   18197       case 0x86: /* JBEb/JNAb (jump below or equal) */
   18198       case 0x87: /* JNBEb/JAb (jump not below or equal) */
   18199       case 0x88: /* JSb (jump negative) */
   18200       case 0x89: /* JSb (jump not negative) */
   18201       case 0x8A: /* JP (jump parity even) */
   18202       case 0x8B: /* JNP/JPO (jump parity odd) */
   18203       case 0x8C: /* JLb/JNGEb (jump less) */
   18204       case 0x8D: /* JGEb/JNLb (jump greater or equal) */
   18205       case 0x8E: /* JLEb/JNGb (jump less or equal) */
   18206       case 0x8F: /* JGb/JNLEb (jump greater) */
   18207        { Long   jmpDelta;
   18208          HChar* comment  = "";
   18209          if (haveF2orF3(pfx)) goto decode_failure;
   18210          jmpDelta = getSDisp32(delta);
   18211          d64 = (guest_RIP_bbstart+delta+4) + jmpDelta;
   18212          delta += 4;
   18213          if (resteerCisOk
   18214              && vex_control.guest_chase_cond
   18215              && (Addr64)d64 != (Addr64)guest_RIP_bbstart
   18216              && jmpDelta < 0
   18217              && resteerOkFn( callback_opaque, d64) ) {
   18218             /* Speculation: assume this backward branch is taken.  So
   18219                we need to emit a side-exit to the insn following this
   18220                one, on the negation of the condition, and continue at
   18221                the branch target address (d64).  If we wind up back at
   18222                the first instruction of the trace, just stop; it's
   18223                better to let the IR loop unroller handle that case. */
   18224             stmt( IRStmt_Exit(
   18225                      mk_amd64g_calculate_condition(
   18226                         (AMD64Condcode)(1 ^ (opc - 0x80))),
   18227                      Ijk_Boring,
   18228                      IRConst_U64(guest_RIP_bbstart+delta) ) );
   18229             dres.whatNext   = Dis_ResteerC;
   18230             dres.continueAt = d64;
   18231             comment = "(assumed taken)";
   18232          }
   18233          else
   18234          if (resteerCisOk
   18235              && vex_control.guest_chase_cond
   18236              && (Addr64)d64 != (Addr64)guest_RIP_bbstart
   18237              && jmpDelta >= 0
   18238              && resteerOkFn( callback_opaque, guest_RIP_bbstart+delta ) ) {
   18239             /* Speculation: assume this forward branch is not taken.
   18240                So we need to emit a side-exit to d64 (the dest) and
   18241                continue disassembling at the insn immediately
   18242                following this one. */
   18243             stmt( IRStmt_Exit(
   18244                      mk_amd64g_calculate_condition((AMD64Condcode)
   18245                                                    (opc - 0x80)),
   18246                      Ijk_Boring,
   18247                      IRConst_U64(d64) ) );
   18248             dres.whatNext   = Dis_ResteerC;
   18249             dres.continueAt = guest_RIP_bbstart+delta;
   18250             comment = "(assumed not taken)";
   18251          }
   18252          else {
   18253             /* Conservative default translation - end the block at
   18254                this point. */
   18255             jcc_01( (AMD64Condcode)(opc - 0x80),
   18256                     guest_RIP_bbstart+delta,
   18257                     d64 );
   18258             dres.whatNext = Dis_StopHere;
   18259          }
   18260          DIP("j%s-32 0x%llx %s\n", name_AMD64Condcode(opc - 0x80), d64, comment);
   18261          break;
   18262        }
   18263 
   18264       /* =-=-=-=-=-=-=-=-=- PREFETCH =-=-=-=-=-=-=-=-=-= */
   18265       case 0x0D: /* 0F 0D /0 -- prefetch mem8 */
   18266                  /* 0F 0D /1 -- prefetchw mem8 */
   18267          if (have66orF2orF3(pfx)) goto decode_failure;
   18268          modrm = getUChar(delta);
   18269          if (epartIsReg(modrm)) goto decode_failure;
   18270          if (gregLO3ofRM(modrm) != 0 && gregLO3ofRM(modrm) != 1)
   18271             goto decode_failure;
   18272 
   18273          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   18274          delta += alen;
   18275 
   18276          switch (gregLO3ofRM(modrm)) {
   18277             case 0: DIP("prefetch %s\n", dis_buf); break;
   18278             case 1: DIP("prefetchw %s\n", dis_buf); break;
   18279             default: vassert(0); /*NOTREACHED*/
   18280          }
   18281          break;
   18282 
   18283       /* =-=-=-=-=-=-=-=-=- RDTSC -=-=-=-=-=-=-=-=-=-=-= */
   18284       case 0x31: { /* RDTSC */
   18285          IRTemp   val  = newTemp(Ity_I64);
   18286          IRExpr** args = mkIRExprVec_0();
   18287          IRDirty* d    = unsafeIRDirty_1_N (
   18288                             val,
   18289                             0/*regparms*/,
   18290                             "amd64g_dirtyhelper_RDTSC",
   18291                             &amd64g_dirtyhelper_RDTSC,
   18292                             args
   18293                          );
   18294          if (have66orF2orF3(pfx)) goto decode_failure;
   18295          /* execute the dirty call, dumping the result in val. */
   18296          stmt( IRStmt_Dirty(d) );
   18297          putIRegRDX(4, unop(Iop_64HIto32, mkexpr(val)));
   18298          putIRegRAX(4, unop(Iop_64to32, mkexpr(val)));
   18299          DIP("rdtsc\n");
   18300          break;
   18301       }
   18302 
   18303 //..       /* =-=-=-=-=-=-=-=-=- PUSH/POP Sreg =-=-=-=-=-=-=-=-=-= */
   18304 //..
   18305 //..       case 0xA1: /* POP %FS */
   18306 //..          dis_pop_segreg( R_FS, sz ); break;
   18307 //..       case 0xA9: /* POP %GS */
   18308 //..          dis_pop_segreg( R_GS, sz ); break;
   18309 //..
   18310 //..       case 0xA0: /* PUSH %FS */
   18311 //..          dis_push_segreg( R_FS, sz ); break;
   18312 //..       case 0xA8: /* PUSH %GS */
   18313 //..          dis_push_segreg( R_GS, sz ); break;
   18314 
   18315       /* =-=-=-=-=-=-=-=-=- SETcc Eb =-=-=-=-=-=-=-=-=-= */
   18316       case 0x90:
   18317       case 0x91:
   18318       case 0x92: /* set-Bb/set-NAEb (set if below) */
   18319       case 0x93: /* set-NBb/set-AEb (set if not below) */
   18320       case 0x94: /* set-Zb/set-Eb (set if zero) */
   18321       case 0x95: /* set-NZb/set-NEb (set if not zero) */
   18322       case 0x96: /* set-BEb/set-NAb (set if below or equal) */
   18323       case 0x97: /* set-NBEb/set-Ab (set if not below or equal) */
   18324       case 0x98: /* set-Sb (set if negative) */
   18325       case 0x99: /* set-Sb (set if not negative) */
   18326       case 0x9A: /* set-P (set if parity even) */
   18327       case 0x9B: /* set-NP (set if parity odd) */
   18328       case 0x9C: /* set-Lb/set-NGEb (set if less) */
   18329       case 0x9D: /* set-GEb/set-NLb (set if greater or equal) */
   18330       case 0x9E: /* set-LEb/set-NGb (set if less or equal) */
   18331       case 0x9F: /* set-Gb/set-NLEb (set if greater) */
   18332          if (haveF2orF3(pfx)) goto decode_failure;
   18333          t1 = newTemp(Ity_I8);
   18334          assign( t1, unop(Iop_1Uto8,mk_amd64g_calculate_condition(opc-0x90)) );
   18335          modrm = getUChar(delta);
   18336          if (epartIsReg(modrm)) {
   18337             delta++;
   18338             putIRegE(1, pfx, modrm, mkexpr(t1));
   18339             DIP("set%s %s\n", name_AMD64Condcode(opc-0x90),
   18340                               nameIRegE(1,pfx,modrm));
   18341          } else {
   18342             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   18343             delta += alen;
   18344             storeLE( mkexpr(addr), mkexpr(t1) );
   18345             DIP("set%s %s\n", name_AMD64Condcode(opc-0x90), dis_buf);
   18346          }
   18347          break;
   18348 
   18349       /* =-=-=-=-=-=-=-=-=- SHLD/SHRD -=-=-=-=-=-=-=-=-= */
   18350 
   18351       case 0xA4: /* SHLDv imm8,Gv,Ev */
   18352          modrm = getUChar(delta);
   18353          d64   = delta + lengthAMode(pfx, delta);
   18354          vex_sprintf(dis_buf, "$%d", (Int)getUChar(d64));
   18355          delta = dis_SHLRD_Gv_Ev (
   18356                     vbi, pfx, delta, modrm, sz,
   18357                     mkU8(getUChar(d64)), True, /* literal */
   18358                     dis_buf, True /* left */ );
   18359          break;
   18360       case 0xA5: /* SHLDv %cl,Gv,Ev */
   18361          modrm = getUChar(delta);
   18362          delta = dis_SHLRD_Gv_Ev (
   18363                     vbi, pfx, delta, modrm, sz,
   18364                     getIRegCL(), False, /* not literal */
   18365                     "%cl", True /* left */ );
   18366          break;
   18367 
   18368       case 0xAC: /* SHRDv imm8,Gv,Ev */
   18369          modrm = getUChar(delta);
   18370          d64   = delta + lengthAMode(pfx, delta);
   18371          vex_sprintf(dis_buf, "$%d", (Int)getUChar(d64));
   18372          delta = dis_SHLRD_Gv_Ev (
   18373                     vbi, pfx, delta, modrm, sz,
   18374                     mkU8(getUChar(d64)), True, /* literal */
   18375                     dis_buf, False /* right */ );
   18376          break;
   18377       case 0xAD: /* SHRDv %cl,Gv,Ev */
   18378          modrm = getUChar(delta);
   18379          delta = dis_SHLRD_Gv_Ev (
   18380                     vbi, pfx, delta, modrm, sz,
   18381                     getIRegCL(), False, /* not literal */
   18382                     "%cl", False /* right */);
   18383          break;
   18384 
   18385       /* =-=-=-=-=-=-=-=-=- SYSCALL -=-=-=-=-=-=-=-=-=-= */
   18386       case 0x05: /* SYSCALL */
   18387          guest_RIP_next_mustcheck = True;
   18388          guest_RIP_next_assumed = guest_RIP_bbstart + delta;
   18389          putIReg64( R_RCX, mkU64(guest_RIP_next_assumed) );
   18390          /* It's important that all guest state is up-to-date
   18391             at this point.  So we declare an end-of-block here, which
   18392             forces any cached guest state to be flushed. */
   18393          jmp_lit(Ijk_Sys_syscall, guest_RIP_next_assumed);
   18394          dres.whatNext = Dis_StopHere;
   18395          DIP("syscall\n");
   18396          break;
   18397 
   18398       /* =-=-=-=-=-=-=-=-=- XADD -=-=-=-=-=-=-=-=-=-= */
   18399 
   18400       case 0xC0: { /* XADD Gb,Eb */
   18401          Bool decode_OK = False;
   18402          delta = dis_xadd_G_E ( &decode_OK, vbi, pfx, 1, delta );
   18403          if (!decode_OK)
   18404             goto decode_failure;
   18405          break;
   18406       }
   18407       case 0xC1: { /* XADD Gv,Ev */
   18408          Bool decode_OK = False;
   18409          delta = dis_xadd_G_E ( &decode_OK, vbi, pfx, sz, delta );
   18410          if (!decode_OK)
   18411             goto decode_failure;
   18412          break;
   18413       }
   18414 
   18415       /* =-=-=-=-=-=-=-=-=- MMXery =-=-=-=-=-=-=-=-=-=-= */
   18416 
   18417       case 0x71:
   18418       case 0x72:
   18419       case 0x73: /* PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
   18420 
   18421       case 0x6E: /* MOVD (src)ireg-or-mem, (dst)mmxreg */
   18422       case 0x7E: /* MOVD (src)mmxreg, (dst)ireg-or-mem */
   18423       case 0x7F: /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
   18424       case 0x6F: /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
   18425 
   18426       case 0xFC:
   18427       case 0xFD:
   18428       case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
   18429 
   18430       case 0xEC:
   18431       case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
   18432 
   18433       case 0xDC:
   18434       case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   18435 
   18436       case 0xF8:
   18437       case 0xF9:
   18438       case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
   18439 
   18440       case 0xE8:
   18441       case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
   18442 
   18443       case 0xD8:
   18444       case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   18445 
   18446       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
   18447       case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
   18448 
   18449       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
   18450 
   18451       case 0x74:
   18452       case 0x75:
   18453       case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
   18454 
   18455       case 0x64:
   18456       case 0x65:
   18457       case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
   18458 
   18459       case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
   18460       case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
   18461       case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
   18462 
   18463       case 0x68:
   18464       case 0x69:
   18465       case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
   18466 
   18467       case 0x60:
   18468       case 0x61:
   18469       case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
   18470 
   18471       case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
   18472       case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
   18473       case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
   18474       case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
   18475 
   18476       case 0xF1: /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
   18477       case 0xF2:
   18478       case 0xF3:
   18479 
   18480       case 0xD1: /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
   18481       case 0xD2:
   18482       case 0xD3:
   18483 
   18484       case 0xE1: /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
   18485       case 0xE2:
   18486       {
   18487          Long delta0    = delta-1;
   18488          Bool decode_OK = False;
   18489 
   18490          /* If sz==2 this is SSE, and we assume sse idec has
   18491             already spotted those cases by now. */
   18492          if (sz != 4 && sz != 8)
   18493             goto decode_failure;
   18494          if (have66orF2orF3(pfx))
   18495             goto decode_failure;
   18496 
   18497          delta = dis_MMX ( &decode_OK, vbi, pfx, sz, delta-1 );
   18498          if (!decode_OK) {
   18499             delta = delta0;
   18500             goto decode_failure;
   18501          }
   18502          break;
   18503       }
   18504 
   18505       case 0x0E: /* FEMMS */
   18506       case 0x77: /* EMMS */
   18507          if (sz != 4)
   18508             goto decode_failure;
   18509          do_EMMS_preamble();
   18510          DIP("{f}emms\n");
   18511          break;
   18512 
   18513       /* =-=-=-=-=-=-=-=-=- SGDT and SIDT =-=-=-=-=-=-=-=-=-=-= */
   18514       case 0x01: /* 0F 01 /0 -- SGDT */
   18515                  /* 0F 01 /1 -- SIDT */
   18516       {
   18517           /* This is really revolting, but ... since each processor
   18518              (core) only has one IDT and one GDT, just let the guest
   18519              see it (pass-through semantics).  I can't see any way to
   18520              construct a faked-up value, so don't bother to try. */
   18521          modrm = getUChar(delta);
   18522          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   18523          delta += alen;
   18524          if (epartIsReg(modrm)) goto decode_failure;
   18525          if (gregLO3ofRM(modrm) != 0 && gregLO3ofRM(modrm) != 1)
   18526             goto decode_failure;
   18527          switch (gregLO3ofRM(modrm)) {
   18528             case 0: DIP("sgdt %s\n", dis_buf); break;
   18529             case 1: DIP("sidt %s\n", dis_buf); break;
   18530             default: vassert(0); /*NOTREACHED*/
   18531          }
   18532 
   18533          IRDirty* d = unsafeIRDirty_0_N (
   18534                           0/*regparms*/,
   18535                           "amd64g_dirtyhelper_SxDT",
   18536                           &amd64g_dirtyhelper_SxDT,
   18537                           mkIRExprVec_2( mkexpr(addr),
   18538                                          mkU64(gregLO3ofRM(modrm)) )
   18539                       );
   18540          /* declare we're writing memory */
   18541          d->mFx   = Ifx_Write;
   18542          d->mAddr = mkexpr(addr);
   18543          d->mSize = 6;
   18544          stmt( IRStmt_Dirty(d) );
   18545          break;
   18546       }
   18547 
   18548       /* =-=-=-=-=-=-=-=-=- unimp2 =-=-=-=-=-=-=-=-=-=-= */
   18549 
   18550       default:
   18551          goto decode_failure;
   18552    } /* switch (opc) for the 2-byte opcodes */
   18553    goto decode_success;
   18554    } /* case 0x0F: of primary opcode */
   18555 
   18556    /* ------------------------ ??? ------------------------ */
   18557 
   18558   default:
   18559   decode_failure:
   18560    /* All decode failures end up here. */
   18561    vex_printf("vex amd64->IR: unhandled instruction bytes: "
   18562               "0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
   18563               (Int)getUChar(delta_start+0),
   18564               (Int)getUChar(delta_start+1),
   18565               (Int)getUChar(delta_start+2),
   18566               (Int)getUChar(delta_start+3),
   18567               (Int)getUChar(delta_start+4),
   18568               (Int)getUChar(delta_start+5) );
   18569 
   18570    /* Tell the dispatcher that this insn cannot be decoded, and so has
   18571       not been executed, and (is currently) the next to be executed.
   18572       RIP should be up-to-date since it made so at the start of each
   18573       insn, but nevertheless be paranoid and update it again right
   18574       now. */
   18575    stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_curr_instr) ) );
   18576    jmp_lit(Ijk_NoDecode, guest_RIP_curr_instr);
   18577    dres.whatNext = Dis_StopHere;
   18578    dres.len      = 0;
   18579    /* We also need to say that a CAS is not expected now, regardless
   18580       of what it might have been set to at the start of the function,
   18581       since the IR that we've emitted just above (to synthesis a
   18582       SIGILL) does not involve any CAS, and presumably no other IR has
   18583       been emitted for this (non-decoded) insn. */
   18584    *expect_CAS = False;
   18585    return dres;
   18586 
   18587    } /* switch (opc) for the main (primary) opcode switch. */
   18588 
   18589   decode_success:
   18590    /* All decode successes end up here. */
   18591    DIP("\n");
   18592    dres.len = (Int)toUInt(delta - delta_start);
   18593    return dres;
   18594 }
   18595 
   18596 #undef DIP
   18597 #undef DIS
   18598 
   18599 
   18600 /*------------------------------------------------------------*/
   18601 /*--- Top-level fn                                         ---*/
   18602 /*------------------------------------------------------------*/
   18603 
   18604 /* Disassemble a single instruction into IR.  The instruction
   18605    is located in host memory at &guest_code[delta]. */
   18606 
   18607 DisResult disInstr_AMD64 ( IRSB*        irsb_IN,
   18608                            Bool         put_IP,
   18609                            Bool         (*resteerOkFn) ( void*, Addr64 ),
   18610                            Bool         resteerCisOk,
   18611                            void*        callback_opaque,
   18612                            UChar*       guest_code_IN,
   18613                            Long         delta,
   18614                            Addr64       guest_IP,
   18615                            VexArch      guest_arch,
   18616                            VexArchInfo* archinfo,
   18617                            VexAbiInfo*  abiinfo,
   18618                            Bool         host_bigendian_IN )
   18619 {
   18620    Int       i, x1, x2;
   18621    Bool      expect_CAS, has_CAS;
   18622    DisResult dres;
   18623 
   18624    /* Set globals (see top of this file) */
   18625    vassert(guest_arch == VexArchAMD64);
   18626    guest_code           = guest_code_IN;
   18627    irsb                 = irsb_IN;
   18628    host_is_bigendian    = host_bigendian_IN;
   18629    guest_RIP_curr_instr = guest_IP;
   18630    guest_RIP_bbstart    = guest_IP - delta;
   18631 
   18632    /* We'll consult these after doing disInstr_AMD64_WRK. */
   18633    guest_RIP_next_assumed   = 0;
   18634    guest_RIP_next_mustcheck = False;
   18635 
   18636    x1 = irsb_IN->stmts_used;
   18637    expect_CAS = False;
   18638    dres = disInstr_AMD64_WRK ( &expect_CAS, put_IP, resteerOkFn,
   18639                                resteerCisOk,
   18640                                callback_opaque,
   18641                                delta, archinfo, abiinfo );
   18642    x2 = irsb_IN->stmts_used;
   18643    vassert(x2 >= x1);
   18644 
   18645    /* If disInstr_AMD64_WRK tried to figure out the next rip, check it
   18646       got it right.  Failure of this assertion is serious and denotes
   18647       a bug in disInstr. */
   18648    if (guest_RIP_next_mustcheck
   18649        && guest_RIP_next_assumed != guest_RIP_curr_instr + dres.len) {
   18650       vex_printf("\n");
   18651       vex_printf("assumed next %%rip = 0x%llx\n",
   18652                  guest_RIP_next_assumed );
   18653       vex_printf(" actual next %%rip = 0x%llx\n",
   18654                  guest_RIP_curr_instr + dres.len );
   18655       vpanic("disInstr_AMD64: disInstr miscalculated next %rip");
   18656    }
   18657 
   18658    /* See comment at the top of disInstr_AMD64_WRK for meaning of
   18659       expect_CAS.  Here, we (sanity-)check for the presence/absence of
   18660       IRCAS as directed by the returned expect_CAS value. */
   18661    has_CAS = False;
   18662    for (i = x1; i < x2; i++) {
   18663       if (irsb_IN->stmts[i]->tag == Ist_CAS)
   18664          has_CAS = True;
   18665    }
   18666 
   18667    if (expect_CAS != has_CAS) {
   18668       /* inconsistency detected.  re-disassemble the instruction so as
   18669          to generate a useful error message; then assert. */
   18670       vex_traceflags |= VEX_TRACE_FE;
   18671       dres = disInstr_AMD64_WRK ( &expect_CAS, put_IP, resteerOkFn,
   18672                                   resteerCisOk,
   18673                                   callback_opaque,
   18674                                   delta, archinfo, abiinfo );
   18675       for (i = x1; i < x2; i++) {
   18676          vex_printf("\t\t");
   18677          ppIRStmt(irsb_IN->stmts[i]);
   18678          vex_printf("\n");
   18679       }
   18680       /* Failure of this assertion is serious and denotes a bug in
   18681          disInstr. */
   18682       vpanic("disInstr_AMD64: inconsistency in LOCK prefix handling");
   18683    }
   18684 
   18685    return dres;
   18686 }
   18687 
   18688 
   18689 /*------------------------------------------------------------*/
   18690 /*--- Unused stuff                                         ---*/
   18691 /*------------------------------------------------------------*/
   18692 
   18693 // A potentially more Memcheck-friendly version of gen_LZCNT, if
   18694 // this should ever be needed.
   18695 //
   18696 //static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
   18697 //{
   18698 //   /* Scheme is simple: propagate the most significant 1-bit into all
   18699 //      lower positions in the word.  This gives a word of the form
   18700 //      0---01---1.  Now invert it, giving a word of the form
   18701 //      1---10---0, then do a population-count idiom (to count the 1s,
   18702 //      which is the number of leading zeroes, or the word size if the
   18703 //      original word was 0.
   18704 //   */
   18705 //   Int i;
   18706 //   IRTemp t[7];
   18707 //   for (i = 0; i < 7; i++) {
   18708 //      t[i] = newTemp(ty);
   18709 //   }
   18710 //   if (ty == Ity_I64) {
   18711 //      assign(t[0], binop(Iop_Or64, mkexpr(src),
   18712 //                                   binop(Iop_Shr64, mkexpr(src),  mkU8(1))));
   18713 //      assign(t[1], binop(Iop_Or64, mkexpr(t[0]),
   18714 //                                   binop(Iop_Shr64, mkexpr(t[0]), mkU8(2))));
   18715 //      assign(t[2], binop(Iop_Or64, mkexpr(t[1]),
   18716 //                                   binop(Iop_Shr64, mkexpr(t[1]), mkU8(4))));
   18717 //      assign(t[3], binop(Iop_Or64, mkexpr(t[2]),
   18718 //                                   binop(Iop_Shr64, mkexpr(t[2]), mkU8(8))));
   18719 //      assign(t[4], binop(Iop_Or64, mkexpr(t[3]),
   18720 //                                   binop(Iop_Shr64, mkexpr(t[3]), mkU8(16))));
   18721 //      assign(t[5], binop(Iop_Or64, mkexpr(t[4]),
   18722 //                                   binop(Iop_Shr64, mkexpr(t[4]), mkU8(32))));
   18723 //      assign(t[6], unop(Iop_Not64, mkexpr(t[5])));
   18724 //      return gen_POPCOUNT(ty, t[6]);
   18725 //   }
   18726 //   if (ty == Ity_I32) {
   18727 //      assign(t[0], binop(Iop_Or32, mkexpr(src),
   18728 //                                   binop(Iop_Shr32, mkexpr(src),  mkU8(1))));
   18729 //      assign(t[1], binop(Iop_Or32, mkexpr(t[0]),
   18730 //                                   binop(Iop_Shr32, mkexpr(t[0]), mkU8(2))));
   18731 //      assign(t[2], binop(Iop_Or32, mkexpr(t[1]),
   18732 //                                   binop(Iop_Shr32, mkexpr(t[1]), mkU8(4))));
   18733 //      assign(t[3], binop(Iop_Or32, mkexpr(t[2]),
   18734 //                                   binop(Iop_Shr32, mkexpr(t[2]), mkU8(8))));
   18735 //      assign(t[4], binop(Iop_Or32, mkexpr(t[3]),
   18736 //                                   binop(Iop_Shr32, mkexpr(t[3]), mkU8(16))));
   18737 //      assign(t[5], unop(Iop_Not32, mkexpr(t[4])));
   18738 //      return gen_POPCOUNT(ty, t[5]);
   18739 //   }
   18740 //   if (ty == Ity_I16) {
   18741 //      assign(t[0], binop(Iop_Or16, mkexpr(src),
   18742 //                                   binop(Iop_Shr16, mkexpr(src),  mkU8(1))));
   18743 //      assign(t[1], binop(Iop_Or16, mkexpr(t[0]),
   18744 //                                   binop(Iop_Shr16, mkexpr(t[0]), mkU8(2))));
   18745 //      assign(t[2], binop(Iop_Or16, mkexpr(t[1]),
   18746 //                                   binop(Iop_Shr16, mkexpr(t[1]), mkU8(4))));
   18747 //      assign(t[3], binop(Iop_Or16, mkexpr(t[2]),
   18748 //                                   binop(Iop_Shr16, mkexpr(t[2]), mkU8(8))));
   18749 //      assign(t[4], unop(Iop_Not16, mkexpr(t[3])));
   18750 //      return gen_POPCOUNT(ty, t[4]);
   18751 //   }
   18752 //   vassert(0);
   18753 //}
   18754 
   18755 
   18756 /*--------------------------------------------------------------------*/
   18757 /*--- end                                       guest_amd64_toIR.c ---*/
   18758 /*--------------------------------------------------------------------*/
   18759