Home | History | Annotate | Download | only in priv
      1 
      2 /*--------------------------------------------------------------------*/
      3 /*--- begin                                     guest_amd64_toIR.c ---*/
      4 /*--------------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2011 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 /* Translates AMD64 code to IR. */
     37 
     38 /* TODO:
     39 
     40    All Puts to CC_OP/CC_DEP1/CC_DEP2/CC_NDEP should really be checked
     41    to ensure a 64-bit value is being written.
     42 
     43    x87 FP Limitations:
     44 
     45    * all arithmetic done at 64 bits
     46 
     47    * no FP exceptions, except for handling stack over/underflow
     48 
     49    * FP rounding mode observed only for float->int conversions and
     50      int->float conversions which could lose accuracy, and for
     51      float-to-float rounding.  For all other operations,
     52      round-to-nearest is used, regardless.
     53 
     54    * FP sin/cos/tan/sincos: C2 flag is always cleared.  IOW the
     55      simulation claims the argument is in-range (-2^63 <= arg <= 2^63)
     56      even when it isn't.
     57 
     58    * some of the FCOM cases could do with testing -- not convinced
     59      that the args are the right way round.
     60 
     61    * FSAVE does not re-initialise the FPU; it should do
     62 
     63    * FINIT not only initialises the FPU environment, it also zeroes
     64      all the FP registers.  It should leave the registers unchanged.
     65 
     66     RDTSC returns zero, always.
     67 
     68     SAHF should cause eflags[1] == 1, and in fact it produces 0.  As
     69     per Intel docs this bit has no meaning anyway.  Since PUSHF is the
     70     only way to observe eflags[1], a proper fix would be to make that
     71     bit be set by PUSHF.
     72 
     73     This module uses global variables and so is not MT-safe (if that
     74     should ever become relevant).
     75 */
     76 
     77 /* Notes re address size overrides (0x67).
     78 
     79    According to the AMD documentation (24594 Rev 3.09, Sept 2003,
     80    "AMD64 Architecture Programmer's Manual Volume 3: General-Purpose
     81    and System Instructions"), Section 1.2.3 ("Address-Size Override
     82    Prefix"):
     83 
     84    0x67 applies to all explicit memory references, causing the top
     85    32 bits of the effective address to become zero.
     86 
     87    0x67 has no effect on stack references (push/pop); these always
     88    use a 64-bit address.
     89 
     90    0x67 changes the interpretation of instructions which implicitly
     91    reference RCX/RSI/RDI, so that in fact ECX/ESI/EDI are used
     92    instead.  These are:
     93 
     94       cmp{s,sb,sw,sd,sq}
     95       in{s,sb,sw,sd}
     96       jcxz, jecxz, jrcxz
     97       lod{s,sb,sw,sd,sq}
     98       loop{,e,bz,be,z}
     99       mov{s,sb,sw,sd,sq}
    100       out{s,sb,sw,sd}
    101       rep{,e,ne,nz}
    102       sca{s,sb,sw,sd,sq}
    103       sto{s,sb,sw,sd,sq}
    104       xlat{,b} */
    105 
    106 /* "Special" instructions.
    107 
    108    This instruction decoder can decode three special instructions
    109    which mean nothing natively (are no-ops as far as regs/mem are
    110    concerned) but have meaning for supporting Valgrind.  A special
    111    instruction is flagged by the 16-byte preamble 48C1C703 48C1C70D
    112    48C1C73D 48C1C733 (in the standard interpretation, that means: rolq
    113    $3, %rdi; rolq $13, %rdi; rolq $61, %rdi; rolq $51, %rdi).
    114    Following that, one of the following 3 are allowed (standard
    115    interpretation in parentheses):
    116 
    117       4887DB (xchgq %rbx,%rbx)   %RDX = client_request ( %RAX )
    118       4887C9 (xchgq %rcx,%rcx)   %RAX = guest_NRADDR
    119       4887D2 (xchgq %rdx,%rdx)   call-noredir *%RAX
    120 
    121    Any other bytes following the 16-byte preamble are illegal and
    122    constitute a failure in instruction decoding.  This all assumes
    123    that the preamble will never occur except in specific code
    124    fragments designed for Valgrind to catch.
    125 
    126    No prefixes may precede a "Special" instruction.
    127 */
    128 
    129 /* casLE (implementation of lock-prefixed insns) and rep-prefixed
    130    insns: the side-exit back to the start of the insn is done with
    131    Ijk_Boring.  This is quite wrong, it should be done with
    132    Ijk_NoRedir, since otherwise the side exit, which is intended to
    133    restart the instruction for whatever reason, could go somewhere
    134    entirely else.  Doing it right (with Ijk_NoRedir jumps) would make
    135    no-redir jumps performance critical, at least for rep-prefixed
    136    instructions, since all iterations thereof would involve such a
    137    jump.  It's not such a big deal with casLE since the side exit is
    138    only taken if the CAS fails, that is, the location is contended,
    139    which is relatively unlikely.
    140 
    141    Note also, the test for CAS success vs failure is done using
    142    Iop_CasCmp{EQ,NE}{8,16,32,64} rather than the ordinary
    143    Iop_Cmp{EQ,NE} equivalents.  This is so as to tell Memcheck that it
    144    shouldn't definedness-check these comparisons.  See
    145    COMMENT_ON_CasCmpEQ in memcheck/mc_translate.c for
    146    background/rationale.
    147 */
    148 
    149 /* LOCK prefixed instructions.  These are translated using IR-level
    150    CAS statements (IRCAS) and are believed to preserve atomicity, even
    151    from the point of view of some other process racing against a
    152    simulated one (presumably they communicate via a shared memory
    153    segment).
    154 
    155    Handlers which are aware of LOCK prefixes are:
    156       dis_op2_G_E      (add, or, adc, sbb, and, sub, xor)
    157       dis_cmpxchg_G_E  (cmpxchg)
    158       dis_Grp1         (add, or, adc, sbb, and, sub, xor)
    159       dis_Grp3         (not, neg)
    160       dis_Grp4         (inc, dec)
    161       dis_Grp5         (inc, dec)
    162       dis_Grp8_Imm     (bts, btc, btr)
    163       dis_bt_G_E       (bts, btc, btr)
    164       dis_xadd_G_E     (xadd)
    165 */
    166 
    167 
    168 #include "libvex_basictypes.h"
    169 #include "libvex_ir.h"
    170 #include "libvex.h"
    171 #include "libvex_guest_amd64.h"
    172 
    173 #include "main_util.h"
    174 #include "main_globals.h"
    175 #include "guest_generic_bb_to_IR.h"
    176 #include "guest_generic_x87.h"
    177 #include "guest_amd64_defs.h"
    178 
    179 
    180 /*------------------------------------------------------------*/
    181 /*--- Globals                                              ---*/
    182 /*------------------------------------------------------------*/
    183 
    184 /* These are set at the start of the translation of an insn, right
    185    down in disInstr_AMD64, so that we don't have to pass them around
    186    endlessly.  They are all constant during the translation of any
    187    given insn. */
    188 
    189 /* These are set at the start of the translation of a BB, so
    190    that we don't have to pass them around endlessly. */
    191 
    192 /* We need to know this to do sub-register accesses correctly. */
    193 static Bool host_is_bigendian;
    194 
    195 /* Pointer to the guest code area (points to start of BB, not to the
    196    insn being processed). */
    197 static UChar* guest_code;
    198 
    199 /* The guest address corresponding to guest_code[0]. */
    200 static Addr64 guest_RIP_bbstart;
    201 
    202 /* The guest address for the instruction currently being
    203    translated. */
    204 static Addr64 guest_RIP_curr_instr;
    205 
    206 /* The IRSB* into which we're generating code. */
    207 static IRSB* irsb;
    208 
    209 /* For ensuring that %rip-relative addressing is done right.  A read
    210    of %rip generates the address of the next instruction.  It may be
    211    that we don't conveniently know that inside disAMode().  For sanity
    212    checking, if the next insn %rip is needed, we make a guess at what
    213    it is, record that guess here, and set the accompanying Bool to
    214    indicate that -- after this insn's decode is finished -- that guess
    215    needs to be checked.  */
    216 
    217 /* At the start of each insn decode, is set to (0, False).
    218    After the decode, if _mustcheck is now True, _assumed is
    219    checked. */
    220 
    221 static Addr64 guest_RIP_next_assumed;
    222 static Bool   guest_RIP_next_mustcheck;
    223 
    224 
    225 /*------------------------------------------------------------*/
    226 /*--- Helpers for constructing IR.                         ---*/
    227 /*------------------------------------------------------------*/
    228 
    229 /* Generate a new temporary of the given type. */
    230 static IRTemp newTemp ( IRType ty )
    231 {
    232    vassert(isPlausibleIRType(ty));
    233    return newIRTemp( irsb->tyenv, ty );
    234 }
    235 
    236 /* Add a statement to the list held by "irsb". */
    237 static void stmt ( IRStmt* st )
    238 {
    239    addStmtToIRSB( irsb, st );
    240 }
    241 
    242 /* Generate a statement "dst := e". */
    243 static void assign ( IRTemp dst, IRExpr* e )
    244 {
    245    stmt( IRStmt_WrTmp(dst, e) );
    246 }
    247 
    248 static IRExpr* unop ( IROp op, IRExpr* a )
    249 {
    250    return IRExpr_Unop(op, a);
    251 }
    252 
    253 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
    254 {
    255    return IRExpr_Binop(op, a1, a2);
    256 }
    257 
    258 static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
    259 {
    260    return IRExpr_Triop(op, a1, a2, a3);
    261 }
    262 
    263 static IRExpr* mkexpr ( IRTemp tmp )
    264 {
    265    return IRExpr_RdTmp(tmp);
    266 }
    267 
    268 static IRExpr* mkU8 ( ULong i )
    269 {
    270    vassert(i < 256);
    271    return IRExpr_Const(IRConst_U8( (UChar)i ));
    272 }
    273 
    274 static IRExpr* mkU16 ( ULong i )
    275 {
    276    vassert(i < 0x10000ULL);
    277    return IRExpr_Const(IRConst_U16( (UShort)i ));
    278 }
    279 
    280 static IRExpr* mkU32 ( ULong i )
    281 {
    282    vassert(i < 0x100000000ULL);
    283    return IRExpr_Const(IRConst_U32( (UInt)i ));
    284 }
    285 
    286 static IRExpr* mkU64 ( ULong i )
    287 {
    288    return IRExpr_Const(IRConst_U64(i));
    289 }
    290 
    291 static IRExpr* mkU ( IRType ty, ULong i )
    292 {
    293    switch (ty) {
    294       case Ity_I8:  return mkU8(i);
    295       case Ity_I16: return mkU16(i);
    296       case Ity_I32: return mkU32(i);
    297       case Ity_I64: return mkU64(i);
    298       default: vpanic("mkU(amd64)");
    299    }
    300 }
    301 
    302 static void storeLE ( IRExpr* addr, IRExpr* data )
    303 {
    304    stmt( IRStmt_Store(Iend_LE, addr, data) );
    305 }
    306 
    307 static IRExpr* loadLE ( IRType ty, IRExpr* addr )
    308 {
    309    return IRExpr_Load(Iend_LE, ty, addr);
    310 }
    311 
    312 static IROp mkSizedOp ( IRType ty, IROp op8 )
    313 {
    314    vassert(op8 == Iop_Add8 || op8 == Iop_Sub8
    315            || op8 == Iop_Mul8
    316            || op8 == Iop_Or8 || op8 == Iop_And8 || op8 == Iop_Xor8
    317            || op8 == Iop_Shl8 || op8 == Iop_Shr8 || op8 == Iop_Sar8
    318            || op8 == Iop_CmpEQ8 || op8 == Iop_CmpNE8
    319            || op8 == Iop_CasCmpNE8
    320            || op8 == Iop_Not8 );
    321    switch (ty) {
    322       case Ity_I8:  return 0 +op8;
    323       case Ity_I16: return 1 +op8;
    324       case Ity_I32: return 2 +op8;
    325       case Ity_I64: return 3 +op8;
    326       default: vpanic("mkSizedOp(amd64)");
    327    }
    328 }
    329 
    330 static
    331 IRExpr* doScalarWidening ( Int szSmall, Int szBig, Bool signd, IRExpr* src )
    332 {
    333    if (szSmall == 1 && szBig == 4) {
    334       return unop(signd ? Iop_8Sto32 : Iop_8Uto32, src);
    335    }
    336    if (szSmall == 1 && szBig == 2) {
    337       return unop(signd ? Iop_8Sto16 : Iop_8Uto16, src);
    338    }
    339    if (szSmall == 2 && szBig == 4) {
    340       return unop(signd ? Iop_16Sto32 : Iop_16Uto32, src);
    341    }
    342    if (szSmall == 1 && szBig == 8 && !signd) {
    343       return unop(Iop_8Uto64, src);
    344    }
    345    if (szSmall == 1 && szBig == 8 && signd) {
    346       return unop(Iop_8Sto64, src);
    347    }
    348    if (szSmall == 2 && szBig == 8 && !signd) {
    349       return unop(Iop_16Uto64, src);
    350    }
    351    if (szSmall == 2 && szBig == 8 && signd) {
    352       return unop(Iop_16Sto64, src);
    353    }
    354    vpanic("doScalarWidening(amd64)");
    355 }
    356 
    357 
    358 
    359 /*------------------------------------------------------------*/
    360 /*--- Debugging output                                     ---*/
    361 /*------------------------------------------------------------*/
    362 
    363 /* Bomb out if we can't handle something. */
    364 __attribute__ ((noreturn))
    365 static void unimplemented ( HChar* str )
    366 {
    367    vex_printf("amd64toIR: unimplemented feature\n");
    368    vpanic(str);
    369 }
    370 
    371 #define DIP(format, args...)           \
    372    if (vex_traceflags & VEX_TRACE_FE)  \
    373       vex_printf(format, ## args)
    374 
    375 #define DIS(buf, format, args...)      \
    376    if (vex_traceflags & VEX_TRACE_FE)  \
    377       vex_sprintf(buf, format, ## args)
    378 
    379 
    380 /*------------------------------------------------------------*/
    381 /*--- Offsets of various parts of the amd64 guest state.   ---*/
    382 /*------------------------------------------------------------*/
    383 
    384 #define OFFB_RAX       offsetof(VexGuestAMD64State,guest_RAX)
    385 #define OFFB_RBX       offsetof(VexGuestAMD64State,guest_RBX)
    386 #define OFFB_RCX       offsetof(VexGuestAMD64State,guest_RCX)
    387 #define OFFB_RDX       offsetof(VexGuestAMD64State,guest_RDX)
    388 #define OFFB_RSP       offsetof(VexGuestAMD64State,guest_RSP)
    389 #define OFFB_RBP       offsetof(VexGuestAMD64State,guest_RBP)
    390 #define OFFB_RSI       offsetof(VexGuestAMD64State,guest_RSI)
    391 #define OFFB_RDI       offsetof(VexGuestAMD64State,guest_RDI)
    392 #define OFFB_R8        offsetof(VexGuestAMD64State,guest_R8)
    393 #define OFFB_R9        offsetof(VexGuestAMD64State,guest_R9)
    394 #define OFFB_R10       offsetof(VexGuestAMD64State,guest_R10)
    395 #define OFFB_R11       offsetof(VexGuestAMD64State,guest_R11)
    396 #define OFFB_R12       offsetof(VexGuestAMD64State,guest_R12)
    397 #define OFFB_R13       offsetof(VexGuestAMD64State,guest_R13)
    398 #define OFFB_R14       offsetof(VexGuestAMD64State,guest_R14)
    399 #define OFFB_R15       offsetof(VexGuestAMD64State,guest_R15)
    400 
    401 #define OFFB_RIP       offsetof(VexGuestAMD64State,guest_RIP)
    402 
    403 #define OFFB_FS_ZERO   offsetof(VexGuestAMD64State,guest_FS_ZERO)
    404 #define OFFB_GS_0x60   offsetof(VexGuestAMD64State,guest_GS_0x60)
    405 
    406 #define OFFB_CC_OP     offsetof(VexGuestAMD64State,guest_CC_OP)
    407 #define OFFB_CC_DEP1   offsetof(VexGuestAMD64State,guest_CC_DEP1)
    408 #define OFFB_CC_DEP2   offsetof(VexGuestAMD64State,guest_CC_DEP2)
    409 #define OFFB_CC_NDEP   offsetof(VexGuestAMD64State,guest_CC_NDEP)
    410 
    411 #define OFFB_FPREGS    offsetof(VexGuestAMD64State,guest_FPREG[0])
    412 #define OFFB_FPTAGS    offsetof(VexGuestAMD64State,guest_FPTAG[0])
    413 #define OFFB_DFLAG     offsetof(VexGuestAMD64State,guest_DFLAG)
    414 #define OFFB_ACFLAG    offsetof(VexGuestAMD64State,guest_ACFLAG)
    415 #define OFFB_IDFLAG    offsetof(VexGuestAMD64State,guest_IDFLAG)
    416 #define OFFB_FTOP      offsetof(VexGuestAMD64State,guest_FTOP)
    417 #define OFFB_FC3210    offsetof(VexGuestAMD64State,guest_FC3210)
    418 #define OFFB_FPROUND   offsetof(VexGuestAMD64State,guest_FPROUND)
    419 //..
    420 //.. #define OFFB_CS        offsetof(VexGuestX86State,guest_CS)
    421 //.. #define OFFB_DS        offsetof(VexGuestX86State,guest_DS)
    422 //.. #define OFFB_ES        offsetof(VexGuestX86State,guest_ES)
    423 //.. #define OFFB_FS        offsetof(VexGuestX86State,guest_FS)
    424 //.. #define OFFB_GS        offsetof(VexGuestX86State,guest_GS)
    425 //.. #define OFFB_SS        offsetof(VexGuestX86State,guest_SS)
    426 //.. #define OFFB_LDT       offsetof(VexGuestX86State,guest_LDT)
    427 //.. #define OFFB_GDT       offsetof(VexGuestX86State,guest_GDT)
    428 
    429 #define OFFB_SSEROUND  offsetof(VexGuestAMD64State,guest_SSEROUND)
    430 #define OFFB_XMM0      offsetof(VexGuestAMD64State,guest_XMM0)
    431 #define OFFB_XMM1      offsetof(VexGuestAMD64State,guest_XMM1)
    432 #define OFFB_XMM2      offsetof(VexGuestAMD64State,guest_XMM2)
    433 #define OFFB_XMM3      offsetof(VexGuestAMD64State,guest_XMM3)
    434 #define OFFB_XMM4      offsetof(VexGuestAMD64State,guest_XMM4)
    435 #define OFFB_XMM5      offsetof(VexGuestAMD64State,guest_XMM5)
    436 #define OFFB_XMM6      offsetof(VexGuestAMD64State,guest_XMM6)
    437 #define OFFB_XMM7      offsetof(VexGuestAMD64State,guest_XMM7)
    438 #define OFFB_XMM8      offsetof(VexGuestAMD64State,guest_XMM8)
    439 #define OFFB_XMM9      offsetof(VexGuestAMD64State,guest_XMM9)
    440 #define OFFB_XMM10     offsetof(VexGuestAMD64State,guest_XMM10)
    441 #define OFFB_XMM11     offsetof(VexGuestAMD64State,guest_XMM11)
    442 #define OFFB_XMM12     offsetof(VexGuestAMD64State,guest_XMM12)
    443 #define OFFB_XMM13     offsetof(VexGuestAMD64State,guest_XMM13)
    444 #define OFFB_XMM14     offsetof(VexGuestAMD64State,guest_XMM14)
    445 #define OFFB_XMM15     offsetof(VexGuestAMD64State,guest_XMM15)
    446 #define OFFB_XMM16     offsetof(VexGuestAMD64State,guest_XMM16)
    447 
    448 #define OFFB_EMWARN    offsetof(VexGuestAMD64State,guest_EMWARN)
    449 #define OFFB_TISTART   offsetof(VexGuestAMD64State,guest_TISTART)
    450 #define OFFB_TILEN     offsetof(VexGuestAMD64State,guest_TILEN)
    451 
    452 #define OFFB_NRADDR    offsetof(VexGuestAMD64State,guest_NRADDR)
    453 
    454 
    455 /*------------------------------------------------------------*/
    456 /*--- Helper bits and pieces for deconstructing the        ---*/
    457 /*--- amd64 insn stream.                                   ---*/
    458 /*------------------------------------------------------------*/
    459 
    460 /* This is the AMD64 register encoding -- integer regs. */
    461 #define R_RAX 0
    462 #define R_RCX 1
    463 #define R_RDX 2
    464 #define R_RBX 3
    465 #define R_RSP 4
    466 #define R_RBP 5
    467 #define R_RSI 6
    468 #define R_RDI 7
    469 #define R_R8  8
    470 #define R_R9  9
    471 #define R_R10 10
    472 #define R_R11 11
    473 #define R_R12 12
    474 #define R_R13 13
    475 #define R_R14 14
    476 #define R_R15 15
    477 
    478 //.. #define R_AL (0+R_EAX)
    479 //.. #define R_AH (4+R_EAX)
    480 
    481 /* This is the Intel register encoding -- segment regs. */
    482 #define R_ES 0
    483 #define R_CS 1
    484 #define R_SS 2
    485 #define R_DS 3
    486 #define R_FS 4
    487 #define R_GS 5
    488 
    489 
    490 /* Various simple conversions */
    491 
    492 static ULong extend_s_8to64 ( UChar x )
    493 {
    494    return (ULong)((((Long)x) << 56) >> 56);
    495 }
    496 
    497 static ULong extend_s_16to64 ( UShort x )
    498 {
    499    return (ULong)((((Long)x) << 48) >> 48);
    500 }
    501 
    502 static ULong extend_s_32to64 ( UInt x )
    503 {
    504    return (ULong)((((Long)x) << 32) >> 32);
    505 }
    506 
    507 /* Figure out whether the mod and rm parts of a modRM byte refer to a
    508    register or memory.  If so, the byte will have the form 11XXXYYY,
    509    where YYY is the register number. */
    510 inline
    511 static Bool epartIsReg ( UChar mod_reg_rm )
    512 {
    513    return toBool(0xC0 == (mod_reg_rm & 0xC0));
    514 }
    515 
    516 /* Extract the 'g' field from a modRM byte.  This only produces 3
    517    bits, which is not a complete register number.  You should avoid
    518    this function if at all possible. */
    519 inline
    520 static Int gregLO3ofRM ( UChar mod_reg_rm )
    521 {
    522    return (Int)( (mod_reg_rm >> 3) & 7 );
    523 }
    524 
    525 /* Ditto the 'e' field of a modRM byte. */
    526 inline
    527 static Int eregLO3ofRM ( UChar mod_reg_rm )
    528 {
    529    return (Int)(mod_reg_rm & 0x7);
    530 }
    531 
    532 /* Get a 8/16/32-bit unsigned value out of the insn stream. */
    533 
    534 static UChar getUChar ( Long delta )
    535 {
    536    UChar v = guest_code[delta+0];
    537    return v;
    538 }
    539 
    540 static UInt getUDisp16 ( Long delta )
    541 {
    542    UInt v = guest_code[delta+1]; v <<= 8;
    543    v |= guest_code[delta+0];
    544    return v & 0xFFFF;
    545 }
    546 
    547 //.. static UInt getUDisp ( Int size, Long delta )
    548 //.. {
    549 //..    switch (size) {
    550 //..       case 4: return getUDisp32(delta);
    551 //..       case 2: return getUDisp16(delta);
    552 //..       case 1: return getUChar(delta);
    553 //..       default: vpanic("getUDisp(x86)");
    554 //..    }
    555 //..    return 0; /*notreached*/
    556 //.. }
    557 
    558 
    559 /* Get a byte value out of the insn stream and sign-extend to 64
    560    bits. */
    561 static Long getSDisp8 ( Long delta )
    562 {
    563    return extend_s_8to64( guest_code[delta] );
    564 }
    565 
    566 /* Get a 16-bit value out of the insn stream and sign-extend to 64
    567    bits. */
    568 static Long getSDisp16 ( Long delta )
    569 {
    570    UInt v = guest_code[delta+1]; v <<= 8;
    571    v |= guest_code[delta+0];
    572    return extend_s_16to64( (UShort)v );
    573 }
    574 
    575 /* Get a 32-bit value out of the insn stream and sign-extend to 64
    576    bits. */
    577 static Long getSDisp32 ( Long delta )
    578 {
    579    UInt v = guest_code[delta+3]; v <<= 8;
    580    v |= guest_code[delta+2]; v <<= 8;
    581    v |= guest_code[delta+1]; v <<= 8;
    582    v |= guest_code[delta+0];
    583    return extend_s_32to64( v );
    584 }
    585 
    586 /* Get a 64-bit value out of the insn stream. */
    587 static Long getDisp64 ( Long delta )
    588 {
    589    ULong v = 0;
    590    v |= guest_code[delta+7]; v <<= 8;
    591    v |= guest_code[delta+6]; v <<= 8;
    592    v |= guest_code[delta+5]; v <<= 8;
    593    v |= guest_code[delta+4]; v <<= 8;
    594    v |= guest_code[delta+3]; v <<= 8;
    595    v |= guest_code[delta+2]; v <<= 8;
    596    v |= guest_code[delta+1]; v <<= 8;
    597    v |= guest_code[delta+0];
    598    return v;
    599 }
    600 
    601 /* Note: because AMD64 doesn't allow 64-bit literals, it is an error
    602    if this is called with size==8.  Should not happen. */
    603 static Long getSDisp ( Int size, Long delta )
    604 {
    605    switch (size) {
    606       case 4: return getSDisp32(delta);
    607       case 2: return getSDisp16(delta);
    608       case 1: return getSDisp8(delta);
    609       default: vpanic("getSDisp(amd64)");
    610   }
    611 }
    612 
    613 static ULong mkSizeMask ( Int sz )
    614 {
    615    switch (sz) {
    616       case 1: return 0x00000000000000FFULL;
    617       case 2: return 0x000000000000FFFFULL;
    618       case 4: return 0x00000000FFFFFFFFULL;
    619       case 8: return 0xFFFFFFFFFFFFFFFFULL;
    620       default: vpanic("mkSzMask(amd64)");
    621    }
    622 }
    623 
    624 static Int imin ( Int a, Int b )
    625 {
    626    return (a < b) ? a : b;
    627 }
    628 
    629 static IRType szToITy ( Int n )
    630 {
    631    switch (n) {
    632       case 1: return Ity_I8;
    633       case 2: return Ity_I16;
    634       case 4: return Ity_I32;
    635       case 8: return Ity_I64;
    636       default: vex_printf("\nszToITy(%d)\n", n);
    637                vpanic("szToITy(amd64)");
    638    }
    639 }
    640 
    641 
    642 /*------------------------------------------------------------*/
    643 /*--- For dealing with prefixes.                           ---*/
    644 /*------------------------------------------------------------*/
    645 
    646 /* The idea is to pass around an int holding a bitmask summarising
    647    info from the prefixes seen on the current instruction, including
    648    info from the REX byte.  This info is used in various places, but
    649    most especially when making sense of register fields in
    650    instructions.
    651 
    652    The top 16 bits of the prefix are 0x3141, just as a hacky way
    653    to ensure it really is a valid prefix.
    654 
    655    Things you can safely assume about a well-formed prefix:
    656    * at most one segment-override bit (CS,DS,ES,FS,GS,SS) is set.
    657    * if REX is not present then REXW,REXR,REXX,REXB will read
    658      as zero.
    659    * F2 and F3 will not both be 1.
    660 */
    661 
    662 typedef UInt  Prefix;
    663 
    664 #define PFX_ASO   (1<<0)     /* address-size override present (0x67) */
    665 #define PFX_66    (1<<1)     /* operand-size override-to-16 present (0x66) */
    666 #define PFX_REX   (1<<2)     /* REX byte present (0x40 to 0x4F) */
    667 #define PFX_REXW  (1<<3)     /* REX W bit, if REX present, else 0 */
    668 #define PFX_REXR  (1<<4)     /* REX R bit, if REX present, else 0 */
    669 #define PFX_REXX  (1<<5)     /* REX X bit, if REX present, else 0 */
    670 #define PFX_REXB  (1<<6)     /* REX B bit, if REX present, else 0 */
    671 #define PFX_LOCK  (1<<7)     /* bus LOCK prefix present (0xF0) */
    672 #define PFX_F2    (1<<8)     /* REP/REPE/REPZ prefix present (0xF2) */
    673 #define PFX_F3    (1<<9)     /* REPNE/REPNZ prefix present (0xF3) */
    674 #define PFX_CS    (1<<10)    /* CS segment prefix present (0x2E) */
    675 #define PFX_DS    (1<<11)    /* DS segment prefix present (0x3E) */
    676 #define PFX_ES    (1<<12)    /* ES segment prefix present (0x26) */
    677 #define PFX_FS    (1<<13)    /* FS segment prefix present (0x64) */
    678 #define PFX_GS    (1<<14)    /* GS segment prefix present (0x65) */
    679 #define PFX_SS    (1<<15)    /* SS segment prefix present (0x36) */
    680 
    681 #define PFX_EMPTY 0x31410000
    682 
    683 static Bool IS_VALID_PFX ( Prefix pfx ) {
    684    return toBool((pfx & 0xFFFF0000) == PFX_EMPTY);
    685 }
    686 
    687 static Bool haveREX ( Prefix pfx ) {
    688    return toBool(pfx & PFX_REX);
    689 }
    690 
    691 static Int getRexW ( Prefix pfx ) {
    692    return (pfx & PFX_REXW) ? 1 : 0;
    693 }
    694 /* Apparently unused.
    695 static Int getRexR ( Prefix pfx ) {
    696    return (pfx & PFX_REXR) ? 1 : 0;
    697 }
    698 */
    699 static Int getRexX ( Prefix pfx ) {
    700    return (pfx & PFX_REXX) ? 1 : 0;
    701 }
    702 static Int getRexB ( Prefix pfx ) {
    703    return (pfx & PFX_REXB) ? 1 : 0;
    704 }
    705 
    706 /* Check a prefix doesn't have F2 or F3 set in it, since usually that
    707    completely changes what instruction it really is. */
    708 static Bool haveF2orF3 ( Prefix pfx ) {
    709    return toBool((pfx & (PFX_F2|PFX_F3)) > 0);
    710 }
    711 static Bool haveF2 ( Prefix pfx ) {
    712    return toBool((pfx & PFX_F2) > 0);
    713 }
    714 static Bool haveF3 ( Prefix pfx ) {
    715    return toBool((pfx & PFX_F3) > 0);
    716 }
    717 
    718 static Bool have66 ( Prefix pfx ) {
    719    return toBool((pfx & PFX_66) > 0);
    720 }
    721 static Bool haveASO ( Prefix pfx ) {
    722    return toBool((pfx & PFX_ASO) > 0);
    723 }
    724 
    725 /* Return True iff pfx has 66 set and F2 and F3 clear */
    726 static Bool have66noF2noF3 ( Prefix pfx )
    727 {
    728   return
    729      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_66);
    730 }
    731 
    732 /* Return True iff pfx has F2 set and 66 and F3 clear */
    733 static Bool haveF2no66noF3 ( Prefix pfx )
    734 {
    735   return
    736      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_F2);
    737 }
    738 
    739 /* Return True iff pfx has F3 set and 66 and F2 clear */
    740 static Bool haveF3no66noF2 ( Prefix pfx )
    741 {
    742   return
    743      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_F3);
    744 }
    745 
    746 /* Return True iff pfx has F3 set and F2 clear */
    747 static Bool haveF3noF2 ( Prefix pfx )
    748 {
    749   return
    750      toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F3);
    751 }
    752 
    753 /* Return True iff pfx has F2 set and F3 clear */
    754 static Bool haveF2noF3 ( Prefix pfx )
    755 {
    756   return
    757      toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F2);
    758 }
    759 
    760 /* Return True iff pfx has 66, F2 and F3 clear */
    761 static Bool haveNo66noF2noF3 ( Prefix pfx )
    762 {
    763   return
    764      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == 0);
    765 }
    766 
    767 /* Return True iff pfx has any of 66, F2 and F3 set */
    768 static Bool have66orF2orF3 ( Prefix pfx )
    769 {
    770   return toBool( ! haveNo66noF2noF3(pfx) );
    771 }
    772 
    773 /* Return True iff pfx has 66 or F2 set */
    774 static Bool have66orF2 ( Prefix pfx )
    775 {
    776    return toBool((pfx & (PFX_66|PFX_F2)) > 0);
    777 }
    778 
    779 /* Clear all the segment-override bits in a prefix. */
    780 static Prefix clearSegBits ( Prefix p )
    781 {
    782    return
    783       p & ~(PFX_CS | PFX_DS | PFX_ES | PFX_FS | PFX_GS | PFX_SS);
    784 }
    785 
    786 
    787 /*------------------------------------------------------------*/
    788 /*--- For dealing with integer registers                   ---*/
    789 /*------------------------------------------------------------*/
    790 
    791 /* This is somewhat complex.  The rules are:
    792 
    793    For 64, 32 and 16 bit register references, the e or g fields in the
    794    modrm bytes supply the low 3 bits of the register number.  The
    795    fourth (most-significant) bit of the register number is supplied by
    796    the REX byte, if it is present; else that bit is taken to be zero.
    797 
    798    The REX.R bit supplies the high bit corresponding to the g register
    799    field, and the REX.B bit supplies the high bit corresponding to the
    800    e register field (when the mod part of modrm indicates that modrm's
    801    e component refers to a register and not to memory).
    802 
    803    The REX.X bit supplies a high register bit for certain registers
    804    in SIB address modes, and is generally rarely used.
    805 
    806    For 8 bit register references, the presence of the REX byte itself
    807    has significance.  If there is no REX present, then the 3-bit
    808    number extracted from the modrm e or g field is treated as an index
    809    into the sequence %al %cl %dl %bl %ah %ch %dh %bh -- that is, the
    810    old x86 encoding scheme.
    811 
    812    But if there is a REX present, the register reference is
    813    interpreted in the same way as for 64/32/16-bit references: a high
    814    bit is extracted from REX, giving a 4-bit number, and the denoted
    815    register is the lowest 8 bits of the 16 integer registers denoted
    816    by the number.  In particular, values 3 through 7 of this sequence
    817    do not refer to %ah %ch %dh %bh but instead to the lowest 8 bits of
    818    %rsp %rbp %rsi %rdi.
    819 
    820    The REX.W bit has no bearing at all on register numbers.  Instead
    821    its presence indicates that the operand size is to be overridden
    822    from its default value (32 bits) to 64 bits instead.  This is in
    823    the same fashion that an 0x66 prefix indicates the operand size is
    824    to be overridden from 32 bits down to 16 bits.  When both REX.W and
    825    0x66 are present there is a conflict, and REX.W takes precedence.
    826 
    827    Rather than try to handle this complexity using a single huge
    828    function, several smaller ones are provided.  The aim is to make it
    829    as difficult as possible to screw up register decoding in a subtle
    830    and hard-to-track-down way.
    831 
    832    Because these routines fish around in the host's memory (that is,
    833    in the guest state area) for sub-parts of guest registers, their
    834    correctness depends on the host's endianness.  So far these
    835    routines only work for little-endian hosts.  Those for which
    836    endianness is important have assertions to ensure sanity.
    837 */
    838 
    839 
    840 /* About the simplest question you can ask: where do the 64-bit
    841    integer registers live (in the guest state) ? */
    842 
    843 static Int integerGuestReg64Offset ( UInt reg )
    844 {
    845    switch (reg) {
    846       case R_RAX: return OFFB_RAX;
    847       case R_RCX: return OFFB_RCX;
    848       case R_RDX: return OFFB_RDX;
    849       case R_RBX: return OFFB_RBX;
    850       case R_RSP: return OFFB_RSP;
    851       case R_RBP: return OFFB_RBP;
    852       case R_RSI: return OFFB_RSI;
    853       case R_RDI: return OFFB_RDI;
    854       case R_R8:  return OFFB_R8;
    855       case R_R9:  return OFFB_R9;
    856       case R_R10: return OFFB_R10;
    857       case R_R11: return OFFB_R11;
    858       case R_R12: return OFFB_R12;
    859       case R_R13: return OFFB_R13;
    860       case R_R14: return OFFB_R14;
    861       case R_R15: return OFFB_R15;
    862       default: vpanic("integerGuestReg64Offset(amd64)");
    863    }
    864 }
    865 
    866 
    867 /* Produce the name of an integer register, for printing purposes.
    868    reg is a number in the range 0 .. 15 that has been generated from a
    869    3-bit reg-field number and a REX extension bit.  irregular denotes
    870    the case where sz==1 and no REX byte is present. */
    871 
    872 static
    873 HChar* nameIReg ( Int sz, UInt reg, Bool irregular )
    874 {
    875    static HChar* ireg64_names[16]
    876      = { "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
    877          "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" };
    878    static HChar* ireg32_names[16]
    879      = { "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
    880          "%r8d", "%r9d", "%r10d","%r11d","%r12d","%r13d","%r14d","%r15d" };
    881    static HChar* ireg16_names[16]
    882      = { "%ax",  "%cx",  "%dx",  "%bx",  "%sp",  "%bp",  "%si",  "%di",
    883          "%r8w", "%r9w", "%r10w","%r11w","%r12w","%r13w","%r14w","%r15w" };
    884    static HChar* ireg8_names[16]
    885      = { "%al",  "%cl",  "%dl",  "%bl",  "%spl", "%bpl", "%sil", "%dil",
    886          "%r8b", "%r9b", "%r10b","%r11b","%r12b","%r13b","%r14b","%r15b" };
    887    static HChar* ireg8_irregular[8]
    888      = { "%al", "%cl", "%dl", "%bl", "%ah", "%ch", "%dh", "%bh" };
    889 
    890    vassert(reg < 16);
    891    if (sz == 1) {
    892       if (irregular)
    893          vassert(reg < 8);
    894    } else {
    895       vassert(irregular == False);
    896    }
    897 
    898    switch (sz) {
    899       case 8: return ireg64_names[reg];
    900       case 4: return ireg32_names[reg];
    901       case 2: return ireg16_names[reg];
    902       case 1: if (irregular) {
    903                  return ireg8_irregular[reg];
    904               } else {
    905                  return ireg8_names[reg];
    906               }
    907       default: vpanic("nameIReg(amd64)");
    908    }
    909 }
    910 
    911 /* Using the same argument conventions as nameIReg, produce the
    912    guest state offset of an integer register. */
    913 
    914 static
    915 Int offsetIReg ( Int sz, UInt reg, Bool irregular )
    916 {
    917    vassert(reg < 16);
    918    if (sz == 1) {
    919       if (irregular)
    920          vassert(reg < 8);
    921    } else {
    922       vassert(irregular == False);
    923    }
    924 
    925    /* Deal with irregular case -- sz==1 and no REX present */
    926    if (sz == 1 && irregular) {
    927       switch (reg) {
    928          case R_RSP: return 1+ OFFB_RAX;
    929          case R_RBP: return 1+ OFFB_RCX;
    930          case R_RSI: return 1+ OFFB_RDX;
    931          case R_RDI: return 1+ OFFB_RBX;
    932          default:    break; /* use the normal case */
    933       }
    934    }
    935 
    936    /* Normal case */
    937    return integerGuestReg64Offset(reg);
    938 }
    939 
    940 
    941 /* Read the %CL register :: Ity_I8, for shift/rotate operations. */
    942 
    943 static IRExpr* getIRegCL ( void )
    944 {
    945    vassert(!host_is_bigendian);
    946    return IRExpr_Get( OFFB_RCX, Ity_I8 );
    947 }
    948 
    949 
    950 /* Write to the %AH register. */
    951 
    952 static void putIRegAH ( IRExpr* e )
    953 {
    954    vassert(!host_is_bigendian);
    955    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I8);
    956    stmt( IRStmt_Put( OFFB_RAX+1, e ) );
    957 }
    958 
    959 
    960 /* Read/write various widths of %RAX, as it has various
    961    special-purpose uses. */
    962 
    963 static HChar* nameIRegRAX ( Int sz )
    964 {
    965    switch (sz) {
    966       case 1: return "%al";
    967       case 2: return "%ax";
    968       case 4: return "%eax";
    969       case 8: return "%rax";
    970       default: vpanic("nameIRegRAX(amd64)");
    971    }
    972 }
    973 
    974 static IRExpr* getIRegRAX ( Int sz )
    975 {
    976    vassert(!host_is_bigendian);
    977    switch (sz) {
    978       case 1: return IRExpr_Get( OFFB_RAX, Ity_I8 );
    979       case 2: return IRExpr_Get( OFFB_RAX, Ity_I16 );
    980       case 4: return unop(Iop_64to32, IRExpr_Get( OFFB_RAX, Ity_I64 ));
    981       case 8: return IRExpr_Get( OFFB_RAX, Ity_I64 );
    982       default: vpanic("getIRegRAX(amd64)");
    983    }
    984 }
    985 
    986 static void putIRegRAX ( Int sz, IRExpr* e )
    987 {
    988    IRType ty = typeOfIRExpr(irsb->tyenv, e);
    989    vassert(!host_is_bigendian);
    990    switch (sz) {
    991       case 8: vassert(ty == Ity_I64);
    992               stmt( IRStmt_Put( OFFB_RAX, e ));
    993               break;
    994       case 4: vassert(ty == Ity_I32);
    995               stmt( IRStmt_Put( OFFB_RAX, unop(Iop_32Uto64,e) ));
    996               break;
    997       case 2: vassert(ty == Ity_I16);
    998               stmt( IRStmt_Put( OFFB_RAX, e ));
    999               break;
   1000       case 1: vassert(ty == Ity_I8);
   1001               stmt( IRStmt_Put( OFFB_RAX, e ));
   1002               break;
   1003       default: vpanic("putIRegRAX(amd64)");
   1004    }
   1005 }
   1006 
   1007 
   1008 /* Read/write various widths of %RDX, as it has various
   1009    special-purpose uses. */
   1010 
   1011 static HChar* nameIRegRDX ( Int sz )
   1012 {
   1013    switch (sz) {
   1014       case 1: return "%dl";
   1015       case 2: return "%dx";
   1016       case 4: return "%edx";
   1017       case 8: return "%rdx";
   1018       default: vpanic("nameIRegRDX(amd64)");
   1019    }
   1020 }
   1021 
   1022 static IRExpr* getIRegRDX ( Int sz )
   1023 {
   1024    vassert(!host_is_bigendian);
   1025    switch (sz) {
   1026       case 1: return IRExpr_Get( OFFB_RDX, Ity_I8 );
   1027       case 2: return IRExpr_Get( OFFB_RDX, Ity_I16 );
   1028       case 4: return unop(Iop_64to32, IRExpr_Get( OFFB_RDX, Ity_I64 ));
   1029       case 8: return IRExpr_Get( OFFB_RDX, Ity_I64 );
   1030       default: vpanic("getIRegRDX(amd64)");
   1031    }
   1032 }
   1033 
   1034 static void putIRegRDX ( Int sz, IRExpr* e )
   1035 {
   1036    vassert(!host_is_bigendian);
   1037    vassert(typeOfIRExpr(irsb->tyenv, e) == szToITy(sz));
   1038    switch (sz) {
   1039       case 8: stmt( IRStmt_Put( OFFB_RDX, e ));
   1040               break;
   1041       case 4: stmt( IRStmt_Put( OFFB_RDX, unop(Iop_32Uto64,e) ));
   1042               break;
   1043       case 2: stmt( IRStmt_Put( OFFB_RDX, e ));
   1044               break;
   1045       case 1: stmt( IRStmt_Put( OFFB_RDX, e ));
   1046               break;
   1047       default: vpanic("putIRegRDX(amd64)");
   1048    }
   1049 }
   1050 
   1051 
   1052 /* Simplistic functions to deal with the integer registers as a
   1053    straightforward bank of 16 64-bit regs. */
   1054 
   1055 static IRExpr* getIReg64 ( UInt regno )
   1056 {
   1057    return IRExpr_Get( integerGuestReg64Offset(regno),
   1058                       Ity_I64 );
   1059 }
   1060 
   1061 static void putIReg64 ( UInt regno, IRExpr* e )
   1062 {
   1063    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   1064    stmt( IRStmt_Put( integerGuestReg64Offset(regno), e ) );
   1065 }
   1066 
   1067 static HChar* nameIReg64 ( UInt regno )
   1068 {
   1069    return nameIReg( 8, regno, False );
   1070 }
   1071 
   1072 
   1073 /* Simplistic functions to deal with the lower halves of integer
   1074    registers as a straightforward bank of 16 32-bit regs. */
   1075 
   1076 static IRExpr* getIReg32 ( UInt regno )
   1077 {
   1078    vassert(!host_is_bigendian);
   1079    return unop(Iop_64to32,
   1080                IRExpr_Get( integerGuestReg64Offset(regno),
   1081                            Ity_I64 ));
   1082 }
   1083 
   1084 static void putIReg32 ( UInt regno, IRExpr* e )
   1085 {
   1086    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
   1087    stmt( IRStmt_Put( integerGuestReg64Offset(regno),
   1088                      unop(Iop_32Uto64,e) ) );
   1089 }
   1090 
   1091 static HChar* nameIReg32 ( UInt regno )
   1092 {
   1093    return nameIReg( 4, regno, False );
   1094 }
   1095 
   1096 
   1097 /* Simplistic functions to deal with the lower quarters of integer
   1098    registers as a straightforward bank of 16 16-bit regs. */
   1099 
   1100 static IRExpr* getIReg16 ( UInt regno )
   1101 {
   1102    vassert(!host_is_bigendian);
   1103    return IRExpr_Get( integerGuestReg64Offset(regno),
   1104                       Ity_I16 );
   1105 }
   1106 
   1107 static void putIReg16 ( UInt regno, IRExpr* e )
   1108 {
   1109    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
   1110    stmt( IRStmt_Put( integerGuestReg64Offset(regno),
   1111                      unop(Iop_16Uto64,e) ) );
   1112 }
   1113 
   1114 static HChar* nameIReg16 ( UInt regno )
   1115 {
   1116    return nameIReg( 2, regno, False );
   1117 }
   1118 
   1119 
   1120 /* Sometimes what we know is a 3-bit register number, a REX byte, and
   1121    which field of the REX byte is to be used to extend to a 4-bit
   1122    number.  These functions cater for that situation.
   1123 */
   1124 static IRExpr* getIReg64rexX ( Prefix pfx, UInt lo3bits )
   1125 {
   1126    vassert(lo3bits < 8);
   1127    vassert(IS_VALID_PFX(pfx));
   1128    return getIReg64( lo3bits | (getRexX(pfx) << 3) );
   1129 }
   1130 
   1131 static HChar* nameIReg64rexX ( Prefix pfx, UInt lo3bits )
   1132 {
   1133    vassert(lo3bits < 8);
   1134    vassert(IS_VALID_PFX(pfx));
   1135    return nameIReg( 8, lo3bits | (getRexX(pfx) << 3), False );
   1136 }
   1137 
   1138 static HChar* nameIRegRexB ( Int sz, Prefix pfx, UInt lo3bits )
   1139 {
   1140    vassert(lo3bits < 8);
   1141    vassert(IS_VALID_PFX(pfx));
   1142    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1143    return nameIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1144                         toBool(sz==1 && !haveREX(pfx)) );
   1145 }
   1146 
   1147 static IRExpr* getIRegRexB ( Int sz, Prefix pfx, UInt lo3bits )
   1148 {
   1149    vassert(lo3bits < 8);
   1150    vassert(IS_VALID_PFX(pfx));
   1151    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1152    if (sz == 4) {
   1153       sz = 8;
   1154       return unop(Iop_64to32,
   1155                   IRExpr_Get(
   1156                      offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1157                                      toBool(sz==1 && !haveREX(pfx)) ),
   1158                      szToITy(sz)
   1159                  )
   1160              );
   1161    } else {
   1162       return IRExpr_Get(
   1163                 offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1164                                 toBool(sz==1 && !haveREX(pfx)) ),
   1165                 szToITy(sz)
   1166              );
   1167    }
   1168 }
   1169 
   1170 static void putIRegRexB ( Int sz, Prefix pfx, UInt lo3bits, IRExpr* e )
   1171 {
   1172    vassert(lo3bits < 8);
   1173    vassert(IS_VALID_PFX(pfx));
   1174    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1175    vassert(typeOfIRExpr(irsb->tyenv, e) == szToITy(sz));
   1176    stmt( IRStmt_Put(
   1177             offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1178                             toBool(sz==1 && !haveREX(pfx)) ),
   1179             sz==4 ? unop(Iop_32Uto64,e) : e
   1180    ));
   1181 }
   1182 
   1183 
   1184 /* Functions for getting register numbers from modrm bytes and REX
   1185    when we don't have to consider the complexities of integer subreg
   1186    accesses.
   1187 */
   1188 /* Extract the g reg field from a modRM byte, and augment it using the
   1189    REX.R bit from the supplied REX byte.  The R bit usually is
   1190    associated with the g register field.
   1191 */
   1192 static UInt gregOfRexRM ( Prefix pfx, UChar mod_reg_rm )
   1193 {
   1194    Int reg = (Int)( (mod_reg_rm >> 3) & 7 );
   1195    reg += (pfx & PFX_REXR) ? 8 : 0;
   1196    return reg;
   1197 }
   1198 
   1199 /* Extract the e reg field from a modRM byte, and augment it using the
   1200    REX.B bit from the supplied REX byte.  The B bit usually is
   1201    associated with the e register field (when modrm indicates e is a
   1202    register, that is).
   1203 */
   1204 static UInt eregOfRexRM ( Prefix pfx, UChar mod_reg_rm )
   1205 {
   1206    Int rm;
   1207    vassert(epartIsReg(mod_reg_rm));
   1208    rm = (Int)(mod_reg_rm & 0x7);
   1209    rm += (pfx & PFX_REXB) ? 8 : 0;
   1210    return rm;
   1211 }
   1212 
   1213 
   1214 /* General functions for dealing with integer register access. */
   1215 
   1216 /* Produce the guest state offset for a reference to the 'g' register
   1217    field in a modrm byte, taking into account REX (or its absence),
   1218    and the size of the access.
   1219 */
   1220 static UInt offsetIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1221 {
   1222    UInt reg;
   1223    vassert(!host_is_bigendian);
   1224    vassert(IS_VALID_PFX(pfx));
   1225    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1226    reg = gregOfRexRM( pfx, mod_reg_rm );
   1227    return offsetIReg( sz, reg, toBool(sz == 1 && !haveREX(pfx)) );
   1228 }
   1229 
   1230 static
   1231 IRExpr* getIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1232 {
   1233    if (sz == 4) {
   1234       sz = 8;
   1235       return unop(Iop_64to32,
   1236                   IRExpr_Get( offsetIRegG( sz, pfx, mod_reg_rm ),
   1237                               szToITy(sz) ));
   1238    } else {
   1239       return IRExpr_Get( offsetIRegG( sz, pfx, mod_reg_rm ),
   1240                          szToITy(sz) );
   1241    }
   1242 }
   1243 
   1244 static
   1245 void putIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm, IRExpr* e )
   1246 {
   1247    vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
   1248    if (sz == 4) {
   1249       e = unop(Iop_32Uto64,e);
   1250    }
   1251    stmt( IRStmt_Put( offsetIRegG( sz, pfx, mod_reg_rm ), e ) );
   1252 }
   1253 
   1254 static
   1255 HChar* nameIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1256 {
   1257    return nameIReg( sz, gregOfRexRM(pfx,mod_reg_rm),
   1258                         toBool(sz==1 && !haveREX(pfx)) );
   1259 }
   1260 
   1261 
   1262 /* Produce the guest state offset for a reference to the 'e' register
   1263    field in a modrm byte, taking into account REX (or its absence),
   1264    and the size of the access.  eregOfRexRM will assert if mod_reg_rm
   1265    denotes a memory access rather than a register access.
   1266 */
   1267 static UInt offsetIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1268 {
   1269    UInt reg;
   1270    vassert(!host_is_bigendian);
   1271    vassert(IS_VALID_PFX(pfx));
   1272    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1273    reg = eregOfRexRM( pfx, mod_reg_rm );
   1274    return offsetIReg( sz, reg, toBool(sz == 1 && !haveREX(pfx)) );
   1275 }
   1276 
   1277 static
   1278 IRExpr* getIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1279 {
   1280    if (sz == 4) {
   1281       sz = 8;
   1282       return unop(Iop_64to32,
   1283                   IRExpr_Get( offsetIRegE( sz, pfx, mod_reg_rm ),
   1284                               szToITy(sz) ));
   1285    } else {
   1286       return IRExpr_Get( offsetIRegE( sz, pfx, mod_reg_rm ),
   1287                          szToITy(sz) );
   1288    }
   1289 }
   1290 
   1291 static
   1292 void putIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm, IRExpr* e )
   1293 {
   1294    vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
   1295    if (sz == 4) {
   1296       e = unop(Iop_32Uto64,e);
   1297    }
   1298    stmt( IRStmt_Put( offsetIRegE( sz, pfx, mod_reg_rm ), e ) );
   1299 }
   1300 
   1301 static
   1302 HChar* nameIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1303 {
   1304    return nameIReg( sz, eregOfRexRM(pfx,mod_reg_rm),
   1305                         toBool(sz==1 && !haveREX(pfx)) );
   1306 }
   1307 
   1308 
   1309 /*------------------------------------------------------------*/
   1310 /*--- For dealing with XMM registers                       ---*/
   1311 /*------------------------------------------------------------*/
   1312 
   1313 //.. static Int segmentGuestRegOffset ( UInt sreg )
   1314 //.. {
   1315 //..    switch (sreg) {
   1316 //..       case R_ES: return OFFB_ES;
   1317 //..       case R_CS: return OFFB_CS;
   1318 //..       case R_SS: return OFFB_SS;
   1319 //..       case R_DS: return OFFB_DS;
   1320 //..       case R_FS: return OFFB_FS;
   1321 //..       case R_GS: return OFFB_GS;
   1322 //..       default: vpanic("segmentGuestRegOffset(x86)");
   1323 //..    }
   1324 //.. }
   1325 
   1326 static Int xmmGuestRegOffset ( UInt xmmreg )
   1327 {
   1328    switch (xmmreg) {
   1329       case 0:  return OFFB_XMM0;
   1330       case 1:  return OFFB_XMM1;
   1331       case 2:  return OFFB_XMM2;
   1332       case 3:  return OFFB_XMM3;
   1333       case 4:  return OFFB_XMM4;
   1334       case 5:  return OFFB_XMM5;
   1335       case 6:  return OFFB_XMM6;
   1336       case 7:  return OFFB_XMM7;
   1337       case 8:  return OFFB_XMM8;
   1338       case 9:  return OFFB_XMM9;
   1339       case 10: return OFFB_XMM10;
   1340       case 11: return OFFB_XMM11;
   1341       case 12: return OFFB_XMM12;
   1342       case 13: return OFFB_XMM13;
   1343       case 14: return OFFB_XMM14;
   1344       case 15: return OFFB_XMM15;
   1345       default: vpanic("xmmGuestRegOffset(amd64)");
   1346    }
   1347 }
   1348 
   1349 /* Lanes of vector registers are always numbered from zero being the
   1350    least significant lane (rightmost in the register).  */
   1351 
   1352 static Int xmmGuestRegLane16offset ( UInt xmmreg, Int laneno )
   1353 {
   1354    /* Correct for little-endian host only. */
   1355    vassert(!host_is_bigendian);
   1356    vassert(laneno >= 0 && laneno < 8);
   1357    return xmmGuestRegOffset( xmmreg ) + 2 * laneno;
   1358 }
   1359 
   1360 static Int xmmGuestRegLane32offset ( UInt xmmreg, Int laneno )
   1361 {
   1362    /* Correct for little-endian host only. */
   1363    vassert(!host_is_bigendian);
   1364    vassert(laneno >= 0 && laneno < 4);
   1365    return xmmGuestRegOffset( xmmreg ) + 4 * laneno;
   1366 }
   1367 
   1368 static Int xmmGuestRegLane64offset ( UInt xmmreg, Int laneno )
   1369 {
   1370    /* Correct for little-endian host only. */
   1371    vassert(!host_is_bigendian);
   1372    vassert(laneno >= 0 && laneno < 2);
   1373    return xmmGuestRegOffset( xmmreg ) + 8 * laneno;
   1374 }
   1375 
   1376 //.. static IRExpr* getSReg ( UInt sreg )
   1377 //.. {
   1378 //..    return IRExpr_Get( segmentGuestRegOffset(sreg), Ity_I16 );
   1379 //.. }
   1380 //..
   1381 //.. static void putSReg ( UInt sreg, IRExpr* e )
   1382 //.. {
   1383 //..    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
   1384 //..    stmt( IRStmt_Put( segmentGuestRegOffset(sreg), e ) );
   1385 //.. }
   1386 
   1387 static IRExpr* getXMMReg ( UInt xmmreg )
   1388 {
   1389    return IRExpr_Get( xmmGuestRegOffset(xmmreg), Ity_V128 );
   1390 }
   1391 
   1392 static IRExpr* getXMMRegLane64 ( UInt xmmreg, Int laneno )
   1393 {
   1394    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_I64 );
   1395 }
   1396 
   1397 static IRExpr* getXMMRegLane64F ( UInt xmmreg, Int laneno )
   1398 {
   1399    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_F64 );
   1400 }
   1401 
   1402 static IRExpr* getXMMRegLane32 ( UInt xmmreg, Int laneno )
   1403 {
   1404    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_I32 );
   1405 }
   1406 
   1407 static IRExpr* getXMMRegLane32F ( UInt xmmreg, Int laneno )
   1408 {
   1409    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_F32 );
   1410 }
   1411 
   1412 static IRExpr* getXMMRegLane16 ( UInt xmmreg, Int laneno )
   1413 {
   1414   return IRExpr_Get( xmmGuestRegLane16offset(xmmreg,laneno), Ity_I16 );
   1415 }
   1416 
   1417 static void putXMMReg ( UInt xmmreg, IRExpr* e )
   1418 {
   1419    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
   1420    stmt( IRStmt_Put( xmmGuestRegOffset(xmmreg), e ) );
   1421 }
   1422 
   1423 static void putXMMRegLane64 ( UInt xmmreg, Int laneno, IRExpr* e )
   1424 {
   1425    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   1426    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
   1427 }
   1428 
   1429 static void putXMMRegLane64F ( UInt xmmreg, Int laneno, IRExpr* e )
   1430 {
   1431    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
   1432    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
   1433 }
   1434 
   1435 static void putXMMRegLane32F ( UInt xmmreg, Int laneno, IRExpr* e )
   1436 {
   1437    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
   1438    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
   1439 }
   1440 
   1441 static void putXMMRegLane32 ( UInt xmmreg, Int laneno, IRExpr* e )
   1442 {
   1443    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
   1444    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
   1445 }
   1446 
   1447 static void putXMMRegLane16 ( UInt xmmreg, Int laneno, IRExpr* e )
   1448 {
   1449    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
   1450    stmt( IRStmt_Put( xmmGuestRegLane16offset(xmmreg,laneno), e ) );
   1451 }
   1452 
   1453 static IRExpr* mkV128 ( UShort mask )
   1454 {
   1455    return IRExpr_Const(IRConst_V128(mask));
   1456 }
   1457 
   1458 static IRExpr* mkAnd1 ( IRExpr* x, IRExpr* y )
   1459 {
   1460    vassert(typeOfIRExpr(irsb->tyenv,x) == Ity_I1);
   1461    vassert(typeOfIRExpr(irsb->tyenv,y) == Ity_I1);
   1462    return unop(Iop_64to1,
   1463                binop(Iop_And64,
   1464                      unop(Iop_1Uto64,x),
   1465                      unop(Iop_1Uto64,y)));
   1466 }
   1467 
   1468 /* Generate a compare-and-swap operation, operating on memory at
   1469    'addr'.  The expected value is 'expVal' and the new value is
   1470    'newVal'.  If the operation fails, then transfer control (with a
   1471    no-redir jump (XXX no -- see comment at top of this file)) to
   1472    'restart_point', which is presumably the address of the guest
   1473    instruction again -- retrying, essentially. */
   1474 static void casLE ( IRExpr* addr, IRExpr* expVal, IRExpr* newVal,
   1475                     Addr64 restart_point )
   1476 {
   1477    IRCAS* cas;
   1478    IRType tyE    = typeOfIRExpr(irsb->tyenv, expVal);
   1479    IRType tyN    = typeOfIRExpr(irsb->tyenv, newVal);
   1480    IRTemp oldTmp = newTemp(tyE);
   1481    IRTemp expTmp = newTemp(tyE);
   1482    vassert(tyE == tyN);
   1483    vassert(tyE == Ity_I64 || tyE == Ity_I32
   1484            || tyE == Ity_I16 || tyE == Ity_I8);
   1485    assign(expTmp, expVal);
   1486    cas = mkIRCAS( IRTemp_INVALID, oldTmp, Iend_LE, addr,
   1487                   NULL, mkexpr(expTmp), NULL, newVal );
   1488    stmt( IRStmt_CAS(cas) );
   1489    stmt( IRStmt_Exit(
   1490             binop( mkSizedOp(tyE,Iop_CasCmpNE8),
   1491                    mkexpr(oldTmp), mkexpr(expTmp) ),
   1492             Ijk_Boring, /*Ijk_NoRedir*/
   1493             IRConst_U64( restart_point )
   1494          ));
   1495 }
   1496 
   1497 
   1498 /*------------------------------------------------------------*/
   1499 /*--- Helpers for %rflags.                                 ---*/
   1500 /*------------------------------------------------------------*/
   1501 
   1502 /* -------------- Evaluating the flags-thunk. -------------- */
   1503 
   1504 /* Build IR to calculate all the eflags from stored
   1505    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
   1506    Ity_I64. */
   1507 static IRExpr* mk_amd64g_calculate_rflags_all ( void )
   1508 {
   1509    IRExpr** args
   1510       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
   1511                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
   1512                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
   1513                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   1514    IRExpr* call
   1515       = mkIRExprCCall(
   1516            Ity_I64,
   1517            0/*regparm*/,
   1518            "amd64g_calculate_rflags_all", &amd64g_calculate_rflags_all,
   1519            args
   1520         );
   1521    /* Exclude OP and NDEP from definedness checking.  We're only
   1522       interested in DEP1 and DEP2. */
   1523    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
   1524    return call;
   1525 }
   1526 
   1527 /* Build IR to calculate some particular condition from stored
   1528    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
   1529    Ity_Bit. */
   1530 static IRExpr* mk_amd64g_calculate_condition ( AMD64Condcode cond )
   1531 {
   1532    IRExpr** args
   1533       = mkIRExprVec_5( mkU64(cond),
   1534                        IRExpr_Get(OFFB_CC_OP,   Ity_I64),
   1535                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
   1536                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
   1537                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   1538    IRExpr* call
   1539       = mkIRExprCCall(
   1540            Ity_I64,
   1541            0/*regparm*/,
   1542            "amd64g_calculate_condition", &amd64g_calculate_condition,
   1543            args
   1544         );
   1545    /* Exclude the requested condition, OP and NDEP from definedness
   1546       checking.  We're only interested in DEP1 and DEP2. */
   1547    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<1) | (1<<4);
   1548    return unop(Iop_64to1, call);
   1549 }
   1550 
   1551 /* Build IR to calculate just the carry flag from stored
   1552    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression :: Ity_I64. */
   1553 static IRExpr* mk_amd64g_calculate_rflags_c ( void )
   1554 {
   1555    IRExpr** args
   1556       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
   1557                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
   1558                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
   1559                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   1560    IRExpr* call
   1561       = mkIRExprCCall(
   1562            Ity_I64,
   1563            0/*regparm*/,
   1564            "amd64g_calculate_rflags_c", &amd64g_calculate_rflags_c,
   1565            args
   1566         );
   1567    /* Exclude OP and NDEP from definedness checking.  We're only
   1568       interested in DEP1 and DEP2. */
   1569    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
   1570    return call;
   1571 }
   1572 
   1573 
   1574 /* -------------- Building the flags-thunk. -------------- */
   1575 
   1576 /* The machinery in this section builds the flag-thunk following a
   1577    flag-setting operation.  Hence the various setFlags_* functions.
   1578 */
   1579 
   1580 static Bool isAddSub ( IROp op8 )
   1581 {
   1582    return toBool(op8 == Iop_Add8 || op8 == Iop_Sub8);
   1583 }
   1584 
   1585 static Bool isLogic ( IROp op8 )
   1586 {
   1587    return toBool(op8 == Iop_And8 || op8 == Iop_Or8 || op8 == Iop_Xor8);
   1588 }
   1589 
   1590 /* U-widen 8/16/32/64 bit int expr to 64. */
   1591 static IRExpr* widenUto64 ( IRExpr* e )
   1592 {
   1593    switch (typeOfIRExpr(irsb->tyenv,e)) {
   1594       case Ity_I64: return e;
   1595       case Ity_I32: return unop(Iop_32Uto64, e);
   1596       case Ity_I16: return unop(Iop_16Uto64, e);
   1597       case Ity_I8:  return unop(Iop_8Uto64, e);
   1598       default: vpanic("widenUto64");
   1599    }
   1600 }
   1601 
   1602 /* S-widen 8/16/32/64 bit int expr to 32. */
   1603 static IRExpr* widenSto64 ( IRExpr* e )
   1604 {
   1605    switch (typeOfIRExpr(irsb->tyenv,e)) {
   1606       case Ity_I64: return e;
   1607       case Ity_I32: return unop(Iop_32Sto64, e);
   1608       case Ity_I16: return unop(Iop_16Sto64, e);
   1609       case Ity_I8:  return unop(Iop_8Sto64, e);
   1610       default: vpanic("widenSto64");
   1611    }
   1612 }
   1613 
   1614 /* Narrow 8/16/32/64 bit int expr to 8/16/32/64.  Clearly only some
   1615    of these combinations make sense. */
   1616 static IRExpr* narrowTo ( IRType dst_ty, IRExpr* e )
   1617 {
   1618    IRType src_ty = typeOfIRExpr(irsb->tyenv,e);
   1619    if (src_ty == dst_ty)
   1620       return e;
   1621    if (src_ty == Ity_I32 && dst_ty == Ity_I16)
   1622       return unop(Iop_32to16, e);
   1623    if (src_ty == Ity_I32 && dst_ty == Ity_I8)
   1624       return unop(Iop_32to8, e);
   1625    if (src_ty == Ity_I64 && dst_ty == Ity_I32)
   1626       return unop(Iop_64to32, e);
   1627    if (src_ty == Ity_I64 && dst_ty == Ity_I16)
   1628       return unop(Iop_64to16, e);
   1629    if (src_ty == Ity_I64 && dst_ty == Ity_I8)
   1630       return unop(Iop_64to8, e);
   1631 
   1632    vex_printf("\nsrc, dst tys are: ");
   1633    ppIRType(src_ty);
   1634    vex_printf(", ");
   1635    ppIRType(dst_ty);
   1636    vex_printf("\n");
   1637    vpanic("narrowTo(amd64)");
   1638 }
   1639 
   1640 
   1641 /* Set the flags thunk OP, DEP1 and DEP2 fields.  The supplied op is
   1642    auto-sized up to the real op. */
   1643 
   1644 static
   1645 void setFlags_DEP1_DEP2 ( IROp op8, IRTemp dep1, IRTemp dep2, IRType ty )
   1646 {
   1647    Int ccOp = 0;
   1648    switch (ty) {
   1649       case Ity_I8:  ccOp = 0; break;
   1650       case Ity_I16: ccOp = 1; break;
   1651       case Ity_I32: ccOp = 2; break;
   1652       case Ity_I64: ccOp = 3; break;
   1653       default: vassert(0);
   1654    }
   1655    switch (op8) {
   1656       case Iop_Add8: ccOp += AMD64G_CC_OP_ADDB;   break;
   1657       case Iop_Sub8: ccOp += AMD64G_CC_OP_SUBB;   break;
   1658       default:       ppIROp(op8);
   1659                      vpanic("setFlags_DEP1_DEP2(amd64)");
   1660    }
   1661    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
   1662    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dep1))) );
   1663    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(dep2))) );
   1664 }
   1665 
   1666 
   1667 /* Set the OP and DEP1 fields only, and write zero to DEP2. */
   1668 
   1669 static
   1670 void setFlags_DEP1 ( IROp op8, IRTemp dep1, IRType ty )
   1671 {
   1672    Int ccOp = 0;
   1673    switch (ty) {
   1674       case Ity_I8:  ccOp = 0; break;
   1675       case Ity_I16: ccOp = 1; break;
   1676       case Ity_I32: ccOp = 2; break;
   1677       case Ity_I64: ccOp = 3; break;
   1678       default: vassert(0);
   1679    }
   1680    switch (op8) {
   1681       case Iop_Or8:
   1682       case Iop_And8:
   1683       case Iop_Xor8: ccOp += AMD64G_CC_OP_LOGICB; break;
   1684       default:       ppIROp(op8);
   1685                      vpanic("setFlags_DEP1(amd64)");
   1686    }
   1687    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
   1688    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dep1))) );
   1689    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
   1690 }
   1691 
   1692 
   1693 /* For shift operations, we put in the result and the undershifted
   1694    result.  Except if the shift amount is zero, the thunk is left
   1695    unchanged. */
   1696 
   1697 static void setFlags_DEP1_DEP2_shift ( IROp    op64,
   1698                                        IRTemp  res,
   1699                                        IRTemp  resUS,
   1700                                        IRType  ty,
   1701                                        IRTemp  guard )
   1702 {
   1703    Int ccOp = 0;
   1704    switch (ty) {
   1705       case Ity_I8:  ccOp = 0; break;
   1706       case Ity_I16: ccOp = 1; break;
   1707       case Ity_I32: ccOp = 2; break;
   1708       case Ity_I64: ccOp = 3; break;
   1709       default: vassert(0);
   1710    }
   1711 
   1712    vassert(guard);
   1713 
   1714    /* Both kinds of right shifts are handled by the same thunk
   1715       operation. */
   1716    switch (op64) {
   1717       case Iop_Shr64:
   1718       case Iop_Sar64: ccOp += AMD64G_CC_OP_SHRB; break;
   1719       case Iop_Shl64: ccOp += AMD64G_CC_OP_SHLB; break;
   1720       default:        ppIROp(op64);
   1721                       vpanic("setFlags_DEP1_DEP2_shift(amd64)");
   1722    }
   1723 
   1724    /* DEP1 contains the result, DEP2 contains the undershifted value. */
   1725    stmt( IRStmt_Put( OFFB_CC_OP,
   1726                      IRExpr_Mux0X( mkexpr(guard),
   1727                                    IRExpr_Get(OFFB_CC_OP,Ity_I64),
   1728                                    mkU64(ccOp))) );
   1729    stmt( IRStmt_Put( OFFB_CC_DEP1,
   1730                      IRExpr_Mux0X( mkexpr(guard),
   1731                                    IRExpr_Get(OFFB_CC_DEP1,Ity_I64),
   1732                                    widenUto64(mkexpr(res)))) );
   1733    stmt( IRStmt_Put( OFFB_CC_DEP2,
   1734                      IRExpr_Mux0X( mkexpr(guard),
   1735                                    IRExpr_Get(OFFB_CC_DEP2,Ity_I64),
   1736                                    widenUto64(mkexpr(resUS)))) );
   1737 }
   1738 
   1739 
   1740 /* For the inc/dec case, we store in DEP1 the result value and in NDEP
   1741    the former value of the carry flag, which unfortunately we have to
   1742    compute. */
   1743 
   1744 static void setFlags_INC_DEC ( Bool inc, IRTemp res, IRType ty )
   1745 {
   1746    Int ccOp = inc ? AMD64G_CC_OP_INCB : AMD64G_CC_OP_DECB;
   1747 
   1748    switch (ty) {
   1749       case Ity_I8:  ccOp += 0; break;
   1750       case Ity_I16: ccOp += 1; break;
   1751       case Ity_I32: ccOp += 2; break;
   1752       case Ity_I64: ccOp += 3; break;
   1753       default: vassert(0);
   1754    }
   1755 
   1756    /* This has to come first, because calculating the C flag
   1757       may require reading all four thunk fields. */
   1758    stmt( IRStmt_Put( OFFB_CC_NDEP, mk_amd64g_calculate_rflags_c()) );
   1759    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
   1760    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(res))) );
   1761    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
   1762 }
   1763 
   1764 
   1765 /* Multiplies are pretty much like add and sub: DEP1 and DEP2 hold the
   1766    two arguments. */
   1767 
   1768 static
   1769 void setFlags_MUL ( IRType ty, IRTemp arg1, IRTemp arg2, ULong base_op )
   1770 {
   1771    switch (ty) {
   1772       case Ity_I8:
   1773          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+0) ) );
   1774          break;
   1775       case Ity_I16:
   1776          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+1) ) );
   1777          break;
   1778       case Ity_I32:
   1779          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+2) ) );
   1780          break;
   1781       case Ity_I64:
   1782          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+3) ) );
   1783          break;
   1784       default:
   1785          vpanic("setFlags_MUL(amd64)");
   1786    }
   1787    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(arg1)) ));
   1788    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(arg2)) ));
   1789 }
   1790 
   1791 
   1792 /* -------------- Condition codes. -------------- */
   1793 
   1794 /* Condition codes, using the AMD encoding.  */
   1795 
   1796 static HChar* name_AMD64Condcode ( AMD64Condcode cond )
   1797 {
   1798    switch (cond) {
   1799       case AMD64CondO:      return "o";
   1800       case AMD64CondNO:     return "no";
   1801       case AMD64CondB:      return "b";
   1802       case AMD64CondNB:     return "ae"; /*"nb";*/
   1803       case AMD64CondZ:      return "e"; /*"z";*/
   1804       case AMD64CondNZ:     return "ne"; /*"nz";*/
   1805       case AMD64CondBE:     return "be";
   1806       case AMD64CondNBE:    return "a"; /*"nbe";*/
   1807       case AMD64CondS:      return "s";
   1808       case AMD64CondNS:     return "ns";
   1809       case AMD64CondP:      return "p";
   1810       case AMD64CondNP:     return "np";
   1811       case AMD64CondL:      return "l";
   1812       case AMD64CondNL:     return "ge"; /*"nl";*/
   1813       case AMD64CondLE:     return "le";
   1814       case AMD64CondNLE:    return "g"; /*"nle";*/
   1815       case AMD64CondAlways: return "ALWAYS";
   1816       default: vpanic("name_AMD64Condcode");
   1817    }
   1818 }
   1819 
   1820 static
   1821 AMD64Condcode positiveIse_AMD64Condcode ( AMD64Condcode  cond,
   1822                                           /*OUT*/Bool*   needInvert )
   1823 {
   1824    vassert(cond >= AMD64CondO && cond <= AMD64CondNLE);
   1825    if (cond & 1) {
   1826       *needInvert = True;
   1827       return cond-1;
   1828    } else {
   1829       *needInvert = False;
   1830       return cond;
   1831    }
   1832 }
   1833 
   1834 
   1835 /* -------------- Helpers for ADD/SUB with carry. -------------- */
   1836 
   1837 /* Given ta1, ta2 and tres, compute tres = ADC(ta1,ta2) and set flags
   1838    appropriately.
   1839 
   1840    Optionally, generate a store for the 'tres' value.  This can either
   1841    be a normal store, or it can be a cas-with-possible-failure style
   1842    store:
   1843 
   1844    if taddr is IRTemp_INVALID, then no store is generated.
   1845 
   1846    if taddr is not IRTemp_INVALID, then a store (using taddr as
   1847    the address) is generated:
   1848 
   1849      if texpVal is IRTemp_INVALID then a normal store is
   1850      generated, and restart_point must be zero (it is irrelevant).
   1851 
   1852      if texpVal is not IRTemp_INVALID then a cas-style store is
   1853      generated.  texpVal is the expected value, restart_point
   1854      is the restart point if the store fails, and texpVal must
   1855      have the same type as tres.
   1856 
   1857 */
   1858 static void helper_ADC ( Int sz,
   1859                          IRTemp tres, IRTemp ta1, IRTemp ta2,
   1860                          /* info about optional store: */
   1861                          IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
   1862 {
   1863    UInt    thunkOp;
   1864    IRType  ty    = szToITy(sz);
   1865    IRTemp  oldc  = newTemp(Ity_I64);
   1866    IRTemp  oldcn = newTemp(ty);
   1867    IROp    plus  = mkSizedOp(ty, Iop_Add8);
   1868    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
   1869 
   1870    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
   1871 
   1872    switch (sz) {
   1873       case 8:  thunkOp = AMD64G_CC_OP_ADCQ; break;
   1874       case 4:  thunkOp = AMD64G_CC_OP_ADCL; break;
   1875       case 2:  thunkOp = AMD64G_CC_OP_ADCW; break;
   1876       case 1:  thunkOp = AMD64G_CC_OP_ADCB; break;
   1877       default: vassert(0);
   1878    }
   1879 
   1880    /* oldc = old carry flag, 0 or 1 */
   1881    assign( oldc,  binop(Iop_And64,
   1882                         mk_amd64g_calculate_rflags_c(),
   1883                         mkU64(1)) );
   1884 
   1885    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
   1886 
   1887    assign( tres, binop(plus,
   1888                        binop(plus,mkexpr(ta1),mkexpr(ta2)),
   1889                        mkexpr(oldcn)) );
   1890 
   1891    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
   1892       start of this function. */
   1893    if (taddr != IRTemp_INVALID) {
   1894       if (texpVal == IRTemp_INVALID) {
   1895          vassert(restart_point == 0);
   1896          storeLE( mkexpr(taddr), mkexpr(tres) );
   1897       } else {
   1898          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
   1899          /* .. and hence 'texpVal' has the same type as 'tres'. */
   1900          casLE( mkexpr(taddr),
   1901                 mkexpr(texpVal), mkexpr(tres), restart_point );
   1902       }
   1903    }
   1904 
   1905    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
   1906    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1))  ));
   1907    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
   1908                                                          mkexpr(oldcn)) )) );
   1909    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
   1910 }
   1911 
   1912 
   1913 /* Given ta1, ta2 and tres, compute tres = SBB(ta1,ta2) and set flags
   1914    appropriately.  As with helper_ADC, possibly generate a store of
   1915    the result -- see comments on helper_ADC for details.
   1916 */
   1917 static void helper_SBB ( Int sz,
   1918                          IRTemp tres, IRTemp ta1, IRTemp ta2,
   1919                          /* info about optional store: */
   1920                          IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
   1921 {
   1922    UInt    thunkOp;
   1923    IRType  ty    = szToITy(sz);
   1924    IRTemp  oldc  = newTemp(Ity_I64);
   1925    IRTemp  oldcn = newTemp(ty);
   1926    IROp    minus = mkSizedOp(ty, Iop_Sub8);
   1927    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
   1928 
   1929    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
   1930 
   1931    switch (sz) {
   1932       case 8:  thunkOp = AMD64G_CC_OP_SBBQ; break;
   1933       case 4:  thunkOp = AMD64G_CC_OP_SBBL; break;
   1934       case 2:  thunkOp = AMD64G_CC_OP_SBBW; break;
   1935       case 1:  thunkOp = AMD64G_CC_OP_SBBB; break;
   1936       default: vassert(0);
   1937    }
   1938 
   1939    /* oldc = old carry flag, 0 or 1 */
   1940    assign( oldc, binop(Iop_And64,
   1941                        mk_amd64g_calculate_rflags_c(),
   1942                        mkU64(1)) );
   1943 
   1944    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
   1945 
   1946    assign( tres, binop(minus,
   1947                        binop(minus,mkexpr(ta1),mkexpr(ta2)),
   1948                        mkexpr(oldcn)) );
   1949 
   1950    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
   1951       start of this function. */
   1952    if (taddr != IRTemp_INVALID) {
   1953       if (texpVal == IRTemp_INVALID) {
   1954          vassert(restart_point == 0);
   1955          storeLE( mkexpr(taddr), mkexpr(tres) );
   1956       } else {
   1957          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
   1958          /* .. and hence 'texpVal' has the same type as 'tres'. */
   1959          casLE( mkexpr(taddr),
   1960                 mkexpr(texpVal), mkexpr(tres), restart_point );
   1961       }
   1962    }
   1963 
   1964    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
   1965    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1) )) );
   1966    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
   1967                                                          mkexpr(oldcn)) )) );
   1968    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
   1969 }
   1970 
   1971 
   1972 /* -------------- Helpers for disassembly printing. -------------- */
   1973 
   1974 static HChar* nameGrp1 ( Int opc_aux )
   1975 {
   1976    static HChar* grp1_names[8]
   1977      = { "add", "or", "adc", "sbb", "and", "sub", "xor", "cmp" };
   1978    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp1(amd64)");
   1979    return grp1_names[opc_aux];
   1980 }
   1981 
   1982 static HChar* nameGrp2 ( Int opc_aux )
   1983 {
   1984    static HChar* grp2_names[8]
   1985      = { "rol", "ror", "rcl", "rcr", "shl", "shr", "shl", "sar" };
   1986    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp2(amd64)");
   1987    return grp2_names[opc_aux];
   1988 }
   1989 
   1990 static HChar* nameGrp4 ( Int opc_aux )
   1991 {
   1992    static HChar* grp4_names[8]
   1993      = { "inc", "dec", "???", "???", "???", "???", "???", "???" };
   1994    if (opc_aux < 0 || opc_aux > 1) vpanic("nameGrp4(amd64)");
   1995    return grp4_names[opc_aux];
   1996 }
   1997 
   1998 static HChar* nameGrp5 ( Int opc_aux )
   1999 {
   2000    static HChar* grp5_names[8]
   2001      = { "inc", "dec", "call*", "call*", "jmp*", "jmp*", "push", "???" };
   2002    if (opc_aux < 0 || opc_aux > 6) vpanic("nameGrp5(amd64)");
   2003    return grp5_names[opc_aux];
   2004 }
   2005 
   2006 static HChar* nameGrp8 ( Int opc_aux )
   2007 {
   2008    static HChar* grp8_names[8]
   2009       = { "???", "???", "???", "???", "bt", "bts", "btr", "btc" };
   2010    if (opc_aux < 4 || opc_aux > 7) vpanic("nameGrp8(amd64)");
   2011    return grp8_names[opc_aux];
   2012 }
   2013 
   2014 //.. static HChar* nameSReg ( UInt sreg )
   2015 //.. {
   2016 //..    switch (sreg) {
   2017 //..       case R_ES: return "%es";
   2018 //..       case R_CS: return "%cs";
   2019 //..       case R_SS: return "%ss";
   2020 //..       case R_DS: return "%ds";
   2021 //..       case R_FS: return "%fs";
   2022 //..       case R_GS: return "%gs";
   2023 //..       default: vpanic("nameSReg(x86)");
   2024 //..    }
   2025 //.. }
   2026 
   2027 static HChar* nameMMXReg ( Int mmxreg )
   2028 {
   2029    static HChar* mmx_names[8]
   2030      = { "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" };
   2031    if (mmxreg < 0 || mmxreg > 7) vpanic("nameMMXReg(amd64,guest)");
   2032    return mmx_names[mmxreg];
   2033 }
   2034 
   2035 static HChar* nameXMMReg ( Int xmmreg )
   2036 {
   2037    static HChar* xmm_names[16]
   2038      = { "%xmm0",  "%xmm1",  "%xmm2",  "%xmm3",
   2039          "%xmm4",  "%xmm5",  "%xmm6",  "%xmm7",
   2040          "%xmm8",  "%xmm9",  "%xmm10", "%xmm11",
   2041          "%xmm12", "%xmm13", "%xmm14", "%xmm15" };
   2042    if (xmmreg < 0 || xmmreg > 15) vpanic("nameXMMReg(amd64)");
   2043    return xmm_names[xmmreg];
   2044 }
   2045 
   2046 static HChar* nameMMXGran ( Int gran )
   2047 {
   2048    switch (gran) {
   2049       case 0: return "b";
   2050       case 1: return "w";
   2051       case 2: return "d";
   2052       case 3: return "q";
   2053       default: vpanic("nameMMXGran(amd64,guest)");
   2054    }
   2055 }
   2056 
   2057 static HChar nameISize ( Int size )
   2058 {
   2059    switch (size) {
   2060       case 8: return 'q';
   2061       case 4: return 'l';
   2062       case 2: return 'w';
   2063       case 1: return 'b';
   2064       default: vpanic("nameISize(amd64)");
   2065    }
   2066 }
   2067 
   2068 
   2069 /*------------------------------------------------------------*/
   2070 /*--- JMP helpers                                          ---*/
   2071 /*------------------------------------------------------------*/
   2072 
   2073 static void jmp_lit( IRJumpKind kind, Addr64 d64 )
   2074 {
   2075    irsb->next     = mkU64(d64);
   2076    irsb->jumpkind = kind;
   2077 }
   2078 
   2079 static void jmp_treg( IRJumpKind kind, IRTemp t )
   2080 {
   2081    irsb->next     = mkexpr(t);
   2082    irsb->jumpkind = kind;
   2083 }
   2084 
   2085 static
   2086 void jcc_01 ( AMD64Condcode cond, Addr64 d64_false, Addr64 d64_true )
   2087 {
   2088    Bool          invert;
   2089    AMD64Condcode condPos;
   2090    condPos = positiveIse_AMD64Condcode ( cond, &invert );
   2091    if (invert) {
   2092       stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
   2093                          Ijk_Boring,
   2094                          IRConst_U64(d64_false) ) );
   2095       irsb->next     = mkU64(d64_true);
   2096       irsb->jumpkind = Ijk_Boring;
   2097    } else {
   2098       stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
   2099                          Ijk_Boring,
   2100                          IRConst_U64(d64_true) ) );
   2101       irsb->next     = mkU64(d64_false);
   2102       irsb->jumpkind = Ijk_Boring;
   2103    }
   2104 }
   2105 
   2106 /* Let new_rsp be the %rsp value after a call/return.  Let nia be the
   2107    guest address of the next instruction to be executed.
   2108 
   2109    This function generates an AbiHint to say that -128(%rsp)
   2110    .. -1(%rsp) should now be regarded as uninitialised.
   2111 */
   2112 static
   2113 void make_redzone_AbiHint ( VexAbiInfo* vbi,
   2114                             IRTemp new_rsp, IRTemp nia, HChar* who )
   2115 {
   2116    Int szB = vbi->guest_stack_redzone_size;
   2117    vassert(szB >= 0);
   2118 
   2119    /* A bit of a kludge.  Currently the only AbI we've guested AMD64
   2120       for is ELF.  So just check it's the expected 128 value
   2121       (paranoia). */
   2122    vassert(szB == 128);
   2123 
   2124    if (0) vex_printf("AbiHint: %s\n", who);
   2125    vassert(typeOfIRTemp(irsb->tyenv, new_rsp) == Ity_I64);
   2126    vassert(typeOfIRTemp(irsb->tyenv, nia) == Ity_I64);
   2127    if (szB > 0)
   2128       stmt( IRStmt_AbiHint(
   2129                binop(Iop_Sub64, mkexpr(new_rsp), mkU64(szB)),
   2130                szB,
   2131                mkexpr(nia)
   2132             ));
   2133 }
   2134 
   2135 
   2136 /*------------------------------------------------------------*/
   2137 /*--- Disassembling addressing modes                       ---*/
   2138 /*------------------------------------------------------------*/
   2139 
   2140 static
   2141 HChar* segRegTxt ( Prefix pfx )
   2142 {
   2143    if (pfx & PFX_CS) return "%cs:";
   2144    if (pfx & PFX_DS) return "%ds:";
   2145    if (pfx & PFX_ES) return "%es:";
   2146    if (pfx & PFX_FS) return "%fs:";
   2147    if (pfx & PFX_GS) return "%gs:";
   2148    if (pfx & PFX_SS) return "%ss:";
   2149    return ""; /* no override */
   2150 }
   2151 
   2152 
   2153 /* 'virtual' is an IRExpr* holding a virtual address.  Convert it to a
   2154    linear address by adding any required segment override as indicated
   2155    by sorb, and also dealing with any address size override
   2156    present. */
   2157 static
   2158 IRExpr* handleAddrOverrides ( VexAbiInfo* vbi,
   2159                               Prefix pfx, IRExpr* virtual )
   2160 {
   2161    /* --- segment overrides --- */
   2162    if (pfx & PFX_FS) {
   2163       if (vbi->guest_amd64_assume_fs_is_zero) {
   2164          /* Note that this is a linux-kernel specific hack that relies
   2165             on the assumption that %fs is always zero. */
   2166          /* return virtual + guest_FS_ZERO. */
   2167          virtual = binop(Iop_Add64, virtual,
   2168                                     IRExpr_Get(OFFB_FS_ZERO, Ity_I64));
   2169       } else {
   2170          unimplemented("amd64 %fs segment override");
   2171       }
   2172    }
   2173 
   2174    if (pfx & PFX_GS) {
   2175       if (vbi->guest_amd64_assume_gs_is_0x60) {
   2176          /* Note that this is a darwin-kernel specific hack that relies
   2177             on the assumption that %gs is always 0x60. */
   2178          /* return virtual + guest_GS_0x60. */
   2179          virtual = binop(Iop_Add64, virtual,
   2180                                     IRExpr_Get(OFFB_GS_0x60, Ity_I64));
   2181       } else {
   2182          unimplemented("amd64 %gs segment override");
   2183       }
   2184    }
   2185 
   2186    /* cs, ds, es and ss are simply ignored in 64-bit mode. */
   2187 
   2188    /* --- address size override --- */
   2189    if (haveASO(pfx))
   2190       virtual = unop(Iop_32Uto64, unop(Iop_64to32, virtual));
   2191 
   2192    return virtual;
   2193 }
   2194 
   2195 //.. {
   2196 //..    Int    sreg;
   2197 //..    IRType hWordTy;
   2198 //..    IRTemp ldt_ptr, gdt_ptr, seg_selector, r64;
   2199 //..
   2200 //..    if (sorb == 0)
   2201 //..       /* the common case - no override */
   2202 //..       return virtual;
   2203 //..
   2204 //..    switch (sorb) {
   2205 //..       case 0x3E: sreg = R_DS; break;
   2206 //..       case 0x26: sreg = R_ES; break;
   2207 //..       case 0x64: sreg = R_FS; break;
   2208 //..       case 0x65: sreg = R_GS; break;
   2209 //..       default: vpanic("handleAddrOverrides(x86,guest)");
   2210 //..    }
   2211 //..
   2212 //..    hWordTy = sizeof(HWord)==4 ? Ity_I32 : Ity_I64;
   2213 //..
   2214 //..    seg_selector = newTemp(Ity_I32);
   2215 //..    ldt_ptr      = newTemp(hWordTy);
   2216 //..    gdt_ptr      = newTemp(hWordTy);
   2217 //..    r64          = newTemp(Ity_I64);
   2218 //..
   2219 //..    assign( seg_selector, unop(Iop_16Uto32, getSReg(sreg)) );
   2220 //..    assign( ldt_ptr, IRExpr_Get( OFFB_LDT, hWordTy ));
   2221 //..    assign( gdt_ptr, IRExpr_Get( OFFB_GDT, hWordTy ));
   2222 //..
   2223 //..    /*
   2224 //..    Call this to do the translation and limit checks:
   2225 //..    ULong x86g_use_seg_selector ( HWord ldt, HWord gdt,
   2226 //..                                  UInt seg_selector, UInt virtual_addr )
   2227 //..    */
   2228 //..    assign(
   2229 //..       r64,
   2230 //..       mkIRExprCCall(
   2231 //..          Ity_I64,
   2232 //..          0/*regparms*/,
   2233 //..          "x86g_use_seg_selector",
   2234 //..          &x86g_use_seg_selector,
   2235 //..          mkIRExprVec_4( mkexpr(ldt_ptr), mkexpr(gdt_ptr),
   2236 //..                         mkexpr(seg_selector), virtual)
   2237 //..       )
   2238 //..    );
   2239 //..
   2240 //..    /* If the high 32 of the result are non-zero, there was a
   2241 //..       failure in address translation.  In which case, make a
   2242 //..       quick exit.
   2243 //..    */
   2244 //..    stmt(
   2245 //..       IRStmt_Exit(
   2246 //..          binop(Iop_CmpNE32, unop(Iop_64HIto32, mkexpr(r64)), mkU32(0)),
   2247 //..          Ijk_MapFail,
   2248 //..          IRConst_U32( guest_eip_curr_instr )
   2249 //..       )
   2250 //..    );
   2251 //..
   2252 //..    /* otherwise, here's the translated result. */
   2253 //..    return unop(Iop_64to32, mkexpr(r64));
   2254 //.. }
   2255 
   2256 
   2257 /* Generate IR to calculate an address indicated by a ModRM and
   2258    following SIB bytes.  The expression, and the number of bytes in
   2259    the address mode, are returned (the latter in *len).  Note that
   2260    this fn should not be called if the R/M part of the address denotes
   2261    a register instead of memory.  If print_codegen is true, text of
   2262    the addressing mode is placed in buf.
   2263 
   2264    The computed address is stored in a new tempreg, and the
   2265    identity of the tempreg is returned.
   2266 
   2267    extra_bytes holds the number of bytes after the amode, as supplied
   2268    by the caller.  This is needed to make sense of %rip-relative
   2269    addresses.  Note that the value that *len is set to is only the
   2270    length of the amode itself and does not include the value supplied
   2271    in extra_bytes.
   2272  */
   2273 
   2274 static IRTemp disAMode_copy2tmp ( IRExpr* addr64 )
   2275 {
   2276    IRTemp tmp = newTemp(Ity_I64);
   2277    assign( tmp, addr64 );
   2278    return tmp;
   2279 }
   2280 
   2281 static
   2282 IRTemp disAMode ( /*OUT*/Int* len,
   2283                   VexAbiInfo* vbi, Prefix pfx, Long delta,
   2284                   /*OUT*/HChar* buf, Int extra_bytes )
   2285 {
   2286    UChar mod_reg_rm = getUChar(delta);
   2287    delta++;
   2288 
   2289    buf[0] = (UChar)0;
   2290    vassert(extra_bytes >= 0 && extra_bytes < 10);
   2291 
   2292    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
   2293       jump table seems a bit excessive.
   2294    */
   2295    mod_reg_rm &= 0xC7;                         /* is now XX000YYY */
   2296    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
   2297                                                /* is now XX0XXYYY */
   2298    mod_reg_rm &= 0x1F;                         /* is now 000XXYYY */
   2299    switch (mod_reg_rm) {
   2300 
   2301       /* REX.B==0: (%rax) .. (%rdi), not including (%rsp) or (%rbp).
   2302          REX.B==1: (%r8)  .. (%r15), not including (%r12) or (%r13).
   2303       */
   2304       case 0x00: case 0x01: case 0x02: case 0x03:
   2305       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
   2306          { UChar rm = toUChar(mod_reg_rm & 7);
   2307            DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,rm));
   2308            *len = 1;
   2309            return disAMode_copy2tmp(
   2310                   handleAddrOverrides(vbi, pfx, getIRegRexB(8,pfx,rm)));
   2311          }
   2312 
   2313       /* REX.B==0: d8(%rax) ... d8(%rdi), not including d8(%rsp)
   2314          REX.B==1: d8(%r8)  ... d8(%r15), not including d8(%r12)
   2315       */
   2316       case 0x08: case 0x09: case 0x0A: case 0x0B:
   2317       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
   2318          { UChar rm = toUChar(mod_reg_rm & 7);
   2319            Long d   = getSDisp8(delta);
   2320            if (d == 0) {
   2321               DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,rm));
   2322            } else {
   2323               DIS(buf, "%s%lld(%s)", segRegTxt(pfx), d, nameIRegRexB(8,pfx,rm));
   2324            }
   2325            *len = 2;
   2326            return disAMode_copy2tmp(
   2327                   handleAddrOverrides(vbi, pfx,
   2328                      binop(Iop_Add64,getIRegRexB(8,pfx,rm),mkU64(d))));
   2329          }
   2330 
   2331       /* REX.B==0: d32(%rax) ... d32(%rdi), not including d32(%rsp)
   2332          REX.B==1: d32(%r8)  ... d32(%r15), not including d32(%r12)
   2333       */
   2334       case 0x10: case 0x11: case 0x12: case 0x13:
   2335       /* ! 14 */ case 0x15: case 0x16: case 0x17:
   2336          { UChar rm = toUChar(mod_reg_rm & 7);
   2337            Long  d  = getSDisp32(delta);
   2338            DIS(buf, "%s%lld(%s)", segRegTxt(pfx), d, nameIRegRexB(8,pfx,rm));
   2339            *len = 5;
   2340            return disAMode_copy2tmp(
   2341                   handleAddrOverrides(vbi, pfx,
   2342                      binop(Iop_Add64,getIRegRexB(8,pfx,rm),mkU64(d))));
   2343          }
   2344 
   2345       /* REX.B==0: a register, %rax .. %rdi.  This shouldn't happen. */
   2346       /* REX.B==1: a register, %r8  .. %r16.  This shouldn't happen. */
   2347       case 0x18: case 0x19: case 0x1A: case 0x1B:
   2348       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
   2349          vpanic("disAMode(amd64): not an addr!");
   2350 
   2351       /* RIP + disp32.  This assumes that guest_RIP_curr_instr is set
   2352          correctly at the start of handling each instruction. */
   2353       case 0x05:
   2354          { Long d = getSDisp32(delta);
   2355            *len = 5;
   2356            DIS(buf, "%s%lld(%%rip)", segRegTxt(pfx), d);
   2357            /* We need to know the next instruction's start address.
   2358               Try and figure out what it is, record the guess, and ask
   2359               the top-level driver logic (bbToIR_AMD64) to check we
   2360               guessed right, after the instruction is completely
   2361               decoded. */
   2362            guest_RIP_next_mustcheck = True;
   2363            guest_RIP_next_assumed = guest_RIP_bbstart
   2364                                     + delta+4 + extra_bytes;
   2365            return disAMode_copy2tmp(
   2366                      handleAddrOverrides(vbi, pfx,
   2367                         binop(Iop_Add64, mkU64(guest_RIP_next_assumed),
   2368                                          mkU64(d))));
   2369          }
   2370 
   2371       case 0x04: {
   2372          /* SIB, with no displacement.  Special cases:
   2373             -- %rsp cannot act as an index value.
   2374                If index_r indicates %rsp, zero is used for the index.
   2375             -- when mod is zero and base indicates RBP or R13, base is
   2376                instead a 32-bit sign-extended literal.
   2377             It's all madness, I tell you.  Extract %index, %base and
   2378             scale from the SIB byte.  The value denoted is then:
   2379                | %index == %RSP && (%base == %RBP || %base == %R13)
   2380                = d32 following SIB byte
   2381                | %index == %RSP && !(%base == %RBP || %base == %R13)
   2382                = %base
   2383                | %index != %RSP && (%base == %RBP || %base == %R13)
   2384                = d32 following SIB byte + (%index << scale)
   2385                | %index != %RSP && !(%base == %RBP || %base == %R13)
   2386                = %base + (%index << scale)
   2387          */
   2388          UChar sib     = getUChar(delta);
   2389          UChar scale   = toUChar((sib >> 6) & 3);
   2390          UChar index_r = toUChar((sib >> 3) & 7);
   2391          UChar base_r  = toUChar(sib & 7);
   2392          /* correct since #(R13) == 8 + #(RBP) */
   2393          Bool  base_is_BPor13 = toBool(base_r == R_RBP);
   2394          Bool  index_is_SP    = toBool(index_r == R_RSP && 0==getRexX(pfx));
   2395          delta++;
   2396 
   2397          if ((!index_is_SP) && (!base_is_BPor13)) {
   2398             if (scale == 0) {
   2399                DIS(buf, "%s(%s,%s)", segRegTxt(pfx),
   2400                          nameIRegRexB(8,pfx,base_r),
   2401                          nameIReg64rexX(pfx,index_r));
   2402             } else {
   2403                DIS(buf, "%s(%s,%s,%d)", segRegTxt(pfx),
   2404                          nameIRegRexB(8,pfx,base_r),
   2405                          nameIReg64rexX(pfx,index_r), 1<<scale);
   2406             }
   2407             *len = 2;
   2408             return
   2409                disAMode_copy2tmp(
   2410                handleAddrOverrides(vbi, pfx,
   2411                   binop(Iop_Add64,
   2412                         getIRegRexB(8,pfx,base_r),
   2413                         binop(Iop_Shl64, getIReg64rexX(pfx,index_r),
   2414                               mkU8(scale)))));
   2415          }
   2416 
   2417          if ((!index_is_SP) && base_is_BPor13) {
   2418             Long d = getSDisp32(delta);
   2419             DIS(buf, "%s%lld(,%s,%d)", segRegTxt(pfx), d,
   2420                       nameIReg64rexX(pfx,index_r), 1<<scale);
   2421             *len = 6;
   2422             return
   2423                disAMode_copy2tmp(
   2424                handleAddrOverrides(vbi, pfx,
   2425                   binop(Iop_Add64,
   2426                         binop(Iop_Shl64, getIReg64rexX(pfx,index_r),
   2427                                          mkU8(scale)),
   2428                         mkU64(d))));
   2429          }
   2430 
   2431          if (index_is_SP && (!base_is_BPor13)) {
   2432             DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,base_r));
   2433             *len = 2;
   2434             return disAMode_copy2tmp(
   2435                    handleAddrOverrides(vbi, pfx, getIRegRexB(8,pfx,base_r)));
   2436          }
   2437 
   2438          if (index_is_SP && base_is_BPor13) {
   2439             Long d = getSDisp32(delta);
   2440             DIS(buf, "%s%lld", segRegTxt(pfx), d);
   2441             *len = 6;
   2442             return disAMode_copy2tmp(
   2443                    handleAddrOverrides(vbi, pfx, mkU64(d)));
   2444          }
   2445 
   2446          vassert(0);
   2447       }
   2448 
   2449       /* SIB, with 8-bit displacement.  Special cases:
   2450          -- %esp cannot act as an index value.
   2451             If index_r indicates %esp, zero is used for the index.
   2452          Denoted value is:
   2453             | %index == %ESP
   2454             = d8 + %base
   2455             | %index != %ESP
   2456             = d8 + %base + (%index << scale)
   2457       */
   2458       case 0x0C: {
   2459          UChar sib     = getUChar(delta);
   2460          UChar scale   = toUChar((sib >> 6) & 3);
   2461          UChar index_r = toUChar((sib >> 3) & 7);
   2462          UChar base_r  = toUChar(sib & 7);
   2463          Long d        = getSDisp8(delta+1);
   2464 
   2465          if (index_r == R_RSP && 0==getRexX(pfx)) {
   2466             DIS(buf, "%s%lld(%s)", segRegTxt(pfx),
   2467                                    d, nameIRegRexB(8,pfx,base_r));
   2468             *len = 3;
   2469             return disAMode_copy2tmp(
   2470                    handleAddrOverrides(vbi, pfx,
   2471                       binop(Iop_Add64, getIRegRexB(8,pfx,base_r), mkU64(d)) ));
   2472          } else {
   2473             if (scale == 0) {
   2474                DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
   2475                          nameIRegRexB(8,pfx,base_r),
   2476                          nameIReg64rexX(pfx,index_r));
   2477             } else {
   2478                DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
   2479                          nameIRegRexB(8,pfx,base_r),
   2480                          nameIReg64rexX(pfx,index_r), 1<<scale);
   2481             }
   2482             *len = 3;
   2483             return
   2484                 disAMode_copy2tmp(
   2485                 handleAddrOverrides(vbi, pfx,
   2486                   binop(Iop_Add64,
   2487                         binop(Iop_Add64,
   2488                               getIRegRexB(8,pfx,base_r),
   2489                               binop(Iop_Shl64,
   2490                                     getIReg64rexX(pfx,index_r), mkU8(scale))),
   2491                         mkU64(d))));
   2492          }
   2493          vassert(0); /*NOTREACHED*/
   2494       }
   2495 
   2496       /* SIB, with 32-bit displacement.  Special cases:
   2497          -- %rsp cannot act as an index value.
   2498             If index_r indicates %rsp, zero is used for the index.
   2499          Denoted value is:
   2500             | %index == %RSP
   2501             = d32 + %base
   2502             | %index != %RSP
   2503             = d32 + %base + (%index << scale)
   2504       */
   2505       case 0x14: {
   2506          UChar sib     = getUChar(delta);
   2507          UChar scale   = toUChar((sib >> 6) & 3);
   2508          UChar index_r = toUChar((sib >> 3) & 7);
   2509          UChar base_r  = toUChar(sib & 7);
   2510          Long d        = getSDisp32(delta+1);
   2511 
   2512          if (index_r == R_RSP && 0==getRexX(pfx)) {
   2513             DIS(buf, "%s%lld(%s)", segRegTxt(pfx),
   2514                                    d, nameIRegRexB(8,pfx,base_r));
   2515             *len = 6;
   2516             return disAMode_copy2tmp(
   2517                    handleAddrOverrides(vbi, pfx,
   2518                       binop(Iop_Add64, getIRegRexB(8,pfx,base_r), mkU64(d)) ));
   2519          } else {
   2520             if (scale == 0) {
   2521                DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
   2522                          nameIRegRexB(8,pfx,base_r),
   2523                          nameIReg64rexX(pfx,index_r));
   2524             } else {
   2525                DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
   2526                          nameIRegRexB(8,pfx,base_r),
   2527                          nameIReg64rexX(pfx,index_r), 1<<scale);
   2528             }
   2529             *len = 6;
   2530             return
   2531                 disAMode_copy2tmp(
   2532                 handleAddrOverrides(vbi, pfx,
   2533                   binop(Iop_Add64,
   2534                         binop(Iop_Add64,
   2535                               getIRegRexB(8,pfx,base_r),
   2536                               binop(Iop_Shl64,
   2537                                     getIReg64rexX(pfx,index_r), mkU8(scale))),
   2538                         mkU64(d))));
   2539          }
   2540          vassert(0); /*NOTREACHED*/
   2541       }
   2542 
   2543       default:
   2544          vpanic("disAMode(amd64)");
   2545          return 0; /*notreached*/
   2546    }
   2547 }
   2548 
   2549 
   2550 /* Figure out the number of (insn-stream) bytes constituting the amode
   2551    beginning at delta.  Is useful for getting hold of literals beyond
   2552    the end of the amode before it has been disassembled.  */
   2553 
   2554 static UInt lengthAMode ( Prefix pfx, Long delta )
   2555 {
   2556    UChar mod_reg_rm = getUChar(delta);
   2557    delta++;
   2558 
   2559    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
   2560       jump table seems a bit excessive.
   2561    */
   2562    mod_reg_rm &= 0xC7;                         /* is now XX000YYY */
   2563    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
   2564                                                /* is now XX0XXYYY */
   2565    mod_reg_rm &= 0x1F;                         /* is now 000XXYYY */
   2566    switch (mod_reg_rm) {
   2567 
   2568       /* REX.B==0: (%rax) .. (%rdi), not including (%rsp) or (%rbp).
   2569          REX.B==1: (%r8)  .. (%r15), not including (%r12) or (%r13).
   2570       */
   2571       case 0x00: case 0x01: case 0x02: case 0x03:
   2572       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
   2573          return 1;
   2574 
   2575       /* REX.B==0: d8(%rax) ... d8(%rdi), not including d8(%rsp)
   2576          REX.B==1: d8(%r8)  ... d8(%r15), not including d8(%r12)
   2577       */
   2578       case 0x08: case 0x09: case 0x0A: case 0x0B:
   2579       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
   2580          return 2;
   2581 
   2582       /* REX.B==0: d32(%rax) ... d32(%rdi), not including d32(%rsp)
   2583          REX.B==1: d32(%r8)  ... d32(%r15), not including d32(%r12)
   2584       */
   2585       case 0x10: case 0x11: case 0x12: case 0x13:
   2586       /* ! 14 */ case 0x15: case 0x16: case 0x17:
   2587          return 5;
   2588 
   2589       /* REX.B==0: a register, %rax .. %rdi.  This shouldn't happen. */
   2590       /* REX.B==1: a register, %r8  .. %r16.  This shouldn't happen. */
   2591       /* Not an address, but still handled. */
   2592       case 0x18: case 0x19: case 0x1A: case 0x1B:
   2593       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
   2594          return 1;
   2595 
   2596       /* RIP + disp32. */
   2597       case 0x05:
   2598          return 5;
   2599 
   2600       case 0x04: {
   2601          /* SIB, with no displacement. */
   2602          UChar sib     = getUChar(delta);
   2603          UChar base_r  = toUChar(sib & 7);
   2604          /* correct since #(R13) == 8 + #(RBP) */
   2605          Bool  base_is_BPor13 = toBool(base_r == R_RBP);
   2606 
   2607          if (base_is_BPor13) {
   2608             return 6;
   2609          } else {
   2610             return 2;
   2611          }
   2612       }
   2613 
   2614       /* SIB, with 8-bit displacement. */
   2615       case 0x0C:
   2616          return 3;
   2617 
   2618       /* SIB, with 32-bit displacement. */
   2619       case 0x14:
   2620          return 6;
   2621 
   2622       default:
   2623          vpanic("lengthAMode(amd64)");
   2624          return 0; /*notreached*/
   2625    }
   2626 }
   2627 
   2628 
   2629 /*------------------------------------------------------------*/
   2630 /*--- Disassembling common idioms                          ---*/
   2631 /*------------------------------------------------------------*/
   2632 
   2633 /* Handle binary integer instructions of the form
   2634       op E, G  meaning
   2635       op reg-or-mem, reg
   2636    Is passed the a ptr to the modRM byte, the actual operation, and the
   2637    data size.  Returns the address advanced completely over this
   2638    instruction.
   2639 
   2640    E(src) is reg-or-mem
   2641    G(dst) is reg.
   2642 
   2643    If E is reg, -->    GET %G,  tmp
   2644                        OP %E,   tmp
   2645                        PUT tmp, %G
   2646 
   2647    If E is mem and OP is not reversible,
   2648                 -->    (getAddr E) -> tmpa
   2649                        LD (tmpa), tmpa
   2650                        GET %G, tmp2
   2651                        OP tmpa, tmp2
   2652                        PUT tmp2, %G
   2653 
   2654    If E is mem and OP is reversible
   2655                 -->    (getAddr E) -> tmpa
   2656                        LD (tmpa), tmpa
   2657                        OP %G, tmpa
   2658                        PUT tmpa, %G
   2659 */
   2660 static
   2661 ULong dis_op2_E_G ( VexAbiInfo* vbi,
   2662                     Prefix      pfx,
   2663                     Bool        addSubCarry,
   2664                     IROp        op8,
   2665                     Bool        keep,
   2666                     Int         size,
   2667                     Long        delta0,
   2668                     HChar*      t_amd64opc )
   2669 {
   2670    HChar   dis_buf[50];
   2671    Int     len;
   2672    IRType  ty   = szToITy(size);
   2673    IRTemp  dst1 = newTemp(ty);
   2674    IRTemp  src  = newTemp(ty);
   2675    IRTemp  dst0 = newTemp(ty);
   2676    UChar   rm   = getUChar(delta0);
   2677    IRTemp  addr = IRTemp_INVALID;
   2678 
   2679    /* addSubCarry == True indicates the intended operation is
   2680       add-with-carry or subtract-with-borrow. */
   2681    if (addSubCarry) {
   2682       vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
   2683       vassert(keep);
   2684    }
   2685 
   2686    if (epartIsReg(rm)) {
   2687       /* Specially handle XOR reg,reg, because that doesn't really
   2688          depend on reg, and doing the obvious thing potentially
   2689          generates a spurious value check failure due to the bogus
   2690          dependency. */
   2691       if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
   2692           && offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
   2693          if (False && op8 == Iop_Sub8)
   2694             vex_printf("vex amd64->IR: sbb %%r,%%r optimisation(1)\n");
   2695 	 putIRegG(size,pfx,rm, mkU(ty,0));
   2696       }
   2697 
   2698       assign( dst0, getIRegG(size,pfx,rm) );
   2699       assign( src,  getIRegE(size,pfx,rm) );
   2700 
   2701       if (addSubCarry && op8 == Iop_Add8) {
   2702          helper_ADC( size, dst1, dst0, src,
   2703                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2704          putIRegG(size, pfx, rm, mkexpr(dst1));
   2705       } else
   2706       if (addSubCarry && op8 == Iop_Sub8) {
   2707          helper_SBB( size, dst1, dst0, src,
   2708                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2709          putIRegG(size, pfx, rm, mkexpr(dst1));
   2710       } else {
   2711          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   2712          if (isAddSub(op8))
   2713             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2714          else
   2715             setFlags_DEP1(op8, dst1, ty);
   2716          if (keep)
   2717             putIRegG(size, pfx, rm, mkexpr(dst1));
   2718       }
   2719 
   2720       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   2721                           nameIRegE(size,pfx,rm),
   2722                           nameIRegG(size,pfx,rm));
   2723       return 1+delta0;
   2724    } else {
   2725       /* E refers to memory */
   2726       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   2727       assign( dst0, getIRegG(size,pfx,rm) );
   2728       assign( src,  loadLE(szToITy(size), mkexpr(addr)) );
   2729 
   2730       if (addSubCarry && op8 == Iop_Add8) {
   2731          helper_ADC( size, dst1, dst0, src,
   2732                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2733          putIRegG(size, pfx, rm, mkexpr(dst1));
   2734       } else
   2735       if (addSubCarry && op8 == Iop_Sub8) {
   2736          helper_SBB( size, dst1, dst0, src,
   2737                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2738          putIRegG(size, pfx, rm, mkexpr(dst1));
   2739       } else {
   2740          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   2741          if (isAddSub(op8))
   2742             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2743          else
   2744             setFlags_DEP1(op8, dst1, ty);
   2745          if (keep)
   2746             putIRegG(size, pfx, rm, mkexpr(dst1));
   2747       }
   2748 
   2749       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   2750                           dis_buf, nameIRegG(size, pfx, rm));
   2751       return len+delta0;
   2752    }
   2753 }
   2754 
   2755 
   2756 
   2757 /* Handle binary integer instructions of the form
   2758       op G, E  meaning
   2759       op reg, reg-or-mem
   2760    Is passed the a ptr to the modRM byte, the actual operation, and the
   2761    data size.  Returns the address advanced completely over this
   2762    instruction.
   2763 
   2764    G(src) is reg.
   2765    E(dst) is reg-or-mem
   2766 
   2767    If E is reg, -->    GET %E,  tmp
   2768                        OP %G,   tmp
   2769                        PUT tmp, %E
   2770 
   2771    If E is mem, -->    (getAddr E) -> tmpa
   2772                        LD (tmpa), tmpv
   2773                        OP %G, tmpv
   2774                        ST tmpv, (tmpa)
   2775 */
   2776 static
   2777 ULong dis_op2_G_E ( VexAbiInfo* vbi,
   2778                     Prefix      pfx,
   2779                     Bool        addSubCarry,
   2780                     IROp        op8,
   2781                     Bool        keep,
   2782                     Int         size,
   2783                     Long        delta0,
   2784                     HChar*      t_amd64opc )
   2785 {
   2786    HChar   dis_buf[50];
   2787    Int     len;
   2788    IRType  ty   = szToITy(size);
   2789    IRTemp  dst1 = newTemp(ty);
   2790    IRTemp  src  = newTemp(ty);
   2791    IRTemp  dst0 = newTemp(ty);
   2792    UChar   rm   = getUChar(delta0);
   2793    IRTemp  addr = IRTemp_INVALID;
   2794 
   2795    /* addSubCarry == True indicates the intended operation is
   2796       add-with-carry or subtract-with-borrow. */
   2797    if (addSubCarry) {
   2798       vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
   2799       vassert(keep);
   2800    }
   2801 
   2802    if (epartIsReg(rm)) {
   2803       /* Specially handle XOR reg,reg, because that doesn't really
   2804          depend on reg, and doing the obvious thing potentially
   2805          generates a spurious value check failure due to the bogus
   2806          dependency.  Ditto SBB reg,reg. */
   2807       if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
   2808           && offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
   2809          putIRegE(size,pfx,rm, mkU(ty,0));
   2810       }
   2811 
   2812       assign(dst0, getIRegE(size,pfx,rm));
   2813       assign(src,  getIRegG(size,pfx,rm));
   2814 
   2815       if (addSubCarry && op8 == Iop_Add8) {
   2816          helper_ADC( size, dst1, dst0, src,
   2817                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2818          putIRegE(size, pfx, rm, mkexpr(dst1));
   2819       } else
   2820       if (addSubCarry && op8 == Iop_Sub8) {
   2821          helper_SBB( size, dst1, dst0, src,
   2822                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2823          putIRegE(size, pfx, rm, mkexpr(dst1));
   2824       } else {
   2825          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   2826          if (isAddSub(op8))
   2827             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2828          else
   2829             setFlags_DEP1(op8, dst1, ty);
   2830          if (keep)
   2831             putIRegE(size, pfx, rm, mkexpr(dst1));
   2832       }
   2833 
   2834       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   2835                           nameIRegG(size,pfx,rm),
   2836                           nameIRegE(size,pfx,rm));
   2837       return 1+delta0;
   2838    }
   2839 
   2840    /* E refers to memory */
   2841    {
   2842       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   2843       assign(dst0, loadLE(ty,mkexpr(addr)));
   2844       assign(src,  getIRegG(size,pfx,rm));
   2845 
   2846       if (addSubCarry && op8 == Iop_Add8) {
   2847          if (pfx & PFX_LOCK) {
   2848             /* cas-style store */
   2849             helper_ADC( size, dst1, dst0, src,
   2850                         /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   2851          } else {
   2852             /* normal store */
   2853             helper_ADC( size, dst1, dst0, src,
   2854                         /*store*/addr, IRTemp_INVALID, 0 );
   2855          }
   2856       } else
   2857       if (addSubCarry && op8 == Iop_Sub8) {
   2858          if (pfx & PFX_LOCK) {
   2859             /* cas-style store */
   2860             helper_SBB( size, dst1, dst0, src,
   2861                         /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   2862          } else {
   2863             /* normal store */
   2864             helper_SBB( size, dst1, dst0, src,
   2865                         /*store*/addr, IRTemp_INVALID, 0 );
   2866          }
   2867       } else {
   2868          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   2869          if (keep) {
   2870             if (pfx & PFX_LOCK) {
   2871                if (0) vex_printf("locked case\n" );
   2872                casLE( mkexpr(addr),
   2873                       mkexpr(dst0)/*expval*/,
   2874                       mkexpr(dst1)/*newval*/, guest_RIP_curr_instr );
   2875             } else {
   2876                if (0) vex_printf("nonlocked case\n");
   2877                storeLE(mkexpr(addr), mkexpr(dst1));
   2878             }
   2879          }
   2880          if (isAddSub(op8))
   2881             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2882          else
   2883             setFlags_DEP1(op8, dst1, ty);
   2884       }
   2885 
   2886       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   2887                           nameIRegG(size,pfx,rm), dis_buf);
   2888       return len+delta0;
   2889    }
   2890 }
   2891 
   2892 
   2893 /* Handle move instructions of the form
   2894       mov E, G  meaning
   2895       mov reg-or-mem, reg
   2896    Is passed the a ptr to the modRM byte, and the data size.  Returns
   2897    the address advanced completely over this instruction.
   2898 
   2899    E(src) is reg-or-mem
   2900    G(dst) is reg.
   2901 
   2902    If E is reg, -->    GET %E,  tmpv
   2903                        PUT tmpv, %G
   2904 
   2905    If E is mem  -->    (getAddr E) -> tmpa
   2906                        LD (tmpa), tmpb
   2907                        PUT tmpb, %G
   2908 */
   2909 static
   2910 ULong dis_mov_E_G ( VexAbiInfo* vbi,
   2911                     Prefix      pfx,
   2912                     Int         size,
   2913                     Long        delta0 )
   2914 {
   2915    Int len;
   2916    UChar rm = getUChar(delta0);
   2917    HChar dis_buf[50];
   2918 
   2919    if (epartIsReg(rm)) {
   2920       putIRegG(size, pfx, rm, getIRegE(size, pfx, rm));
   2921       DIP("mov%c %s,%s\n", nameISize(size),
   2922                            nameIRegE(size,pfx,rm),
   2923                            nameIRegG(size,pfx,rm));
   2924       return 1+delta0;
   2925    }
   2926 
   2927    /* E refers to memory */
   2928    {
   2929       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   2930       putIRegG(size, pfx, rm, loadLE(szToITy(size), mkexpr(addr)));
   2931       DIP("mov%c %s,%s\n", nameISize(size),
   2932                            dis_buf,
   2933                            nameIRegG(size,pfx,rm));
   2934       return delta0+len;
   2935    }
   2936 }
   2937 
   2938 
   2939 /* Handle move instructions of the form
   2940       mov G, E  meaning
   2941       mov reg, reg-or-mem
   2942    Is passed the a ptr to the modRM byte, and the data size.  Returns
   2943    the address advanced completely over this instruction.
   2944 
   2945    G(src) is reg.
   2946    E(dst) is reg-or-mem
   2947 
   2948    If E is reg, -->    GET %G,  tmp
   2949                        PUT tmp, %E
   2950 
   2951    If E is mem, -->    (getAddr E) -> tmpa
   2952                        GET %G, tmpv
   2953                        ST tmpv, (tmpa)
   2954 */
   2955 static
   2956 ULong dis_mov_G_E ( VexAbiInfo* vbi,
   2957                     Prefix      pfx,
   2958                     Int         size,
   2959                     Long        delta0 )
   2960 {
   2961    Int len;
   2962    UChar rm = getUChar(delta0);
   2963    HChar dis_buf[50];
   2964 
   2965    if (epartIsReg(rm)) {
   2966       putIRegE(size, pfx, rm, getIRegG(size, pfx, rm));
   2967       DIP("mov%c %s,%s\n", nameISize(size),
   2968                            nameIRegG(size,pfx,rm),
   2969                            nameIRegE(size,pfx,rm));
   2970       return 1+delta0;
   2971    }
   2972 
   2973    /* E refers to memory */
   2974    {
   2975       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   2976       storeLE( mkexpr(addr), getIRegG(size, pfx, rm) );
   2977       DIP("mov%c %s,%s\n", nameISize(size),
   2978                            nameIRegG(size,pfx,rm),
   2979                            dis_buf);
   2980       return len+delta0;
   2981    }
   2982 }
   2983 
   2984 
   2985 /* op $immediate, AL/AX/EAX/RAX. */
   2986 static
   2987 ULong dis_op_imm_A ( Int    size,
   2988                      Bool   carrying,
   2989                      IROp   op8,
   2990                      Bool   keep,
   2991                      Long   delta,
   2992                      HChar* t_amd64opc )
   2993 {
   2994    Int    size4 = imin(size,4);
   2995    IRType ty    = szToITy(size);
   2996    IRTemp dst0  = newTemp(ty);
   2997    IRTemp src   = newTemp(ty);
   2998    IRTemp dst1  = newTemp(ty);
   2999    Long  lit    = getSDisp(size4,delta);
   3000    assign(dst0, getIRegRAX(size));
   3001    assign(src,  mkU(ty,lit & mkSizeMask(size)));
   3002 
   3003    if (isAddSub(op8) && !carrying) {
   3004       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   3005       setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3006    }
   3007    else
   3008    if (isLogic(op8)) {
   3009       vassert(!carrying);
   3010       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   3011       setFlags_DEP1(op8, dst1, ty);
   3012    }
   3013    else
   3014    if (op8 == Iop_Add8 && carrying) {
   3015       helper_ADC( size, dst1, dst0, src,
   3016                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3017    }
   3018    else
   3019    if (op8 == Iop_Sub8 && carrying) {
   3020       helper_SBB( size, dst1, dst0, src,
   3021                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3022    }
   3023    else
   3024       vpanic("dis_op_imm_A(amd64,guest)");
   3025 
   3026    if (keep)
   3027       putIRegRAX(size, mkexpr(dst1));
   3028 
   3029    DIP("%s%c $%lld, %s\n", t_amd64opc, nameISize(size),
   3030                            lit, nameIRegRAX(size));
   3031    return delta+size4;
   3032 }
   3033 
   3034 
   3035 /* Sign- and Zero-extending moves. */
   3036 static
   3037 ULong dis_movx_E_G ( VexAbiInfo* vbi,
   3038                      Prefix pfx,
   3039                      Long delta, Int szs, Int szd, Bool sign_extend )
   3040 {
   3041    UChar rm = getUChar(delta);
   3042    if (epartIsReg(rm)) {
   3043       putIRegG(szd, pfx, rm,
   3044                     doScalarWidening(
   3045                        szs,szd,sign_extend,
   3046                        getIRegE(szs,pfx,rm)));
   3047       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
   3048                                nameISize(szs),
   3049                                nameISize(szd),
   3050                                nameIRegE(szs,pfx,rm),
   3051                                nameIRegG(szd,pfx,rm));
   3052       return 1+delta;
   3053    }
   3054 
   3055    /* E refers to memory */
   3056    {
   3057       Int    len;
   3058       HChar  dis_buf[50];
   3059       IRTemp addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   3060       putIRegG(szd, pfx, rm,
   3061                     doScalarWidening(
   3062                        szs,szd,sign_extend,
   3063                        loadLE(szToITy(szs),mkexpr(addr))));
   3064       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
   3065                                nameISize(szs),
   3066                                nameISize(szd),
   3067                                dis_buf,
   3068                                nameIRegG(szd,pfx,rm));
   3069       return len+delta;
   3070    }
   3071 }
   3072 
   3073 
   3074 /* Generate code to divide ArchRegs RDX:RAX / EDX:EAX / DX:AX / AX by
   3075    the 64 / 32 / 16 / 8 bit quantity in the given IRTemp.  */
   3076 static
   3077 void codegen_div ( Int sz, IRTemp t, Bool signed_divide )
   3078 {
   3079    /* special-case the 64-bit case */
   3080    if (sz == 8) {
   3081       IROp   op     = signed_divide ? Iop_DivModS128to64
   3082                                     : Iop_DivModU128to64;
   3083       IRTemp src128 = newTemp(Ity_I128);
   3084       IRTemp dst128 = newTemp(Ity_I128);
   3085       assign( src128, binop(Iop_64HLto128,
   3086                             getIReg64(R_RDX),
   3087                             getIReg64(R_RAX)) );
   3088       assign( dst128, binop(op, mkexpr(src128), mkexpr(t)) );
   3089       putIReg64( R_RAX, unop(Iop_128to64,mkexpr(dst128)) );
   3090       putIReg64( R_RDX, unop(Iop_128HIto64,mkexpr(dst128)) );
   3091    } else {
   3092       IROp   op    = signed_divide ? Iop_DivModS64to32
   3093                                    : Iop_DivModU64to32;
   3094       IRTemp src64 = newTemp(Ity_I64);
   3095       IRTemp dst64 = newTemp(Ity_I64);
   3096       switch (sz) {
   3097       case 4:
   3098          assign( src64,
   3099                  binop(Iop_32HLto64, getIRegRDX(4), getIRegRAX(4)) );
   3100          assign( dst64,
   3101                  binop(op, mkexpr(src64), mkexpr(t)) );
   3102          putIRegRAX( 4, unop(Iop_64to32,mkexpr(dst64)) );
   3103          putIRegRDX( 4, unop(Iop_64HIto32,mkexpr(dst64)) );
   3104          break;
   3105       case 2: {
   3106          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
   3107          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
   3108          assign( src64, unop(widen3264,
   3109                              binop(Iop_16HLto32,
   3110                                    getIRegRDX(2),
   3111                                    getIRegRAX(2))) );
   3112          assign( dst64, binop(op, mkexpr(src64), unop(widen1632,mkexpr(t))) );
   3113          putIRegRAX( 2, unop(Iop_32to16,unop(Iop_64to32,mkexpr(dst64))) );
   3114          putIRegRDX( 2, unop(Iop_32to16,unop(Iop_64HIto32,mkexpr(dst64))) );
   3115          break;
   3116       }
   3117       case 1: {
   3118          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
   3119          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
   3120          IROp widen816  = signed_divide ? Iop_8Sto16  : Iop_8Uto16;
   3121          assign( src64, unop(widen3264,
   3122                         unop(widen1632, getIRegRAX(2))) );
   3123          assign( dst64,
   3124                  binop(op, mkexpr(src64),
   3125                            unop(widen1632, unop(widen816, mkexpr(t)))) );
   3126          putIRegRAX( 1, unop(Iop_16to8,
   3127                         unop(Iop_32to16,
   3128                         unop(Iop_64to32,mkexpr(dst64)))) );
   3129          putIRegAH( unop(Iop_16to8,
   3130                     unop(Iop_32to16,
   3131                     unop(Iop_64HIto32,mkexpr(dst64)))) );
   3132          break;
   3133       }
   3134       default:
   3135          vpanic("codegen_div(amd64)");
   3136       }
   3137    }
   3138 }
   3139 
   3140 static
   3141 ULong dis_Grp1 ( VexAbiInfo* vbi,
   3142                  Prefix pfx,
   3143                  Long delta, UChar modrm,
   3144                  Int am_sz, Int d_sz, Int sz, Long d64 )
   3145 {
   3146    Int     len;
   3147    HChar   dis_buf[50];
   3148    IRType  ty   = szToITy(sz);
   3149    IRTemp  dst1 = newTemp(ty);
   3150    IRTemp  src  = newTemp(ty);
   3151    IRTemp  dst0 = newTemp(ty);
   3152    IRTemp  addr = IRTemp_INVALID;
   3153    IROp    op8  = Iop_INVALID;
   3154    ULong   mask = mkSizeMask(sz);
   3155 
   3156    switch (gregLO3ofRM(modrm)) {
   3157       case 0: op8 = Iop_Add8; break;  case 1: op8 = Iop_Or8;  break;
   3158       case 2: break;  // ADC
   3159       case 3: break;  // SBB
   3160       case 4: op8 = Iop_And8; break;  case 5: op8 = Iop_Sub8; break;
   3161       case 6: op8 = Iop_Xor8; break;  case 7: op8 = Iop_Sub8; break;
   3162       /*NOTREACHED*/
   3163       default: vpanic("dis_Grp1(amd64): unhandled case");
   3164    }
   3165 
   3166    if (epartIsReg(modrm)) {
   3167       vassert(am_sz == 1);
   3168 
   3169       assign(dst0, getIRegE(sz,pfx,modrm));
   3170       assign(src,  mkU(ty,d64 & mask));
   3171 
   3172       if (gregLO3ofRM(modrm) == 2 /* ADC */) {
   3173          helper_ADC( sz, dst1, dst0, src,
   3174                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3175       } else
   3176       if (gregLO3ofRM(modrm) == 3 /* SBB */) {
   3177          helper_SBB( sz, dst1, dst0, src,
   3178                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3179       } else {
   3180          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   3181          if (isAddSub(op8))
   3182             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3183          else
   3184             setFlags_DEP1(op8, dst1, ty);
   3185       }
   3186 
   3187       if (gregLO3ofRM(modrm) < 7)
   3188          putIRegE(sz, pfx, modrm, mkexpr(dst1));
   3189 
   3190       delta += (am_sz + d_sz);
   3191       DIP("%s%c $%lld, %s\n",
   3192           nameGrp1(gregLO3ofRM(modrm)), nameISize(sz), d64,
   3193           nameIRegE(sz,pfx,modrm));
   3194    } else {
   3195       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, /*xtra*/d_sz );
   3196 
   3197       assign(dst0, loadLE(ty,mkexpr(addr)));
   3198       assign(src, mkU(ty,d64 & mask));
   3199 
   3200       if (gregLO3ofRM(modrm) == 2 /* ADC */) {
   3201          if (pfx & PFX_LOCK) {
   3202             /* cas-style store */
   3203             helper_ADC( sz, dst1, dst0, src,
   3204                        /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   3205          } else {
   3206             /* normal store */
   3207             helper_ADC( sz, dst1, dst0, src,
   3208                         /*store*/addr, IRTemp_INVALID, 0 );
   3209          }
   3210       } else
   3211       if (gregLO3ofRM(modrm) == 3 /* SBB */) {
   3212          if (pfx & PFX_LOCK) {
   3213             /* cas-style store */
   3214             helper_SBB( sz, dst1, dst0, src,
   3215                        /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   3216          } else {
   3217             /* normal store */
   3218             helper_SBB( sz, dst1, dst0, src,
   3219                         /*store*/addr, IRTemp_INVALID, 0 );
   3220          }
   3221       } else {
   3222          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   3223          if (gregLO3ofRM(modrm) < 7) {
   3224             if (pfx & PFX_LOCK) {
   3225                casLE( mkexpr(addr), mkexpr(dst0)/*expVal*/,
   3226                                     mkexpr(dst1)/*newVal*/,
   3227                                     guest_RIP_curr_instr );
   3228             } else {
   3229                storeLE(mkexpr(addr), mkexpr(dst1));
   3230             }
   3231          }
   3232          if (isAddSub(op8))
   3233             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3234          else
   3235             setFlags_DEP1(op8, dst1, ty);
   3236       }
   3237 
   3238       delta += (len+d_sz);
   3239       DIP("%s%c $%lld, %s\n",
   3240           nameGrp1(gregLO3ofRM(modrm)), nameISize(sz),
   3241           d64, dis_buf);
   3242    }
   3243    return delta;
   3244 }
   3245 
   3246 
   3247 /* Group 2 extended opcodes.  shift_expr must be an 8-bit typed
   3248    expression. */
   3249 
   3250 static
   3251 ULong dis_Grp2 ( VexAbiInfo* vbi,
   3252                  Prefix pfx,
   3253                  Long delta, UChar modrm,
   3254                  Int am_sz, Int d_sz, Int sz, IRExpr* shift_expr,
   3255                  HChar* shift_expr_txt, Bool* decode_OK )
   3256 {
   3257    /* delta on entry points at the modrm byte. */
   3258    HChar  dis_buf[50];
   3259    Int    len;
   3260    Bool   isShift, isRotate, isRotateC;
   3261    IRType ty    = szToITy(sz);
   3262    IRTemp dst0  = newTemp(ty);
   3263    IRTemp dst1  = newTemp(ty);
   3264    IRTemp addr  = IRTemp_INVALID;
   3265 
   3266    *decode_OK = True;
   3267 
   3268    vassert(sz == 1 || sz == 2 || sz == 4 || sz == 8);
   3269 
   3270    /* Put value to shift/rotate in dst0. */
   3271    if (epartIsReg(modrm)) {
   3272       assign(dst0, getIRegE(sz, pfx, modrm));
   3273       delta += (am_sz + d_sz);
   3274    } else {
   3275       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, /*xtra*/d_sz );
   3276       assign(dst0, loadLE(ty,mkexpr(addr)));
   3277       delta += len + d_sz;
   3278    }
   3279 
   3280    isShift = False;
   3281    switch (gregLO3ofRM(modrm)) { case 4: case 5: case 6: case 7: isShift = True; }
   3282 
   3283    isRotate = False;
   3284    switch (gregLO3ofRM(modrm)) { case 0: case 1: isRotate = True; }
   3285 
   3286    isRotateC = False;
   3287    switch (gregLO3ofRM(modrm)) { case 2: case 3: isRotateC = True; }
   3288 
   3289    if (!isShift && !isRotate && !isRotateC) {
   3290       /*NOTREACHED*/
   3291       vpanic("dis_Grp2(Reg): unhandled case(amd64)");
   3292    }
   3293 
   3294    if (isRotateC) {
   3295       /* Call a helper; this insn is so ridiculous it does not deserve
   3296          better.  One problem is, the helper has to calculate both the
   3297          new value and the new flags.  This is more than 64 bits, and
   3298          there is no way to return more than 64 bits from the helper.
   3299          Hence the crude and obvious solution is to call it twice,
   3300          using the sign of the sz field to indicate whether it is the
   3301          value or rflags result we want.
   3302       */
   3303       Bool     left = toBool(gregLO3ofRM(modrm) == 2);
   3304       IRExpr** argsVALUE;
   3305       IRExpr** argsRFLAGS;
   3306 
   3307       IRTemp new_value  = newTemp(Ity_I64);
   3308       IRTemp new_rflags = newTemp(Ity_I64);
   3309       IRTemp old_rflags = newTemp(Ity_I64);
   3310 
   3311       assign( old_rflags, widenUto64(mk_amd64g_calculate_rflags_all()) );
   3312 
   3313       argsVALUE
   3314          = mkIRExprVec_4( widenUto64(mkexpr(dst0)), /* thing to rotate */
   3315                           widenUto64(shift_expr),   /* rotate amount */
   3316                           mkexpr(old_rflags),
   3317                           mkU64(sz) );
   3318       assign( new_value,
   3319                  mkIRExprCCall(
   3320                     Ity_I64,
   3321                     0/*regparm*/,
   3322                     left ? "amd64g_calculate_RCL" : "amd64g_calculate_RCR",
   3323                     left ? &amd64g_calculate_RCL  : &amd64g_calculate_RCR,
   3324                     argsVALUE
   3325                  )
   3326             );
   3327 
   3328       argsRFLAGS
   3329          = mkIRExprVec_4( widenUto64(mkexpr(dst0)), /* thing to rotate */
   3330                           widenUto64(shift_expr),   /* rotate amount */
   3331                           mkexpr(old_rflags),
   3332                           mkU64(-sz) );
   3333       assign( new_rflags,
   3334                  mkIRExprCCall(
   3335                     Ity_I64,
   3336                     0/*regparm*/,
   3337                     left ? "amd64g_calculate_RCL" : "amd64g_calculate_RCR",
   3338                     left ? &amd64g_calculate_RCL  : &amd64g_calculate_RCR,
   3339                     argsRFLAGS
   3340                  )
   3341             );
   3342 
   3343       assign( dst1, narrowTo(ty, mkexpr(new_value)) );
   3344       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   3345       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(new_rflags) ));
   3346       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   3347       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   3348    }
   3349 
   3350    else
   3351    if (isShift) {
   3352 
   3353       IRTemp pre64     = newTemp(Ity_I64);
   3354       IRTemp res64     = newTemp(Ity_I64);
   3355       IRTemp res64ss   = newTemp(Ity_I64);
   3356       IRTemp shift_amt = newTemp(Ity_I8);
   3357       UChar  mask      = toUChar(sz==8 ? 63 : 31);
   3358       IROp   op64;
   3359 
   3360       switch (gregLO3ofRM(modrm)) {
   3361          case 4: op64 = Iop_Shl64; break;
   3362          case 5: op64 = Iop_Shr64; break;
   3363          case 6: op64 = Iop_Shl64; break;
   3364          case 7: op64 = Iop_Sar64; break;
   3365          /*NOTREACHED*/
   3366          default: vpanic("dis_Grp2:shift"); break;
   3367       }
   3368 
   3369       /* Widen the value to be shifted to 64 bits, do the shift, and
   3370          narrow back down.  This seems surprisingly long-winded, but
   3371          unfortunately the AMD semantics requires that 8/16/32-bit
   3372          shifts give defined results for shift values all the way up
   3373          to 32, and this seems the simplest way to do it.  It has the
   3374          advantage that the only IR level shifts generated are of 64
   3375          bit values, and the shift amount is guaranteed to be in the
   3376          range 0 .. 63, thereby observing the IR semantics requiring
   3377          all shift values to be in the range 0 .. 2^word_size-1.
   3378 
   3379          Therefore the shift amount is masked with 63 for 64-bit shifts
   3380          and 31 for all others.
   3381       */
   3382       /* shift_amt = shift_expr & MASK, regardless of operation size */
   3383       assign( shift_amt, binop(Iop_And8, shift_expr, mkU8(mask)) );
   3384 
   3385       /* suitably widen the value to be shifted to 64 bits. */
   3386       assign( pre64, op64==Iop_Sar64 ? widenSto64(mkexpr(dst0))
   3387                                      : widenUto64(mkexpr(dst0)) );
   3388 
   3389       /* res64 = pre64 `shift` shift_amt */
   3390       assign( res64, binop(op64, mkexpr(pre64), mkexpr(shift_amt)) );
   3391 
   3392       /* res64ss = pre64 `shift` ((shift_amt - 1) & MASK) */
   3393       assign( res64ss,
   3394               binop(op64,
   3395                     mkexpr(pre64),
   3396                     binop(Iop_And8,
   3397                           binop(Iop_Sub8,
   3398                                 mkexpr(shift_amt), mkU8(1)),
   3399                           mkU8(mask))) );
   3400 
   3401       /* Build the flags thunk. */
   3402       setFlags_DEP1_DEP2_shift(op64, res64, res64ss, ty, shift_amt);
   3403 
   3404       /* Narrow the result back down. */
   3405       assign( dst1, narrowTo(ty, mkexpr(res64)) );
   3406 
   3407    } /* if (isShift) */
   3408 
   3409    else
   3410    if (isRotate) {
   3411       Int    ccOp      = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1
   3412                                         : (ty==Ity_I32 ? 2 : 3));
   3413       Bool   left      = toBool(gregLO3ofRM(modrm) == 0);
   3414       IRTemp rot_amt   = newTemp(Ity_I8);
   3415       IRTemp rot_amt64 = newTemp(Ity_I8);
   3416       IRTemp oldFlags  = newTemp(Ity_I64);
   3417       UChar  mask      = toUChar(sz==8 ? 63 : 31);
   3418 
   3419       /* rot_amt = shift_expr & mask */
   3420       /* By masking the rotate amount thusly, the IR-level Shl/Shr
   3421          expressions never shift beyond the word size and thus remain
   3422          well defined. */
   3423       assign(rot_amt64, binop(Iop_And8, shift_expr, mkU8(mask)));
   3424 
   3425       if (ty == Ity_I64)
   3426          assign(rot_amt, mkexpr(rot_amt64));
   3427       else
   3428          assign(rot_amt, binop(Iop_And8, mkexpr(rot_amt64), mkU8(8*sz-1)));
   3429 
   3430       if (left) {
   3431 
   3432          /* dst1 = (dst0 << rot_amt) | (dst0 >>u (wordsize-rot_amt)) */
   3433          assign(dst1,
   3434             binop( mkSizedOp(ty,Iop_Or8),
   3435                    binop( mkSizedOp(ty,Iop_Shl8),
   3436                           mkexpr(dst0),
   3437                           mkexpr(rot_amt)
   3438                    ),
   3439                    binop( mkSizedOp(ty,Iop_Shr8),
   3440                           mkexpr(dst0),
   3441                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
   3442                    )
   3443             )
   3444          );
   3445          ccOp += AMD64G_CC_OP_ROLB;
   3446 
   3447       } else { /* right */
   3448 
   3449          /* dst1 = (dst0 >>u rot_amt) | (dst0 << (wordsize-rot_amt)) */
   3450          assign(dst1,
   3451             binop( mkSizedOp(ty,Iop_Or8),
   3452                    binop( mkSizedOp(ty,Iop_Shr8),
   3453                           mkexpr(dst0),
   3454                           mkexpr(rot_amt)
   3455                    ),
   3456                    binop( mkSizedOp(ty,Iop_Shl8),
   3457                           mkexpr(dst0),
   3458                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
   3459                    )
   3460             )
   3461          );
   3462          ccOp += AMD64G_CC_OP_RORB;
   3463 
   3464       }
   3465 
   3466       /* dst1 now holds the rotated value.  Build flag thunk.  We
   3467          need the resulting value for this, and the previous flags.
   3468          Except don't set it if the rotate count is zero. */
   3469 
   3470       assign(oldFlags, mk_amd64g_calculate_rflags_all());
   3471 
   3472       /* CC_DEP1 is the rotated value.  CC_NDEP is flags before. */
   3473       stmt( IRStmt_Put( OFFB_CC_OP,
   3474                         IRExpr_Mux0X( mkexpr(rot_amt64),
   3475                                       IRExpr_Get(OFFB_CC_OP,Ity_I64),
   3476                                       mkU64(ccOp))) );
   3477       stmt( IRStmt_Put( OFFB_CC_DEP1,
   3478                         IRExpr_Mux0X( mkexpr(rot_amt64),
   3479                                       IRExpr_Get(OFFB_CC_DEP1,Ity_I64),
   3480                                       widenUto64(mkexpr(dst1)))) );
   3481       stmt( IRStmt_Put( OFFB_CC_DEP2,
   3482                         IRExpr_Mux0X( mkexpr(rot_amt64),
   3483                                       IRExpr_Get(OFFB_CC_DEP2,Ity_I64),
   3484                                       mkU64(0))) );
   3485       stmt( IRStmt_Put( OFFB_CC_NDEP,
   3486                         IRExpr_Mux0X( mkexpr(rot_amt64),
   3487                                       IRExpr_Get(OFFB_CC_NDEP,Ity_I64),
   3488                                       mkexpr(oldFlags))) );
   3489    } /* if (isRotate) */
   3490 
   3491    /* Save result, and finish up. */
   3492    if (epartIsReg(modrm)) {
   3493       putIRegE(sz, pfx, modrm, mkexpr(dst1));
   3494       if (vex_traceflags & VEX_TRACE_FE) {
   3495          vex_printf("%s%c ",
   3496                     nameGrp2(gregLO3ofRM(modrm)), nameISize(sz) );
   3497          if (shift_expr_txt)
   3498             vex_printf("%s", shift_expr_txt);
   3499          else
   3500             ppIRExpr(shift_expr);
   3501          vex_printf(", %s\n", nameIRegE(sz,pfx,modrm));
   3502       }
   3503    } else {
   3504       storeLE(mkexpr(addr), mkexpr(dst1));
   3505       if (vex_traceflags & VEX_TRACE_FE) {
   3506          vex_printf("%s%c ",
   3507                     nameGrp2(gregLO3ofRM(modrm)), nameISize(sz) );
   3508          if (shift_expr_txt)
   3509             vex_printf("%s", shift_expr_txt);
   3510          else
   3511             ppIRExpr(shift_expr);
   3512          vex_printf(", %s\n", dis_buf);
   3513       }
   3514    }
   3515    return delta;
   3516 }
   3517 
   3518 
   3519 /* Group 8 extended opcodes (but BT/BTS/BTC/BTR only). */
   3520 static
   3521 ULong dis_Grp8_Imm ( VexAbiInfo* vbi,
   3522                      Prefix pfx,
   3523                      Long delta, UChar modrm,
   3524                      Int am_sz, Int sz, ULong src_val,
   3525                      Bool* decode_OK )
   3526 {
   3527    /* src_val denotes a d8.
   3528       And delta on entry points at the modrm byte. */
   3529 
   3530    IRType ty     = szToITy(sz);
   3531    IRTemp t2     = newTemp(Ity_I64);
   3532    IRTemp t2m    = newTemp(Ity_I64);
   3533    IRTemp t_addr = IRTemp_INVALID;
   3534    HChar  dis_buf[50];
   3535    ULong  mask;
   3536 
   3537    /* we're optimists :-) */
   3538    *decode_OK = True;
   3539 
   3540    /* Limit src_val -- the bit offset -- to something within a word.
   3541       The Intel docs say that literal offsets larger than a word are
   3542       masked in this way. */
   3543    switch (sz) {
   3544       case 2:  src_val &= 15; break;
   3545       case 4:  src_val &= 31; break;
   3546       case 8:  src_val &= 63; break;
   3547       default: *decode_OK = False; return delta;
   3548    }
   3549 
   3550    /* Invent a mask suitable for the operation. */
   3551    switch (gregLO3ofRM(modrm)) {
   3552       case 4: /* BT */  mask = 0;                  break;
   3553       case 5: /* BTS */ mask = 1ULL << src_val;    break;
   3554       case 6: /* BTR */ mask = ~(1ULL << src_val); break;
   3555       case 7: /* BTC */ mask = 1ULL << src_val;    break;
   3556          /* If this needs to be extended, probably simplest to make a
   3557             new function to handle the other cases (0 .. 3).  The
   3558             Intel docs do however not indicate any use for 0 .. 3, so
   3559             we don't expect this to happen. */
   3560       default: *decode_OK = False; return delta;
   3561    }
   3562 
   3563    /* Fetch the value to be tested and modified into t2, which is
   3564       64-bits wide regardless of sz. */
   3565    if (epartIsReg(modrm)) {
   3566       vassert(am_sz == 1);
   3567       assign( t2, widenUto64(getIRegE(sz, pfx, modrm)) );
   3568       delta += (am_sz + 1);
   3569       DIP("%s%c $0x%llx, %s\n", nameGrp8(gregLO3ofRM(modrm)),
   3570                                 nameISize(sz),
   3571                                 src_val, nameIRegE(sz,pfx,modrm));
   3572    } else {
   3573       Int len;
   3574       t_addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 1 );
   3575       delta  += (len+1);
   3576       assign( t2, widenUto64(loadLE(ty, mkexpr(t_addr))) );
   3577       DIP("%s%c $0x%llx, %s\n", nameGrp8(gregLO3ofRM(modrm)),
   3578                                 nameISize(sz),
   3579                                 src_val, dis_buf);
   3580    }
   3581 
   3582    /* Compute the new value into t2m, if non-BT. */
   3583    switch (gregLO3ofRM(modrm)) {
   3584       case 4: /* BT */
   3585          break;
   3586       case 5: /* BTS */
   3587          assign( t2m, binop(Iop_Or64, mkU64(mask), mkexpr(t2)) );
   3588          break;
   3589       case 6: /* BTR */
   3590          assign( t2m, binop(Iop_And64, mkU64(mask), mkexpr(t2)) );
   3591          break;
   3592       case 7: /* BTC */
   3593          assign( t2m, binop(Iop_Xor64, mkU64(mask), mkexpr(t2)) );
   3594          break;
   3595      default:
   3596          /*NOTREACHED*/ /*the previous switch guards this*/
   3597          vassert(0);
   3598    }
   3599 
   3600    /* Write the result back, if non-BT. */
   3601    if (gregLO3ofRM(modrm) != 4 /* BT */) {
   3602       if (epartIsReg(modrm)) {
   3603 	putIRegE(sz, pfx, modrm, narrowTo(ty, mkexpr(t2m)));
   3604       } else {
   3605          if (pfx & PFX_LOCK) {
   3606             casLE( mkexpr(t_addr),
   3607                    narrowTo(ty, mkexpr(t2))/*expd*/,
   3608                    narrowTo(ty, mkexpr(t2m))/*new*/,
   3609                    guest_RIP_curr_instr );
   3610          } else {
   3611             storeLE(mkexpr(t_addr), narrowTo(ty, mkexpr(t2m)));
   3612          }
   3613       }
   3614    }
   3615 
   3616    /* Copy relevant bit from t2 into the carry flag. */
   3617    /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
   3618    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   3619    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   3620    stmt( IRStmt_Put(
   3621             OFFB_CC_DEP1,
   3622             binop(Iop_And64,
   3623                   binop(Iop_Shr64, mkexpr(t2), mkU8(src_val)),
   3624                   mkU64(1))
   3625        ));
   3626    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   3627       elimination of previous stores to this field work better. */
   3628    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   3629 
   3630    return delta;
   3631 }
   3632 
   3633 
   3634 /* Signed/unsigned widening multiply.  Generate IR to multiply the
   3635    value in RAX/EAX/AX/AL by the given IRTemp, and park the result in
   3636    RDX:RAX/EDX:EAX/DX:AX/AX.
   3637 */
   3638 static void codegen_mulL_A_D ( Int sz, Bool syned,
   3639                                IRTemp tmp, HChar* tmp_txt )
   3640 {
   3641    IRType ty = szToITy(sz);
   3642    IRTemp t1 = newTemp(ty);
   3643 
   3644    assign( t1, getIRegRAX(sz) );
   3645 
   3646    switch (ty) {
   3647       case Ity_I64: {
   3648          IRTemp res128  = newTemp(Ity_I128);
   3649          IRTemp resHi   = newTemp(Ity_I64);
   3650          IRTemp resLo   = newTemp(Ity_I64);
   3651          IROp   mulOp   = syned ? Iop_MullS64 : Iop_MullU64;
   3652          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   3653          setFlags_MUL ( Ity_I64, t1, tmp, tBaseOp );
   3654          assign( res128, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   3655          assign( resHi, unop(Iop_128HIto64,mkexpr(res128)));
   3656          assign( resLo, unop(Iop_128to64,mkexpr(res128)));
   3657          putIReg64(R_RDX, mkexpr(resHi));
   3658          putIReg64(R_RAX, mkexpr(resLo));
   3659          break;
   3660       }
   3661       case Ity_I32: {
   3662          IRTemp res64   = newTemp(Ity_I64);
   3663          IRTemp resHi   = newTemp(Ity_I32);
   3664          IRTemp resLo   = newTemp(Ity_I32);
   3665          IROp   mulOp   = syned ? Iop_MullS32 : Iop_MullU32;
   3666          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   3667          setFlags_MUL ( Ity_I32, t1, tmp, tBaseOp );
   3668          assign( res64, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   3669          assign( resHi, unop(Iop_64HIto32,mkexpr(res64)));
   3670          assign( resLo, unop(Iop_64to32,mkexpr(res64)));
   3671          putIRegRDX(4, mkexpr(resHi));
   3672          putIRegRAX(4, mkexpr(resLo));
   3673          break;
   3674       }
   3675       case Ity_I16: {
   3676          IRTemp res32   = newTemp(Ity_I32);
   3677          IRTemp resHi   = newTemp(Ity_I16);
   3678          IRTemp resLo   = newTemp(Ity_I16);
   3679          IROp   mulOp   = syned ? Iop_MullS16 : Iop_MullU16;
   3680          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   3681          setFlags_MUL ( Ity_I16, t1, tmp, tBaseOp );
   3682          assign( res32, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   3683          assign( resHi, unop(Iop_32HIto16,mkexpr(res32)));
   3684          assign( resLo, unop(Iop_32to16,mkexpr(res32)));
   3685          putIRegRDX(2, mkexpr(resHi));
   3686          putIRegRAX(2, mkexpr(resLo));
   3687          break;
   3688       }
   3689       case Ity_I8: {
   3690          IRTemp res16   = newTemp(Ity_I16);
   3691          IRTemp resHi   = newTemp(Ity_I8);
   3692          IRTemp resLo   = newTemp(Ity_I8);
   3693          IROp   mulOp   = syned ? Iop_MullS8 : Iop_MullU8;
   3694          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   3695          setFlags_MUL ( Ity_I8, t1, tmp, tBaseOp );
   3696          assign( res16, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   3697          assign( resHi, unop(Iop_16HIto8,mkexpr(res16)));
   3698          assign( resLo, unop(Iop_16to8,mkexpr(res16)));
   3699          putIRegRAX(2, mkexpr(res16));
   3700          break;
   3701       }
   3702       default:
   3703          ppIRType(ty);
   3704          vpanic("codegen_mulL_A_D(amd64)");
   3705    }
   3706    DIP("%s%c %s\n", syned ? "imul" : "mul", nameISize(sz), tmp_txt);
   3707 }
   3708 
   3709 
   3710 /* Group 3 extended opcodes. */
   3711 static
   3712 ULong dis_Grp3 ( VexAbiInfo* vbi,
   3713                  Prefix pfx, Int sz, Long delta, Bool* decode_OK )
   3714 {
   3715    Long    d64;
   3716    UChar   modrm;
   3717    HChar   dis_buf[50];
   3718    Int     len;
   3719    IRTemp  addr;
   3720    IRType  ty = szToITy(sz);
   3721    IRTemp  t1 = newTemp(ty);
   3722    IRTemp dst1, src, dst0;
   3723    *decode_OK = True;
   3724    modrm = getUChar(delta);
   3725    if (epartIsReg(modrm)) {
   3726       switch (gregLO3ofRM(modrm)) {
   3727          case 0: { /* TEST */
   3728             delta++;
   3729             d64 = getSDisp(imin(4,sz), delta);
   3730             delta += imin(4,sz);
   3731             dst1 = newTemp(ty);
   3732             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
   3733                                getIRegE(sz,pfx,modrm),
   3734                                mkU(ty, d64 & mkSizeMask(sz))));
   3735             setFlags_DEP1( Iop_And8, dst1, ty );
   3736             DIP("test%c $%lld, %s\n",
   3737                 nameISize(sz), d64,
   3738                 nameIRegE(sz, pfx, modrm));
   3739             break;
   3740          }
   3741          case 1:
   3742             *decode_OK = False;
   3743             return delta;
   3744          case 2: /* NOT */
   3745             delta++;
   3746             putIRegE(sz, pfx, modrm,
   3747                               unop(mkSizedOp(ty,Iop_Not8),
   3748                                    getIRegE(sz, pfx, modrm)));
   3749             DIP("not%c %s\n", nameISize(sz),
   3750                               nameIRegE(sz, pfx, modrm));
   3751             break;
   3752          case 3: /* NEG */
   3753             delta++;
   3754             dst0 = newTemp(ty);
   3755             src  = newTemp(ty);
   3756             dst1 = newTemp(ty);
   3757             assign(dst0, mkU(ty,0));
   3758             assign(src,  getIRegE(sz, pfx, modrm));
   3759             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0),
   3760                                                        mkexpr(src)));
   3761             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
   3762             putIRegE(sz, pfx, modrm, mkexpr(dst1));
   3763             DIP("neg%c %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm));
   3764             break;
   3765          case 4: /* MUL (unsigned widening) */
   3766             delta++;
   3767             src = newTemp(ty);
   3768             assign(src, getIRegE(sz,pfx,modrm));
   3769             codegen_mulL_A_D ( sz, False, src,
   3770                                nameIRegE(sz,pfx,modrm) );
   3771             break;
   3772          case 5: /* IMUL (signed widening) */
   3773             delta++;
   3774             src = newTemp(ty);
   3775             assign(src, getIRegE(sz,pfx,modrm));
   3776             codegen_mulL_A_D ( sz, True, src,
   3777                                nameIRegE(sz,pfx,modrm) );
   3778             break;
   3779          case 6: /* DIV */
   3780             delta++;
   3781             assign( t1, getIRegE(sz, pfx, modrm) );
   3782             codegen_div ( sz, t1, False );
   3783             DIP("div%c %s\n", nameISize(sz),
   3784                               nameIRegE(sz, pfx, modrm));
   3785             break;
   3786          case 7: /* IDIV */
   3787             delta++;
   3788             assign( t1, getIRegE(sz, pfx, modrm) );
   3789             codegen_div ( sz, t1, True );
   3790             DIP("idiv%c %s\n", nameISize(sz),
   3791                                nameIRegE(sz, pfx, modrm));
   3792             break;
   3793          default:
   3794             /*NOTREACHED*/
   3795             vpanic("Grp3(amd64,R)");
   3796       }
   3797    } else {
   3798       addr = disAMode ( &len, vbi, pfx, delta, dis_buf,
   3799                         /* we have to inform disAMode of any immediate
   3800 			   bytes used */
   3801                         gregLO3ofRM(modrm)==0/*TEST*/
   3802                            ? imin(4,sz)
   3803                            : 0
   3804                       );
   3805       t1   = newTemp(ty);
   3806       delta += len;
   3807       assign(t1, loadLE(ty,mkexpr(addr)));
   3808       switch (gregLO3ofRM(modrm)) {
   3809          case 0: { /* TEST */
   3810             d64 = getSDisp(imin(4,sz), delta);
   3811             delta += imin(4,sz);
   3812             dst1 = newTemp(ty);
   3813             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
   3814                                mkexpr(t1),
   3815                                mkU(ty, d64 & mkSizeMask(sz))));
   3816             setFlags_DEP1( Iop_And8, dst1, ty );
   3817             DIP("test%c $%lld, %s\n", nameISize(sz), d64, dis_buf);
   3818             break;
   3819          }
   3820          case 1:
   3821             *decode_OK = False;
   3822             return delta;
   3823          case 2: /* NOT */
   3824             dst1 = newTemp(ty);
   3825             assign(dst1, unop(mkSizedOp(ty,Iop_Not8), mkexpr(t1)));
   3826             if (pfx & PFX_LOCK) {
   3827                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
   3828                                     guest_RIP_curr_instr );
   3829             } else {
   3830                storeLE( mkexpr(addr), mkexpr(dst1) );
   3831             }
   3832             DIP("not%c %s\n", nameISize(sz), dis_buf);
   3833             break;
   3834          case 3: /* NEG */
   3835             dst0 = newTemp(ty);
   3836             src  = newTemp(ty);
   3837             dst1 = newTemp(ty);
   3838             assign(dst0, mkU(ty,0));
   3839             assign(src,  mkexpr(t1));
   3840             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0),
   3841                                                        mkexpr(src)));
   3842             if (pfx & PFX_LOCK) {
   3843                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
   3844                                     guest_RIP_curr_instr );
   3845             } else {
   3846                storeLE( mkexpr(addr), mkexpr(dst1) );
   3847             }
   3848             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
   3849             DIP("neg%c %s\n", nameISize(sz), dis_buf);
   3850             break;
   3851          case 4: /* MUL (unsigned widening) */
   3852             codegen_mulL_A_D ( sz, False, t1, dis_buf );
   3853             break;
   3854          case 5: /* IMUL */
   3855             codegen_mulL_A_D ( sz, True, t1, dis_buf );
   3856             break;
   3857          case 6: /* DIV */
   3858             codegen_div ( sz, t1, False );
   3859             DIP("div%c %s\n", nameISize(sz), dis_buf);
   3860             break;
   3861          case 7: /* IDIV */
   3862             codegen_div ( sz, t1, True );
   3863             DIP("idiv%c %s\n", nameISize(sz), dis_buf);
   3864             break;
   3865          default:
   3866             /*NOTREACHED*/
   3867             vpanic("Grp3(amd64,M)");
   3868       }
   3869    }
   3870    return delta;
   3871 }
   3872 
   3873 
   3874 /* Group 4 extended opcodes. */
   3875 static
   3876 ULong dis_Grp4 ( VexAbiInfo* vbi,
   3877                  Prefix pfx, Long delta, Bool* decode_OK )
   3878 {
   3879    Int   alen;
   3880    UChar modrm;
   3881    HChar dis_buf[50];
   3882    IRType ty = Ity_I8;
   3883    IRTemp t1 = newTemp(ty);
   3884    IRTemp t2 = newTemp(ty);
   3885 
   3886    *decode_OK = True;
   3887 
   3888    modrm = getUChar(delta);
   3889    if (epartIsReg(modrm)) {
   3890       assign(t1, getIRegE(1, pfx, modrm));
   3891       switch (gregLO3ofRM(modrm)) {
   3892          case 0: /* INC */
   3893             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
   3894             putIRegE(1, pfx, modrm, mkexpr(t2));
   3895             setFlags_INC_DEC( True, t2, ty );
   3896             break;
   3897          case 1: /* DEC */
   3898             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
   3899             putIRegE(1, pfx, modrm, mkexpr(t2));
   3900             setFlags_INC_DEC( False, t2, ty );
   3901             break;
   3902          default:
   3903             *decode_OK = False;
   3904             return delta;
   3905       }
   3906       delta++;
   3907       DIP("%sb %s\n", nameGrp4(gregLO3ofRM(modrm)),
   3908                       nameIRegE(1, pfx, modrm));
   3909    } else {
   3910       IRTemp addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   3911       assign( t1, loadLE(ty, mkexpr(addr)) );
   3912       switch (gregLO3ofRM(modrm)) {
   3913          case 0: /* INC */
   3914             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
   3915             if (pfx & PFX_LOCK) {
   3916                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
   3917                       guest_RIP_curr_instr );
   3918             } else {
   3919                storeLE( mkexpr(addr), mkexpr(t2) );
   3920             }
   3921             setFlags_INC_DEC( True, t2, ty );
   3922             break;
   3923          case 1: /* DEC */
   3924             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
   3925             if (pfx & PFX_LOCK) {
   3926                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
   3927                       guest_RIP_curr_instr );
   3928             } else {
   3929                storeLE( mkexpr(addr), mkexpr(t2) );
   3930             }
   3931             setFlags_INC_DEC( False, t2, ty );
   3932             break;
   3933          default:
   3934             *decode_OK = False;
   3935             return delta;
   3936       }
   3937       delta += alen;
   3938       DIP("%sb %s\n", nameGrp4(gregLO3ofRM(modrm)), dis_buf);
   3939    }
   3940    return delta;
   3941 }
   3942 
   3943 
   3944 /* Group 5 extended opcodes. */
   3945 static
   3946 ULong dis_Grp5 ( VexAbiInfo* vbi,
   3947                  Prefix pfx, Int sz, Long delta,
   3948                  DisResult* dres, Bool* decode_OK )
   3949 {
   3950    Int     len;
   3951    UChar   modrm;
   3952    HChar   dis_buf[50];
   3953    IRTemp  addr = IRTemp_INVALID;
   3954    IRType  ty = szToITy(sz);
   3955    IRTemp  t1 = newTemp(ty);
   3956    IRTemp  t2 = IRTemp_INVALID;
   3957    IRTemp  t3 = IRTemp_INVALID;
   3958    Bool    showSz = True;
   3959 
   3960    *decode_OK = True;
   3961 
   3962    modrm = getUChar(delta);
   3963    if (epartIsReg(modrm)) {
   3964       assign(t1, getIRegE(sz,pfx,modrm));
   3965       switch (gregLO3ofRM(modrm)) {
   3966          case 0: /* INC */
   3967             t2 = newTemp(ty);
   3968             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
   3969                              mkexpr(t1), mkU(ty,1)));
   3970             setFlags_INC_DEC( True, t2, ty );
   3971             putIRegE(sz,pfx,modrm, mkexpr(t2));
   3972             break;
   3973          case 1: /* DEC */
   3974             t2 = newTemp(ty);
   3975             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
   3976                              mkexpr(t1), mkU(ty,1)));
   3977             setFlags_INC_DEC( False, t2, ty );
   3978             putIRegE(sz,pfx,modrm, mkexpr(t2));
   3979             break;
   3980          case 2: /* call Ev */
   3981             /* Ignore any sz value and operate as if sz==8. */
   3982             if (!(sz == 4 || sz == 8)) goto unhandled;
   3983             sz = 8;
   3984             t3 = newTemp(Ity_I64);
   3985             assign(t3, getIRegE(sz,pfx,modrm));
   3986             t2 = newTemp(Ity_I64);
   3987             assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   3988             putIReg64(R_RSP, mkexpr(t2));
   3989             storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+1));
   3990             make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(reg)");
   3991             jmp_treg(Ijk_Call,t3);
   3992             dres->whatNext = Dis_StopHere;
   3993             showSz = False;
   3994             break;
   3995          case 4: /* jmp Ev */
   3996             /* Ignore any sz value and operate as if sz==8. */
   3997             if (!(sz == 4 || sz == 8)) goto unhandled;
   3998             sz = 8;
   3999             t3 = newTemp(Ity_I64);
   4000             assign(t3, getIRegE(sz,pfx,modrm));
   4001             jmp_treg(Ijk_Boring,t3);
   4002             dres->whatNext = Dis_StopHere;
   4003             showSz = False;
   4004             break;
   4005          default:
   4006             *decode_OK = False;
   4007             return delta;
   4008       }
   4009       delta++;
   4010       DIP("%s%c %s\n", nameGrp5(gregLO3ofRM(modrm)),
   4011                        showSz ? nameISize(sz) : ' ',
   4012                        nameIRegE(sz, pfx, modrm));
   4013    } else {
   4014       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   4015       if (gregLO3ofRM(modrm) != 2 && gregLO3ofRM(modrm) != 4
   4016                                   && gregLO3ofRM(modrm) != 6) {
   4017          assign(t1, loadLE(ty,mkexpr(addr)));
   4018       }
   4019       switch (gregLO3ofRM(modrm)) {
   4020          case 0: /* INC */
   4021             t2 = newTemp(ty);
   4022             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
   4023                              mkexpr(t1), mkU(ty,1)));
   4024             if (pfx & PFX_LOCK) {
   4025                casLE( mkexpr(addr),
   4026                       mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
   4027             } else {
   4028                storeLE(mkexpr(addr),mkexpr(t2));
   4029             }
   4030             setFlags_INC_DEC( True, t2, ty );
   4031             break;
   4032          case 1: /* DEC */
   4033             t2 = newTemp(ty);
   4034             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
   4035                              mkexpr(t1), mkU(ty,1)));
   4036             if (pfx & PFX_LOCK) {
   4037                casLE( mkexpr(addr),
   4038                       mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
   4039             } else {
   4040                storeLE(mkexpr(addr),mkexpr(t2));
   4041             }
   4042             setFlags_INC_DEC( False, t2, ty );
   4043             break;
   4044          case 2: /* call Ev */
   4045             /* Ignore any sz value and operate as if sz==8. */
   4046             if (!(sz == 4 || sz == 8)) goto unhandled;
   4047             sz = 8;
   4048             t3 = newTemp(Ity_I64);
   4049             assign(t3, loadLE(Ity_I64,mkexpr(addr)));
   4050             t2 = newTemp(Ity_I64);
   4051             assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   4052             putIReg64(R_RSP, mkexpr(t2));
   4053             storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+len));
   4054             make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(mem)");
   4055             jmp_treg(Ijk_Call,t3);
   4056             dres->whatNext = Dis_StopHere;
   4057             showSz = False;
   4058             break;
   4059          case 4: /* JMP Ev */
   4060             /* Ignore any sz value and operate as if sz==8. */
   4061             if (!(sz == 4 || sz == 8)) goto unhandled;
   4062             sz = 8;
   4063             t3 = newTemp(Ity_I64);
   4064             assign(t3, loadLE(Ity_I64,mkexpr(addr)));
   4065             jmp_treg(Ijk_Boring,t3);
   4066             dres->whatNext = Dis_StopHere;
   4067             showSz = False;
   4068             break;
   4069          case 6: /* PUSH Ev */
   4070             /* There is no encoding for 32-bit operand size; hence ... */
   4071             if (sz == 4) sz = 8;
   4072             if (!(sz == 8 || sz == 2)) goto unhandled;
   4073             if (sz == 8) {
   4074                t3 = newTemp(Ity_I64);
   4075                assign(t3, loadLE(Ity_I64,mkexpr(addr)));
   4076                t2 = newTemp(Ity_I64);
   4077                assign( t2, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
   4078                putIReg64(R_RSP, mkexpr(t2) );
   4079                storeLE( mkexpr(t2), mkexpr(t3) );
   4080                break;
   4081 	    } else {
   4082                goto unhandled; /* awaiting test case */
   4083 	    }
   4084          default:
   4085          unhandled:
   4086             *decode_OK = False;
   4087             return delta;
   4088       }
   4089       delta += len;
   4090       DIP("%s%c %s\n", nameGrp5(gregLO3ofRM(modrm)),
   4091                        showSz ? nameISize(sz) : ' ',
   4092                        dis_buf);
   4093    }
   4094    return delta;
   4095 }
   4096 
   4097 
   4098 /*------------------------------------------------------------*/
   4099 /*--- Disassembling string ops (including REP prefixes)    ---*/
   4100 /*------------------------------------------------------------*/
   4101 
   4102 /* Code shared by all the string ops */
   4103 static
   4104 void dis_string_op_increment ( Int sz, IRTemp t_inc )
   4105 {
   4106    UChar logSz;
   4107    if (sz == 8 || sz == 4 || sz == 2) {
   4108       logSz = 1;
   4109       if (sz == 4) logSz = 2;
   4110       if (sz == 8) logSz = 3;
   4111       assign( t_inc,
   4112               binop(Iop_Shl64, IRExpr_Get( OFFB_DFLAG, Ity_I64 ),
   4113                                mkU8(logSz) ) );
   4114    } else {
   4115       assign( t_inc,
   4116               IRExpr_Get( OFFB_DFLAG, Ity_I64 ) );
   4117    }
   4118 }
   4119 
   4120 static
   4121 void dis_string_op( void (*dis_OP)( Int, IRTemp, Prefix pfx ),
   4122                     Int sz, HChar* name, Prefix pfx )
   4123 {
   4124    IRTemp t_inc = newTemp(Ity_I64);
   4125    /* Really we ought to inspect the override prefixes, but we don't.
   4126       The following assertion catches any resulting sillyness. */
   4127    vassert(pfx == clearSegBits(pfx));
   4128    dis_string_op_increment(sz, t_inc);
   4129    dis_OP( sz, t_inc, pfx );
   4130    DIP("%s%c\n", name, nameISize(sz));
   4131 }
   4132 
   4133 static
   4134 void dis_MOVS ( Int sz, IRTemp t_inc, Prefix pfx )
   4135 {
   4136    IRType ty = szToITy(sz);
   4137    IRTemp td = newTemp(Ity_I64);   /* RDI */
   4138    IRTemp ts = newTemp(Ity_I64);   /* RSI */
   4139    IRExpr *incd, *incs;
   4140 
   4141    if (haveASO(pfx)) {
   4142       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   4143       assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
   4144    } else {
   4145       assign( td, getIReg64(R_RDI) );
   4146       assign( ts, getIReg64(R_RSI) );
   4147    }
   4148 
   4149    storeLE( mkexpr(td), loadLE(ty,mkexpr(ts)) );
   4150 
   4151    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   4152    incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
   4153    if (haveASO(pfx)) {
   4154       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   4155       incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
   4156    }
   4157    putIReg64( R_RDI, incd );
   4158    putIReg64( R_RSI, incs );
   4159 }
   4160 
   4161 static
   4162 void dis_LODS ( Int sz, IRTemp t_inc, Prefix pfx )
   4163 {
   4164    IRType ty = szToITy(sz);
   4165    IRTemp ts = newTemp(Ity_I64);   /* RSI */
   4166    IRExpr *incs;
   4167 
   4168    if (haveASO(pfx))
   4169       assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
   4170    else
   4171       assign( ts, getIReg64(R_RSI) );
   4172 
   4173    putIRegRAX ( sz, loadLE(ty, mkexpr(ts)) );
   4174 
   4175    incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
   4176    if (haveASO(pfx))
   4177       incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
   4178    putIReg64( R_RSI, incs );
   4179 }
   4180 
   4181 static
   4182 void dis_STOS ( Int sz, IRTemp t_inc, Prefix pfx )
   4183 {
   4184    IRType ty = szToITy(sz);
   4185    IRTemp ta = newTemp(ty);        /* rAX */
   4186    IRTemp td = newTemp(Ity_I64);   /* RDI */
   4187    IRExpr *incd;
   4188 
   4189    assign( ta, getIRegRAX(sz) );
   4190 
   4191    if (haveASO(pfx))
   4192       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   4193    else
   4194       assign( td, getIReg64(R_RDI) );
   4195 
   4196    storeLE( mkexpr(td), mkexpr(ta) );
   4197 
   4198    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   4199    if (haveASO(pfx))
   4200       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   4201    putIReg64( R_RDI, incd );
   4202 }
   4203 
   4204 static
   4205 void dis_CMPS ( Int sz, IRTemp t_inc, Prefix pfx )
   4206 {
   4207    IRType ty  = szToITy(sz);
   4208    IRTemp tdv = newTemp(ty);      /* (RDI) */
   4209    IRTemp tsv = newTemp(ty);      /* (RSI) */
   4210    IRTemp td  = newTemp(Ity_I64); /*  RDI  */
   4211    IRTemp ts  = newTemp(Ity_I64); /*  RSI  */
   4212    IRExpr *incd, *incs;
   4213 
   4214    if (haveASO(pfx)) {
   4215       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   4216       assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
   4217    } else {
   4218       assign( td, getIReg64(R_RDI) );
   4219       assign( ts, getIReg64(R_RSI) );
   4220    }
   4221 
   4222    assign( tdv, loadLE(ty,mkexpr(td)) );
   4223 
   4224    assign( tsv, loadLE(ty,mkexpr(ts)) );
   4225 
   4226    setFlags_DEP1_DEP2 ( Iop_Sub8, tsv, tdv, ty );
   4227 
   4228    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   4229    incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
   4230    if (haveASO(pfx)) {
   4231       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   4232       incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
   4233    }
   4234    putIReg64( R_RDI, incd );
   4235    putIReg64( R_RSI, incs );
   4236 }
   4237 
   4238 static
   4239 void dis_SCAS ( Int sz, IRTemp t_inc, Prefix pfx )
   4240 {
   4241    IRType ty  = szToITy(sz);
   4242    IRTemp ta  = newTemp(ty);       /*  rAX  */
   4243    IRTemp td  = newTemp(Ity_I64);  /*  RDI  */
   4244    IRTemp tdv = newTemp(ty);       /* (RDI) */
   4245    IRExpr *incd;
   4246 
   4247    assign( ta, getIRegRAX(sz) );
   4248 
   4249    if (haveASO(pfx))
   4250       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   4251    else
   4252       assign( td, getIReg64(R_RDI) );
   4253 
   4254    assign( tdv, loadLE(ty,mkexpr(td)) );
   4255 
   4256    setFlags_DEP1_DEP2 ( Iop_Sub8, ta, tdv, ty );
   4257 
   4258    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   4259    if (haveASO(pfx))
   4260       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   4261    putIReg64( R_RDI, incd );
   4262 }
   4263 
   4264 
   4265 /* Wrap the appropriate string op inside a REP/REPE/REPNE.  We assume
   4266    the insn is the last one in the basic block, and so emit a jump to
   4267    the next insn, rather than just falling through. */
   4268 static
   4269 void dis_REP_op ( AMD64Condcode cond,
   4270                   void (*dis_OP)(Int, IRTemp, Prefix),
   4271                   Int sz, Addr64 rip, Addr64 rip_next, HChar* name,
   4272                   Prefix pfx )
   4273 {
   4274    IRTemp t_inc = newTemp(Ity_I64);
   4275    IRTemp tc;
   4276    IRExpr* cmp;
   4277 
   4278    /* Really we ought to inspect the override prefixes, but we don't.
   4279       The following assertion catches any resulting sillyness. */
   4280    vassert(pfx == clearSegBits(pfx));
   4281 
   4282    if (haveASO(pfx)) {
   4283       tc = newTemp(Ity_I32);  /*  ECX  */
   4284       assign( tc, getIReg32(R_RCX) );
   4285       cmp = binop(Iop_CmpEQ32, mkexpr(tc), mkU32(0));
   4286    } else {
   4287       tc = newTemp(Ity_I64);  /*  RCX  */
   4288       assign( tc, getIReg64(R_RCX) );
   4289       cmp = binop(Iop_CmpEQ64, mkexpr(tc), mkU64(0));
   4290    }
   4291 
   4292    stmt( IRStmt_Exit( cmp, Ijk_Boring, IRConst_U64(rip_next) ) );
   4293 
   4294    if (haveASO(pfx))
   4295       putIReg32(R_RCX, binop(Iop_Sub32, mkexpr(tc), mkU32(1)) );
   4296   else
   4297       putIReg64(R_RCX, binop(Iop_Sub64, mkexpr(tc), mkU64(1)) );
   4298 
   4299    dis_string_op_increment(sz, t_inc);
   4300    dis_OP (sz, t_inc, pfx);
   4301 
   4302    if (cond == AMD64CondAlways) {
   4303       jmp_lit(Ijk_Boring,rip);
   4304    } else {
   4305       stmt( IRStmt_Exit( mk_amd64g_calculate_condition(cond),
   4306                          Ijk_Boring,
   4307                          IRConst_U64(rip) ) );
   4308       jmp_lit(Ijk_Boring,rip_next);
   4309    }
   4310    DIP("%s%c\n", name, nameISize(sz));
   4311 }
   4312 
   4313 
   4314 /*------------------------------------------------------------*/
   4315 /*--- Arithmetic, etc.                                     ---*/
   4316 /*------------------------------------------------------------*/
   4317 
   4318 /* IMUL E, G.  Supplied eip points to the modR/M byte. */
   4319 static
   4320 ULong dis_mul_E_G ( VexAbiInfo* vbi,
   4321                     Prefix      pfx,
   4322                     Int         size,
   4323                     Long        delta0 )
   4324 {
   4325    Int    alen;
   4326    HChar  dis_buf[50];
   4327    UChar  rm = getUChar(delta0);
   4328    IRType ty = szToITy(size);
   4329    IRTemp te = newTemp(ty);
   4330    IRTemp tg = newTemp(ty);
   4331    IRTemp resLo = newTemp(ty);
   4332 
   4333    assign( tg, getIRegG(size, pfx, rm) );
   4334    if (epartIsReg(rm)) {
   4335       assign( te, getIRegE(size, pfx, rm) );
   4336    } else {
   4337       IRTemp addr = disAMode( &alen, vbi, pfx, delta0, dis_buf, 0 );
   4338       assign( te, loadLE(ty,mkexpr(addr)) );
   4339    }
   4340 
   4341    setFlags_MUL ( ty, te, tg, AMD64G_CC_OP_SMULB );
   4342 
   4343    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tg) ) );
   4344 
   4345    putIRegG(size, pfx, rm, mkexpr(resLo) );
   4346 
   4347    if (epartIsReg(rm)) {
   4348       DIP("imul%c %s, %s\n", nameISize(size),
   4349                              nameIRegE(size,pfx,rm),
   4350                              nameIRegG(size,pfx,rm));
   4351       return 1+delta0;
   4352    } else {
   4353       DIP("imul%c %s, %s\n", nameISize(size),
   4354                              dis_buf,
   4355                              nameIRegG(size,pfx,rm));
   4356       return alen+delta0;
   4357    }
   4358 }
   4359 
   4360 
   4361 /* IMUL I * E -> G.  Supplied rip points to the modR/M byte. */
   4362 static
   4363 ULong dis_imul_I_E_G ( VexAbiInfo* vbi,
   4364                        Prefix      pfx,
   4365                        Int         size,
   4366                        Long        delta,
   4367                        Int         litsize )
   4368 {
   4369    Long   d64;
   4370    Int    alen;
   4371    HChar  dis_buf[50];
   4372    UChar  rm = getUChar(delta);
   4373    IRType ty = szToITy(size);
   4374    IRTemp te = newTemp(ty);
   4375    IRTemp tl = newTemp(ty);
   4376    IRTemp resLo = newTemp(ty);
   4377 
   4378    vassert(/*size == 1 ||*/ size == 2 || size == 4 || size == 8);
   4379 
   4380    if (epartIsReg(rm)) {
   4381       assign(te, getIRegE(size, pfx, rm));
   4382       delta++;
   4383    } else {
   4384       IRTemp addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   4385                                      imin(4,litsize) );
   4386       assign(te, loadLE(ty, mkexpr(addr)));
   4387       delta += alen;
   4388    }
   4389    d64 = getSDisp(imin(4,litsize),delta);
   4390    delta += imin(4,litsize);
   4391 
   4392    d64 &= mkSizeMask(size);
   4393    assign(tl, mkU(ty,d64));
   4394 
   4395    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tl) ));
   4396 
   4397    setFlags_MUL ( ty, te, tl, AMD64G_CC_OP_SMULB );
   4398 
   4399    putIRegG(size, pfx, rm, mkexpr(resLo));
   4400 
   4401    DIP("imul%c $%lld, %s, %s\n",
   4402        nameISize(size), d64,
   4403        ( epartIsReg(rm) ? nameIRegE(size,pfx,rm) : dis_buf ),
   4404        nameIRegG(size,pfx,rm) );
   4405    return delta;
   4406 }
   4407 
   4408 
   4409 /* Generate an IR sequence to do a popcount operation on the supplied
   4410    IRTemp, and return a new IRTemp holding the result.  'ty' may be
   4411    Ity_I16, Ity_I32 or Ity_I64 only. */
   4412 static IRTemp gen_POPCOUNT ( IRType ty, IRTemp src )
   4413 {
   4414    Int i;
   4415    if (ty == Ity_I16) {
   4416       IRTemp old = IRTemp_INVALID;
   4417       IRTemp nyu = IRTemp_INVALID;
   4418       IRTemp mask[4], shift[4];
   4419       for (i = 0; i < 4; i++) {
   4420          mask[i]  = newTemp(ty);
   4421          shift[i] = 1 << i;
   4422       }
   4423       assign(mask[0], mkU16(0x5555));
   4424       assign(mask[1], mkU16(0x3333));
   4425       assign(mask[2], mkU16(0x0F0F));
   4426       assign(mask[3], mkU16(0x00FF));
   4427       old = src;
   4428       for (i = 0; i < 4; i++) {
   4429          nyu = newTemp(ty);
   4430          assign(nyu,
   4431                 binop(Iop_Add16,
   4432                       binop(Iop_And16,
   4433                             mkexpr(old),
   4434                             mkexpr(mask[i])),
   4435                       binop(Iop_And16,
   4436                             binop(Iop_Shr16, mkexpr(old), mkU8(shift[i])),
   4437                             mkexpr(mask[i]))));
   4438          old = nyu;
   4439       }
   4440       return nyu;
   4441    }
   4442    if (ty == Ity_I32) {
   4443       IRTemp old = IRTemp_INVALID;
   4444       IRTemp nyu = IRTemp_INVALID;
   4445       IRTemp mask[5], shift[5];
   4446       for (i = 0; i < 5; i++) {
   4447          mask[i]  = newTemp(ty);
   4448          shift[i] = 1 << i;
   4449       }
   4450       assign(mask[0], mkU32(0x55555555));
   4451       assign(mask[1], mkU32(0x33333333));
   4452       assign(mask[2], mkU32(0x0F0F0F0F));
   4453       assign(mask[3], mkU32(0x00FF00FF));
   4454       assign(mask[4], mkU32(0x0000FFFF));
   4455       old = src;
   4456       for (i = 0; i < 5; i++) {
   4457          nyu = newTemp(ty);
   4458          assign(nyu,
   4459                 binop(Iop_Add32,
   4460                       binop(Iop_And32,
   4461                             mkexpr(old),
   4462                             mkexpr(mask[i])),
   4463                       binop(Iop_And32,
   4464                             binop(Iop_Shr32, mkexpr(old), mkU8(shift[i])),
   4465                             mkexpr(mask[i]))));
   4466          old = nyu;
   4467       }
   4468       return nyu;
   4469    }
   4470    if (ty == Ity_I64) {
   4471       IRTemp old = IRTemp_INVALID;
   4472       IRTemp nyu = IRTemp_INVALID;
   4473       IRTemp mask[6], shift[6];
   4474       for (i = 0; i < 6; i++) {
   4475          mask[i]  = newTemp(ty);
   4476          shift[i] = 1 << i;
   4477       }
   4478       assign(mask[0], mkU64(0x5555555555555555ULL));
   4479       assign(mask[1], mkU64(0x3333333333333333ULL));
   4480       assign(mask[2], mkU64(0x0F0F0F0F0F0F0F0FULL));
   4481       assign(mask[3], mkU64(0x00FF00FF00FF00FFULL));
   4482       assign(mask[4], mkU64(0x0000FFFF0000FFFFULL));
   4483       assign(mask[5], mkU64(0x00000000FFFFFFFFULL));
   4484       old = src;
   4485       for (i = 0; i < 6; i++) {
   4486          nyu = newTemp(ty);
   4487          assign(nyu,
   4488                 binop(Iop_Add64,
   4489                       binop(Iop_And64,
   4490                             mkexpr(old),
   4491                             mkexpr(mask[i])),
   4492                       binop(Iop_And64,
   4493                             binop(Iop_Shr64, mkexpr(old), mkU8(shift[i])),
   4494                             mkexpr(mask[i]))));
   4495          old = nyu;
   4496       }
   4497       return nyu;
   4498    }
   4499    /*NOTREACHED*/
   4500    vassert(0);
   4501 }
   4502 
   4503 
   4504 /* Generate an IR sequence to do a count-leading-zeroes operation on
   4505    the supplied IRTemp, and return a new IRTemp holding the result.
   4506    'ty' may be Ity_I16, Ity_I32 or Ity_I64 only.  In the case where
   4507    the argument is zero, return the number of bits in the word (the
   4508    natural semantics). */
   4509 static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
   4510 {
   4511    vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16);
   4512 
   4513    IRTemp src64 = newTemp(Ity_I64);
   4514    assign(src64, widenUto64( mkexpr(src) ));
   4515 
   4516    IRTemp src64x = newTemp(Ity_I64);
   4517    assign(src64x,
   4518           binop(Iop_Shl64, mkexpr(src64),
   4519                            mkU8(64 - 8 * sizeofIRType(ty))));
   4520 
   4521    // Clz64 has undefined semantics when its input is zero, so
   4522    // special-case around that.
   4523    IRTemp res64 = newTemp(Ity_I64);
   4524    assign(res64,
   4525           IRExpr_Mux0X(
   4526              unop(Iop_1Uto8,
   4527                   binop(Iop_CmpEQ64, mkexpr(src64x), mkU64(0))),
   4528              unop(Iop_Clz64, mkexpr(src64x)),
   4529              mkU64(8 * sizeofIRType(ty))
   4530    ));
   4531 
   4532    IRTemp res = newTemp(ty);
   4533    assign(res, narrowTo(ty, mkexpr(res64)));
   4534    return res;
   4535 }
   4536 
   4537 
   4538 /*------------------------------------------------------------*/
   4539 /*---                                                      ---*/
   4540 /*--- x87 FLOATING POINT INSTRUCTIONS                      ---*/
   4541 /*---                                                      ---*/
   4542 /*------------------------------------------------------------*/
   4543 
   4544 /* --- Helper functions for dealing with the register stack. --- */
   4545 
   4546 /* --- Set the emulation-warning pseudo-register. --- */
   4547 
   4548 static void put_emwarn ( IRExpr* e /* :: Ity_I32 */ )
   4549 {
   4550    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   4551    stmt( IRStmt_Put( OFFB_EMWARN, e ) );
   4552 }
   4553 
   4554 /* --- Produce an IRExpr* denoting a 64-bit QNaN. --- */
   4555 
   4556 static IRExpr* mkQNaN64 ( void )
   4557 {
   4558   /* QNaN is 0 2047 1 0(51times)
   4559      == 0b 11111111111b 1 0(51times)
   4560      == 0x7FF8 0000 0000 0000
   4561    */
   4562    return IRExpr_Const(IRConst_F64i(0x7FF8000000000000ULL));
   4563 }
   4564 
   4565 /* --------- Get/put the top-of-stack pointer :: Ity_I32 --------- */
   4566 
   4567 static IRExpr* get_ftop ( void )
   4568 {
   4569    return IRExpr_Get( OFFB_FTOP, Ity_I32 );
   4570 }
   4571 
   4572 static void put_ftop ( IRExpr* e )
   4573 {
   4574    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   4575    stmt( IRStmt_Put( OFFB_FTOP, e ) );
   4576 }
   4577 
   4578 /* --------- Get/put the C3210 bits. --------- */
   4579 
   4580 static IRExpr*  /* :: Ity_I64 */ get_C3210 ( void )
   4581 {
   4582    return IRExpr_Get( OFFB_FC3210, Ity_I64 );
   4583 }
   4584 
   4585 static void put_C3210 ( IRExpr* e  /* :: Ity_I64 */ )
   4586 {
   4587    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
   4588    stmt( IRStmt_Put( OFFB_FC3210, e ) );
   4589 }
   4590 
   4591 /* --------- Get/put the FPU rounding mode. --------- */
   4592 static IRExpr* /* :: Ity_I32 */ get_fpround ( void )
   4593 {
   4594    return unop(Iop_64to32, IRExpr_Get( OFFB_FPROUND, Ity_I64 ));
   4595 }
   4596 
   4597 static void put_fpround ( IRExpr* /* :: Ity_I32 */ e )
   4598 {
   4599    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   4600    stmt( IRStmt_Put( OFFB_FPROUND, unop(Iop_32Uto64,e) ) );
   4601 }
   4602 
   4603 
   4604 /* --------- Synthesise a 2-bit FPU rounding mode. --------- */
   4605 /* Produces a value in 0 .. 3, which is encoded as per the type
   4606    IRRoundingMode.  Since the guest_FPROUND value is also encoded as
   4607    per IRRoundingMode, we merely need to get it and mask it for
   4608    safety.
   4609 */
   4610 static IRExpr* /* :: Ity_I32 */ get_roundingmode ( void )
   4611 {
   4612    return binop( Iop_And32, get_fpround(), mkU32(3) );
   4613 }
   4614 
   4615 static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
   4616 {
   4617    return mkU32(Irrm_NEAREST);
   4618 }
   4619 
   4620 
   4621 /* --------- Get/set FP register tag bytes. --------- */
   4622 
   4623 /* Given i, and some expression e, generate 'ST_TAG(i) = e'. */
   4624 
   4625 static void put_ST_TAG ( Int i, IRExpr* value )
   4626 {
   4627    IRRegArray* descr;
   4628    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_I8);
   4629    descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   4630    stmt( IRStmt_PutI( descr, get_ftop(), i, value ) );
   4631 }
   4632 
   4633 /* Given i, generate an expression yielding 'ST_TAG(i)'.  This will be
   4634    zero to indicate "Empty" and nonzero to indicate "NonEmpty".  */
   4635 
   4636 static IRExpr* get_ST_TAG ( Int i )
   4637 {
   4638    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   4639    return IRExpr_GetI( descr, get_ftop(), i );
   4640 }
   4641 
   4642 
   4643 /* --------- Get/set FP registers. --------- */
   4644 
   4645 /* Given i, and some expression e, emit 'ST(i) = e' and set the
   4646    register's tag to indicate the register is full.  The previous
   4647    state of the register is not checked. */
   4648 
   4649 static void put_ST_UNCHECKED ( Int i, IRExpr* value )
   4650 {
   4651    IRRegArray* descr;
   4652    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_F64);
   4653    descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
   4654    stmt( IRStmt_PutI( descr, get_ftop(), i, value ) );
   4655    /* Mark the register as in-use. */
   4656    put_ST_TAG(i, mkU8(1));
   4657 }
   4658 
   4659 /* Given i, and some expression e, emit
   4660       ST(i) = is_full(i) ? NaN : e
   4661    and set the tag accordingly.
   4662 */
   4663 
   4664 static void put_ST ( Int i, IRExpr* value )
   4665 {
   4666    put_ST_UNCHECKED( i,
   4667                      IRExpr_Mux0X( get_ST_TAG(i),
   4668                                    /* 0 means empty */
   4669                                    value,
   4670                                    /* non-0 means full */
   4671                                    mkQNaN64()
   4672                    )
   4673    );
   4674 }
   4675 
   4676 
   4677 /* Given i, generate an expression yielding 'ST(i)'. */
   4678 
   4679 static IRExpr* get_ST_UNCHECKED ( Int i )
   4680 {
   4681    IRRegArray* descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
   4682    return IRExpr_GetI( descr, get_ftop(), i );
   4683 }
   4684 
   4685 
   4686 /* Given i, generate an expression yielding
   4687   is_full(i) ? ST(i) : NaN
   4688 */
   4689 
   4690 static IRExpr* get_ST ( Int i )
   4691 {
   4692    return
   4693       IRExpr_Mux0X( get_ST_TAG(i),
   4694                     /* 0 means empty */
   4695                     mkQNaN64(),
   4696                     /* non-0 means full */
   4697                     get_ST_UNCHECKED(i));
   4698 }
   4699 
   4700 
   4701 /* Adjust FTOP downwards by one register. */
   4702 
   4703 static void fp_push ( void )
   4704 {
   4705    put_ftop( binop(Iop_Sub32, get_ftop(), mkU32(1)) );
   4706 }
   4707 
   4708 /* Adjust FTOP upwards by one register, and mark the vacated register
   4709    as empty.  */
   4710 
   4711 static void fp_pop ( void )
   4712 {
   4713    put_ST_TAG(0, mkU8(0));
   4714    put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
   4715 }
   4716 
   4717 /* Clear the C2 bit of the FPU status register, for
   4718    sin/cos/tan/sincos. */
   4719 
   4720 static void clear_C2 ( void )
   4721 {
   4722    put_C3210( binop(Iop_And64, get_C3210(), mkU64(~AMD64G_FC_MASK_C2)) );
   4723 }
   4724 
   4725 /* Invent a plausible-looking FPU status word value:
   4726       ((ftop & 7) << 11) | (c3210 & 0x4700)
   4727  */
   4728 static IRExpr* get_FPU_sw ( void )
   4729 {
   4730    return
   4731       unop(Iop_32to16,
   4732            binop(Iop_Or32,
   4733                  binop(Iop_Shl32,
   4734                        binop(Iop_And32, get_ftop(), mkU32(7)),
   4735                              mkU8(11)),
   4736                        binop(Iop_And32, unop(Iop_64to32, get_C3210()),
   4737                                         mkU32(0x4700))
   4738       ));
   4739 }
   4740 
   4741 
   4742 /* ------------------------------------------------------- */
   4743 /* Given all that stack-mangling junk, we can now go ahead
   4744    and describe FP instructions.
   4745 */
   4746 
   4747 /* ST(0) = ST(0) `op` mem64/32(addr)
   4748    Need to check ST(0)'s tag on read, but not on write.
   4749 */
   4750 static
   4751 void fp_do_op_mem_ST_0 ( IRTemp addr, HChar* op_txt, HChar* dis_buf,
   4752                          IROp op, Bool dbl )
   4753 {
   4754    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
   4755    if (dbl) {
   4756       put_ST_UNCHECKED(0,
   4757          triop( op,
   4758                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4759                 get_ST(0),
   4760                 loadLE(Ity_F64,mkexpr(addr))
   4761          ));
   4762    } else {
   4763       put_ST_UNCHECKED(0,
   4764          triop( op,
   4765                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4766                 get_ST(0),
   4767                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr)))
   4768          ));
   4769    }
   4770 }
   4771 
   4772 
   4773 /* ST(0) = mem64/32(addr) `op` ST(0)
   4774    Need to check ST(0)'s tag on read, but not on write.
   4775 */
   4776 static
   4777 void fp_do_oprev_mem_ST_0 ( IRTemp addr, HChar* op_txt, HChar* dis_buf,
   4778                             IROp op, Bool dbl )
   4779 {
   4780    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
   4781    if (dbl) {
   4782       put_ST_UNCHECKED(0,
   4783          triop( op,
   4784                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4785                 loadLE(Ity_F64,mkexpr(addr)),
   4786                 get_ST(0)
   4787          ));
   4788    } else {
   4789       put_ST_UNCHECKED(0,
   4790          triop( op,
   4791                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4792                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr))),
   4793                 get_ST(0)
   4794          ));
   4795    }
   4796 }
   4797 
   4798 
   4799 /* ST(dst) = ST(dst) `op` ST(src).
   4800    Check dst and src tags when reading but not on write.
   4801 */
   4802 static
   4803 void fp_do_op_ST_ST ( HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
   4804                       Bool pop_after )
   4805 {
   4806    DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"", st_src, st_dst );
   4807    put_ST_UNCHECKED(
   4808       st_dst,
   4809       triop( op,
   4810              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4811              get_ST(st_dst),
   4812              get_ST(st_src) )
   4813    );
   4814    if (pop_after)
   4815       fp_pop();
   4816 }
   4817 
   4818 /* ST(dst) = ST(src) `op` ST(dst).
   4819    Check dst and src tags when reading but not on write.
   4820 */
   4821 static
   4822 void fp_do_oprev_ST_ST ( HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
   4823                          Bool pop_after )
   4824 {
   4825    DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"", st_src, st_dst );
   4826    put_ST_UNCHECKED(
   4827       st_dst,
   4828       triop( op,
   4829              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4830              get_ST(st_src),
   4831              get_ST(st_dst) )
   4832    );
   4833    if (pop_after)
   4834       fp_pop();
   4835 }
   4836 
   4837 /* %rflags(Z,P,C) = UCOMI( st(0), st(i) ) */
   4838 static void fp_do_ucomi_ST0_STi ( UInt i, Bool pop_after )
   4839 {
   4840    DIP("fucomi%s %%st(0),%%st(%u)\n", pop_after ? "p" : "", i);
   4841    /* This is a bit of a hack (and isn't really right).  It sets
   4842       Z,P,C,O correctly, but forces A and S to zero, whereas the Intel
   4843       documentation implies A and S are unchanged.
   4844    */
   4845    /* It's also fishy in that it is used both for COMIP and
   4846       UCOMIP, and they aren't the same (although similar). */
   4847    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   4848    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   4849    stmt( IRStmt_Put(
   4850             OFFB_CC_DEP1,
   4851             binop( Iop_And64,
   4852                    unop( Iop_32Uto64,
   4853                          binop(Iop_CmpF64, get_ST(0), get_ST(i))),
   4854                    mkU64(0x45)
   4855         )));
   4856    if (pop_after)
   4857       fp_pop();
   4858 }
   4859 
   4860 
   4861 /* returns
   4862    32to16( if e32 <s -32768 || e32 >s 32767 then -32768 else e32 )
   4863 */
   4864 static IRExpr* x87ishly_qnarrow_32_to_16 ( IRExpr* e32 )
   4865 {
   4866    IRTemp t32 = newTemp(Ity_I32);
   4867    assign( t32, e32 );
   4868    return
   4869       IRExpr_Mux0X(
   4870          unop(Iop_1Uto8,
   4871               binop(Iop_CmpLT64U,
   4872                     unop(Iop_32Uto64,
   4873                          binop(Iop_Add32, mkexpr(t32), mkU32(32768))),
   4874                     mkU64(65536))),
   4875          mkU16( 0x8000 ),
   4876          unop(Iop_32to16, mkexpr(t32)));
   4877 }
   4878 
   4879 
   4880 static
   4881 ULong dis_FPU ( /*OUT*/Bool* decode_ok,
   4882                 VexAbiInfo* vbi, Prefix pfx, Long delta )
   4883 {
   4884    Int    len;
   4885    UInt   r_src, r_dst;
   4886    HChar  dis_buf[50];
   4887    IRTemp t1, t2;
   4888 
   4889    /* On entry, delta points at the second byte of the insn (the modrm
   4890       byte).*/
   4891    UChar first_opcode = getUChar(delta-1);
   4892    UChar modrm        = getUChar(delta+0);
   4893 
   4894    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD8 opcodes +-+-+-+-+-+-+-+ */
   4895 
   4896    if (first_opcode == 0xD8) {
   4897       if (modrm < 0xC0) {
   4898 
   4899          /* bits 5,4,3 are an opcode extension, and the modRM also
   4900            specifies an address. */
   4901          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   4902          delta += len;
   4903 
   4904          switch (gregLO3ofRM(modrm)) {
   4905 
   4906             case 0: /* FADD single-real */
   4907                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, False );
   4908                break;
   4909 
   4910             case 1: /* FMUL single-real */
   4911                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, False );
   4912                break;
   4913 
   4914 //..             case 2: /* FCOM single-real */
   4915 //..                DIP("fcoms %s\n", dis_buf);
   4916 //..                /* This forces C1 to zero, which isn't right. */
   4917 //..                put_C3210(
   4918 //..                    binop( Iop_And32,
   4919 //..                           binop(Iop_Shl32,
   4920 //..                                 binop(Iop_CmpF64,
   4921 //..                                       get_ST(0),
   4922 //..                                       unop(Iop_F32toF64,
   4923 //..                                            loadLE(Ity_F32,mkexpr(addr)))),
   4924 //..                                 mkU8(8)),
   4925 //..                           mkU32(0x4500)
   4926 //..                    ));
   4927 //..                break;
   4928 //..
   4929 //..             case 3: /* FCOMP single-real */
   4930 //..                DIP("fcomps %s\n", dis_buf);
   4931 //..                /* This forces C1 to zero, which isn't right. */
   4932 //..                put_C3210(
   4933 //..                    binop( Iop_And32,
   4934 //..                           binop(Iop_Shl32,
   4935 //..                                 binop(Iop_CmpF64,
   4936 //..                                       get_ST(0),
   4937 //..                                       unop(Iop_F32toF64,
   4938 //..                                            loadLE(Ity_F32,mkexpr(addr)))),
   4939 //..                                 mkU8(8)),
   4940 //..                           mkU32(0x4500)
   4941 //..                    ));
   4942 //..                fp_pop();
   4943 //..                break;
   4944 
   4945             case 4: /* FSUB single-real */
   4946                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, False );
   4947                break;
   4948 
   4949             case 5: /* FSUBR single-real */
   4950                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, False );
   4951                break;
   4952 
   4953             case 6: /* FDIV single-real */
   4954                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, False );
   4955                break;
   4956 
   4957             case 7: /* FDIVR single-real */
   4958                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, False );
   4959                break;
   4960 
   4961             default:
   4962                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   4963                vex_printf("first_opcode == 0xD8\n");
   4964                goto decode_fail;
   4965          }
   4966       } else {
   4967          delta++;
   4968          switch (modrm) {
   4969 
   4970             case 0xC0 ... 0xC7: /* FADD %st(?),%st(0) */
   4971                fp_do_op_ST_ST ( "add", Iop_AddF64, modrm - 0xC0, 0, False );
   4972                break;
   4973 
   4974             case 0xC8 ... 0xCF: /* FMUL %st(?),%st(0) */
   4975                fp_do_op_ST_ST ( "mul", Iop_MulF64, modrm - 0xC8, 0, False );
   4976                break;
   4977 
   4978             /* Dunno if this is right */
   4979             case 0xD0 ... 0xD7: /* FCOM %st(?),%st(0) */
   4980                r_dst = (UInt)modrm - 0xD0;
   4981                DIP("fcom %%st(0),%%st(%d)\n", r_dst);
   4982                /* This forces C1 to zero, which isn't right. */
   4983                put_C3210(
   4984                    unop(Iop_32Uto64,
   4985                    binop( Iop_And32,
   4986                           binop(Iop_Shl32,
   4987                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   4988                                 mkU8(8)),
   4989                           mkU32(0x4500)
   4990                    )));
   4991                break;
   4992 
   4993             /* Dunno if this is right */
   4994             case 0xD8 ... 0xDF: /* FCOMP %st(?),%st(0) */
   4995                r_dst = (UInt)modrm - 0xD8;
   4996                DIP("fcomp %%st(0),%%st(%d)\n", r_dst);
   4997                /* This forces C1 to zero, which isn't right. */
   4998                put_C3210(
   4999                    unop(Iop_32Uto64,
   5000                    binop( Iop_And32,
   5001                           binop(Iop_Shl32,
   5002                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   5003                                 mkU8(8)),
   5004                           mkU32(0x4500)
   5005                    )));
   5006                fp_pop();
   5007                break;
   5008 
   5009             case 0xE0 ... 0xE7: /* FSUB %st(?),%st(0) */
   5010                fp_do_op_ST_ST ( "sub", Iop_SubF64, modrm - 0xE0, 0, False );
   5011                break;
   5012 
   5013             case 0xE8 ... 0xEF: /* FSUBR %st(?),%st(0) */
   5014                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, modrm - 0xE8, 0, False );
   5015                break;
   5016 
   5017             case 0xF0 ... 0xF7: /* FDIV %st(?),%st(0) */
   5018                fp_do_op_ST_ST ( "div", Iop_DivF64, modrm - 0xF0, 0, False );
   5019                break;
   5020 
   5021             case 0xF8 ... 0xFF: /* FDIVR %st(?),%st(0) */
   5022                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, modrm - 0xF8, 0, False );
   5023                break;
   5024 
   5025             default:
   5026                goto decode_fail;
   5027          }
   5028       }
   5029    }
   5030 
   5031    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD9 opcodes +-+-+-+-+-+-+-+ */
   5032    else
   5033    if (first_opcode == 0xD9) {
   5034       if (modrm < 0xC0) {
   5035 
   5036          /* bits 5,4,3 are an opcode extension, and the modRM also
   5037             specifies an address. */
   5038          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   5039          delta += len;
   5040 
   5041          switch (gregLO3ofRM(modrm)) {
   5042 
   5043             case 0: /* FLD single-real */
   5044                DIP("flds %s\n", dis_buf);
   5045                fp_push();
   5046                put_ST(0, unop(Iop_F32toF64,
   5047                               loadLE(Ity_F32, mkexpr(addr))));
   5048                break;
   5049 
   5050             case 2: /* FST single-real */
   5051                DIP("fsts %s\n", dis_buf);
   5052                storeLE(mkexpr(addr),
   5053                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
   5054                break;
   5055 
   5056             case 3: /* FSTP single-real */
   5057                DIP("fstps %s\n", dis_buf);
   5058                storeLE(mkexpr(addr),
   5059                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
   5060                fp_pop();
   5061                break;
   5062 
   5063             case 4: { /* FLDENV m28 */
   5064                /* Uses dirty helper:
   5065                      VexEmWarn amd64g_do_FLDENV ( VexGuestX86State*, HWord ) */
   5066                IRTemp    ew = newTemp(Ity_I32);
   5067                IRTemp   w64 = newTemp(Ity_I64);
   5068                IRDirty*   d = unsafeIRDirty_0_N (
   5069                                  0/*regparms*/,
   5070                                  "amd64g_dirtyhelper_FLDENV",
   5071                                  &amd64g_dirtyhelper_FLDENV,
   5072                                  mkIRExprVec_1( mkexpr(addr) )
   5073                               );
   5074                d->needsBBP = True;
   5075                d->tmp      = w64;
   5076                /* declare we're reading memory */
   5077                d->mFx   = Ifx_Read;
   5078                d->mAddr = mkexpr(addr);
   5079                d->mSize = 28;
   5080 
   5081                /* declare we're writing guest state */
   5082                d->nFxState = 4;
   5083 
   5084                d->fxState[0].fx     = Ifx_Write;
   5085                d->fxState[0].offset = OFFB_FTOP;
   5086                d->fxState[0].size   = sizeof(UInt);
   5087 
   5088                d->fxState[1].fx     = Ifx_Write;
   5089                d->fxState[1].offset = OFFB_FPTAGS;
   5090                d->fxState[1].size   = 8 * sizeof(UChar);
   5091 
   5092                d->fxState[2].fx     = Ifx_Write;
   5093                d->fxState[2].offset = OFFB_FPROUND;
   5094                d->fxState[2].size   = sizeof(ULong);
   5095 
   5096                d->fxState[3].fx     = Ifx_Write;
   5097                d->fxState[3].offset = OFFB_FC3210;
   5098                d->fxState[3].size   = sizeof(ULong);
   5099 
   5100                stmt( IRStmt_Dirty(d) );
   5101 
   5102                /* ew contains any emulation warning we may need to
   5103                   issue.  If needed, side-exit to the next insn,
   5104                   reporting the warning, so that Valgrind's dispatcher
   5105                   sees the warning. */
   5106 	       assign(ew, unop(Iop_64to32,mkexpr(w64)) );
   5107                put_emwarn( mkexpr(ew) );
   5108                stmt(
   5109                   IRStmt_Exit(
   5110                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   5111                      Ijk_EmWarn,
   5112                      IRConst_U64( guest_RIP_bbstart+delta )
   5113                   )
   5114                );
   5115 
   5116                DIP("fldenv %s\n", dis_buf);
   5117                break;
   5118             }
   5119 
   5120             case 5: {/* FLDCW */
   5121                /* The only thing we observe in the control word is the
   5122                   rounding mode.  Therefore, pass the 16-bit value
   5123                   (x87 native-format control word) to a clean helper,
   5124                   getting back a 64-bit value, the lower half of which
   5125                   is the FPROUND value to store, and the upper half of
   5126                   which is the emulation-warning token which may be
   5127                   generated.
   5128                */
   5129                /* ULong amd64h_check_fldcw ( ULong ); */
   5130                IRTemp t64 = newTemp(Ity_I64);
   5131                IRTemp ew = newTemp(Ity_I32);
   5132                DIP("fldcw %s\n", dis_buf);
   5133                assign( t64, mkIRExprCCall(
   5134                                Ity_I64, 0/*regparms*/,
   5135                                "amd64g_check_fldcw",
   5136                                &amd64g_check_fldcw,
   5137                                mkIRExprVec_1(
   5138                                   unop( Iop_16Uto64,
   5139                                         loadLE(Ity_I16, mkexpr(addr)))
   5140                                )
   5141                             )
   5142                      );
   5143 
   5144                put_fpround( unop(Iop_64to32, mkexpr(t64)) );
   5145                assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
   5146                put_emwarn( mkexpr(ew) );
   5147                /* Finally, if an emulation warning was reported,
   5148                   side-exit to the next insn, reporting the warning,
   5149                   so that Valgrind's dispatcher sees the warning. */
   5150                stmt(
   5151                   IRStmt_Exit(
   5152                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   5153                      Ijk_EmWarn,
   5154                      IRConst_U64( guest_RIP_bbstart+delta )
   5155                   )
   5156                );
   5157                break;
   5158             }
   5159 
   5160             case 6: { /* FNSTENV m28 */
   5161                /* Uses dirty helper:
   5162                      void amd64g_do_FSTENV ( VexGuestAMD64State*, HWord ) */
   5163                IRDirty* d = unsafeIRDirty_0_N (
   5164                                0/*regparms*/,
   5165                                "amd64g_dirtyhelper_FSTENV",
   5166                                &amd64g_dirtyhelper_FSTENV,
   5167                                mkIRExprVec_1( mkexpr(addr) )
   5168                             );
   5169                d->needsBBP = True;
   5170                /* declare we're writing memory */
   5171                d->mFx   = Ifx_Write;
   5172                d->mAddr = mkexpr(addr);
   5173                d->mSize = 28;
   5174 
   5175                /* declare we're reading guest state */
   5176                d->nFxState = 4;
   5177 
   5178                d->fxState[0].fx     = Ifx_Read;
   5179                d->fxState[0].offset = OFFB_FTOP;
   5180                d->fxState[0].size   = sizeof(UInt);
   5181 
   5182                d->fxState[1].fx     = Ifx_Read;
   5183                d->fxState[1].offset = OFFB_FPTAGS;
   5184                d->fxState[1].size   = 8 * sizeof(UChar);
   5185 
   5186                d->fxState[2].fx     = Ifx_Read;
   5187                d->fxState[2].offset = OFFB_FPROUND;
   5188                d->fxState[2].size   = sizeof(ULong);
   5189 
   5190                d->fxState[3].fx     = Ifx_Read;
   5191                d->fxState[3].offset = OFFB_FC3210;
   5192                d->fxState[3].size   = sizeof(ULong);
   5193 
   5194                stmt( IRStmt_Dirty(d) );
   5195 
   5196                DIP("fnstenv %s\n", dis_buf);
   5197                break;
   5198             }
   5199 
   5200             case 7: /* FNSTCW */
   5201                /* Fake up a native x87 FPU control word.  The only
   5202                   thing it depends on is FPROUND[1:0], so call a clean
   5203                   helper to cook it up. */
   5204                /* ULong amd64g_create_fpucw ( ULong fpround ) */
   5205                DIP("fnstcw %s\n", dis_buf);
   5206                storeLE(
   5207                   mkexpr(addr),
   5208                   unop( Iop_64to16,
   5209                         mkIRExprCCall(
   5210                            Ity_I64, 0/*regp*/,
   5211                            "amd64g_create_fpucw", &amd64g_create_fpucw,
   5212                            mkIRExprVec_1( unop(Iop_32Uto64, get_fpround()) )
   5213                         )
   5214                   )
   5215                );
   5216                break;
   5217 
   5218             default:
   5219                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   5220                vex_printf("first_opcode == 0xD9\n");
   5221                goto decode_fail;
   5222          }
   5223 
   5224       } else {
   5225          delta++;
   5226          switch (modrm) {
   5227 
   5228             case 0xC0 ... 0xC7: /* FLD %st(?) */
   5229                r_src = (UInt)modrm - 0xC0;
   5230                DIP("fld %%st(%u)\n", r_src);
   5231                t1 = newTemp(Ity_F64);
   5232                assign(t1, get_ST(r_src));
   5233                fp_push();
   5234                put_ST(0, mkexpr(t1));
   5235                break;
   5236 
   5237             case 0xC8 ... 0xCF: /* FXCH %st(?) */
   5238                r_src = (UInt)modrm - 0xC8;
   5239                DIP("fxch %%st(%u)\n", r_src);
   5240                t1 = newTemp(Ity_F64);
   5241                t2 = newTemp(Ity_F64);
   5242                assign(t1, get_ST(0));
   5243                assign(t2, get_ST(r_src));
   5244                put_ST_UNCHECKED(0, mkexpr(t2));
   5245                put_ST_UNCHECKED(r_src, mkexpr(t1));
   5246                break;
   5247 
   5248             case 0xE0: /* FCHS */
   5249                DIP("fchs\n");
   5250                put_ST_UNCHECKED(0, unop(Iop_NegF64, get_ST(0)));
   5251                break;
   5252 
   5253             case 0xE1: /* FABS */
   5254                DIP("fabs\n");
   5255                put_ST_UNCHECKED(0, unop(Iop_AbsF64, get_ST(0)));
   5256                break;
   5257 
   5258             case 0xE5: { /* FXAM */
   5259                /* This is an interesting one.  It examines %st(0),
   5260                   regardless of whether the tag says it's empty or not.
   5261                   Here, just pass both the tag (in our format) and the
   5262                   value (as a double, actually a ULong) to a helper
   5263                   function. */
   5264                IRExpr** args
   5265                   = mkIRExprVec_2( unop(Iop_8Uto64, get_ST_TAG(0)),
   5266                                    unop(Iop_ReinterpF64asI64,
   5267                                         get_ST_UNCHECKED(0)) );
   5268                put_C3210(mkIRExprCCall(
   5269                             Ity_I64,
   5270                             0/*regparm*/,
   5271                             "amd64g_calculate_FXAM", &amd64g_calculate_FXAM,
   5272                             args
   5273                         ));
   5274                DIP("fxam\n");
   5275                break;
   5276             }
   5277 
   5278             case 0xE8: /* FLD1 */
   5279                DIP("fld1\n");
   5280                fp_push();
   5281                /* put_ST(0, IRExpr_Const(IRConst_F64(1.0))); */
   5282                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff0000000000000ULL)));
   5283                break;
   5284 
   5285             case 0xE9: /* FLDL2T */
   5286                DIP("fldl2t\n");
   5287                fp_push();
   5288                /* put_ST(0, IRExpr_Const(IRConst_F64(3.32192809488736234781))); */
   5289                put_ST(0, IRExpr_Const(IRConst_F64i(0x400a934f0979a371ULL)));
   5290                break;
   5291 
   5292             case 0xEA: /* FLDL2E */
   5293                DIP("fldl2e\n");
   5294                fp_push();
   5295                /* put_ST(0, IRExpr_Const(IRConst_F64(1.44269504088896340739))); */
   5296                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff71547652b82feULL)));
   5297                break;
   5298 
   5299             case 0xEB: /* FLDPI */
   5300                DIP("fldpi\n");
   5301                fp_push();
   5302                /* put_ST(0, IRExpr_Const(IRConst_F64(3.14159265358979323851))); */
   5303                put_ST(0, IRExpr_Const(IRConst_F64i(0x400921fb54442d18ULL)));
   5304                break;
   5305 
   5306             case 0xEC: /* FLDLG2 */
   5307                DIP("fldlg2\n");
   5308                fp_push();
   5309                /* put_ST(0, IRExpr_Const(IRConst_F64(0.301029995663981143))); */
   5310                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fd34413509f79ffULL)));
   5311                break;
   5312 
   5313             case 0xED: /* FLDLN2 */
   5314                DIP("fldln2\n");
   5315                fp_push();
   5316                /* put_ST(0, IRExpr_Const(IRConst_F64(0.69314718055994530942))); */
   5317                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fe62e42fefa39efULL)));
   5318                break;
   5319 
   5320             case 0xEE: /* FLDZ */
   5321                DIP("fldz\n");
   5322                fp_push();
   5323                /* put_ST(0, IRExpr_Const(IRConst_F64(0.0))); */
   5324                put_ST(0, IRExpr_Const(IRConst_F64i(0x0000000000000000ULL)));
   5325                break;
   5326 
   5327             case 0xF0: /* F2XM1 */
   5328                DIP("f2xm1\n");
   5329                put_ST_UNCHECKED(0,
   5330                   binop(Iop_2xm1F64,
   5331                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5332                         get_ST(0)));
   5333                break;
   5334 
   5335             case 0xF1: /* FYL2X */
   5336                DIP("fyl2x\n");
   5337                put_ST_UNCHECKED(1,
   5338                   triop(Iop_Yl2xF64,
   5339                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5340                         get_ST(1),
   5341                         get_ST(0)));
   5342                fp_pop();
   5343                break;
   5344 
   5345             case 0xF2: /* FPTAN */
   5346                DIP("ftan\n");
   5347                put_ST_UNCHECKED(0,
   5348                   binop(Iop_TanF64,
   5349                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5350                         get_ST(0)));
   5351                fp_push();
   5352                put_ST(0, IRExpr_Const(IRConst_F64(1.0)));
   5353                clear_C2(); /* HACK */
   5354                break;
   5355 
   5356             case 0xF3: /* FPATAN */
   5357                DIP("fpatan\n");
   5358                put_ST_UNCHECKED(1,
   5359                   triop(Iop_AtanF64,
   5360                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5361                         get_ST(1),
   5362                         get_ST(0)));
   5363                fp_pop();
   5364                break;
   5365 
   5366             case 0xF4: { /* FXTRACT */
   5367                IRTemp argF = newTemp(Ity_F64);
   5368                IRTemp sigF = newTemp(Ity_F64);
   5369                IRTemp expF = newTemp(Ity_F64);
   5370                IRTemp argI = newTemp(Ity_I64);
   5371                IRTemp sigI = newTemp(Ity_I64);
   5372                IRTemp expI = newTemp(Ity_I64);
   5373                DIP("fxtract\n");
   5374                assign( argF, get_ST(0) );
   5375                assign( argI, unop(Iop_ReinterpF64asI64, mkexpr(argF)));
   5376                assign( sigI,
   5377                        mkIRExprCCall(
   5378                           Ity_I64, 0/*regparms*/,
   5379                           "x86amd64g_calculate_FXTRACT",
   5380                           &x86amd64g_calculate_FXTRACT,
   5381                           mkIRExprVec_2( mkexpr(argI),
   5382                                          mkIRExpr_HWord(0)/*sig*/ ))
   5383                );
   5384                assign( expI,
   5385                        mkIRExprCCall(
   5386                           Ity_I64, 0/*regparms*/,
   5387                           "x86amd64g_calculate_FXTRACT",
   5388                           &x86amd64g_calculate_FXTRACT,
   5389                           mkIRExprVec_2( mkexpr(argI),
   5390                                          mkIRExpr_HWord(1)/*exp*/ ))
   5391                );
   5392                assign( sigF, unop(Iop_ReinterpI64asF64, mkexpr(sigI)) );
   5393                assign( expF, unop(Iop_ReinterpI64asF64, mkexpr(expI)) );
   5394                /* exponent */
   5395                put_ST_UNCHECKED(0, mkexpr(expF) );
   5396                fp_push();
   5397                /* significand */
   5398                put_ST(0, mkexpr(sigF) );
   5399                break;
   5400             }
   5401 
   5402             case 0xF5: { /* FPREM1 -- IEEE compliant */
   5403                IRTemp a1 = newTemp(Ity_F64);
   5404                IRTemp a2 = newTemp(Ity_F64);
   5405                DIP("fprem1\n");
   5406                /* Do FPREM1 twice, once to get the remainder, and once
   5407                   to get the C3210 flag values. */
   5408                assign( a1, get_ST(0) );
   5409                assign( a2, get_ST(1) );
   5410                put_ST_UNCHECKED(0,
   5411                   triop(Iop_PRem1F64,
   5412                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5413                         mkexpr(a1),
   5414                         mkexpr(a2)));
   5415                put_C3210(
   5416                   unop(Iop_32Uto64,
   5417                   triop(Iop_PRem1C3210F64,
   5418                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5419                         mkexpr(a1),
   5420                         mkexpr(a2)) ));
   5421                break;
   5422             }
   5423 
   5424             case 0xF7: /* FINCSTP */
   5425                DIP("fincstp\n");
   5426                put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
   5427                break;
   5428 
   5429             case 0xF8: { /* FPREM -- not IEEE compliant */
   5430                IRTemp a1 = newTemp(Ity_F64);
   5431                IRTemp a2 = newTemp(Ity_F64);
   5432                DIP("fprem\n");
   5433                /* Do FPREM twice, once to get the remainder, and once
   5434                   to get the C3210 flag values. */
   5435                assign( a1, get_ST(0) );
   5436                assign( a2, get_ST(1) );
   5437                put_ST_UNCHECKED(0,
   5438                   triop(Iop_PRemF64,
   5439                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5440                         mkexpr(a1),
   5441                         mkexpr(a2)));
   5442                put_C3210(
   5443                   unop(Iop_32Uto64,
   5444                   triop(Iop_PRemC3210F64,
   5445                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5446                         mkexpr(a1),
   5447                         mkexpr(a2)) ));
   5448                break;
   5449             }
   5450 
   5451             case 0xF9: /* FYL2XP1 */
   5452                DIP("fyl2xp1\n");
   5453                put_ST_UNCHECKED(1,
   5454                   triop(Iop_Yl2xp1F64,
   5455                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5456                         get_ST(1),
   5457                         get_ST(0)));
   5458                fp_pop();
   5459                break;
   5460 
   5461             case 0xFA: /* FSQRT */
   5462                DIP("fsqrt\n");
   5463                put_ST_UNCHECKED(0,
   5464                   binop(Iop_SqrtF64,
   5465                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5466                         get_ST(0)));
   5467                break;
   5468 
   5469             case 0xFB: { /* FSINCOS */
   5470                IRTemp a1 = newTemp(Ity_F64);
   5471                assign( a1, get_ST(0) );
   5472                DIP("fsincos\n");
   5473                put_ST_UNCHECKED(0,
   5474                   binop(Iop_SinF64,
   5475                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5476                         mkexpr(a1)));
   5477                fp_push();
   5478                put_ST(0,
   5479                   binop(Iop_CosF64,
   5480                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5481                         mkexpr(a1)));
   5482                clear_C2(); /* HACK */
   5483                break;
   5484             }
   5485 
   5486             case 0xFC: /* FRNDINT */
   5487                DIP("frndint\n");
   5488                put_ST_UNCHECKED(0,
   5489                   binop(Iop_RoundF64toInt, get_roundingmode(), get_ST(0)) );
   5490                break;
   5491 
   5492             case 0xFD: /* FSCALE */
   5493                DIP("fscale\n");
   5494                put_ST_UNCHECKED(0,
   5495                   triop(Iop_ScaleF64,
   5496                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5497                         get_ST(0),
   5498                         get_ST(1)));
   5499                break;
   5500 
   5501             case 0xFE: /* FSIN */
   5502                DIP("fsin\n");
   5503                put_ST_UNCHECKED(0,
   5504                   binop(Iop_SinF64,
   5505                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5506                         get_ST(0)));
   5507                clear_C2(); /* HACK */
   5508                break;
   5509 
   5510             case 0xFF: /* FCOS */
   5511                DIP("fcos\n");
   5512                put_ST_UNCHECKED(0,
   5513                   binop(Iop_CosF64,
   5514                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5515                         get_ST(0)));
   5516                clear_C2(); /* HACK */
   5517                break;
   5518 
   5519             default:
   5520                goto decode_fail;
   5521          }
   5522       }
   5523    }
   5524 
   5525    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDA opcodes +-+-+-+-+-+-+-+ */
   5526    else
   5527    if (first_opcode == 0xDA) {
   5528 
   5529       if (modrm < 0xC0) {
   5530 
   5531          /* bits 5,4,3 are an opcode extension, and the modRM also
   5532             specifies an address. */
   5533          IROp   fop;
   5534          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   5535          delta += len;
   5536          switch (gregLO3ofRM(modrm)) {
   5537 
   5538             case 0: /* FIADD m32int */ /* ST(0) += m32int */
   5539                DIP("fiaddl %s\n", dis_buf);
   5540                fop = Iop_AddF64;
   5541                goto do_fop_m32;
   5542 
   5543             case 1: /* FIMUL m32int */ /* ST(0) *= m32int */
   5544                DIP("fimull %s\n", dis_buf);
   5545                fop = Iop_MulF64;
   5546                goto do_fop_m32;
   5547 
   5548             case 4: /* FISUB m32int */ /* ST(0) -= m32int */
   5549                DIP("fisubl %s\n", dis_buf);
   5550                fop = Iop_SubF64;
   5551                goto do_fop_m32;
   5552 
   5553             case 5: /* FISUBR m32int */ /* ST(0) = m32int - ST(0) */
   5554                DIP("fisubrl %s\n", dis_buf);
   5555                fop = Iop_SubF64;
   5556                goto do_foprev_m32;
   5557 
   5558             case 6: /* FIDIV m32int */ /* ST(0) /= m32int */
   5559                DIP("fisubl %s\n", dis_buf);
   5560                fop = Iop_DivF64;
   5561                goto do_fop_m32;
   5562 
   5563             case 7: /* FIDIVR m32int */ /* ST(0) = m32int / ST(0) */
   5564                DIP("fidivrl %s\n", dis_buf);
   5565                fop = Iop_DivF64;
   5566                goto do_foprev_m32;
   5567 
   5568             do_fop_m32:
   5569                put_ST_UNCHECKED(0,
   5570                   triop(fop,
   5571                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5572                         get_ST(0),
   5573                         unop(Iop_I32StoF64,
   5574                              loadLE(Ity_I32, mkexpr(addr)))));
   5575                break;
   5576 
   5577             do_foprev_m32:
   5578                put_ST_UNCHECKED(0,
   5579                   triop(fop,
   5580                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5581                         unop(Iop_I32StoF64,
   5582                              loadLE(Ity_I32, mkexpr(addr))),
   5583                         get_ST(0)));
   5584                break;
   5585 
   5586             default:
   5587                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   5588                vex_printf("first_opcode == 0xDA\n");
   5589                goto decode_fail;
   5590          }
   5591 
   5592       } else {
   5593 
   5594          delta++;
   5595          switch (modrm) {
   5596 
   5597             case 0xC0 ... 0xC7: /* FCMOVB ST(i), ST(0) */
   5598                r_src = (UInt)modrm - 0xC0;
   5599                DIP("fcmovb %%st(%u), %%st(0)\n", r_src);
   5600                put_ST_UNCHECKED(0,
   5601                                 IRExpr_Mux0X(
   5602                                     unop(Iop_1Uto8,
   5603                                          mk_amd64g_calculate_condition(AMD64CondB)),
   5604                                     get_ST(0), get_ST(r_src)) );
   5605                break;
   5606 
   5607             case 0xC8 ... 0xCF: /* FCMOVE(Z) ST(i), ST(0) */
   5608                r_src = (UInt)modrm - 0xC8;
   5609                DIP("fcmovz %%st(%u), %%st(0)\n", r_src);
   5610                put_ST_UNCHECKED(0,
   5611                                 IRExpr_Mux0X(
   5612                                     unop(Iop_1Uto8,
   5613                                          mk_amd64g_calculate_condition(AMD64CondZ)),
   5614                                     get_ST(0), get_ST(r_src)) );
   5615                break;
   5616 
   5617             case 0xD0 ... 0xD7: /* FCMOVBE ST(i), ST(0) */
   5618                r_src = (UInt)modrm - 0xD0;
   5619                DIP("fcmovbe %%st(%u), %%st(0)\n", r_src);
   5620                put_ST_UNCHECKED(0,
   5621                                 IRExpr_Mux0X(
   5622                                     unop(Iop_1Uto8,
   5623                                          mk_amd64g_calculate_condition(AMD64CondBE)),
   5624                                     get_ST(0), get_ST(r_src)) );
   5625                break;
   5626 
   5627             case 0xD8 ... 0xDF: /* FCMOVU ST(i), ST(0) */
   5628                r_src = (UInt)modrm - 0xD8;
   5629                DIP("fcmovu %%st(%u), %%st(0)\n", r_src);
   5630                put_ST_UNCHECKED(0,
   5631                                 IRExpr_Mux0X(
   5632                                     unop(Iop_1Uto8,
   5633                                          mk_amd64g_calculate_condition(AMD64CondP)),
   5634                                     get_ST(0), get_ST(r_src)) );
   5635                break;
   5636 
   5637             case 0xE9: /* FUCOMPP %st(0),%st(1) */
   5638                DIP("fucompp %%st(0),%%st(1)\n");
   5639                /* This forces C1 to zero, which isn't right. */
   5640                put_C3210(
   5641                    unop(Iop_32Uto64,
   5642                    binop( Iop_And32,
   5643                           binop(Iop_Shl32,
   5644                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
   5645                                 mkU8(8)),
   5646                           mkU32(0x4500)
   5647                    )));
   5648                fp_pop();
   5649                fp_pop();
   5650                break;
   5651 
   5652             default:
   5653                goto decode_fail;
   5654          }
   5655 
   5656       }
   5657    }
   5658 
   5659    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDB opcodes +-+-+-+-+-+-+-+ */
   5660    else
   5661    if (first_opcode == 0xDB) {
   5662       if (modrm < 0xC0) {
   5663 
   5664          /* bits 5,4,3 are an opcode extension, and the modRM also
   5665             specifies an address. */
   5666          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   5667          delta += len;
   5668 
   5669          switch (gregLO3ofRM(modrm)) {
   5670 
   5671             case 0: /* FILD m32int */
   5672                DIP("fildl %s\n", dis_buf);
   5673                fp_push();
   5674                put_ST(0, unop(Iop_I32StoF64,
   5675                               loadLE(Ity_I32, mkexpr(addr))));
   5676                break;
   5677 
   5678             case 1: /* FISTTPL m32 (SSE3) */
   5679                DIP("fisttpl %s\n", dis_buf);
   5680                storeLE( mkexpr(addr),
   5681                         binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) );
   5682                fp_pop();
   5683                break;
   5684 
   5685             case 2: /* FIST m32 */
   5686                DIP("fistl %s\n", dis_buf);
   5687                storeLE( mkexpr(addr),
   5688                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
   5689                break;
   5690 
   5691             case 3: /* FISTP m32 */
   5692                DIP("fistpl %s\n", dis_buf);
   5693                storeLE( mkexpr(addr),
   5694                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
   5695                fp_pop();
   5696                break;
   5697 
   5698             case 5: { /* FLD extended-real */
   5699                /* Uses dirty helper:
   5700                      ULong amd64g_loadF80le ( ULong )
   5701                   addr holds the address.  First, do a dirty call to
   5702                   get hold of the data. */
   5703                IRTemp   val  = newTemp(Ity_I64);
   5704                IRExpr** args = mkIRExprVec_1 ( mkexpr(addr) );
   5705 
   5706                IRDirty* d = unsafeIRDirty_1_N (
   5707                                val,
   5708                                0/*regparms*/,
   5709                                "amd64g_dirtyhelper_loadF80le",
   5710                                &amd64g_dirtyhelper_loadF80le,
   5711                                args
   5712                             );
   5713                /* declare that we're reading memory */
   5714                d->mFx   = Ifx_Read;
   5715                d->mAddr = mkexpr(addr);
   5716                d->mSize = 10;
   5717 
   5718                /* execute the dirty call, dumping the result in val. */
   5719                stmt( IRStmt_Dirty(d) );
   5720                fp_push();
   5721                put_ST(0, unop(Iop_ReinterpI64asF64, mkexpr(val)));
   5722 
   5723                DIP("fldt %s\n", dis_buf);
   5724                break;
   5725             }
   5726 
   5727             case 7: { /* FSTP extended-real */
   5728                /* Uses dirty helper:
   5729                      void amd64g_storeF80le ( ULong addr, ULong data )
   5730                */
   5731                IRExpr** args
   5732                   = mkIRExprVec_2( mkexpr(addr),
   5733                                    unop(Iop_ReinterpF64asI64, get_ST(0)) );
   5734 
   5735                IRDirty* d = unsafeIRDirty_0_N (
   5736                                0/*regparms*/,
   5737                                "amd64g_dirtyhelper_storeF80le",
   5738                                &amd64g_dirtyhelper_storeF80le,
   5739                                args
   5740                             );
   5741                /* declare we're writing memory */
   5742                d->mFx   = Ifx_Write;
   5743                d->mAddr = mkexpr(addr);
   5744                d->mSize = 10;
   5745 
   5746                /* execute the dirty call. */
   5747                stmt( IRStmt_Dirty(d) );
   5748                fp_pop();
   5749 
   5750                DIP("fstpt\n %s", dis_buf);
   5751                break;
   5752             }
   5753 
   5754             default:
   5755                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   5756                vex_printf("first_opcode == 0xDB\n");
   5757                goto decode_fail;
   5758          }
   5759 
   5760       } else {
   5761 
   5762          delta++;
   5763          switch (modrm) {
   5764 
   5765             case 0xC0 ... 0xC7: /* FCMOVNB ST(i), ST(0) */
   5766                r_src = (UInt)modrm - 0xC0;
   5767                DIP("fcmovnb %%st(%u), %%st(0)\n", r_src);
   5768                put_ST_UNCHECKED(0,
   5769                                 IRExpr_Mux0X(
   5770                                     unop(Iop_1Uto8,
   5771                                          mk_amd64g_calculate_condition(AMD64CondNB)),
   5772                                     get_ST(0), get_ST(r_src)) );
   5773                break;
   5774 
   5775             case 0xC8 ... 0xCF: /* FCMOVNE(NZ) ST(i), ST(0) */
   5776                r_src = (UInt)modrm - 0xC8;
   5777                DIP("fcmovnz %%st(%u), %%st(0)\n", r_src);
   5778                put_ST_UNCHECKED(
   5779                   0,
   5780                   IRExpr_Mux0X(
   5781                      unop(Iop_1Uto8,
   5782                           mk_amd64g_calculate_condition(AMD64CondNZ)),
   5783                      get_ST(0),
   5784                      get_ST(r_src)
   5785                   )
   5786                );
   5787                break;
   5788 
   5789             case 0xD0 ... 0xD7: /* FCMOVNBE ST(i), ST(0) */
   5790                r_src = (UInt)modrm - 0xD0;
   5791                DIP("fcmovnbe %%st(%u), %%st(0)\n", r_src);
   5792                put_ST_UNCHECKED(
   5793                   0,
   5794                   IRExpr_Mux0X(
   5795                      unop(Iop_1Uto8,
   5796                           mk_amd64g_calculate_condition(AMD64CondNBE)),
   5797                      get_ST(0),
   5798                      get_ST(r_src)
   5799                   )
   5800                );
   5801                break;
   5802 
   5803             case 0xD8 ... 0xDF: /* FCMOVNU ST(i), ST(0) */
   5804                r_src = (UInt)modrm - 0xD8;
   5805                DIP("fcmovnu %%st(%u), %%st(0)\n", r_src);
   5806                put_ST_UNCHECKED(
   5807                   0,
   5808                   IRExpr_Mux0X(
   5809                      unop(Iop_1Uto8,
   5810                           mk_amd64g_calculate_condition(AMD64CondNP)),
   5811                      get_ST(0),
   5812                      get_ST(r_src)
   5813                   )
   5814                );
   5815                break;
   5816 
   5817             case 0xE2:
   5818                DIP("fnclex\n");
   5819                break;
   5820 
   5821             case 0xE3: {
   5822                /* Uses dirty helper:
   5823                      void amd64g_do_FINIT ( VexGuestAMD64State* ) */
   5824                IRDirty* d  = unsafeIRDirty_0_N (
   5825                                 0/*regparms*/,
   5826                                 "amd64g_dirtyhelper_FINIT",
   5827                                 &amd64g_dirtyhelper_FINIT,
   5828                                 mkIRExprVec_0()
   5829                              );
   5830                d->needsBBP = True;
   5831 
   5832                /* declare we're writing guest state */
   5833                d->nFxState = 5;
   5834 
   5835                d->fxState[0].fx     = Ifx_Write;
   5836                d->fxState[0].offset = OFFB_FTOP;
   5837                d->fxState[0].size   = sizeof(UInt);
   5838 
   5839                d->fxState[1].fx     = Ifx_Write;
   5840                d->fxState[1].offset = OFFB_FPREGS;
   5841                d->fxState[1].size   = 8 * sizeof(ULong);
   5842 
   5843                d->fxState[2].fx     = Ifx_Write;
   5844                d->fxState[2].offset = OFFB_FPTAGS;
   5845                d->fxState[2].size   = 8 * sizeof(UChar);
   5846 
   5847                d->fxState[3].fx     = Ifx_Write;
   5848                d->fxState[3].offset = OFFB_FPROUND;
   5849                d->fxState[3].size   = sizeof(ULong);
   5850 
   5851                d->fxState[4].fx     = Ifx_Write;
   5852                d->fxState[4].offset = OFFB_FC3210;
   5853                d->fxState[4].size   = sizeof(ULong);
   5854 
   5855                stmt( IRStmt_Dirty(d) );
   5856 
   5857                DIP("fninit\n");
   5858                break;
   5859             }
   5860 
   5861             case 0xE8 ... 0xEF: /* FUCOMI %st(0),%st(?) */
   5862                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, False );
   5863                break;
   5864 
   5865             case 0xF0 ... 0xF7: /* FCOMI %st(0),%st(?) */
   5866                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, False );
   5867                break;
   5868 
   5869             default:
   5870                goto decode_fail;
   5871          }
   5872       }
   5873    }
   5874 
   5875    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDC opcodes +-+-+-+-+-+-+-+ */
   5876    else
   5877    if (first_opcode == 0xDC) {
   5878       if (modrm < 0xC0) {
   5879 
   5880          /* bits 5,4,3 are an opcode extension, and the modRM also
   5881             specifies an address. */
   5882          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   5883          delta += len;
   5884 
   5885          switch (gregLO3ofRM(modrm)) {
   5886 
   5887             case 0: /* FADD double-real */
   5888                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, True );
   5889                break;
   5890 
   5891             case 1: /* FMUL double-real */
   5892                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, True );
   5893                break;
   5894 
   5895 //..             case 2: /* FCOM double-real */
   5896 //..                DIP("fcoml %s\n", dis_buf);
   5897 //..                /* This forces C1 to zero, which isn't right. */
   5898 //..                put_C3210(
   5899 //..                    binop( Iop_And32,
   5900 //..                           binop(Iop_Shl32,
   5901 //..                                 binop(Iop_CmpF64,
   5902 //..                                       get_ST(0),
   5903 //..                                       loadLE(Ity_F64,mkexpr(addr))),
   5904 //..                                 mkU8(8)),
   5905 //..                           mkU32(0x4500)
   5906 //..                    ));
   5907 //..                break;
   5908 
   5909             case 3: /* FCOMP double-real */
   5910                DIP("fcompl %s\n", dis_buf);
   5911                /* This forces C1 to zero, which isn't right. */
   5912                put_C3210(
   5913                    unop(Iop_32Uto64,
   5914                    binop( Iop_And32,
   5915                           binop(Iop_Shl32,
   5916                                 binop(Iop_CmpF64,
   5917                                       get_ST(0),
   5918                                       loadLE(Ity_F64,mkexpr(addr))),
   5919                                 mkU8(8)),
   5920                           mkU32(0x4500)
   5921                    )));
   5922                fp_pop();
   5923                break;
   5924 
   5925             case 4: /* FSUB double-real */
   5926                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, True );
   5927                break;
   5928 
   5929             case 5: /* FSUBR double-real */
   5930                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, True );
   5931                break;
   5932 
   5933             case 6: /* FDIV double-real */
   5934                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, True );
   5935                break;
   5936 
   5937             case 7: /* FDIVR double-real */
   5938                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, True );
   5939                break;
   5940 
   5941             default:
   5942                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   5943                vex_printf("first_opcode == 0xDC\n");
   5944                goto decode_fail;
   5945          }
   5946 
   5947       } else {
   5948 
   5949          delta++;
   5950          switch (modrm) {
   5951 
   5952             case 0xC0 ... 0xC7: /* FADD %st(0),%st(?) */
   5953                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, False );
   5954                break;
   5955 
   5956             case 0xC8 ... 0xCF: /* FMUL %st(0),%st(?) */
   5957                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, False );
   5958                break;
   5959 
   5960             case 0xE0 ... 0xE7: /* FSUBR %st(0),%st(?) */
   5961                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0, modrm - 0xE0, False );
   5962                break;
   5963 
   5964             case 0xE8 ... 0xEF: /* FSUB %st(0),%st(?) */
   5965                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0, modrm - 0xE8, False );
   5966                break;
   5967 
   5968             case 0xF0 ... 0xF7: /* FDIVR %st(0),%st(?) */
   5969                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, False );
   5970                break;
   5971 
   5972             case 0xF8 ... 0xFF: /* FDIV %st(0),%st(?) */
   5973                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, False );
   5974                break;
   5975 
   5976             default:
   5977                goto decode_fail;
   5978          }
   5979 
   5980       }
   5981    }
   5982 
   5983    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDD opcodes +-+-+-+-+-+-+-+ */
   5984    else
   5985    if (first_opcode == 0xDD) {
   5986 
   5987       if (modrm < 0xC0) {
   5988 
   5989          /* bits 5,4,3 are an opcode extension, and the modRM also
   5990             specifies an address. */
   5991          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   5992          delta += len;
   5993 
   5994          switch (gregLO3ofRM(modrm)) {
   5995 
   5996             case 0: /* FLD double-real */
   5997                DIP("fldl %s\n", dis_buf);
   5998                fp_push();
   5999                put_ST(0, loadLE(Ity_F64, mkexpr(addr)));
   6000                break;
   6001 
   6002             case 1: /* FISTTPQ m64 (SSE3) */
   6003                DIP("fistppll %s\n", dis_buf);
   6004                storeLE( mkexpr(addr),
   6005                         binop(Iop_F64toI64S, mkU32(Irrm_ZERO), get_ST(0)) );
   6006                fp_pop();
   6007                break;
   6008 
   6009             case 2: /* FST double-real */
   6010                DIP("fstl %s\n", dis_buf);
   6011                storeLE(mkexpr(addr), get_ST(0));
   6012                break;
   6013 
   6014             case 3: /* FSTP double-real */
   6015                DIP("fstpl %s\n", dis_buf);
   6016                storeLE(mkexpr(addr), get_ST(0));
   6017                fp_pop();
   6018                break;
   6019 
   6020 //..             case 4: { /* FRSTOR m108 */
   6021 //..                /* Uses dirty helper:
   6022 //..                      VexEmWarn x86g_do_FRSTOR ( VexGuestX86State*, Addr32 ) */
   6023 //..                IRTemp   ew = newTemp(Ity_I32);
   6024 //..                IRDirty* d  = unsafeIRDirty_0_N (
   6025 //..                                 0/*regparms*/,
   6026 //..                                 "x86g_dirtyhelper_FRSTOR",
   6027 //..                                 &x86g_dirtyhelper_FRSTOR,
   6028 //..                                 mkIRExprVec_1( mkexpr(addr) )
   6029 //..                              );
   6030 //..                d->needsBBP = True;
   6031 //..                d->tmp      = ew;
   6032 //..                /* declare we're reading memory */
   6033 //..                d->mFx   = Ifx_Read;
   6034 //..                d->mAddr = mkexpr(addr);
   6035 //..                d->mSize = 108;
   6036 //..
   6037 //..                /* declare we're writing guest state */
   6038 //..                d->nFxState = 5;
   6039 //..
   6040 //..                d->fxState[0].fx     = Ifx_Write;
   6041 //..                d->fxState[0].offset = OFFB_FTOP;
   6042 //..                d->fxState[0].size   = sizeof(UInt);
   6043 //..
   6044 //..                d->fxState[1].fx     = Ifx_Write;
   6045 //..                d->fxState[1].offset = OFFB_FPREGS;
   6046 //..                d->fxState[1].size   = 8 * sizeof(ULong);
   6047 //..
   6048 //..                d->fxState[2].fx     = Ifx_Write;
   6049 //..                d->fxState[2].offset = OFFB_FPTAGS;
   6050 //..                d->fxState[2].size   = 8 * sizeof(UChar);
   6051 //..
   6052 //..                d->fxState[3].fx     = Ifx_Write;
   6053 //..                d->fxState[3].offset = OFFB_FPROUND;
   6054 //..                d->fxState[3].size   = sizeof(UInt);
   6055 //..
   6056 //..                d->fxState[4].fx     = Ifx_Write;
   6057 //..                d->fxState[4].offset = OFFB_FC3210;
   6058 //..                d->fxState[4].size   = sizeof(UInt);
   6059 //..
   6060 //..                stmt( IRStmt_Dirty(d) );
   6061 //..
   6062 //..                /* ew contains any emulation warning we may need to
   6063 //..                   issue.  If needed, side-exit to the next insn,
   6064 //..                   reporting the warning, so that Valgrind's dispatcher
   6065 //..                   sees the warning. */
   6066 //..                put_emwarn( mkexpr(ew) );
   6067 //..                stmt(
   6068 //..                   IRStmt_Exit(
   6069 //..                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   6070 //..                      Ijk_EmWarn,
   6071 //..                      IRConst_U32( ((Addr32)guest_eip_bbstart)+delta)
   6072 //..                   )
   6073 //..                );
   6074 //..
   6075 //..                DIP("frstor %s\n", dis_buf);
   6076 //..                break;
   6077 //..             }
   6078 //..
   6079 //..             case 6: { /* FNSAVE m108 */
   6080 //..                /* Uses dirty helper:
   6081 //..                      void x86g_do_FSAVE ( VexGuestX86State*, UInt ) */
   6082 //..                IRDirty* d = unsafeIRDirty_0_N (
   6083 //..                                0/*regparms*/,
   6084 //..                                "x86g_dirtyhelper_FSAVE",
   6085 //..                                &x86g_dirtyhelper_FSAVE,
   6086 //..                                mkIRExprVec_1( mkexpr(addr) )
   6087 //..                             );
   6088 //..                d->needsBBP = True;
   6089 //..                /* declare we're writing memory */
   6090 //..                d->mFx   = Ifx_Write;
   6091 //..                d->mAddr = mkexpr(addr);
   6092 //..                d->mSize = 108;
   6093 //..
   6094 //..                /* declare we're reading guest state */
   6095 //..                d->nFxState = 5;
   6096 //..
   6097 //..                d->fxState[0].fx     = Ifx_Read;
   6098 //..                d->fxState[0].offset = OFFB_FTOP;
   6099 //..                d->fxState[0].size   = sizeof(UInt);
   6100 //..
   6101 //..                d->fxState[1].fx     = Ifx_Read;
   6102 //..                d->fxState[1].offset = OFFB_FPREGS;
   6103 //..                d->fxState[1].size   = 8 * sizeof(ULong);
   6104 //..
   6105 //..                d->fxState[2].fx     = Ifx_Read;
   6106 //..                d->fxState[2].offset = OFFB_FPTAGS;
   6107 //..                d->fxState[2].size   = 8 * sizeof(UChar);
   6108 //..
   6109 //..                d->fxState[3].fx     = Ifx_Read;
   6110 //..                d->fxState[3].offset = OFFB_FPROUND;
   6111 //..                d->fxState[3].size   = sizeof(UInt);
   6112 //..
   6113 //..                d->fxState[4].fx     = Ifx_Read;
   6114 //..                d->fxState[4].offset = OFFB_FC3210;
   6115 //..                d->fxState[4].size   = sizeof(UInt);
   6116 //..
   6117 //..                stmt( IRStmt_Dirty(d) );
   6118 //..
   6119 //..                DIP("fnsave %s\n", dis_buf);
   6120 //..                break;
   6121 //..             }
   6122 
   6123             case 7: { /* FNSTSW m16 */
   6124                IRExpr* sw = get_FPU_sw();
   6125                vassert(typeOfIRExpr(irsb->tyenv, sw) == Ity_I16);
   6126                storeLE( mkexpr(addr), sw );
   6127                DIP("fnstsw %s\n", dis_buf);
   6128                break;
   6129             }
   6130 
   6131             default:
   6132                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   6133                vex_printf("first_opcode == 0xDD\n");
   6134                goto decode_fail;
   6135          }
   6136       } else {
   6137          delta++;
   6138          switch (modrm) {
   6139 
   6140             case 0xC0 ... 0xC7: /* FFREE %st(?) */
   6141                r_dst = (UInt)modrm - 0xC0;
   6142                DIP("ffree %%st(%u)\n", r_dst);
   6143                put_ST_TAG ( r_dst, mkU8(0) );
   6144                break;
   6145 
   6146             case 0xD0 ... 0xD7: /* FST %st(0),%st(?) */
   6147                r_dst = (UInt)modrm - 0xD0;
   6148                DIP("fst %%st(0),%%st(%u)\n", r_dst);
   6149                /* P4 manual says: "If the destination operand is a
   6150                   non-empty register, the invalid-operation exception
   6151                   is not generated.  Hence put_ST_UNCHECKED. */
   6152                put_ST_UNCHECKED(r_dst, get_ST(0));
   6153                break;
   6154 
   6155             case 0xD8 ... 0xDF: /* FSTP %st(0),%st(?) */
   6156                r_dst = (UInt)modrm - 0xD8;
   6157                DIP("fstp %%st(0),%%st(%u)\n", r_dst);
   6158                /* P4 manual says: "If the destination operand is a
   6159                   non-empty register, the invalid-operation exception
   6160                   is not generated.  Hence put_ST_UNCHECKED. */
   6161                put_ST_UNCHECKED(r_dst, get_ST(0));
   6162                fp_pop();
   6163                break;
   6164 
   6165             case 0xE0 ... 0xE7: /* FUCOM %st(0),%st(?) */
   6166                r_dst = (UInt)modrm - 0xE0;
   6167                DIP("fucom %%st(0),%%st(%u)\n", r_dst);
   6168                /* This forces C1 to zero, which isn't right. */
   6169                put_C3210(
   6170                    unop(Iop_32Uto64,
   6171                    binop( Iop_And32,
   6172                           binop(Iop_Shl32,
   6173                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   6174                                 mkU8(8)),
   6175                           mkU32(0x4500)
   6176                    )));
   6177                break;
   6178 
   6179             case 0xE8 ... 0xEF: /* FUCOMP %st(0),%st(?) */
   6180                r_dst = (UInt)modrm - 0xE8;
   6181                DIP("fucomp %%st(0),%%st(%u)\n", r_dst);
   6182                /* This forces C1 to zero, which isn't right. */
   6183                put_C3210(
   6184                    unop(Iop_32Uto64,
   6185                    binop( Iop_And32,
   6186                           binop(Iop_Shl32,
   6187                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   6188                                 mkU8(8)),
   6189                           mkU32(0x4500)
   6190                    )));
   6191                fp_pop();
   6192                break;
   6193 
   6194             default:
   6195                goto decode_fail;
   6196          }
   6197       }
   6198    }
   6199 
   6200    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDE opcodes +-+-+-+-+-+-+-+ */
   6201    else
   6202    if (first_opcode == 0xDE) {
   6203 
   6204       if (modrm < 0xC0) {
   6205 
   6206          /* bits 5,4,3 are an opcode extension, and the modRM also
   6207             specifies an address. */
   6208          IROp   fop;
   6209          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6210          delta += len;
   6211 
   6212          switch (gregLO3ofRM(modrm)) {
   6213 
   6214             case 0: /* FIADD m16int */ /* ST(0) += m16int */
   6215                DIP("fiaddw %s\n", dis_buf);
   6216                fop = Iop_AddF64;
   6217                goto do_fop_m16;
   6218 
   6219             case 1: /* FIMUL m16int */ /* ST(0) *= m16int */
   6220                DIP("fimulw %s\n", dis_buf);
   6221                fop = Iop_MulF64;
   6222                goto do_fop_m16;
   6223 
   6224             case 4: /* FISUB m16int */ /* ST(0) -= m16int */
   6225                DIP("fisubw %s\n", dis_buf);
   6226                fop = Iop_SubF64;
   6227                goto do_fop_m16;
   6228 
   6229             case 5: /* FISUBR m16int */ /* ST(0) = m16int - ST(0) */
   6230                DIP("fisubrw %s\n", dis_buf);
   6231                fop = Iop_SubF64;
   6232                goto do_foprev_m16;
   6233 
   6234             case 6: /* FIDIV m16int */ /* ST(0) /= m16int */
   6235                DIP("fisubw %s\n", dis_buf);
   6236                fop = Iop_DivF64;
   6237                goto do_fop_m16;
   6238 
   6239             case 7: /* FIDIVR m16int */ /* ST(0) = m16int / ST(0) */
   6240                DIP("fidivrw %s\n", dis_buf);
   6241                fop = Iop_DivF64;
   6242                goto do_foprev_m16;
   6243 
   6244             do_fop_m16:
   6245                put_ST_UNCHECKED(0,
   6246                   triop(fop,
   6247                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6248                         get_ST(0),
   6249                         unop(Iop_I32StoF64,
   6250                              unop(Iop_16Sto32,
   6251                                   loadLE(Ity_I16, mkexpr(addr))))));
   6252                break;
   6253 
   6254             do_foprev_m16:
   6255                put_ST_UNCHECKED(0,
   6256                   triop(fop,
   6257                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6258                         unop(Iop_I32StoF64,
   6259                              unop(Iop_16Sto32,
   6260                                   loadLE(Ity_I16, mkexpr(addr)))),
   6261                         get_ST(0)));
   6262                break;
   6263 
   6264             default:
   6265                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   6266                vex_printf("first_opcode == 0xDE\n");
   6267                goto decode_fail;
   6268          }
   6269 
   6270       } else {
   6271 
   6272          delta++;
   6273          switch (modrm) {
   6274 
   6275             case 0xC0 ... 0xC7: /* FADDP %st(0),%st(?) */
   6276                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, True );
   6277                break;
   6278 
   6279             case 0xC8 ... 0xCF: /* FMULP %st(0),%st(?) */
   6280                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, True );
   6281                break;
   6282 
   6283             case 0xD9: /* FCOMPP %st(0),%st(1) */
   6284                DIP("fcompp %%st(0),%%st(1)\n");
   6285                /* This forces C1 to zero, which isn't right. */
   6286                put_C3210(
   6287                    unop(Iop_32Uto64,
   6288                    binop( Iop_And32,
   6289                           binop(Iop_Shl32,
   6290                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
   6291                                 mkU8(8)),
   6292                           mkU32(0x4500)
   6293                    )));
   6294                fp_pop();
   6295                fp_pop();
   6296                break;
   6297 
   6298             case 0xE0 ... 0xE7: /* FSUBRP %st(0),%st(?) */
   6299                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0,  modrm - 0xE0, True );
   6300                break;
   6301 
   6302             case 0xE8 ... 0xEF: /* FSUBP %st(0),%st(?) */
   6303                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0,  modrm - 0xE8, True );
   6304                break;
   6305 
   6306             case 0xF0 ... 0xF7: /* FDIVRP %st(0),%st(?) */
   6307                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, True );
   6308                break;
   6309 
   6310             case 0xF8 ... 0xFF: /* FDIVP %st(0),%st(?) */
   6311                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, True );
   6312                break;
   6313 
   6314             default:
   6315                goto decode_fail;
   6316          }
   6317 
   6318       }
   6319    }
   6320 
   6321    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDF opcodes +-+-+-+-+-+-+-+ */
   6322    else
   6323    if (first_opcode == 0xDF) {
   6324 
   6325       if (modrm < 0xC0) {
   6326 
   6327          /* bits 5,4,3 are an opcode extension, and the modRM also
   6328             specifies an address. */
   6329          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6330          delta += len;
   6331 
   6332          switch (gregLO3ofRM(modrm)) {
   6333 
   6334             case 0: /* FILD m16int */
   6335                DIP("fildw %s\n", dis_buf);
   6336                fp_push();
   6337                put_ST(0, unop(Iop_I32StoF64,
   6338                               unop(Iop_16Sto32,
   6339                                    loadLE(Ity_I16, mkexpr(addr)))));
   6340                break;
   6341 
   6342             case 1: /* FISTTPS m16 (SSE3) */
   6343                DIP("fisttps %s\n", dis_buf);
   6344                storeLE( mkexpr(addr),
   6345                         x87ishly_qnarrow_32_to_16(
   6346                         binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) ));
   6347                fp_pop();
   6348                break;
   6349 
   6350             case 2: /* FIST m16 */
   6351                DIP("fists %s\n", dis_buf);
   6352                storeLE( mkexpr(addr),
   6353                         x87ishly_qnarrow_32_to_16(
   6354                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) ));
   6355                break;
   6356 
   6357             case 3: /* FISTP m16 */
   6358                DIP("fistps %s\n", dis_buf);
   6359                storeLE( mkexpr(addr),
   6360                         x87ishly_qnarrow_32_to_16(
   6361                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) ));
   6362                fp_pop();
   6363                break;
   6364 
   6365             case 5: /* FILD m64 */
   6366                DIP("fildll %s\n", dis_buf);
   6367                fp_push();
   6368                put_ST(0, binop(Iop_I64StoF64,
   6369                                get_roundingmode(),
   6370                                loadLE(Ity_I64, mkexpr(addr))));
   6371                break;
   6372 
   6373             case 7: /* FISTP m64 */
   6374                DIP("fistpll %s\n", dis_buf);
   6375                storeLE( mkexpr(addr),
   6376                         binop(Iop_F64toI64S, get_roundingmode(), get_ST(0)) );
   6377                fp_pop();
   6378                break;
   6379 
   6380             default:
   6381                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   6382                vex_printf("first_opcode == 0xDF\n");
   6383                goto decode_fail;
   6384          }
   6385 
   6386       } else {
   6387 
   6388          delta++;
   6389          switch (modrm) {
   6390 
   6391             case 0xC0: /* FFREEP %st(0) */
   6392                DIP("ffreep %%st(%d)\n", 0);
   6393                put_ST_TAG ( 0, mkU8(0) );
   6394                fp_pop();
   6395                break;
   6396 
   6397             case 0xE0: /* FNSTSW %ax */
   6398                DIP("fnstsw %%ax\n");
   6399                /* Invent a plausible-looking FPU status word value and
   6400                   dump it in %AX:
   6401                      ((ftop & 7) << 11) | (c3210 & 0x4700)
   6402                */
   6403                putIRegRAX(
   6404                   2,
   6405                   unop(Iop_32to16,
   6406                        binop(Iop_Or32,
   6407                              binop(Iop_Shl32,
   6408                                    binop(Iop_And32, get_ftop(), mkU32(7)),
   6409                                    mkU8(11)),
   6410                              binop(Iop_And32,
   6411                                    unop(Iop_64to32, get_C3210()),
   6412                                    mkU32(0x4700))
   6413                )));
   6414                break;
   6415 
   6416             case 0xE8 ... 0xEF: /* FUCOMIP %st(0),%st(?) */
   6417                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, True );
   6418                break;
   6419 
   6420             case 0xF0 ... 0xF7: /* FCOMIP %st(0),%st(?) */
   6421                /* not really right since COMIP != UCOMIP */
   6422                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, True );
   6423                break;
   6424 
   6425             default:
   6426                goto decode_fail;
   6427          }
   6428       }
   6429 
   6430    }
   6431 
   6432    else
   6433       goto decode_fail;
   6434 
   6435    *decode_ok = True;
   6436    return delta;
   6437 
   6438   decode_fail:
   6439    *decode_ok = False;
   6440    return delta;
   6441 }
   6442 
   6443 
   6444 /*------------------------------------------------------------*/
   6445 /*---                                                      ---*/
   6446 /*--- MMX INSTRUCTIONS                                     ---*/
   6447 /*---                                                      ---*/
   6448 /*------------------------------------------------------------*/
   6449 
   6450 /* Effect of MMX insns on x87 FPU state (table 11-2 of
   6451    IA32 arch manual, volume 3):
   6452 
   6453    Read from, or write to MMX register (viz, any insn except EMMS):
   6454    * All tags set to Valid (non-empty) -- FPTAGS[i] := nonzero
   6455    * FP stack pointer set to zero
   6456 
   6457    EMMS:
   6458    * All tags set to Invalid (empty) -- FPTAGS[i] := zero
   6459    * FP stack pointer set to zero
   6460 */
   6461 
   6462 static void do_MMX_preamble ( void )
   6463 {
   6464    Int         i;
   6465    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   6466    IRExpr*     zero  = mkU32(0);
   6467    IRExpr*     tag1  = mkU8(1);
   6468    put_ftop(zero);
   6469    for (i = 0; i < 8; i++)
   6470       stmt( IRStmt_PutI( descr, zero, i, tag1 ) );
   6471 }
   6472 
   6473 static void do_EMMS_preamble ( void )
   6474 {
   6475    Int         i;
   6476    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   6477    IRExpr*     zero  = mkU32(0);
   6478    IRExpr*     tag0  = mkU8(0);
   6479    put_ftop(zero);
   6480    for (i = 0; i < 8; i++)
   6481       stmt( IRStmt_PutI( descr, zero, i, tag0 ) );
   6482 }
   6483 
   6484 
   6485 static IRExpr* getMMXReg ( UInt archreg )
   6486 {
   6487    vassert(archreg < 8);
   6488    return IRExpr_Get( OFFB_FPREGS + 8 * archreg, Ity_I64 );
   6489 }
   6490 
   6491 
   6492 static void putMMXReg ( UInt archreg, IRExpr* e )
   6493 {
   6494    vassert(archreg < 8);
   6495    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   6496    stmt( IRStmt_Put( OFFB_FPREGS + 8 * archreg, e ) );
   6497 }
   6498 
   6499 
   6500 /* Helper for non-shift MMX insns.  Note this is incomplete in the
   6501    sense that it does not first call do_MMX_preamble() -- that is the
   6502    responsibility of its caller. */
   6503 
   6504 static
   6505 ULong dis_MMXop_regmem_to_reg ( VexAbiInfo* vbi,
   6506                                 Prefix      pfx,
   6507                                 Long        delta,
   6508                                 UChar       opc,
   6509                                 HChar*      name,
   6510                                 Bool        show_granularity )
   6511 {
   6512    HChar   dis_buf[50];
   6513    UChar   modrm = getUChar(delta);
   6514    Bool    isReg = epartIsReg(modrm);
   6515    IRExpr* argL  = NULL;
   6516    IRExpr* argR  = NULL;
   6517    IRExpr* argG  = NULL;
   6518    IRExpr* argE  = NULL;
   6519    IRTemp  res   = newTemp(Ity_I64);
   6520 
   6521    Bool    invG  = False;
   6522    IROp    op    = Iop_INVALID;
   6523    void*   hAddr = NULL;
   6524    HChar*  hName = NULL;
   6525    Bool    eLeft = False;
   6526 
   6527 #  define XXX(_name) do { hAddr = &_name; hName = #_name; } while (0)
   6528 
   6529    switch (opc) {
   6530       /* Original MMX ones */
   6531       case 0xFC: op = Iop_Add8x8; break;
   6532       case 0xFD: op = Iop_Add16x4; break;
   6533       case 0xFE: op = Iop_Add32x2; break;
   6534 
   6535       case 0xEC: op = Iop_QAdd8Sx8; break;
   6536       case 0xED: op = Iop_QAdd16Sx4; break;
   6537 
   6538       case 0xDC: op = Iop_QAdd8Ux8; break;
   6539       case 0xDD: op = Iop_QAdd16Ux4; break;
   6540 
   6541       case 0xF8: op = Iop_Sub8x8;  break;
   6542       case 0xF9: op = Iop_Sub16x4; break;
   6543       case 0xFA: op = Iop_Sub32x2; break;
   6544 
   6545       case 0xE8: op = Iop_QSub8Sx8; break;
   6546       case 0xE9: op = Iop_QSub16Sx4; break;
   6547 
   6548       case 0xD8: op = Iop_QSub8Ux8; break;
   6549       case 0xD9: op = Iop_QSub16Ux4; break;
   6550 
   6551       case 0xE5: op = Iop_MulHi16Sx4; break;
   6552       case 0xD5: op = Iop_Mul16x4; break;
   6553       case 0xF5: XXX(amd64g_calculate_mmx_pmaddwd); break;
   6554 
   6555       case 0x74: op = Iop_CmpEQ8x8; break;
   6556       case 0x75: op = Iop_CmpEQ16x4; break;
   6557       case 0x76: op = Iop_CmpEQ32x2; break;
   6558 
   6559       case 0x64: op = Iop_CmpGT8Sx8; break;
   6560       case 0x65: op = Iop_CmpGT16Sx4; break;
   6561       case 0x66: op = Iop_CmpGT32Sx2; break;
   6562 
   6563       case 0x6B: op = Iop_QNarrowBin32Sto16Sx4; eLeft = True; break;
   6564       case 0x63: op = Iop_QNarrowBin16Sto8Sx8;  eLeft = True; break;
   6565       case 0x67: op = Iop_QNarrowBin16Sto8Ux8;  eLeft = True; break;
   6566 
   6567       case 0x68: op = Iop_InterleaveHI8x8;  eLeft = True; break;
   6568       case 0x69: op = Iop_InterleaveHI16x4; eLeft = True; break;
   6569       case 0x6A: op = Iop_InterleaveHI32x2; eLeft = True; break;
   6570 
   6571       case 0x60: op = Iop_InterleaveLO8x8;  eLeft = True; break;
   6572       case 0x61: op = Iop_InterleaveLO16x4; eLeft = True; break;
   6573       case 0x62: op = Iop_InterleaveLO32x2; eLeft = True; break;
   6574 
   6575       case 0xDB: op = Iop_And64; break;
   6576       case 0xDF: op = Iop_And64; invG = True; break;
   6577       case 0xEB: op = Iop_Or64; break;
   6578       case 0xEF: /* Possibly do better here if argL and argR are the
   6579                     same reg */
   6580                  op = Iop_Xor64; break;
   6581 
   6582       /* Introduced in SSE1 */
   6583       case 0xE0: op = Iop_Avg8Ux8;    break;
   6584       case 0xE3: op = Iop_Avg16Ux4;   break;
   6585       case 0xEE: op = Iop_Max16Sx4;   break;
   6586       case 0xDE: op = Iop_Max8Ux8;    break;
   6587       case 0xEA: op = Iop_Min16Sx4;   break;
   6588       case 0xDA: op = Iop_Min8Ux8;    break;
   6589       case 0xE4: op = Iop_MulHi16Ux4; break;
   6590       case 0xF6: XXX(amd64g_calculate_mmx_psadbw); break;
   6591 
   6592       /* Introduced in SSE2 */
   6593       case 0xD4: op = Iop_Add64; break;
   6594       case 0xFB: op = Iop_Sub64; break;
   6595 
   6596       default:
   6597          vex_printf("\n0x%x\n", (Int)opc);
   6598          vpanic("dis_MMXop_regmem_to_reg");
   6599    }
   6600 
   6601 #  undef XXX
   6602 
   6603    argG = getMMXReg(gregLO3ofRM(modrm));
   6604    if (invG)
   6605       argG = unop(Iop_Not64, argG);
   6606 
   6607    if (isReg) {
   6608       delta++;
   6609       argE = getMMXReg(eregLO3ofRM(modrm));
   6610    } else {
   6611       Int    len;
   6612       IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6613       delta += len;
   6614       argE = loadLE(Ity_I64, mkexpr(addr));
   6615    }
   6616 
   6617    if (eLeft) {
   6618       argL = argE;
   6619       argR = argG;
   6620    } else {
   6621       argL = argG;
   6622       argR = argE;
   6623    }
   6624 
   6625    if (op != Iop_INVALID) {
   6626       vassert(hName == NULL);
   6627       vassert(hAddr == NULL);
   6628       assign(res, binop(op, argL, argR));
   6629    } else {
   6630       vassert(hName != NULL);
   6631       vassert(hAddr != NULL);
   6632       assign( res,
   6633               mkIRExprCCall(
   6634                  Ity_I64,
   6635                  0/*regparms*/, hName, hAddr,
   6636                  mkIRExprVec_2( argL, argR )
   6637               )
   6638             );
   6639    }
   6640 
   6641    putMMXReg( gregLO3ofRM(modrm), mkexpr(res) );
   6642 
   6643    DIP("%s%s %s, %s\n",
   6644        name, show_granularity ? nameMMXGran(opc & 3) : "",
   6645        ( isReg ? nameMMXReg(eregLO3ofRM(modrm)) : dis_buf ),
   6646        nameMMXReg(gregLO3ofRM(modrm)) );
   6647 
   6648    return delta;
   6649 }
   6650 
   6651 
   6652 /* Vector by scalar shift of G by the amount specified at the bottom
   6653    of E.  This is a straight copy of dis_SSE_shiftG_byE. */
   6654 
   6655 static ULong dis_MMX_shiftG_byE ( VexAbiInfo* vbi,
   6656                                   Prefix pfx, Long delta,
   6657                                   HChar* opname, IROp op )
   6658 {
   6659    HChar   dis_buf[50];
   6660    Int     alen, size;
   6661    IRTemp  addr;
   6662    Bool    shl, shr, sar;
   6663    UChar   rm   = getUChar(delta);
   6664    IRTemp  g0   = newTemp(Ity_I64);
   6665    IRTemp  g1   = newTemp(Ity_I64);
   6666    IRTemp  amt  = newTemp(Ity_I64);
   6667    IRTemp  amt8 = newTemp(Ity_I8);
   6668 
   6669    if (epartIsReg(rm)) {
   6670       assign( amt, getMMXReg(eregLO3ofRM(rm)) );
   6671       DIP("%s %s,%s\n", opname,
   6672                         nameMMXReg(eregLO3ofRM(rm)),
   6673                         nameMMXReg(gregLO3ofRM(rm)) );
   6674       delta++;
   6675    } else {
   6676       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   6677       assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
   6678       DIP("%s %s,%s\n", opname,
   6679                         dis_buf,
   6680                         nameMMXReg(gregLO3ofRM(rm)) );
   6681       delta += alen;
   6682    }
   6683    assign( g0,   getMMXReg(gregLO3ofRM(rm)) );
   6684    assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
   6685 
   6686    shl = shr = sar = False;
   6687    size = 0;
   6688    switch (op) {
   6689       case Iop_ShlN16x4: shl = True; size = 32; break;
   6690       case Iop_ShlN32x2: shl = True; size = 32; break;
   6691       case Iop_Shl64:    shl = True; size = 64; break;
   6692       case Iop_ShrN16x4: shr = True; size = 16; break;
   6693       case Iop_ShrN32x2: shr = True; size = 32; break;
   6694       case Iop_Shr64:    shr = True; size = 64; break;
   6695       case Iop_SarN16x4: sar = True; size = 16; break;
   6696       case Iop_SarN32x2: sar = True; size = 32; break;
   6697       default: vassert(0);
   6698    }
   6699 
   6700    if (shl || shr) {
   6701      assign(
   6702         g1,
   6703         IRExpr_Mux0X(
   6704            unop(Iop_1Uto8,binop(Iop_CmpLT64U,mkexpr(amt),mkU64(size))),
   6705            mkU64(0),
   6706            binop(op, mkexpr(g0), mkexpr(amt8))
   6707         )
   6708      );
   6709    } else
   6710    if (sar) {
   6711      assign(
   6712         g1,
   6713         IRExpr_Mux0X(
   6714            unop(Iop_1Uto8,binop(Iop_CmpLT64U,mkexpr(amt),mkU64(size))),
   6715            binop(op, mkexpr(g0), mkU8(size-1)),
   6716            binop(op, mkexpr(g0), mkexpr(amt8))
   6717         )
   6718      );
   6719    } else {
   6720       vassert(0);
   6721    }
   6722 
   6723    putMMXReg( gregLO3ofRM(rm), mkexpr(g1) );
   6724    return delta;
   6725 }
   6726 
   6727 
   6728 /* Vector by scalar shift of E by an immediate byte.  This is a
   6729    straight copy of dis_SSE_shiftE_imm. */
   6730 
   6731 static
   6732 ULong dis_MMX_shiftE_imm ( Long delta, HChar* opname, IROp op )
   6733 {
   6734    Bool    shl, shr, sar;
   6735    UChar   rm   = getUChar(delta);
   6736    IRTemp  e0   = newTemp(Ity_I64);
   6737    IRTemp  e1   = newTemp(Ity_I64);
   6738    UChar   amt, size;
   6739    vassert(epartIsReg(rm));
   6740    vassert(gregLO3ofRM(rm) == 2
   6741            || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
   6742    amt = getUChar(delta+1);
   6743    delta += 2;
   6744    DIP("%s $%d,%s\n", opname,
   6745                       (Int)amt,
   6746                       nameMMXReg(eregLO3ofRM(rm)) );
   6747 
   6748    assign( e0, getMMXReg(eregLO3ofRM(rm)) );
   6749 
   6750    shl = shr = sar = False;
   6751    size = 0;
   6752    switch (op) {
   6753       case Iop_ShlN16x4: shl = True; size = 16; break;
   6754       case Iop_ShlN32x2: shl = True; size = 32; break;
   6755       case Iop_Shl64:    shl = True; size = 64; break;
   6756       case Iop_SarN16x4: sar = True; size = 16; break;
   6757       case Iop_SarN32x2: sar = True; size = 32; break;
   6758       case Iop_ShrN16x4: shr = True; size = 16; break;
   6759       case Iop_ShrN32x2: shr = True; size = 32; break;
   6760       case Iop_Shr64:    shr = True; size = 64; break;
   6761       default: vassert(0);
   6762    }
   6763 
   6764    if (shl || shr) {
   6765      assign( e1, amt >= size
   6766                     ? mkU64(0)
   6767                     : binop(op, mkexpr(e0), mkU8(amt))
   6768      );
   6769    } else
   6770    if (sar) {
   6771      assign( e1, amt >= size
   6772                     ? binop(op, mkexpr(e0), mkU8(size-1))
   6773                     : binop(op, mkexpr(e0), mkU8(amt))
   6774      );
   6775    } else {
   6776       vassert(0);
   6777    }
   6778 
   6779    putMMXReg( eregLO3ofRM(rm), mkexpr(e1) );
   6780    return delta;
   6781 }
   6782 
   6783 
   6784 /* Completely handle all MMX instructions except emms. */
   6785 
   6786 static
   6787 ULong dis_MMX ( Bool* decode_ok,
   6788                 VexAbiInfo* vbi, Prefix pfx, Int sz, Long delta )
   6789 {
   6790    Int   len;
   6791    UChar modrm;
   6792    HChar dis_buf[50];
   6793    UChar opc = getUChar(delta);
   6794    delta++;
   6795 
   6796    /* dis_MMX handles all insns except emms. */
   6797    do_MMX_preamble();
   6798 
   6799    switch (opc) {
   6800 
   6801       case 0x6E:
   6802          if (sz == 4) {
   6803             /* MOVD (src)ireg32-or-mem32 (E), (dst)mmxreg (G)*/
   6804             modrm = getUChar(delta);
   6805             if (epartIsReg(modrm)) {
   6806                delta++;
   6807                putMMXReg(
   6808                   gregLO3ofRM(modrm),
   6809                   binop( Iop_32HLto64,
   6810                          mkU32(0),
   6811                          getIReg32(eregOfRexRM(pfx,modrm)) ) );
   6812                DIP("movd %s, %s\n",
   6813                    nameIReg32(eregOfRexRM(pfx,modrm)),
   6814                    nameMMXReg(gregLO3ofRM(modrm)));
   6815             } else {
   6816                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6817                delta += len;
   6818                putMMXReg(
   6819                   gregLO3ofRM(modrm),
   6820                   binop( Iop_32HLto64,
   6821                          mkU32(0),
   6822                          loadLE(Ity_I32, mkexpr(addr)) ) );
   6823                DIP("movd %s, %s\n", dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
   6824             }
   6825          }
   6826          else
   6827          if (sz == 8) {
   6828             /* MOVD (src)ireg64-or-mem64 (E), (dst)mmxreg (G)*/
   6829             modrm = getUChar(delta);
   6830             if (epartIsReg(modrm)) {
   6831                delta++;
   6832                putMMXReg( gregLO3ofRM(modrm),
   6833                           getIReg64(eregOfRexRM(pfx,modrm)) );
   6834                DIP("movd %s, %s\n",
   6835                    nameIReg64(eregOfRexRM(pfx,modrm)),
   6836                    nameMMXReg(gregLO3ofRM(modrm)));
   6837             } else {
   6838                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6839                delta += len;
   6840                putMMXReg( gregLO3ofRM(modrm),
   6841                           loadLE(Ity_I64, mkexpr(addr)) );
   6842                DIP("movd{64} %s, %s\n", dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
   6843             }
   6844          }
   6845          else {
   6846             goto mmx_decode_failure;
   6847          }
   6848          break;
   6849 
   6850       case 0x7E:
   6851          if (sz == 4) {
   6852             /* MOVD (src)mmxreg (G), (dst)ireg32-or-mem32 (E) */
   6853             modrm = getUChar(delta);
   6854             if (epartIsReg(modrm)) {
   6855                delta++;
   6856                putIReg32( eregOfRexRM(pfx,modrm),
   6857                           unop(Iop_64to32, getMMXReg(gregLO3ofRM(modrm)) ) );
   6858                DIP("movd %s, %s\n",
   6859                    nameMMXReg(gregLO3ofRM(modrm)),
   6860                    nameIReg32(eregOfRexRM(pfx,modrm)));
   6861             } else {
   6862                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6863                delta += len;
   6864                storeLE( mkexpr(addr),
   6865                         unop(Iop_64to32, getMMXReg(gregLO3ofRM(modrm)) ) );
   6866                DIP("movd %s, %s\n", nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
   6867             }
   6868          }
   6869          else
   6870          if (sz == 8) {
   6871             /* MOVD (src)mmxreg (G), (dst)ireg64-or-mem64 (E) */
   6872             modrm = getUChar(delta);
   6873             if (epartIsReg(modrm)) {
   6874                delta++;
   6875                putIReg64( eregOfRexRM(pfx,modrm),
   6876                           getMMXReg(gregLO3ofRM(modrm)) );
   6877                DIP("movd %s, %s\n",
   6878                    nameMMXReg(gregLO3ofRM(modrm)),
   6879                    nameIReg64(eregOfRexRM(pfx,modrm)));
   6880             } else {
   6881                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6882                delta += len;
   6883                storeLE( mkexpr(addr),
   6884                        getMMXReg(gregLO3ofRM(modrm)) );
   6885                DIP("movd{64} %s, %s\n", nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
   6886             }
   6887          } else {
   6888             goto mmx_decode_failure;
   6889          }
   6890          break;
   6891 
   6892       case 0x6F:
   6893          /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
   6894          if (sz != 4
   6895              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   6896             goto mmx_decode_failure;
   6897          modrm = getUChar(delta);
   6898          if (epartIsReg(modrm)) {
   6899             delta++;
   6900             putMMXReg( gregLO3ofRM(modrm), getMMXReg(eregLO3ofRM(modrm)) );
   6901             DIP("movq %s, %s\n",
   6902                 nameMMXReg(eregLO3ofRM(modrm)),
   6903                 nameMMXReg(gregLO3ofRM(modrm)));
   6904          } else {
   6905             IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6906             delta += len;
   6907             putMMXReg( gregLO3ofRM(modrm), loadLE(Ity_I64, mkexpr(addr)) );
   6908             DIP("movq %s, %s\n",
   6909                 dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
   6910          }
   6911          break;
   6912 
   6913       case 0x7F:
   6914          /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
   6915          if (sz != 4
   6916              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   6917             goto mmx_decode_failure;
   6918          modrm = getUChar(delta);
   6919          if (epartIsReg(modrm)) {
   6920             /* Fall through.  The assembler doesn't appear to generate
   6921                these. */
   6922             goto mmx_decode_failure;
   6923          } else {
   6924             IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6925             delta += len;
   6926             storeLE( mkexpr(addr), getMMXReg(gregLO3ofRM(modrm)) );
   6927             DIP("mov(nt)q %s, %s\n",
   6928                 nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
   6929          }
   6930          break;
   6931 
   6932       case 0xFC:
   6933       case 0xFD:
   6934       case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
   6935          if (sz != 4)
   6936             goto mmx_decode_failure;
   6937          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "padd", True );
   6938          break;
   6939 
   6940       case 0xEC:
   6941       case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
   6942          if (sz != 4
   6943              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   6944             goto mmx_decode_failure;
   6945          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "padds", True );
   6946          break;
   6947 
   6948       case 0xDC:
   6949       case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   6950          if (sz != 4)
   6951             goto mmx_decode_failure;
   6952          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "paddus", True );
   6953          break;
   6954 
   6955       case 0xF8:
   6956       case 0xF9:
   6957       case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
   6958          if (sz != 4)
   6959             goto mmx_decode_failure;
   6960          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psub", True );
   6961          break;
   6962 
   6963       case 0xE8:
   6964       case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
   6965          if (sz != 4)
   6966             goto mmx_decode_failure;
   6967          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psubs", True );
   6968          break;
   6969 
   6970       case 0xD8:
   6971       case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   6972          if (sz != 4)
   6973             goto mmx_decode_failure;
   6974          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psubus", True );
   6975          break;
   6976 
   6977       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
   6978          if (sz != 4)
   6979             goto mmx_decode_failure;
   6980          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmulhw", False );
   6981          break;
   6982 
   6983       case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
   6984          if (sz != 4)
   6985             goto mmx_decode_failure;
   6986          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmullw", False );
   6987          break;
   6988 
   6989       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
   6990          vassert(sz == 4);
   6991          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmaddwd", False );
   6992          break;
   6993 
   6994       case 0x74:
   6995       case 0x75:
   6996       case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
   6997          if (sz != 4)
   6998             goto mmx_decode_failure;
   6999          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pcmpeq", True );
   7000          break;
   7001 
   7002       case 0x64:
   7003       case 0x65:
   7004       case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
   7005          if (sz != 4)
   7006             goto mmx_decode_failure;
   7007          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pcmpgt", True );
   7008          break;
   7009 
   7010       case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
   7011          if (sz != 4)
   7012             goto mmx_decode_failure;
   7013          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packssdw", False );
   7014          break;
   7015 
   7016       case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
   7017          if (sz != 4)
   7018             goto mmx_decode_failure;
   7019          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packsswb", False );
   7020          break;
   7021 
   7022       case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
   7023          if (sz != 4)
   7024             goto mmx_decode_failure;
   7025          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packuswb", False );
   7026          break;
   7027 
   7028       case 0x68:
   7029       case 0x69:
   7030       case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
   7031          if (sz != 4
   7032              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7033             goto mmx_decode_failure;
   7034          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "punpckh", True );
   7035          break;
   7036 
   7037       case 0x60:
   7038       case 0x61:
   7039       case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
   7040          if (sz != 4
   7041              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7042             goto mmx_decode_failure;
   7043          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "punpckl", True );
   7044          break;
   7045 
   7046       case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
   7047          if (sz != 4)
   7048             goto mmx_decode_failure;
   7049          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pand", False );
   7050          break;
   7051 
   7052       case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
   7053          if (sz != 4)
   7054             goto mmx_decode_failure;
   7055          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pandn", False );
   7056          break;
   7057 
   7058       case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
   7059          if (sz != 4)
   7060             goto mmx_decode_failure;
   7061          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "por", False );
   7062          break;
   7063 
   7064       case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
   7065          if (sz != 4)
   7066             goto mmx_decode_failure;
   7067          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pxor", False );
   7068          break;
   7069 
   7070 #     define SHIFT_BY_REG(_name,_op)                                     \
   7071                 delta = dis_MMX_shiftG_byE(vbi, pfx, delta, _name, _op); \
   7072                 break;
   7073 
   7074       /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
   7075       case 0xF1: SHIFT_BY_REG("psllw", Iop_ShlN16x4);
   7076       case 0xF2: SHIFT_BY_REG("pslld", Iop_ShlN32x2);
   7077       case 0xF3: SHIFT_BY_REG("psllq", Iop_Shl64);
   7078 
   7079       /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
   7080       case 0xD1: SHIFT_BY_REG("psrlw", Iop_ShrN16x4);
   7081       case 0xD2: SHIFT_BY_REG("psrld", Iop_ShrN32x2);
   7082       case 0xD3: SHIFT_BY_REG("psrlq", Iop_Shr64);
   7083 
   7084       /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
   7085       case 0xE1: SHIFT_BY_REG("psraw", Iop_SarN16x4);
   7086       case 0xE2: SHIFT_BY_REG("psrad", Iop_SarN32x2);
   7087 
   7088 #     undef SHIFT_BY_REG
   7089 
   7090       case 0x71:
   7091       case 0x72:
   7092       case 0x73: {
   7093          /* (sz==4): PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
   7094          UChar byte2, subopc;
   7095          if (sz != 4)
   7096             goto mmx_decode_failure;
   7097          byte2  = getUChar(delta);      /* amode / sub-opcode */
   7098          subopc = toUChar( (byte2 >> 3) & 7 );
   7099 
   7100 #        define SHIFT_BY_IMM(_name,_op)                        \
   7101             do { delta = dis_MMX_shiftE_imm(delta,_name,_op);  \
   7102             } while (0)
   7103 
   7104               if (subopc == 2 /*SRL*/ && opc == 0x71)
   7105                   SHIFT_BY_IMM("psrlw", Iop_ShrN16x4);
   7106          else if (subopc == 2 /*SRL*/ && opc == 0x72)
   7107                  SHIFT_BY_IMM("psrld", Iop_ShrN32x2);
   7108          else if (subopc == 2 /*SRL*/ && opc == 0x73)
   7109                  SHIFT_BY_IMM("psrlq", Iop_Shr64);
   7110 
   7111          else if (subopc == 4 /*SAR*/ && opc == 0x71)
   7112                  SHIFT_BY_IMM("psraw", Iop_SarN16x4);
   7113          else if (subopc == 4 /*SAR*/ && opc == 0x72)
   7114                  SHIFT_BY_IMM("psrad", Iop_SarN32x2);
   7115 
   7116          else if (subopc == 6 /*SHL*/ && opc == 0x71)
   7117                  SHIFT_BY_IMM("psllw", Iop_ShlN16x4);
   7118          else if (subopc == 6 /*SHL*/ && opc == 0x72)
   7119                   SHIFT_BY_IMM("pslld", Iop_ShlN32x2);
   7120          else if (subopc == 6 /*SHL*/ && opc == 0x73)
   7121                  SHIFT_BY_IMM("psllq", Iop_Shl64);
   7122 
   7123          else goto mmx_decode_failure;
   7124 
   7125 #        undef SHIFT_BY_IMM
   7126          break;
   7127       }
   7128 
   7129       case 0xF7: {
   7130          IRTemp addr    = newTemp(Ity_I64);
   7131          IRTemp regD    = newTemp(Ity_I64);
   7132          IRTemp regM    = newTemp(Ity_I64);
   7133          IRTemp mask    = newTemp(Ity_I64);
   7134          IRTemp olddata = newTemp(Ity_I64);
   7135          IRTemp newdata = newTemp(Ity_I64);
   7136 
   7137          modrm = getUChar(delta);
   7138          if (sz != 4 || (!epartIsReg(modrm)))
   7139             goto mmx_decode_failure;
   7140          delta++;
   7141 
   7142          assign( addr, handleAddrOverrides( vbi, pfx, getIReg64(R_RDI) ));
   7143          assign( regM, getMMXReg( eregLO3ofRM(modrm) ));
   7144          assign( regD, getMMXReg( gregLO3ofRM(modrm) ));
   7145          assign( mask, binop(Iop_SarN8x8, mkexpr(regM), mkU8(7)) );
   7146          assign( olddata, loadLE( Ity_I64, mkexpr(addr) ));
   7147          assign( newdata,
   7148                  binop(Iop_Or64,
   7149                        binop(Iop_And64,
   7150                              mkexpr(regD),
   7151                              mkexpr(mask) ),
   7152                        binop(Iop_And64,
   7153                              mkexpr(olddata),
   7154                              unop(Iop_Not64, mkexpr(mask)))) );
   7155          storeLE( mkexpr(addr), mkexpr(newdata) );
   7156          DIP("maskmovq %s,%s\n", nameMMXReg( eregLO3ofRM(modrm) ),
   7157                                  nameMMXReg( gregLO3ofRM(modrm) ) );
   7158          break;
   7159       }
   7160 
   7161       /* --- MMX decode failure --- */
   7162       default:
   7163       mmx_decode_failure:
   7164          *decode_ok = False;
   7165          return delta; /* ignored */
   7166 
   7167    }
   7168 
   7169    *decode_ok = True;
   7170    return delta;
   7171 }
   7172 
   7173 
   7174 /*------------------------------------------------------------*/
   7175 /*--- More misc arithmetic and other obscure insns.        ---*/
   7176 /*------------------------------------------------------------*/
   7177 
   7178 /* Generate base << amt with vacated places filled with stuff
   7179    from xtra.  amt guaranteed in 0 .. 63. */
   7180 static
   7181 IRExpr* shiftL64_with_extras ( IRTemp base, IRTemp xtra, IRTemp amt )
   7182 {
   7183    /* if   amt == 0
   7184       then base
   7185       else (base << amt) | (xtra >>u (64-amt))
   7186    */
   7187    return
   7188       IRExpr_Mux0X(
   7189          mkexpr(amt),
   7190          mkexpr(base),
   7191          binop(Iop_Or64,
   7192                binop(Iop_Shl64, mkexpr(base), mkexpr(amt)),
   7193                binop(Iop_Shr64, mkexpr(xtra),
   7194                                 binop(Iop_Sub8, mkU8(64), mkexpr(amt)))
   7195          )
   7196       );
   7197 }
   7198 
   7199 /* Generate base >>u amt with vacated places filled with stuff
   7200    from xtra.  amt guaranteed in 0 .. 63. */
   7201 static
   7202 IRExpr* shiftR64_with_extras ( IRTemp xtra, IRTemp base, IRTemp amt )
   7203 {
   7204    /* if   amt == 0
   7205       then base
   7206       else (base >>u amt) | (xtra << (64-amt))
   7207    */
   7208    return
   7209       IRExpr_Mux0X(
   7210          mkexpr(amt),
   7211          mkexpr(base),
   7212          binop(Iop_Or64,
   7213                binop(Iop_Shr64, mkexpr(base), mkexpr(amt)),
   7214                binop(Iop_Shl64, mkexpr(xtra),
   7215                                 binop(Iop_Sub8, mkU8(64), mkexpr(amt)))
   7216          )
   7217       );
   7218 }
   7219 
   7220 /* Double length left and right shifts.  Apparently only required in
   7221    v-size (no b- variant). */
   7222 static
   7223 ULong dis_SHLRD_Gv_Ev ( VexAbiInfo* vbi,
   7224                         Prefix pfx,
   7225                         Long delta, UChar modrm,
   7226                         Int sz,
   7227                         IRExpr* shift_amt,
   7228                         Bool amt_is_literal,
   7229                         HChar* shift_amt_txt,
   7230                         Bool left_shift )
   7231 {
   7232    /* shift_amt :: Ity_I8 is the amount to shift.  shift_amt_txt is used
   7233       for printing it.   And eip on entry points at the modrm byte. */
   7234    Int len;
   7235    HChar dis_buf[50];
   7236 
   7237    IRType ty     = szToITy(sz);
   7238    IRTemp gsrc   = newTemp(ty);
   7239    IRTemp esrc   = newTemp(ty);
   7240    IRTemp addr   = IRTemp_INVALID;
   7241    IRTemp tmpSH  = newTemp(Ity_I8);
   7242    IRTemp tmpSS  = newTemp(Ity_I8);
   7243    IRTemp tmp64  = IRTemp_INVALID;
   7244    IRTemp res64  = IRTemp_INVALID;
   7245    IRTemp rss64  = IRTemp_INVALID;
   7246    IRTemp resTy  = IRTemp_INVALID;
   7247    IRTemp rssTy  = IRTemp_INVALID;
   7248    Int    mask   = sz==8 ? 63 : 31;
   7249 
   7250    vassert(sz == 2 || sz == 4 || sz == 8);
   7251 
   7252    /* The E-part is the destination; this is shifted.  The G-part
   7253       supplies bits to be shifted into the E-part, but is not
   7254       changed.
   7255 
   7256       If shifting left, form a double-length word with E at the top
   7257       and G at the bottom, and shift this left.  The result is then in
   7258       the high part.
   7259 
   7260       If shifting right, form a double-length word with G at the top
   7261       and E at the bottom, and shift this right.  The result is then
   7262       at the bottom.  */
   7263 
   7264    /* Fetch the operands. */
   7265 
   7266    assign( gsrc, getIRegG(sz, pfx, modrm) );
   7267 
   7268    if (epartIsReg(modrm)) {
   7269       delta++;
   7270       assign( esrc, getIRegE(sz, pfx, modrm) );
   7271       DIP("sh%cd%c %s, %s, %s\n",
   7272           ( left_shift ? 'l' : 'r' ), nameISize(sz),
   7273           shift_amt_txt,
   7274           nameIRegG(sz, pfx, modrm), nameIRegE(sz, pfx, modrm));
   7275    } else {
   7276       addr = disAMode ( &len, vbi, pfx, delta, dis_buf,
   7277                         /* # bytes following amode */
   7278                         amt_is_literal ? 1 : 0 );
   7279       delta += len;
   7280       assign( esrc, loadLE(ty, mkexpr(addr)) );
   7281       DIP("sh%cd%c %s, %s, %s\n",
   7282           ( left_shift ? 'l' : 'r' ), nameISize(sz),
   7283           shift_amt_txt,
   7284           nameIRegG(sz, pfx, modrm), dis_buf);
   7285    }
   7286 
   7287    /* Calculate the masked shift amount (tmpSH), the masked subshift
   7288       amount (tmpSS), the shifted value (res64) and the subshifted
   7289       value (rss64). */
   7290 
   7291    assign( tmpSH, binop(Iop_And8, shift_amt, mkU8(mask)) );
   7292    assign( tmpSS, binop(Iop_And8,
   7293                         binop(Iop_Sub8, mkexpr(tmpSH), mkU8(1) ),
   7294                         mkU8(mask)));
   7295 
   7296    tmp64 = newTemp(Ity_I64);
   7297    res64 = newTemp(Ity_I64);
   7298    rss64 = newTemp(Ity_I64);
   7299 
   7300    if (sz == 2 || sz == 4) {
   7301 
   7302       /* G is xtra; E is data */
   7303       /* what a freaking nightmare: */
   7304       if (sz == 4 && left_shift) {
   7305          assign( tmp64, binop(Iop_32HLto64, mkexpr(esrc), mkexpr(gsrc)) );
   7306          assign( res64,
   7307                  binop(Iop_Shr64,
   7308                        binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSH)),
   7309                        mkU8(32)) );
   7310          assign( rss64,
   7311                  binop(Iop_Shr64,
   7312                        binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSS)),
   7313                        mkU8(32)) );
   7314       }
   7315       else
   7316       if (sz == 4 && !left_shift) {
   7317          assign( tmp64, binop(Iop_32HLto64, mkexpr(gsrc), mkexpr(esrc)) );
   7318          assign( res64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSH)) );
   7319          assign( rss64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSS)) );
   7320       }
   7321       else
   7322       if (sz == 2 && left_shift) {
   7323          assign( tmp64,
   7324                  binop(Iop_32HLto64,
   7325                        binop(Iop_16HLto32, mkexpr(esrc), mkexpr(gsrc)),
   7326                        binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(gsrc))
   7327          ));
   7328 	 /* result formed by shifting [esrc'gsrc'gsrc'gsrc] */
   7329          assign( res64,
   7330                  binop(Iop_Shr64,
   7331                        binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSH)),
   7332                        mkU8(48)) );
   7333          /* subshift formed by shifting [esrc'0000'0000'0000] */
   7334          assign( rss64,
   7335                  binop(Iop_Shr64,
   7336                        binop(Iop_Shl64,
   7337                              binop(Iop_Shl64, unop(Iop_16Uto64, mkexpr(esrc)),
   7338                                               mkU8(48)),
   7339                              mkexpr(tmpSS)),
   7340                        mkU8(48)) );
   7341       }
   7342       else
   7343       if (sz == 2 && !left_shift) {
   7344          assign( tmp64,
   7345                  binop(Iop_32HLto64,
   7346                        binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(gsrc)),
   7347                        binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(esrc))
   7348          ));
   7349          /* result formed by shifting [gsrc'gsrc'gsrc'esrc] */
   7350          assign( res64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSH)) );
   7351          /* subshift formed by shifting [0000'0000'0000'esrc] */
   7352          assign( rss64, binop(Iop_Shr64,
   7353                               unop(Iop_16Uto64, mkexpr(esrc)),
   7354                               mkexpr(tmpSS)) );
   7355       }
   7356 
   7357    } else {
   7358 
   7359       vassert(sz == 8);
   7360       if (left_shift) {
   7361          assign( res64, shiftL64_with_extras( esrc, gsrc, tmpSH ));
   7362          assign( rss64, shiftL64_with_extras( esrc, gsrc, tmpSS ));
   7363       } else {
   7364          assign( res64, shiftR64_with_extras( gsrc, esrc, tmpSH ));
   7365          assign( rss64, shiftR64_with_extras( gsrc, esrc, tmpSS ));
   7366       }
   7367 
   7368    }
   7369 
   7370    resTy = newTemp(ty);
   7371    rssTy = newTemp(ty);
   7372    assign( resTy, narrowTo(ty, mkexpr(res64)) );
   7373    assign( rssTy, narrowTo(ty, mkexpr(rss64)) );
   7374 
   7375    /* Put result back and write the flags thunk. */
   7376    setFlags_DEP1_DEP2_shift ( left_shift ? Iop_Shl64 : Iop_Sar64,
   7377                               resTy, rssTy, ty, tmpSH );
   7378 
   7379    if (epartIsReg(modrm)) {
   7380       putIRegE(sz, pfx, modrm, mkexpr(resTy));
   7381    } else {
   7382       storeLE( mkexpr(addr), mkexpr(resTy) );
   7383    }
   7384 
   7385    if (amt_is_literal) delta++;
   7386    return delta;
   7387 }
   7388 
   7389 
   7390 /* Handle BT/BTS/BTR/BTC Gv, Ev.  Apparently b-size is not
   7391    required. */
   7392 
   7393 typedef enum { BtOpNone, BtOpSet, BtOpReset, BtOpComp } BtOp;
   7394 
   7395 static HChar* nameBtOp ( BtOp op )
   7396 {
   7397    switch (op) {
   7398       case BtOpNone:  return "";
   7399       case BtOpSet:   return "s";
   7400       case BtOpReset: return "r";
   7401       case BtOpComp:  return "c";
   7402       default: vpanic("nameBtOp(amd64)");
   7403    }
   7404 }
   7405 
   7406 
   7407 static
   7408 ULong dis_bt_G_E ( VexAbiInfo* vbi,
   7409                    Prefix pfx, Int sz, Long delta, BtOp op )
   7410 {
   7411    HChar  dis_buf[50];
   7412    UChar  modrm;
   7413    Int    len;
   7414    IRTemp t_fetched, t_bitno0, t_bitno1, t_bitno2, t_addr0,
   7415      t_addr1, t_rsp, t_mask, t_new;
   7416 
   7417    vassert(sz == 2 || sz == 4 || sz == 8);
   7418 
   7419    t_fetched = t_bitno0 = t_bitno1 = t_bitno2
   7420              = t_addr0 = t_addr1 = t_rsp
   7421              = t_mask = t_new = IRTemp_INVALID;
   7422 
   7423    t_fetched = newTemp(Ity_I8);
   7424    t_new     = newTemp(Ity_I8);
   7425    t_bitno0  = newTemp(Ity_I64);
   7426    t_bitno1  = newTemp(Ity_I64);
   7427    t_bitno2  = newTemp(Ity_I8);
   7428    t_addr1   = newTemp(Ity_I64);
   7429    modrm     = getUChar(delta);
   7430 
   7431    assign( t_bitno0, widenSto64(getIRegG(sz, pfx, modrm)) );
   7432 
   7433    if (epartIsReg(modrm)) {
   7434       delta++;
   7435       /* Get it onto the client's stack.  Oh, this is a horrible
   7436          kludge.  See https://bugs.kde.org/show_bug.cgi?id=245925.
   7437          Because of the ELF ABI stack redzone, there may be live data
   7438          up to 128 bytes below %RSP.  So we can't just push it on the
   7439          stack, else we may wind up trashing live data, and causing
   7440          impossible-to-find simulation errors.  (Yes, this did
   7441          happen.)  So we need to drop RSP before at least 128 before
   7442          pushing it.  That unfortunately means hitting Memcheck's
   7443          fast-case painting code.  Ideally we should drop more than
   7444          128, to reduce the chances of breaking buggy programs that
   7445          have live data below -128(%RSP).  Memcheck fast-cases moves
   7446          of 288 bytes due to the need to handle ppc64-linux quickly,
   7447          so let's use 288.  Of course the real fix is to get rid of
   7448          this kludge entirely.  */
   7449       t_rsp = newTemp(Ity_I64);
   7450       t_addr0 = newTemp(Ity_I64);
   7451 
   7452       vassert(vbi->guest_stack_redzone_size == 128);
   7453       assign( t_rsp, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(288)) );
   7454       putIReg64(R_RSP, mkexpr(t_rsp));
   7455 
   7456       storeLE( mkexpr(t_rsp), getIRegE(sz, pfx, modrm) );
   7457 
   7458       /* Make t_addr0 point at it. */
   7459       assign( t_addr0, mkexpr(t_rsp) );
   7460 
   7461       /* Mask out upper bits of the shift amount, since we're doing a
   7462          reg. */
   7463       assign( t_bitno1, binop(Iop_And64,
   7464                               mkexpr(t_bitno0),
   7465                               mkU64(sz == 8 ? 63 : sz == 4 ? 31 : 15)) );
   7466 
   7467    } else {
   7468       t_addr0 = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   7469       delta += len;
   7470       assign( t_bitno1, mkexpr(t_bitno0) );
   7471    }
   7472 
   7473    /* At this point: t_addr0 is the address being operated on.  If it
   7474       was a reg, we will have pushed it onto the client's stack.
   7475       t_bitno1 is the bit number, suitably masked in the case of a
   7476       reg.  */
   7477 
   7478    /* Now the main sequence. */
   7479    assign( t_addr1,
   7480            binop(Iop_Add64,
   7481                  mkexpr(t_addr0),
   7482                  binop(Iop_Sar64, mkexpr(t_bitno1), mkU8(3))) );
   7483 
   7484    /* t_addr1 now holds effective address */
   7485 
   7486    assign( t_bitno2,
   7487            unop(Iop_64to8,
   7488                 binop(Iop_And64, mkexpr(t_bitno1), mkU64(7))) );
   7489 
   7490    /* t_bitno2 contains offset of bit within byte */
   7491 
   7492    if (op != BtOpNone) {
   7493       t_mask = newTemp(Ity_I8);
   7494       assign( t_mask, binop(Iop_Shl8, mkU8(1), mkexpr(t_bitno2)) );
   7495    }
   7496 
   7497    /* t_mask is now a suitable byte mask */
   7498 
   7499    assign( t_fetched, loadLE(Ity_I8, mkexpr(t_addr1)) );
   7500 
   7501    if (op != BtOpNone) {
   7502       switch (op) {
   7503          case BtOpSet:
   7504             assign( t_new,
   7505                     binop(Iop_Or8, mkexpr(t_fetched), mkexpr(t_mask)) );
   7506             break;
   7507          case BtOpComp:
   7508             assign( t_new,
   7509                     binop(Iop_Xor8, mkexpr(t_fetched), mkexpr(t_mask)) );
   7510             break;
   7511          case BtOpReset:
   7512             assign( t_new,
   7513                     binop(Iop_And8, mkexpr(t_fetched),
   7514                                     unop(Iop_Not8, mkexpr(t_mask))) );
   7515             break;
   7516          default:
   7517             vpanic("dis_bt_G_E(amd64)");
   7518       }
   7519       if ((pfx & PFX_LOCK) && !epartIsReg(modrm)) {
   7520          casLE( mkexpr(t_addr1), mkexpr(t_fetched)/*expd*/,
   7521                                  mkexpr(t_new)/*new*/,
   7522                                  guest_RIP_curr_instr );
   7523       } else {
   7524          storeLE( mkexpr(t_addr1), mkexpr(t_new) );
   7525       }
   7526    }
   7527 
   7528    /* Side effect done; now get selected bit into Carry flag */
   7529    /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
   7530    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   7531    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   7532    stmt( IRStmt_Put(
   7533             OFFB_CC_DEP1,
   7534             binop(Iop_And64,
   7535                   binop(Iop_Shr64,
   7536                         unop(Iop_8Uto64, mkexpr(t_fetched)),
   7537                         mkexpr(t_bitno2)),
   7538                   mkU64(1)))
   7539        );
   7540    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   7541       elimination of previous stores to this field work better. */
   7542    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   7543 
   7544    /* Move reg operand from stack back to reg */
   7545    if (epartIsReg(modrm)) {
   7546       /* t_rsp still points at it. */
   7547       /* only write the reg if actually modifying it; doing otherwise
   7548          zeroes the top half erroneously when doing btl due to
   7549          standard zero-extend rule */
   7550       if (op != BtOpNone)
   7551          putIRegE(sz, pfx, modrm, loadLE(szToITy(sz), mkexpr(t_rsp)) );
   7552       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t_rsp), mkU64(288)) );
   7553    }
   7554 
   7555    DIP("bt%s%c %s, %s\n",
   7556        nameBtOp(op), nameISize(sz), nameIRegG(sz, pfx, modrm),
   7557        ( epartIsReg(modrm) ? nameIRegE(sz, pfx, modrm) : dis_buf ) );
   7558 
   7559    return delta;
   7560 }
   7561 
   7562 
   7563 
   7564 /* Handle BSF/BSR.  Only v-size seems necessary. */
   7565 static
   7566 ULong dis_bs_E_G ( VexAbiInfo* vbi,
   7567                    Prefix pfx, Int sz, Long delta, Bool fwds )
   7568 {
   7569    Bool   isReg;
   7570    UChar  modrm;
   7571    HChar  dis_buf[50];
   7572 
   7573    IRType ty    = szToITy(sz);
   7574    IRTemp src   = newTemp(ty);
   7575    IRTemp dst   = newTemp(ty);
   7576    IRTemp src64 = newTemp(Ity_I64);
   7577    IRTemp dst64 = newTemp(Ity_I64);
   7578    IRTemp src8  = newTemp(Ity_I8);
   7579 
   7580    vassert(sz == 8 || sz == 4 || sz == 2);
   7581 
   7582    modrm = getUChar(delta);
   7583    isReg = epartIsReg(modrm);
   7584    if (isReg) {
   7585       delta++;
   7586       assign( src, getIRegE(sz, pfx, modrm) );
   7587    } else {
   7588       Int    len;
   7589       IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7590       delta += len;
   7591       assign( src, loadLE(ty, mkexpr(addr)) );
   7592    }
   7593 
   7594    DIP("bs%c%c %s, %s\n",
   7595        fwds ? 'f' : 'r', nameISize(sz),
   7596        ( isReg ? nameIRegE(sz, pfx, modrm) : dis_buf ),
   7597        nameIRegG(sz, pfx, modrm));
   7598 
   7599    /* First, widen src to 64 bits if it is not already. */
   7600    assign( src64, widenUto64(mkexpr(src)) );
   7601 
   7602    /* Generate an 8-bit expression which is zero iff the
   7603       original is zero, and nonzero otherwise */
   7604    assign( src8,
   7605            unop(Iop_1Uto8,
   7606                 binop(Iop_CmpNE64,
   7607                       mkexpr(src64), mkU64(0))) );
   7608 
   7609    /* Flags: Z is 1 iff source value is zero.  All others
   7610       are undefined -- we force them to zero. */
   7611    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   7612    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   7613    stmt( IRStmt_Put(
   7614             OFFB_CC_DEP1,
   7615             IRExpr_Mux0X( mkexpr(src8),
   7616                           /* src==0 */
   7617                           mkU64(AMD64G_CC_MASK_Z),
   7618                           /* src!=0 */
   7619                           mkU64(0)
   7620                         )
   7621        ));
   7622    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   7623       elimination of previous stores to this field work better. */
   7624    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   7625 
   7626    /* Result: iff source value is zero, we can't use
   7627       Iop_Clz64/Iop_Ctz64 as they have no defined result in that case.
   7628       But anyway, amd64 semantics say the result is undefined in
   7629       such situations.  Hence handle the zero case specially. */
   7630 
   7631    /* Bleh.  What we compute:
   7632 
   7633           bsf64:  if src == 0 then {dst is unchanged}
   7634                               else Ctz64(src)
   7635 
   7636           bsr64:  if src == 0 then {dst is unchanged}
   7637                               else 63 - Clz64(src)
   7638 
   7639           bsf32:  if src == 0 then {dst is unchanged}
   7640                               else Ctz64(32Uto64(src))
   7641 
   7642           bsr32:  if src == 0 then {dst is unchanged}
   7643                               else 63 - Clz64(32Uto64(src))
   7644 
   7645           bsf16:  if src == 0 then {dst is unchanged}
   7646                               else Ctz64(32Uto64(16Uto32(src)))
   7647 
   7648           bsr16:  if src == 0 then {dst is unchanged}
   7649                               else 63 - Clz64(32Uto64(16Uto32(src)))
   7650    */
   7651 
   7652    /* The main computation, guarding against zero. */
   7653    assign( dst64,
   7654            IRExpr_Mux0X(
   7655               mkexpr(src8),
   7656               /* src == 0 -- leave dst unchanged */
   7657               widenUto64( getIRegG( sz, pfx, modrm ) ),
   7658               /* src != 0 */
   7659               fwds ? unop(Iop_Ctz64, mkexpr(src64))
   7660                    : binop(Iop_Sub64,
   7661                            mkU64(63),
   7662                            unop(Iop_Clz64, mkexpr(src64)))
   7663            )
   7664          );
   7665 
   7666    if (sz == 2)
   7667       assign( dst, unop(Iop_64to16, mkexpr(dst64)) );
   7668    else
   7669    if (sz == 4)
   7670       assign( dst, unop(Iop_64to32, mkexpr(dst64)) );
   7671    else
   7672       assign( dst, mkexpr(dst64) );
   7673 
   7674    /* dump result back */
   7675    putIRegG( sz, pfx, modrm, mkexpr(dst) );
   7676 
   7677    return delta;
   7678 }
   7679 
   7680 
   7681 /* swap rAX with the reg specified by reg and REX.B */
   7682 static
   7683 void codegen_xchg_rAX_Reg ( Prefix pfx, Int sz, UInt regLo3 )
   7684 {
   7685    IRType ty = szToITy(sz);
   7686    IRTemp t1 = newTemp(ty);
   7687    IRTemp t2 = newTemp(ty);
   7688    vassert(sz == 2 || sz == 4 || sz == 8);
   7689    vassert(regLo3 < 8);
   7690    if (sz == 8) {
   7691       assign( t1, getIReg64(R_RAX) );
   7692       assign( t2, getIRegRexB(8, pfx, regLo3) );
   7693       putIReg64( R_RAX, mkexpr(t2) );
   7694       putIRegRexB(8, pfx, regLo3, mkexpr(t1) );
   7695    } else if (sz == 4) {
   7696       assign( t1, getIReg32(R_RAX) );
   7697       assign( t2, getIRegRexB(4, pfx, regLo3) );
   7698       putIReg32( R_RAX, mkexpr(t2) );
   7699       putIRegRexB(4, pfx, regLo3, mkexpr(t1) );
   7700    } else {
   7701       assign( t1, getIReg16(R_RAX) );
   7702       assign( t2, getIRegRexB(2, pfx, regLo3) );
   7703       putIReg16( R_RAX, mkexpr(t2) );
   7704       putIRegRexB(2, pfx, regLo3, mkexpr(t1) );
   7705    }
   7706    DIP("xchg%c %s, %s\n",
   7707        nameISize(sz), nameIRegRAX(sz),
   7708                       nameIRegRexB(sz,pfx, regLo3));
   7709 }
   7710 
   7711 
   7712 static
   7713 void codegen_SAHF ( void )
   7714 {
   7715    /* Set the flags to:
   7716       (amd64g_calculate_flags_all() & AMD64G_CC_MASK_O)
   7717                                     -- retain the old O flag
   7718       | (%AH & (AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
   7719                 |AMD64G_CC_MASK_P|AMD64G_CC_MASK_C)
   7720    */
   7721    ULong  mask_SZACP = AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
   7722                        |AMD64G_CC_MASK_C|AMD64G_CC_MASK_P;
   7723    IRTemp oldflags   = newTemp(Ity_I64);
   7724    assign( oldflags, mk_amd64g_calculate_rflags_all() );
   7725    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   7726    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   7727    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   7728    stmt( IRStmt_Put( OFFB_CC_DEP1,
   7729          binop(Iop_Or64,
   7730                binop(Iop_And64, mkexpr(oldflags), mkU64(AMD64G_CC_MASK_O)),
   7731                binop(Iop_And64,
   7732                      binop(Iop_Shr64, getIReg64(R_RAX), mkU8(8)),
   7733                      mkU64(mask_SZACP))
   7734               )
   7735    ));
   7736 }
   7737 
   7738 
   7739 static
   7740 void codegen_LAHF ( void  )
   7741 {
   7742    /* AH <- EFLAGS(SF:ZF:0:AF:0:PF:1:CF) */
   7743    IRExpr* rax_with_hole;
   7744    IRExpr* new_byte;
   7745    IRExpr* new_rax;
   7746    ULong   mask_SZACP = AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
   7747                         |AMD64G_CC_MASK_C|AMD64G_CC_MASK_P;
   7748 
   7749    IRTemp  flags = newTemp(Ity_I64);
   7750    assign( flags, mk_amd64g_calculate_rflags_all() );
   7751 
   7752    rax_with_hole
   7753       = binop(Iop_And64, getIReg64(R_RAX), mkU64(~0xFF00ULL));
   7754    new_byte
   7755       = binop(Iop_Or64, binop(Iop_And64, mkexpr(flags), mkU64(mask_SZACP)),
   7756                         mkU64(1<<1));
   7757    new_rax
   7758       = binop(Iop_Or64, rax_with_hole,
   7759                         binop(Iop_Shl64, new_byte, mkU8(8)));
   7760    putIReg64(R_RAX, new_rax);
   7761 }
   7762 
   7763 
   7764 static
   7765 ULong dis_cmpxchg_G_E ( /*OUT*/Bool* ok,
   7766                         VexAbiInfo*  vbi,
   7767                         Prefix       pfx,
   7768                         Int          size,
   7769                         Long         delta0 )
   7770 {
   7771    HChar dis_buf[50];
   7772    Int   len;
   7773 
   7774    IRType ty    = szToITy(size);
   7775    IRTemp acc   = newTemp(ty);
   7776    IRTemp src   = newTemp(ty);
   7777    IRTemp dest  = newTemp(ty);
   7778    IRTemp dest2 = newTemp(ty);
   7779    IRTemp acc2  = newTemp(ty);
   7780    IRTemp cond8 = newTemp(Ity_I8);
   7781    IRTemp addr  = IRTemp_INVALID;
   7782    UChar  rm    = getUChar(delta0);
   7783 
   7784    /* There are 3 cases to consider:
   7785 
   7786       reg-reg: ignore any lock prefix, generate sequence based
   7787                on Mux0X
   7788 
   7789       reg-mem, not locked: ignore any lock prefix, generate sequence
   7790                            based on Mux0X
   7791 
   7792       reg-mem, locked: use IRCAS
   7793    */
   7794 
   7795    if (epartIsReg(rm)) {
   7796       /* case 1 */
   7797       assign( dest, getIRegE(size, pfx, rm) );
   7798       delta0++;
   7799       assign( src, getIRegG(size, pfx, rm) );
   7800       assign( acc, getIRegRAX(size) );
   7801       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   7802       assign( cond8, unop(Iop_1Uto8, mk_amd64g_calculate_condition(AMD64CondZ)) );
   7803       assign( dest2, IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(src)) );
   7804       assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
   7805       putIRegRAX(size, mkexpr(acc2));
   7806       putIRegE(size, pfx, rm, mkexpr(dest2));
   7807       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   7808                                nameIRegG(size,pfx,rm),
   7809                                nameIRegE(size,pfx,rm) );
   7810    }
   7811    else if (!epartIsReg(rm) && !(pfx & PFX_LOCK)) {
   7812       /* case 2 */
   7813       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   7814       assign( dest, loadLE(ty, mkexpr(addr)) );
   7815       delta0 += len;
   7816       assign( src, getIRegG(size, pfx, rm) );
   7817       assign( acc, getIRegRAX(size) );
   7818       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   7819       assign( cond8, unop(Iop_1Uto8, mk_amd64g_calculate_condition(AMD64CondZ)) );
   7820       assign( dest2, IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(src)) );
   7821       assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
   7822       putIRegRAX(size, mkexpr(acc2));
   7823       storeLE( mkexpr(addr), mkexpr(dest2) );
   7824       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   7825                                nameIRegG(size,pfx,rm), dis_buf);
   7826    }
   7827    else if (!epartIsReg(rm) && (pfx & PFX_LOCK)) {
   7828       /* case 3 */
   7829       /* src is new value.  acc is expected value.  dest is old value.
   7830          Compute success from the output of the IRCAS, and steer the
   7831          new value for RAX accordingly: in case of success, RAX is
   7832          unchanged. */
   7833       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   7834       delta0 += len;
   7835       assign( src, getIRegG(size, pfx, rm) );
   7836       assign( acc, getIRegRAX(size) );
   7837       stmt( IRStmt_CAS(
   7838          mkIRCAS( IRTemp_INVALID, dest, Iend_LE, mkexpr(addr),
   7839                   NULL, mkexpr(acc), NULL, mkexpr(src) )
   7840       ));
   7841       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   7842       assign( cond8, unop(Iop_1Uto8, mk_amd64g_calculate_condition(AMD64CondZ)) );
   7843       assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
   7844       putIRegRAX(size, mkexpr(acc2));
   7845       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   7846                                nameIRegG(size,pfx,rm), dis_buf);
   7847    }
   7848    else vassert(0);
   7849 
   7850    *ok = True;
   7851    return delta0;
   7852 }
   7853 
   7854 
   7855 /* Handle conditional move instructions of the form
   7856       cmovcc E(reg-or-mem), G(reg)
   7857 
   7858    E(src) is reg-or-mem
   7859    G(dst) is reg.
   7860 
   7861    If E is reg, -->    GET %E, tmps
   7862                        GET %G, tmpd
   7863                        CMOVcc tmps, tmpd
   7864                        PUT tmpd, %G
   7865 
   7866    If E is mem  -->    (getAddr E) -> tmpa
   7867                        LD (tmpa), tmps
   7868                        GET %G, tmpd
   7869                        CMOVcc tmps, tmpd
   7870                        PUT tmpd, %G
   7871 */
   7872 static
   7873 ULong dis_cmov_E_G ( VexAbiInfo* vbi,
   7874                      Prefix        pfx,
   7875                      Int           sz,
   7876                      AMD64Condcode cond,
   7877                      Long          delta0 )
   7878 {
   7879    UChar rm  = getUChar(delta0);
   7880    HChar dis_buf[50];
   7881    Int   len;
   7882 
   7883    IRType ty   = szToITy(sz);
   7884    IRTemp tmps = newTemp(ty);
   7885    IRTemp tmpd = newTemp(ty);
   7886 
   7887    if (epartIsReg(rm)) {
   7888       assign( tmps, getIRegE(sz, pfx, rm) );
   7889       assign( tmpd, getIRegG(sz, pfx, rm) );
   7890 
   7891       putIRegG( sz, pfx, rm,
   7892                 IRExpr_Mux0X( unop(Iop_1Uto8,
   7893                                    mk_amd64g_calculate_condition(cond)),
   7894                               mkexpr(tmpd),
   7895                               mkexpr(tmps) )
   7896               );
   7897       DIP("cmov%s %s,%s\n", name_AMD64Condcode(cond),
   7898                             nameIRegE(sz,pfx,rm),
   7899                             nameIRegG(sz,pfx,rm));
   7900       return 1+delta0;
   7901    }
   7902 
   7903    /* E refers to memory */
   7904    {
   7905       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   7906       assign( tmps, loadLE(ty, mkexpr(addr)) );
   7907       assign( tmpd, getIRegG(sz, pfx, rm) );
   7908 
   7909       putIRegG( sz, pfx, rm,
   7910                 IRExpr_Mux0X( unop(Iop_1Uto8,
   7911                                    mk_amd64g_calculate_condition(cond)),
   7912                               mkexpr(tmpd),
   7913                               mkexpr(tmps) )
   7914               );
   7915 
   7916       DIP("cmov%s %s,%s\n", name_AMD64Condcode(cond),
   7917                             dis_buf,
   7918                             nameIRegG(sz,pfx,rm));
   7919       return len+delta0;
   7920    }
   7921 }
   7922 
   7923 
   7924 static
   7925 ULong dis_xadd_G_E ( /*OUT*/Bool* decode_ok,
   7926                      VexAbiInfo* vbi,
   7927                      Prefix pfx, Int sz, Long delta0 )
   7928 {
   7929    Int   len;
   7930    UChar rm = getUChar(delta0);
   7931    HChar dis_buf[50];
   7932 
   7933    IRType ty    = szToITy(sz);
   7934    IRTemp tmpd  = newTemp(ty);
   7935    IRTemp tmpt0 = newTemp(ty);
   7936    IRTemp tmpt1 = newTemp(ty);
   7937 
   7938    /* There are 3 cases to consider:
   7939 
   7940       reg-reg: ignore any lock prefix,
   7941                generate 'naive' (non-atomic) sequence
   7942 
   7943       reg-mem, not locked: ignore any lock prefix, generate 'naive'
   7944                            (non-atomic) sequence
   7945 
   7946       reg-mem, locked: use IRCAS
   7947    */
   7948 
   7949    if (epartIsReg(rm)) {
   7950       /* case 1 */
   7951       assign( tmpd, getIRegE(sz, pfx, rm) );
   7952       assign( tmpt0, getIRegG(sz, pfx, rm) );
   7953       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   7954                            mkexpr(tmpd), mkexpr(tmpt0)) );
   7955       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   7956       putIRegG(sz, pfx, rm, mkexpr(tmpd));
   7957       putIRegE(sz, pfx, rm, mkexpr(tmpt1));
   7958       DIP("xadd%c %s, %s\n",
   7959           nameISize(sz), nameIRegG(sz,pfx,rm),
   7960           				 nameIRegE(sz,pfx,rm));
   7961       *decode_ok = True;
   7962       return 1+delta0;
   7963    }
   7964    else if (!epartIsReg(rm) && !(pfx & PFX_LOCK)) {
   7965       /* case 2 */
   7966       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   7967       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
   7968       assign( tmpt0, getIRegG(sz, pfx, rm) );
   7969       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   7970                            mkexpr(tmpd), mkexpr(tmpt0)) );
   7971       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   7972       storeLE( mkexpr(addr), mkexpr(tmpt1) );
   7973       putIRegG(sz, pfx, rm, mkexpr(tmpd));
   7974       DIP("xadd%c %s, %s\n",
   7975           nameISize(sz), nameIRegG(sz,pfx,rm), dis_buf);
   7976       *decode_ok = True;
   7977       return len+delta0;
   7978    }
   7979    else if (!epartIsReg(rm) && (pfx & PFX_LOCK)) {
   7980       /* case 3 */
   7981       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   7982       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
   7983       assign( tmpt0, getIRegG(sz, pfx, rm) );
   7984       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   7985                            mkexpr(tmpd), mkexpr(tmpt0)) );
   7986       casLE( mkexpr(addr), mkexpr(tmpd)/*expVal*/,
   7987                            mkexpr(tmpt1)/*newVal*/, guest_RIP_curr_instr );
   7988       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   7989       putIRegG(sz, pfx, rm, mkexpr(tmpd));
   7990       DIP("xadd%c %s, %s\n",
   7991           nameISize(sz), nameIRegG(sz,pfx,rm), dis_buf);
   7992       *decode_ok = True;
   7993       return len+delta0;
   7994    }
   7995    /*UNREACHED*/
   7996    vassert(0);
   7997 }
   7998 
   7999 //.. /* Move 16 bits from Ew (ireg or mem) to G (a segment register). */
   8000 //..
   8001 //.. static
   8002 //.. UInt dis_mov_Ew_Sw ( UChar sorb, Long delta0 )
   8003 //.. {
   8004 //..    Int    len;
   8005 //..    IRTemp addr;
   8006 //..    UChar  rm  = getUChar(delta0);
   8007 //..    HChar  dis_buf[50];
   8008 //..
   8009 //..    if (epartIsReg(rm)) {
   8010 //..       putSReg( gregOfRM(rm), getIReg(2, eregOfRM(rm)) );
   8011 //..       DIP("movw %s,%s\n", nameIReg(2,eregOfRM(rm)), nameSReg(gregOfRM(rm)));
   8012 //..       return 1+delta0;
   8013 //..    } else {
   8014 //..       addr = disAMode ( &len, sorb, delta0, dis_buf );
   8015 //..       putSReg( gregOfRM(rm), loadLE(Ity_I16, mkexpr(addr)) );
   8016 //..       DIP("movw %s,%s\n", dis_buf, nameSReg(gregOfRM(rm)));
   8017 //..       return len+delta0;
   8018 //..    }
   8019 //.. }
   8020 //..
   8021 //.. /* Move 16 bits from G (a segment register) to Ew (ireg or mem).  If
   8022 //..    dst is ireg and sz==4, zero out top half of it.  */
   8023 //..
   8024 //.. static
   8025 //.. UInt dis_mov_Sw_Ew ( UChar sorb,
   8026 //..                      Int   sz,
   8027 //..                      UInt  delta0 )
   8028 //.. {
   8029 //..    Int    len;
   8030 //..    IRTemp addr;
   8031 //..    UChar  rm  = getUChar(delta0);
   8032 //..    HChar  dis_buf[50];
   8033 //..
   8034 //..    vassert(sz == 2 || sz == 4);
   8035 //..
   8036 //..    if (epartIsReg(rm)) {
   8037 //..       if (sz == 4)
   8038 //..          putIReg(4, eregOfRM(rm), unop(Iop_16Uto32, getSReg(gregOfRM(rm))));
   8039 //..       else
   8040 //..          putIReg(2, eregOfRM(rm), getSReg(gregOfRM(rm)));
   8041 //..
   8042 //..       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), nameIReg(sz,eregOfRM(rm)));
   8043 //..       return 1+delta0;
   8044 //..    } else {
   8045 //..       addr = disAMode ( &len, sorb, delta0, dis_buf );
   8046 //..       storeLE( mkexpr(addr), getSReg(gregOfRM(rm)) );
   8047 //..       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), dis_buf);
   8048 //..       return len+delta0;
   8049 //..    }
   8050 //.. }
   8051 //..
   8052 //..
   8053 //.. static
   8054 //.. void dis_push_segreg ( UInt sreg, Int sz )
   8055 //.. {
   8056 //..     IRTemp t1 = newTemp(Ity_I16);
   8057 //..     IRTemp ta = newTemp(Ity_I32);
   8058 //..     vassert(sz == 2 || sz == 4);
   8059 //..
   8060 //..     assign( t1, getSReg(sreg) );
   8061 //..     assign( ta, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)) );
   8062 //..     putIReg(4, R_ESP, mkexpr(ta));
   8063 //..     storeLE( mkexpr(ta), mkexpr(t1) );
   8064 //..
   8065 //..     DIP("pushw %s\n", nameSReg(sreg));
   8066 //.. }
   8067 //..
   8068 //.. static
   8069 //.. void dis_pop_segreg ( UInt sreg, Int sz )
   8070 //.. {
   8071 //..     IRTemp t1 = newTemp(Ity_I16);
   8072 //..     IRTemp ta = newTemp(Ity_I32);
   8073 //..     vassert(sz == 2 || sz == 4);
   8074 //..
   8075 //..     assign( ta, getIReg(4, R_ESP) );
   8076 //..     assign( t1, loadLE(Ity_I16, mkexpr(ta)) );
   8077 //..
   8078 //..     putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(ta), mkU32(sz)) );
   8079 //..     putSReg( sreg, mkexpr(t1) );
   8080 //..     DIP("pop %s\n", nameSReg(sreg));
   8081 //.. }
   8082 
   8083 static
   8084 void dis_ret ( VexAbiInfo* vbi, ULong d64 )
   8085 {
   8086    IRTemp t1 = newTemp(Ity_I64);
   8087    IRTemp t2 = newTemp(Ity_I64);
   8088    IRTemp t3 = newTemp(Ity_I64);
   8089    assign(t1, getIReg64(R_RSP));
   8090    assign(t2, loadLE(Ity_I64,mkexpr(t1)));
   8091    assign(t3, binop(Iop_Add64, mkexpr(t1), mkU64(8+d64)));
   8092    putIReg64(R_RSP, mkexpr(t3));
   8093    make_redzone_AbiHint(vbi, t3, t2/*nia*/, "ret");
   8094    jmp_treg(Ijk_Ret,t2);
   8095 }
   8096 
   8097 
   8098 /*------------------------------------------------------------*/
   8099 /*--- SSE/SSE2/SSE3 helpers                                ---*/
   8100 /*------------------------------------------------------------*/
   8101 
   8102 /* Worker function; do not call directly.
   8103    Handles full width G = G `op` E   and   G = (not G) `op` E.
   8104 */
   8105 
   8106 static ULong dis_SSE_E_to_G_all_wrk (
   8107                 VexAbiInfo* vbi,
   8108                 Prefix pfx, Long delta,
   8109                 HChar* opname, IROp op,
   8110                 Bool   invertG
   8111              )
   8112 {
   8113    HChar   dis_buf[50];
   8114    Int     alen;
   8115    IRTemp  addr;
   8116    UChar   rm = getUChar(delta);
   8117    IRExpr* gpart
   8118       = invertG ? unop(Iop_NotV128, getXMMReg(gregOfRexRM(pfx,rm)))
   8119                 : getXMMReg(gregOfRexRM(pfx,rm));
   8120    if (epartIsReg(rm)) {
   8121       putXMMReg( gregOfRexRM(pfx,rm),
   8122                  binop(op, gpart,
   8123                            getXMMReg(eregOfRexRM(pfx,rm))) );
   8124       DIP("%s %s,%s\n", opname,
   8125                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8126                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8127       return delta+1;
   8128    } else {
   8129       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8130       putXMMReg( gregOfRexRM(pfx,rm),
   8131                  binop(op, gpart,
   8132                            loadLE(Ity_V128, mkexpr(addr))) );
   8133       DIP("%s %s,%s\n", opname,
   8134                         dis_buf,
   8135                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8136       return delta+alen;
   8137    }
   8138 }
   8139 
   8140 
   8141 /* All lanes SSE binary operation, G = G `op` E. */
   8142 
   8143 static
   8144 ULong dis_SSE_E_to_G_all ( VexAbiInfo* vbi,
   8145                            Prefix pfx, Long delta,
   8146                            HChar* opname, IROp op )
   8147 {
   8148    return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, False );
   8149 }
   8150 
   8151 /* All lanes SSE binary operation, G = (not G) `op` E. */
   8152 
   8153 static
   8154 ULong dis_SSE_E_to_G_all_invG ( VexAbiInfo* vbi,
   8155                                 Prefix pfx, Long delta,
   8156                                 HChar* opname, IROp op )
   8157 {
   8158    return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, True );
   8159 }
   8160 
   8161 
   8162 /* Lowest 32-bit lane only SSE binary operation, G = G `op` E. */
   8163 
   8164 static ULong dis_SSE_E_to_G_lo32 ( VexAbiInfo* vbi,
   8165                                    Prefix pfx, Long delta,
   8166                                    HChar* opname, IROp op )
   8167 {
   8168    HChar   dis_buf[50];
   8169    Int     alen;
   8170    IRTemp  addr;
   8171    UChar   rm = getUChar(delta);
   8172    IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
   8173    if (epartIsReg(rm)) {
   8174       putXMMReg( gregOfRexRM(pfx,rm),
   8175                  binop(op, gpart,
   8176                            getXMMReg(eregOfRexRM(pfx,rm))) );
   8177       DIP("%s %s,%s\n", opname,
   8178                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8179                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8180       return delta+1;
   8181    } else {
   8182       /* We can only do a 32-bit memory read, so the upper 3/4 of the
   8183          E operand needs to be made simply of zeroes. */
   8184       IRTemp epart = newTemp(Ity_V128);
   8185       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8186       assign( epart, unop( Iop_32UtoV128,
   8187                            loadLE(Ity_I32, mkexpr(addr))) );
   8188       putXMMReg( gregOfRexRM(pfx,rm),
   8189                  binop(op, gpart, mkexpr(epart)) );
   8190       DIP("%s %s,%s\n", opname,
   8191                         dis_buf,
   8192                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8193       return delta+alen;
   8194    }
   8195 }
   8196 
   8197 
   8198 /* Lower 64-bit lane only SSE binary operation, G = G `op` E. */
   8199 
   8200 static ULong dis_SSE_E_to_G_lo64 ( VexAbiInfo* vbi,
   8201                                    Prefix pfx, Long delta,
   8202                                    HChar* opname, IROp op )
   8203 {
   8204    HChar   dis_buf[50];
   8205    Int     alen;
   8206    IRTemp  addr;
   8207    UChar   rm = getUChar(delta);
   8208    IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
   8209    if (epartIsReg(rm)) {
   8210       putXMMReg( gregOfRexRM(pfx,rm),
   8211                  binop(op, gpart,
   8212                            getXMMReg(eregOfRexRM(pfx,rm))) );
   8213       DIP("%s %s,%s\n", opname,
   8214                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8215                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8216       return delta+1;
   8217    } else {
   8218       /* We can only do a 64-bit memory read, so the upper half of the
   8219          E operand needs to be made simply of zeroes. */
   8220       IRTemp epart = newTemp(Ity_V128);
   8221       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8222       assign( epart, unop( Iop_64UtoV128,
   8223                            loadLE(Ity_I64, mkexpr(addr))) );
   8224       putXMMReg( gregOfRexRM(pfx,rm),
   8225                  binop(op, gpart, mkexpr(epart)) );
   8226       DIP("%s %s,%s\n", opname,
   8227                         dis_buf,
   8228                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8229       return delta+alen;
   8230    }
   8231 }
   8232 
   8233 
   8234 /* All lanes unary SSE operation, G = op(E). */
   8235 
   8236 static ULong dis_SSE_E_to_G_unary_all (
   8237                 VexAbiInfo* vbi,
   8238                 Prefix pfx, Long delta,
   8239                 HChar* opname, IROp op
   8240              )
   8241 {
   8242    HChar   dis_buf[50];
   8243    Int     alen;
   8244    IRTemp  addr;
   8245    UChar   rm = getUChar(delta);
   8246    if (epartIsReg(rm)) {
   8247       putXMMReg( gregOfRexRM(pfx,rm),
   8248                  unop(op, getXMMReg(eregOfRexRM(pfx,rm))) );
   8249       DIP("%s %s,%s\n", opname,
   8250                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8251                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8252       return delta+1;
   8253    } else {
   8254       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8255       putXMMReg( gregOfRexRM(pfx,rm),
   8256                  unop(op, loadLE(Ity_V128, mkexpr(addr))) );
   8257       DIP("%s %s,%s\n", opname,
   8258                         dis_buf,
   8259                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8260       return delta+alen;
   8261    }
   8262 }
   8263 
   8264 
   8265 /* Lowest 32-bit lane only unary SSE operation, G = op(E). */
   8266 
   8267 static ULong dis_SSE_E_to_G_unary_lo32 (
   8268                 VexAbiInfo* vbi,
   8269                 Prefix pfx, Long delta,
   8270                 HChar* opname, IROp op
   8271              )
   8272 {
   8273    /* First we need to get the old G value and patch the low 32 bits
   8274       of the E operand into it.  Then apply op and write back to G. */
   8275    HChar   dis_buf[50];
   8276    Int     alen;
   8277    IRTemp  addr;
   8278    UChar   rm = getUChar(delta);
   8279    IRTemp  oldG0 = newTemp(Ity_V128);
   8280    IRTemp  oldG1 = newTemp(Ity_V128);
   8281 
   8282    assign( oldG0, getXMMReg(gregOfRexRM(pfx,rm)) );
   8283 
   8284    if (epartIsReg(rm)) {
   8285       assign( oldG1,
   8286               binop( Iop_SetV128lo32,
   8287                      mkexpr(oldG0),
   8288                      getXMMRegLane32(eregOfRexRM(pfx,rm), 0)) );
   8289       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   8290       DIP("%s %s,%s\n", opname,
   8291                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8292                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8293       return delta+1;
   8294    } else {
   8295       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8296       assign( oldG1,
   8297               binop( Iop_SetV128lo32,
   8298                      mkexpr(oldG0),
   8299                      loadLE(Ity_I32, mkexpr(addr)) ));
   8300       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   8301       DIP("%s %s,%s\n", opname,
   8302                         dis_buf,
   8303                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8304       return delta+alen;
   8305    }
   8306 }
   8307 
   8308 
   8309 /* Lowest 64-bit lane only unary SSE operation, G = op(E). */
   8310 
   8311 static ULong dis_SSE_E_to_G_unary_lo64 (
   8312                 VexAbiInfo* vbi,
   8313                 Prefix pfx, Long delta,
   8314                 HChar* opname, IROp op
   8315              )
   8316 {
   8317    /* First we need to get the old G value and patch the low 64 bits
   8318       of the E operand into it.  Then apply op and write back to G. */
   8319    HChar   dis_buf[50];
   8320    Int     alen;
   8321    IRTemp  addr;
   8322    UChar   rm = getUChar(delta);
   8323    IRTemp  oldG0 = newTemp(Ity_V128);
   8324    IRTemp  oldG1 = newTemp(Ity_V128);
   8325 
   8326    assign( oldG0, getXMMReg(gregOfRexRM(pfx,rm)) );
   8327 
   8328    if (epartIsReg(rm)) {
   8329       assign( oldG1,
   8330               binop( Iop_SetV128lo64,
   8331                      mkexpr(oldG0),
   8332                      getXMMRegLane64(eregOfRexRM(pfx,rm), 0)) );
   8333       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   8334       DIP("%s %s,%s\n", opname,
   8335                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8336                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8337       return delta+1;
   8338    } else {
   8339       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8340       assign( oldG1,
   8341               binop( Iop_SetV128lo64,
   8342                      mkexpr(oldG0),
   8343                      loadLE(Ity_I64, mkexpr(addr)) ));
   8344       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   8345       DIP("%s %s,%s\n", opname,
   8346                         dis_buf,
   8347                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8348       return delta+alen;
   8349    }
   8350 }
   8351 
   8352 
   8353 /* SSE integer binary operation:
   8354       G = G `op` E   (eLeft == False)
   8355       G = E `op` G   (eLeft == True)
   8356 */
   8357 static ULong dis_SSEint_E_to_G(
   8358                 VexAbiInfo* vbi,
   8359                 Prefix pfx, Long delta,
   8360                 HChar* opname, IROp op,
   8361                 Bool   eLeft
   8362              )
   8363 {
   8364    HChar   dis_buf[50];
   8365    Int     alen;
   8366    IRTemp  addr;
   8367    UChar   rm = getUChar(delta);
   8368    IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
   8369    IRExpr* epart = NULL;
   8370    if (epartIsReg(rm)) {
   8371       epart = getXMMReg(eregOfRexRM(pfx,rm));
   8372       DIP("%s %s,%s\n", opname,
   8373                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8374                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8375       delta += 1;
   8376    } else {
   8377       addr  = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8378       epart = loadLE(Ity_V128, mkexpr(addr));
   8379       DIP("%s %s,%s\n", opname,
   8380                         dis_buf,
   8381                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8382       delta += alen;
   8383    }
   8384    putXMMReg( gregOfRexRM(pfx,rm),
   8385               eLeft ? binop(op, epart, gpart)
   8386 	            : binop(op, gpart, epart) );
   8387    return delta;
   8388 }
   8389 
   8390 
   8391 /* Helper for doing SSE FP comparisons. */
   8392 
   8393 static void findSSECmpOp ( Bool* needNot, IROp* op,
   8394                            Int imm8, Bool all_lanes, Int sz )
   8395 {
   8396    imm8 &= 7;
   8397    *needNot = False;
   8398    *op      = Iop_INVALID;
   8399    if (imm8 >= 4) {
   8400       *needNot = True;
   8401       imm8 -= 4;
   8402    }
   8403 
   8404    if (sz == 4 && all_lanes) {
   8405       switch (imm8) {
   8406          case 0: *op = Iop_CmpEQ32Fx4; return;
   8407          case 1: *op = Iop_CmpLT32Fx4; return;
   8408          case 2: *op = Iop_CmpLE32Fx4; return;
   8409          case 3: *op = Iop_CmpUN32Fx4; return;
   8410          default: break;
   8411       }
   8412    }
   8413    if (sz == 4 && !all_lanes) {
   8414       switch (imm8) {
   8415          case 0: *op = Iop_CmpEQ32F0x4; return;
   8416          case 1: *op = Iop_CmpLT32F0x4; return;
   8417          case 2: *op = Iop_CmpLE32F0x4; return;
   8418          case 3: *op = Iop_CmpUN32F0x4; return;
   8419          default: break;
   8420       }
   8421    }
   8422    if (sz == 8 && all_lanes) {
   8423       switch (imm8) {
   8424          case 0: *op = Iop_CmpEQ64Fx2; return;
   8425          case 1: *op = Iop_CmpLT64Fx2; return;
   8426          case 2: *op = Iop_CmpLE64Fx2; return;
   8427          case 3: *op = Iop_CmpUN64Fx2; return;
   8428          default: break;
   8429       }
   8430    }
   8431    if (sz == 8 && !all_lanes) {
   8432       switch (imm8) {
   8433          case 0: *op = Iop_CmpEQ64F0x2; return;
   8434          case 1: *op = Iop_CmpLT64F0x2; return;
   8435          case 2: *op = Iop_CmpLE64F0x2; return;
   8436          case 3: *op = Iop_CmpUN64F0x2; return;
   8437          default: break;
   8438       }
   8439    }
   8440    vpanic("findSSECmpOp(amd64,guest)");
   8441 }
   8442 
   8443 /* Handles SSE 32F/64F comparisons. */
   8444 
   8445 static ULong dis_SSEcmp_E_to_G ( VexAbiInfo* vbi,
   8446                                  Prefix pfx, Long delta,
   8447                                  HChar* opname, Bool all_lanes, Int sz )
   8448 {
   8449    HChar   dis_buf[50];
   8450    Int     alen, imm8;
   8451    IRTemp  addr;
   8452    Bool    needNot = False;
   8453    IROp    op      = Iop_INVALID;
   8454    IRTemp  plain   = newTemp(Ity_V128);
   8455    UChar   rm      = getUChar(delta);
   8456    UShort  mask    = 0;
   8457    vassert(sz == 4 || sz == 8);
   8458    if (epartIsReg(rm)) {
   8459       imm8 = getUChar(delta+1);
   8460       findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
   8461       assign( plain, binop(op, getXMMReg(gregOfRexRM(pfx,rm)),
   8462                                getXMMReg(eregOfRexRM(pfx,rm))) );
   8463       delta += 2;
   8464       DIP("%s $%d,%s,%s\n", opname,
   8465                             (Int)imm8,
   8466                             nameXMMReg(eregOfRexRM(pfx,rm)),
   8467                             nameXMMReg(gregOfRexRM(pfx,rm)) );
   8468    } else {
   8469       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   8470       imm8 = getUChar(delta+alen);
   8471       findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
   8472       assign( plain,
   8473               binop(
   8474                  op,
   8475                  getXMMReg(gregOfRexRM(pfx,rm)),
   8476                    all_lanes  ? loadLE(Ity_V128, mkexpr(addr))
   8477                  : sz == 8    ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
   8478                  : /*sz==4*/    unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr)))
   8479 	      )
   8480       );
   8481       delta += alen+1;
   8482       DIP("%s $%d,%s,%s\n", opname,
   8483                             (Int)imm8,
   8484                             dis_buf,
   8485                             nameXMMReg(gregOfRexRM(pfx,rm)) );
   8486    }
   8487 
   8488    if (needNot && all_lanes) {
   8489       putXMMReg( gregOfRexRM(pfx,rm),
   8490                  unop(Iop_NotV128, mkexpr(plain)) );
   8491    }
   8492    else
   8493    if (needNot && !all_lanes) {
   8494       mask = toUShort(sz==4 ? 0x000F : 0x00FF);
   8495       putXMMReg( gregOfRexRM(pfx,rm),
   8496                  binop(Iop_XorV128, mkexpr(plain), mkV128(mask)) );
   8497    }
   8498    else {
   8499       putXMMReg( gregOfRexRM(pfx,rm), mkexpr(plain) );
   8500    }
   8501 
   8502    return delta;
   8503 }
   8504 
   8505 
   8506 /* Vector by scalar shift of G by the amount specified at the bottom
   8507    of E. */
   8508 
   8509 static ULong dis_SSE_shiftG_byE ( VexAbiInfo* vbi,
   8510                                   Prefix pfx, Long delta,
   8511                                   HChar* opname, IROp op )
   8512 {
   8513    HChar   dis_buf[50];
   8514    Int     alen, size;
   8515    IRTemp  addr;
   8516    Bool    shl, shr, sar;
   8517    UChar   rm   = getUChar(delta);
   8518    IRTemp  g0   = newTemp(Ity_V128);
   8519    IRTemp  g1   = newTemp(Ity_V128);
   8520    IRTemp  amt  = newTemp(Ity_I32);
   8521    IRTemp  amt8 = newTemp(Ity_I8);
   8522    if (epartIsReg(rm)) {
   8523       assign( amt, getXMMRegLane32(eregOfRexRM(pfx,rm), 0) );
   8524       DIP("%s %s,%s\n", opname,
   8525                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8526                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8527       delta++;
   8528    } else {
   8529       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8530       assign( amt, loadLE(Ity_I32, mkexpr(addr)) );
   8531       DIP("%s %s,%s\n", opname,
   8532                         dis_buf,
   8533                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8534       delta += alen;
   8535    }
   8536    assign( g0,   getXMMReg(gregOfRexRM(pfx,rm)) );
   8537    assign( amt8, unop(Iop_32to8, mkexpr(amt)) );
   8538 
   8539    shl = shr = sar = False;
   8540    size = 0;
   8541    switch (op) {
   8542       case Iop_ShlN16x8: shl = True; size = 32; break;
   8543       case Iop_ShlN32x4: shl = True; size = 32; break;
   8544       case Iop_ShlN64x2: shl = True; size = 64; break;
   8545       case Iop_SarN16x8: sar = True; size = 16; break;
   8546       case Iop_SarN32x4: sar = True; size = 32; break;
   8547       case Iop_ShrN16x8: shr = True; size = 16; break;
   8548       case Iop_ShrN32x4: shr = True; size = 32; break;
   8549       case Iop_ShrN64x2: shr = True; size = 64; break;
   8550       default: vassert(0);
   8551    }
   8552 
   8553    if (shl || shr) {
   8554      assign(
   8555         g1,
   8556         IRExpr_Mux0X(
   8557            unop(Iop_1Uto8,
   8558                 binop(Iop_CmpLT64U, unop(Iop_32Uto64,mkexpr(amt)), mkU64(size))),
   8559            mkV128(0x0000),
   8560            binop(op, mkexpr(g0), mkexpr(amt8))
   8561         )
   8562      );
   8563    } else
   8564    if (sar) {
   8565      assign(
   8566         g1,
   8567         IRExpr_Mux0X(
   8568            unop(Iop_1Uto8,
   8569                 binop(Iop_CmpLT64U, unop(Iop_32Uto64,mkexpr(amt)), mkU64(size))),
   8570            binop(op, mkexpr(g0), mkU8(size-1)),
   8571            binop(op, mkexpr(g0), mkexpr(amt8))
   8572         )
   8573      );
   8574    } else {
   8575       vassert(0);
   8576    }
   8577 
   8578    putXMMReg( gregOfRexRM(pfx,rm), mkexpr(g1) );
   8579    return delta;
   8580 }
   8581 
   8582 
   8583 /* Vector by scalar shift of E by an immediate byte. */
   8584 
   8585 static
   8586 ULong dis_SSE_shiftE_imm ( Prefix pfx,
   8587                            Long delta, HChar* opname, IROp op )
   8588 {
   8589    Bool    shl, shr, sar;
   8590    UChar   rm   = getUChar(delta);
   8591    IRTemp  e0   = newTemp(Ity_V128);
   8592    IRTemp  e1   = newTemp(Ity_V128);
   8593    UChar   amt, size;
   8594    vassert(epartIsReg(rm));
   8595    vassert(gregLO3ofRM(rm) == 2
   8596            || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
   8597    amt = getUChar(delta+1);
   8598    delta += 2;
   8599    DIP("%s $%d,%s\n", opname,
   8600                       (Int)amt,
   8601                       nameXMMReg(eregOfRexRM(pfx,rm)) );
   8602    assign( e0, getXMMReg(eregOfRexRM(pfx,rm)) );
   8603 
   8604    shl = shr = sar = False;
   8605    size = 0;
   8606    switch (op) {
   8607       case Iop_ShlN16x8: shl = True; size = 16; break;
   8608       case Iop_ShlN32x4: shl = True; size = 32; break;
   8609       case Iop_ShlN64x2: shl = True; size = 64; break;
   8610       case Iop_SarN16x8: sar = True; size = 16; break;
   8611       case Iop_SarN32x4: sar = True; size = 32; break;
   8612       case Iop_ShrN16x8: shr = True; size = 16; break;
   8613       case Iop_ShrN32x4: shr = True; size = 32; break;
   8614       case Iop_ShrN64x2: shr = True; size = 64; break;
   8615       default: vassert(0);
   8616    }
   8617 
   8618    if (shl || shr) {
   8619      assign( e1, amt >= size
   8620                     ? mkV128(0x0000)
   8621                     : binop(op, mkexpr(e0), mkU8(amt))
   8622      );
   8623    } else
   8624    if (sar) {
   8625      assign( e1, amt >= size
   8626                     ? binop(op, mkexpr(e0), mkU8(size-1))
   8627                     : binop(op, mkexpr(e0), mkU8(amt))
   8628      );
   8629    } else {
   8630       vassert(0);
   8631    }
   8632 
   8633    putXMMReg( eregOfRexRM(pfx,rm), mkexpr(e1) );
   8634    return delta;
   8635 }
   8636 
   8637 
   8638 /* Get the current SSE rounding mode. */
   8639 
   8640 static IRExpr* /* :: Ity_I32 */ get_sse_roundingmode ( void )
   8641 {
   8642    return
   8643       unop( Iop_64to32,
   8644             binop( Iop_And64,
   8645                    IRExpr_Get( OFFB_SSEROUND, Ity_I64 ),
   8646                    mkU64(3) ));
   8647 }
   8648 
   8649 static void put_sse_roundingmode ( IRExpr* sseround )
   8650 {
   8651    vassert(typeOfIRExpr(irsb->tyenv, sseround) == Ity_I32);
   8652    stmt( IRStmt_Put( OFFB_SSEROUND,
   8653                      unop(Iop_32Uto64,sseround) ) );
   8654 }
   8655 
   8656 /* Break a 128-bit value up into four 32-bit ints. */
   8657 
   8658 static void breakup128to32s ( IRTemp t128,
   8659                               /*OUTs*/
   8660                               IRTemp* t3, IRTemp* t2,
   8661                               IRTemp* t1, IRTemp* t0 )
   8662 {
   8663    IRTemp hi64 = newTemp(Ity_I64);
   8664    IRTemp lo64 = newTemp(Ity_I64);
   8665    assign( hi64, unop(Iop_V128HIto64, mkexpr(t128)) );
   8666    assign( lo64, unop(Iop_V128to64,   mkexpr(t128)) );
   8667 
   8668    vassert(t0 && *t0 == IRTemp_INVALID);
   8669    vassert(t1 && *t1 == IRTemp_INVALID);
   8670    vassert(t2 && *t2 == IRTemp_INVALID);
   8671    vassert(t3 && *t3 == IRTemp_INVALID);
   8672 
   8673    *t0 = newTemp(Ity_I32);
   8674    *t1 = newTemp(Ity_I32);
   8675    *t2 = newTemp(Ity_I32);
   8676    *t3 = newTemp(Ity_I32);
   8677    assign( *t0, unop(Iop_64to32,   mkexpr(lo64)) );
   8678    assign( *t1, unop(Iop_64HIto32, mkexpr(lo64)) );
   8679    assign( *t2, unop(Iop_64to32,   mkexpr(hi64)) );
   8680    assign( *t3, unop(Iop_64HIto32, mkexpr(hi64)) );
   8681 }
   8682 
   8683 /* Construct a 128-bit value from four 32-bit ints. */
   8684 
   8685 static IRExpr* mk128from32s ( IRTemp t3, IRTemp t2,
   8686                               IRTemp t1, IRTemp t0 )
   8687 {
   8688    return
   8689       binop( Iop_64HLtoV128,
   8690              binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
   8691              binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0))
   8692    );
   8693 }
   8694 
   8695 /* Break a 64-bit value up into four 16-bit ints. */
   8696 
   8697 static void breakup64to16s ( IRTemp t64,
   8698                              /*OUTs*/
   8699                              IRTemp* t3, IRTemp* t2,
   8700                              IRTemp* t1, IRTemp* t0 )
   8701 {
   8702    IRTemp hi32 = newTemp(Ity_I32);
   8703    IRTemp lo32 = newTemp(Ity_I32);
   8704    assign( hi32, unop(Iop_64HIto32, mkexpr(t64)) );
   8705    assign( lo32, unop(Iop_64to32,   mkexpr(t64)) );
   8706 
   8707    vassert(t0 && *t0 == IRTemp_INVALID);
   8708    vassert(t1 && *t1 == IRTemp_INVALID);
   8709    vassert(t2 && *t2 == IRTemp_INVALID);
   8710    vassert(t3 && *t3 == IRTemp_INVALID);
   8711 
   8712    *t0 = newTemp(Ity_I16);
   8713    *t1 = newTemp(Ity_I16);
   8714    *t2 = newTemp(Ity_I16);
   8715    *t3 = newTemp(Ity_I16);
   8716    assign( *t0, unop(Iop_32to16,   mkexpr(lo32)) );
   8717    assign( *t1, unop(Iop_32HIto16, mkexpr(lo32)) );
   8718    assign( *t2, unop(Iop_32to16,   mkexpr(hi32)) );
   8719    assign( *t3, unop(Iop_32HIto16, mkexpr(hi32)) );
   8720 }
   8721 
   8722 /* Construct a 64-bit value from four 16-bit ints. */
   8723 
   8724 static IRExpr* mk64from16s ( IRTemp t3, IRTemp t2,
   8725                              IRTemp t1, IRTemp t0 )
   8726 {
   8727    return
   8728       binop( Iop_32HLto64,
   8729              binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)),
   8730              binop(Iop_16HLto32, mkexpr(t1), mkexpr(t0))
   8731    );
   8732 }
   8733 
   8734 
   8735 /* Helper for the SSSE3 (not SSE3) PMULHRSW insns.  Given two 64-bit
   8736    values (aa,bb), computes, for each of the 4 16-bit lanes:
   8737 
   8738    (((aa_lane *s32 bb_lane) >>u 14) + 1) >>u 1
   8739 */
   8740 static IRExpr* dis_PMULHRSW_helper ( IRExpr* aax, IRExpr* bbx )
   8741 {
   8742    IRTemp aa      = newTemp(Ity_I64);
   8743    IRTemp bb      = newTemp(Ity_I64);
   8744    IRTemp aahi32s = newTemp(Ity_I64);
   8745    IRTemp aalo32s = newTemp(Ity_I64);
   8746    IRTemp bbhi32s = newTemp(Ity_I64);
   8747    IRTemp bblo32s = newTemp(Ity_I64);
   8748    IRTemp rHi     = newTemp(Ity_I64);
   8749    IRTemp rLo     = newTemp(Ity_I64);
   8750    IRTemp one32x2 = newTemp(Ity_I64);
   8751    assign(aa, aax);
   8752    assign(bb, bbx);
   8753    assign( aahi32s,
   8754            binop(Iop_SarN32x2,
   8755                  binop(Iop_InterleaveHI16x4, mkexpr(aa), mkexpr(aa)),
   8756                  mkU8(16) ));
   8757    assign( aalo32s,
   8758            binop(Iop_SarN32x2,
   8759                  binop(Iop_InterleaveLO16x4, mkexpr(aa), mkexpr(aa)),
   8760                  mkU8(16) ));
   8761    assign( bbhi32s,
   8762            binop(Iop_SarN32x2,
   8763                  binop(Iop_InterleaveHI16x4, mkexpr(bb), mkexpr(bb)),
   8764                  mkU8(16) ));
   8765    assign( bblo32s,
   8766            binop(Iop_SarN32x2,
   8767                  binop(Iop_InterleaveLO16x4, mkexpr(bb), mkexpr(bb)),
   8768                  mkU8(16) ));
   8769    assign(one32x2, mkU64( (1ULL << 32) + 1 ));
   8770    assign(
   8771       rHi,
   8772       binop(
   8773          Iop_ShrN32x2,
   8774          binop(
   8775             Iop_Add32x2,
   8776             binop(
   8777                Iop_ShrN32x2,
   8778                binop(Iop_Mul32x2, mkexpr(aahi32s), mkexpr(bbhi32s)),
   8779                mkU8(14)
   8780             ),
   8781             mkexpr(one32x2)
   8782          ),
   8783          mkU8(1)
   8784       )
   8785    );
   8786    assign(
   8787       rLo,
   8788       binop(
   8789          Iop_ShrN32x2,
   8790          binop(
   8791             Iop_Add32x2,
   8792             binop(
   8793                Iop_ShrN32x2,
   8794                binop(Iop_Mul32x2, mkexpr(aalo32s), mkexpr(bblo32s)),
   8795                mkU8(14)
   8796             ),
   8797             mkexpr(one32x2)
   8798          ),
   8799          mkU8(1)
   8800       )
   8801    );
   8802    return
   8803       binop(Iop_CatEvenLanes16x4, mkexpr(rHi), mkexpr(rLo));
   8804 }
   8805 
   8806 /* Helper for the SSSE3 (not SSE3) PSIGN{B,W,D} insns.  Given two 64-bit
   8807    values (aa,bb), computes, for each lane:
   8808 
   8809           if aa_lane < 0 then - bb_lane
   8810      else if aa_lane > 0 then bb_lane
   8811      else 0
   8812 */
   8813 static IRExpr* dis_PSIGN_helper ( IRExpr* aax, IRExpr* bbx, Int laneszB )
   8814 {
   8815    IRTemp aa       = newTemp(Ity_I64);
   8816    IRTemp bb       = newTemp(Ity_I64);
   8817    IRTemp zero     = newTemp(Ity_I64);
   8818    IRTemp bbNeg    = newTemp(Ity_I64);
   8819    IRTemp negMask  = newTemp(Ity_I64);
   8820    IRTemp posMask  = newTemp(Ity_I64);
   8821    IROp   opSub    = Iop_INVALID;
   8822    IROp   opCmpGTS = Iop_INVALID;
   8823 
   8824    switch (laneszB) {
   8825       case 1: opSub = Iop_Sub8x8;  opCmpGTS = Iop_CmpGT8Sx8;  break;
   8826       case 2: opSub = Iop_Sub16x4; opCmpGTS = Iop_CmpGT16Sx4; break;
   8827       case 4: opSub = Iop_Sub32x2; opCmpGTS = Iop_CmpGT32Sx2; break;
   8828       default: vassert(0);
   8829    }
   8830 
   8831    assign( aa,      aax );
   8832    assign( bb,      bbx );
   8833    assign( zero,    mkU64(0) );
   8834    assign( bbNeg,   binop(opSub,    mkexpr(zero), mkexpr(bb)) );
   8835    assign( negMask, binop(opCmpGTS, mkexpr(zero), mkexpr(aa)) );
   8836    assign( posMask, binop(opCmpGTS, mkexpr(aa),   mkexpr(zero)) );
   8837 
   8838    return
   8839       binop(Iop_Or64,
   8840             binop(Iop_And64, mkexpr(bb),    mkexpr(posMask)),
   8841             binop(Iop_And64, mkexpr(bbNeg), mkexpr(negMask)) );
   8842 
   8843 }
   8844 
   8845 /* Helper for the SSSE3 (not SSE3) PABS{B,W,D} insns.  Given a 64-bit
   8846    value aa, computes, for each lane
   8847 
   8848    if aa < 0 then -aa else aa
   8849 
   8850    Note that the result is interpreted as unsigned, so that the
   8851    absolute value of the most negative signed input can be
   8852    represented.
   8853 */
   8854 static IRExpr* dis_PABS_helper ( IRExpr* aax, Int laneszB )
   8855 {
   8856    IRTemp aa      = newTemp(Ity_I64);
   8857    IRTemp zero    = newTemp(Ity_I64);
   8858    IRTemp aaNeg   = newTemp(Ity_I64);
   8859    IRTemp negMask = newTemp(Ity_I64);
   8860    IRTemp posMask = newTemp(Ity_I64);
   8861    IROp   opSub   = Iop_INVALID;
   8862    IROp   opSarN  = Iop_INVALID;
   8863 
   8864    switch (laneszB) {
   8865       case 1: opSub = Iop_Sub8x8;  opSarN = Iop_SarN8x8;  break;
   8866       case 2: opSub = Iop_Sub16x4; opSarN = Iop_SarN16x4; break;
   8867       case 4: opSub = Iop_Sub32x2; opSarN = Iop_SarN32x2; break;
   8868       default: vassert(0);
   8869    }
   8870 
   8871    assign( aa,      aax );
   8872    assign( negMask, binop(opSarN, mkexpr(aa), mkU8(8*laneszB-1)) );
   8873    assign( posMask, unop(Iop_Not64, mkexpr(negMask)) );
   8874    assign( zero,    mkU64(0) );
   8875    assign( aaNeg,   binop(opSub, mkexpr(zero), mkexpr(aa)) );
   8876    return
   8877       binop(Iop_Or64,
   8878             binop(Iop_And64, mkexpr(aa),    mkexpr(posMask)),
   8879             binop(Iop_And64, mkexpr(aaNeg), mkexpr(negMask)) );
   8880 }
   8881 
   8882 static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64,
   8883                                         IRTemp lo64, Long byteShift )
   8884 {
   8885    vassert(byteShift >= 1 && byteShift <= 7);
   8886    return
   8887       binop(Iop_Or64,
   8888             binop(Iop_Shl64, mkexpr(hi64), mkU8(8*(8-byteShift))),
   8889             binop(Iop_Shr64, mkexpr(lo64), mkU8(8*byteShift))
   8890       );
   8891 }
   8892 
   8893 /* Generate a SIGSEGV followed by a restart of the current instruction
   8894    if effective_addr is not 16-aligned.  This is required behaviour
   8895    for some SSE3 instructions and all 128-bit SSSE3 instructions.
   8896    This assumes that guest_RIP_curr_instr is set correctly! */
   8897 /* TODO(glider): we've replaced the 0xF mask with 0x0, effectively disabling
   8898  * the check. Need to enable it once TSan stops generating unaligned
   8899  * accesses in the wrappers.
   8900  * See http://code.google.com/p/data-race-test/issues/detail?id=49 */
   8901 static void gen_SEGV_if_not_16_aligned ( IRTemp effective_addr )
   8902 {
   8903    stmt(
   8904       IRStmt_Exit(
   8905          binop(Iop_CmpNE64,
   8906                binop(Iop_And64,mkexpr(effective_addr),mkU64(0x0)),
   8907                mkU64(0)),
   8908          Ijk_SigSEGV,
   8909          IRConst_U64(guest_RIP_curr_instr)
   8910       )
   8911    );
   8912 }
   8913 
   8914 
   8915 /* Helper for deciding whether a given insn (starting at the opcode
   8916    byte) may validly be used with a LOCK prefix.  The following insns
   8917    may be used with LOCK when their destination operand is in memory.
   8918    AFAICS this is exactly the same for both 32-bit and 64-bit mode.
   8919 
   8920    ADD        80 /0,  81 /0,  82 /0,  83 /0,  00,  01
   8921    OR         80 /1,  81 /1,  82 /x,  83 /1,  08,  09
   8922    ADC        80 /2,  81 /2,  82 /2,  83 /2,  10,  11
   8923    SBB        81 /3,  81 /3,  82 /x,  83 /3,  18,  19
   8924    AND        80 /4,  81 /4,  82 /x,  83 /4,  20,  21
   8925    SUB        80 /5,  81 /5,  82 /x,  83 /5,  28,  29
   8926    XOR        80 /6,  81 /6,  82 /x,  83 /6,  30,  31
   8927 
   8928    DEC        FE /1,  FF /1
   8929    INC        FE /0,  FF /0
   8930 
   8931    NEG        F6 /3,  F7 /3
   8932    NOT        F6 /2,  F7 /2
   8933 
   8934    XCHG       86, 87
   8935 
   8936    BTC        0F BB,  0F BA /7
   8937    BTR        0F B3,  0F BA /6
   8938    BTS        0F AB,  0F BA /5
   8939 
   8940    CMPXCHG    0F B0,  0F B1
   8941    CMPXCHG8B  0F C7 /1
   8942 
   8943    XADD       0F C0,  0F C1
   8944 
   8945    ------------------------------
   8946 
   8947    80 /0  =  addb $imm8,  rm8
   8948    81 /0  =  addl $imm32, rm32  and  addw $imm16, rm16
   8949    82 /0  =  addb $imm8,  rm8
   8950    83 /0  =  addl $simm8, rm32  and  addw $simm8, rm16
   8951 
   8952    00     =  addb r8,  rm8
   8953    01     =  addl r32, rm32  and  addw r16, rm16
   8954 
   8955    Same for ADD OR ADC SBB AND SUB XOR
   8956 
   8957    FE /1  = dec rm8
   8958    FF /1  = dec rm32  and  dec rm16
   8959 
   8960    FE /0  = inc rm8
   8961    FF /0  = inc rm32  and  inc rm16
   8962 
   8963    F6 /3  = neg rm8
   8964    F7 /3  = neg rm32  and  neg rm16
   8965 
   8966    F6 /2  = not rm8
   8967    F7 /2  = not rm32  and  not rm16
   8968 
   8969    0F BB     = btcw r16, rm16    and  btcl r32, rm32
   8970    OF BA /7  = btcw $imm8, rm16  and  btcw $imm8, rm32
   8971 
   8972    Same for BTS, BTR
   8973 */
   8974 static Bool can_be_used_with_LOCK_prefix ( UChar* opc )
   8975 {
   8976    switch (opc[0]) {
   8977       case 0x00: case 0x01: case 0x08: case 0x09:
   8978       case 0x10: case 0x11: case 0x18: case 0x19:
   8979       case 0x20: case 0x21: case 0x28: case 0x29:
   8980       case 0x30: case 0x31:
   8981          if (!epartIsReg(opc[1]))
   8982             return True;
   8983          break;
   8984 
   8985       case 0x80: case 0x81: case 0x82: case 0x83:
   8986          if (gregLO3ofRM(opc[1]) >= 0 && gregLO3ofRM(opc[1]) <= 6
   8987              && !epartIsReg(opc[1]))
   8988             return True;
   8989          break;
   8990 
   8991       case 0xFE: case 0xFF:
   8992          if (gregLO3ofRM(opc[1]) >= 0 && gregLO3ofRM(opc[1]) <= 1
   8993              && !epartIsReg(opc[1]))
   8994             return True;
   8995          break;
   8996 
   8997       case 0xF6: case 0xF7:
   8998          if (gregLO3ofRM(opc[1]) >= 2 && gregLO3ofRM(opc[1]) <= 3
   8999              && !epartIsReg(opc[1]))
   9000             return True;
   9001          break;
   9002 
   9003       case 0x86: case 0x87:
   9004          if (!epartIsReg(opc[1]))
   9005             return True;
   9006          break;
   9007 
   9008       case 0x0F: {
   9009          switch (opc[1]) {
   9010             case 0xBB: case 0xB3: case 0xAB:
   9011                if (!epartIsReg(opc[2]))
   9012                   return True;
   9013                break;
   9014             case 0xBA:
   9015                if (gregLO3ofRM(opc[2]) >= 5 && gregLO3ofRM(opc[2]) <= 7
   9016                    && !epartIsReg(opc[2]))
   9017                   return True;
   9018                break;
   9019             case 0xB0: case 0xB1:
   9020                if (!epartIsReg(opc[2]))
   9021                   return True;
   9022                break;
   9023             case 0xC7:
   9024                if (gregLO3ofRM(opc[2]) == 1 && !epartIsReg(opc[2]) )
   9025                   return True;
   9026                break;
   9027             case 0xC0: case 0xC1:
   9028                if (!epartIsReg(opc[2]))
   9029                   return True;
   9030                break;
   9031             default:
   9032                break;
   9033          } /* switch (opc[1]) */
   9034          break;
   9035       }
   9036 
   9037       default:
   9038          break;
   9039    } /* switch (opc[0]) */
   9040 
   9041    return False;
   9042 }
   9043 
   9044 
   9045 /*------------------------------------------------------------*/
   9046 /*--- Disassemble a single instruction                     ---*/
   9047 /*------------------------------------------------------------*/
   9048 
   9049 /* Disassemble a single instruction into IR.  The instruction is
   9050    located in host memory at &guest_code[delta]. */
   9051 
   9052 static
   9053 DisResult disInstr_AMD64_WRK (
   9054              /*OUT*/Bool* expect_CAS,
   9055              Bool         put_IP,
   9056              Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
   9057              Bool         resteerCisOk,
   9058              void*        callback_opaque,
   9059              Long         delta64,
   9060              VexArchInfo* archinfo,
   9061              VexAbiInfo*  vbi
   9062           )
   9063 {
   9064    IRType    ty;
   9065    IRTemp    addr, t0, t1, t2, t3, t4, t5, t6;
   9066    Int       alen;
   9067    UChar     opc, modrm, abyte, pre;
   9068    Long      d64;
   9069    HChar     dis_buf[50];
   9070    Int       am_sz, d_sz, n, n_prefixes;
   9071    DisResult dres;
   9072    UChar*    insn; /* used in SSE decoders */
   9073 
   9074    /* The running delta */
   9075    Long delta = delta64;
   9076 
   9077    /* Holds eip at the start of the insn, so that we can print
   9078       consistent error messages for unimplemented insns. */
   9079    Long delta_start = delta;
   9080 
   9081    /* sz denotes the nominal data-op size of the insn; we change it to
   9082       2 if an 0x66 prefix is seen and 8 if REX.W is 1.  In case of
   9083       conflict REX.W takes precedence. */
   9084    Int sz = 4;
   9085 
   9086    /* pfx holds the summary of prefixes. */
   9087    Prefix pfx = PFX_EMPTY;
   9088 
   9089    /* Set result defaults. */
   9090    dres.whatNext   = Dis_Continue;
   9091    dres.len        = 0;
   9092    dres.continueAt = 0;
   9093 
   9094    *expect_CAS = False;
   9095 
   9096    vassert(guest_RIP_next_assumed == 0);
   9097    vassert(guest_RIP_next_mustcheck == False);
   9098 
   9099    addr = t0 = t1 = t2 = t3 = t4 = t5 = t6 = IRTemp_INVALID;
   9100 
   9101    DIP("\t0x%llx:  ", guest_RIP_bbstart+delta);
   9102 
   9103    /* We may be asked to update the guest RIP before going further. */
   9104    if (put_IP)
   9105       stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_curr_instr)) );
   9106 
   9107    /* Spot "Special" instructions (see comment at top of file). */
   9108    {
   9109       UChar* code = (UChar*)(guest_code + delta);
   9110       /* Spot the 16-byte preamble:
   9111          48C1C703   rolq $3,  %rdi
   9112          48C1C70D   rolq $13, %rdi
   9113          48C1C73D   rolq $61, %rdi
   9114          48C1C733   rolq $51, %rdi
   9115       */
   9116       if (code[ 0] == 0x48 && code[ 1] == 0xC1 && code[ 2] == 0xC7
   9117                                                && code[ 3] == 0x03 &&
   9118           code[ 4] == 0x48 && code[ 5] == 0xC1 && code[ 6] == 0xC7
   9119                                                && code[ 7] == 0x0D &&
   9120           code[ 8] == 0x48 && code[ 9] == 0xC1 && code[10] == 0xC7
   9121                                                && code[11] == 0x3D &&
   9122           code[12] == 0x48 && code[13] == 0xC1 && code[14] == 0xC7
   9123                                                && code[15] == 0x33) {
   9124          /* Got a "Special" instruction preamble.  Which one is it? */
   9125          if (code[16] == 0x48 && code[17] == 0x87
   9126                               && code[18] == 0xDB /* xchgq %rbx,%rbx */) {
   9127             /* %RDX = client_request ( %RAX ) */
   9128             DIP("%%rdx = client_request ( %%rax )\n");
   9129             delta += 19;
   9130             jmp_lit(Ijk_ClientReq, guest_RIP_bbstart+delta);
   9131             dres.whatNext = Dis_StopHere;
   9132             goto decode_success;
   9133          }
   9134          else
   9135          if (code[16] == 0x48 && code[17] == 0x87
   9136                               && code[18] == 0xC9 /* xchgq %rcx,%rcx */) {
   9137             /* %RAX = guest_NRADDR */
   9138             DIP("%%rax = guest_NRADDR\n");
   9139             delta += 19;
   9140             putIRegRAX(8, IRExpr_Get( OFFB_NRADDR, Ity_I64 ));
   9141             goto decode_success;
   9142          }
   9143          else
   9144          if (code[16] == 0x48 && code[17] == 0x87
   9145                               && code[18] == 0xD2 /* xchgq %rdx,%rdx */) {
   9146             /* call-noredir *%RAX */
   9147             DIP("call-noredir *%%rax\n");
   9148             delta += 19;
   9149             t1 = newTemp(Ity_I64);
   9150             assign(t1, getIRegRAX(8));
   9151             t2 = newTemp(Ity_I64);
   9152             assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   9153             putIReg64(R_RSP, mkexpr(t2));
   9154             storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta));
   9155             jmp_treg(Ijk_NoRedir,t1);
   9156             dres.whatNext = Dis_StopHere;
   9157             goto decode_success;
   9158          }
   9159          /* We don't know what it is. */
   9160          goto decode_failure;
   9161          /*NOTREACHED*/
   9162       }
   9163    }
   9164 
   9165    /* Eat prefixes, summarising the result in pfx and sz, and rejecting
   9166       as many invalid combinations as possible. */
   9167    n_prefixes = 0;
   9168    while (True) {
   9169       if (n_prefixes > 7) goto decode_failure;
   9170       pre = getUChar(delta);
   9171       switch (pre) {
   9172          case 0x66: pfx |= PFX_66; break;
   9173          case 0x67: pfx |= PFX_ASO; break;
   9174          case 0xF2: pfx |= PFX_F2; break;
   9175          case 0xF3: pfx |= PFX_F3; break;
   9176          case 0xF0: pfx |= PFX_LOCK; *expect_CAS = True; break;
   9177          case 0x2E: pfx |= PFX_CS; break;
   9178          case 0x3E: pfx |= PFX_DS; break;
   9179          case 0x26: pfx |= PFX_ES; break;
   9180          case 0x64: pfx |= PFX_FS; break;
   9181          case 0x65: pfx |= PFX_GS; break;
   9182          case 0x36: pfx |= PFX_SS; break;
   9183          case 0x40 ... 0x4F:
   9184             pfx |= PFX_REX;
   9185             if (pre & (1<<3)) pfx |= PFX_REXW;
   9186             if (pre & (1<<2)) pfx |= PFX_REXR;
   9187             if (pre & (1<<1)) pfx |= PFX_REXX;
   9188             if (pre & (1<<0)) pfx |= PFX_REXB;
   9189             break;
   9190          default:
   9191             goto not_a_prefix;
   9192       }
   9193       n_prefixes++;
   9194       delta++;
   9195    }
   9196 
   9197    not_a_prefix:
   9198 
   9199    /* Dump invalid combinations */
   9200    n = 0;
   9201    if (pfx & PFX_F2) n++;
   9202    if (pfx & PFX_F3) n++;
   9203    if (n > 1)
   9204       goto decode_failure; /* can't have both */
   9205 
   9206    n = 0;
   9207    if (pfx & PFX_CS) n++;
   9208    if (pfx & PFX_DS) n++;
   9209    if (pfx & PFX_ES) n++;
   9210    if (pfx & PFX_FS) n++;
   9211    if (pfx & PFX_GS) n++;
   9212    if (pfx & PFX_SS) n++;
   9213    if (n > 1)
   9214       goto decode_failure; /* multiple seg overrides == illegal */
   9215 
   9216    /* We have a %fs prefix.  Reject it if there's no evidence in 'vbi'
   9217       that we should accept it. */
   9218    if ((pfx & PFX_FS) && !vbi->guest_amd64_assume_fs_is_zero)
   9219       goto decode_failure;
   9220 
   9221    /* Ditto for %gs prefixes. */
   9222    if ((pfx & PFX_GS) && !vbi->guest_amd64_assume_gs_is_0x60)
   9223       goto decode_failure;
   9224 
   9225    /* Set up sz. */
   9226    sz = 4;
   9227    if (pfx & PFX_66) sz = 2;
   9228    if ((pfx & PFX_REX) && (pfx & PFX_REXW)) sz = 8;
   9229 
   9230    /* Now we should be looking at the primary opcode byte or the
   9231       leading F2 or F3.  Check that any LOCK prefix is actually
   9232       allowed. */
   9233 
   9234    if (pfx & PFX_LOCK) {
   9235       if (can_be_used_with_LOCK_prefix( (UChar*)&guest_code[delta] )) {
   9236          DIP("lock ");
   9237       } else {
   9238          *expect_CAS = False;
   9239          goto decode_failure;
   9240       }
   9241    }
   9242 
   9243 
   9244    /* ---------------------------------------------------- */
   9245    /* --- The SSE/SSE2 decoder.                        --- */
   9246    /* ---------------------------------------------------- */
   9247 
   9248    /* What did I do to deserve SSE ?  Perhaps I was really bad in a
   9249       previous life? */
   9250 
   9251    /* Note, this doesn't handle SSE3 right now.  All amd64s support
   9252       SSE2 as a minimum so there is no point distinguishing SSE1 vs
   9253       SSE2. */
   9254 
   9255    insn = (UChar*)&guest_code[delta];
   9256 
   9257    /* FXSAVE is spuriously at the start here only because it is
   9258       thusly placed in guest-x86/toIR.c. */
   9259 
   9260    /* 0F AE /0 = FXSAVE m512 -- write x87 and SSE state to memory.
   9261       Note that the presence or absence of REX.W slightly affects the
   9262       written format: whether the saved FPU IP and DP pointers are 64
   9263       or 32 bits.  But the helper function we call simply writes zero
   9264       bits in the relevant fields (which are 64 bits regardless of
   9265       what REX.W is) and so it's good enough (iow, equally broken) in
   9266       both cases. */
   9267    if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   9268        && insn[0] == 0x0F && insn[1] == 0xAE
   9269        && !epartIsReg(insn[2]) && gregOfRexRM(pfx,insn[2]) == 0) {
   9270        IRDirty* d;
   9271       modrm = getUChar(delta+2);
   9272       vassert(!epartIsReg(modrm));
   9273 
   9274       addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   9275       delta += 2+alen;
   9276       gen_SEGV_if_not_16_aligned(addr);
   9277 
   9278       DIP("%sfxsave %s\n", sz==8 ? "rex64/" : "", dis_buf);
   9279 
   9280       /* Uses dirty helper:
   9281             void amd64g_do_FXSAVE ( VexGuestAMD64State*, ULong ) */
   9282       d = unsafeIRDirty_0_N (
   9283              0/*regparms*/,
   9284              "amd64g_dirtyhelper_FXSAVE",
   9285              &amd64g_dirtyhelper_FXSAVE,
   9286              mkIRExprVec_1( mkexpr(addr) )
   9287           );
   9288       d->needsBBP = True;
   9289 
   9290       /* declare we're writing memory */
   9291       d->mFx   = Ifx_Write;
   9292       d->mAddr = mkexpr(addr);
   9293       d->mSize = 512;
   9294 
   9295       /* declare we're reading guest state */
   9296       d->nFxState = 7;
   9297 
   9298       d->fxState[0].fx     = Ifx_Read;
   9299       d->fxState[0].offset = OFFB_FTOP;
   9300       d->fxState[0].size   = sizeof(UInt);
   9301 
   9302       d->fxState[1].fx     = Ifx_Read;
   9303       d->fxState[1].offset = OFFB_FPREGS;
   9304       d->fxState[1].size   = 8 * sizeof(ULong);
   9305 
   9306       d->fxState[2].fx     = Ifx_Read;
   9307       d->fxState[2].offset = OFFB_FPTAGS;
   9308       d->fxState[2].size   = 8 * sizeof(UChar);
   9309 
   9310       d->fxState[3].fx     = Ifx_Read;
   9311       d->fxState[3].offset = OFFB_FPROUND;
   9312       d->fxState[3].size   = sizeof(ULong);
   9313 
   9314       d->fxState[4].fx     = Ifx_Read;
   9315       d->fxState[4].offset = OFFB_FC3210;
   9316       d->fxState[4].size   = sizeof(ULong);
   9317 
   9318       d->fxState[5].fx     = Ifx_Read;
   9319       d->fxState[5].offset = OFFB_XMM0;
   9320       d->fxState[5].size   = 16 * sizeof(U128);
   9321 
   9322       d->fxState[6].fx     = Ifx_Read;
   9323       d->fxState[6].offset = OFFB_SSEROUND;
   9324       d->fxState[6].size   = sizeof(ULong);
   9325 
   9326       /* Be paranoid ... this assertion tries to ensure the 16 %xmm
   9327 	 images are packed back-to-back.  If not, the value of
   9328 	 d->fxState[5].size is wrong. */
   9329       vassert(16 == sizeof(U128));
   9330       vassert(OFFB_XMM15 == (OFFB_XMM0 + 15 * 16));
   9331 
   9332       stmt( IRStmt_Dirty(d) );
   9333 
   9334       goto decode_success;
   9335    }
   9336 
   9337    /* 0F AE /1 = FXRSTOR m512 -- read x87 and SSE state from memory.
   9338       As with FXSAVE above we ignore the value of REX.W since we're
   9339       not bothering with the FPU DP and IP fields. */
   9340    if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   9341        && insn[0] == 0x0F && insn[1] == 0xAE
   9342        && !epartIsReg(insn[2]) && gregOfRexRM(pfx,insn[2]) == 1) {
   9343        IRDirty* d;
   9344       modrm = getUChar(delta+2);
   9345       vassert(!epartIsReg(modrm));
   9346 
   9347       addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   9348       delta += 2+alen;
   9349       gen_SEGV_if_not_16_aligned(addr);
   9350 
   9351       DIP("%sfxrstor %s\n", sz==8 ? "rex64/" : "", dis_buf);
   9352 
   9353       /* Uses dirty helper:
   9354             VexEmWarn amd64g_do_FXRSTOR ( VexGuestAMD64State*, ULong )
   9355          NOTE:
   9356             the VexEmWarn value is simply ignored
   9357       */
   9358       d = unsafeIRDirty_0_N (
   9359              0/*regparms*/,
   9360              "amd64g_dirtyhelper_FXRSTOR",
   9361              &amd64g_dirtyhelper_FXRSTOR,
   9362              mkIRExprVec_1( mkexpr(addr) )
   9363           );
   9364       d->needsBBP = True;
   9365 
   9366       /* declare we're reading memory */
   9367       d->mFx   = Ifx_Read;
   9368       d->mAddr = mkexpr(addr);
   9369       d->mSize = 512;
   9370 
   9371       /* declare we're writing guest state */
   9372       d->nFxState = 7;
   9373 
   9374       d->fxState[0].fx     = Ifx_Write;
   9375       d->fxState[0].offset = OFFB_FTOP;
   9376       d->fxState[0].size   = sizeof(UInt);
   9377 
   9378       d->fxState[1].fx     = Ifx_Write;
   9379       d->fxState[1].offset = OFFB_FPREGS;
   9380       d->fxState[1].size   = 8 * sizeof(ULong);
   9381 
   9382       d->fxState[2].fx     = Ifx_Write;
   9383       d->fxState[2].offset = OFFB_FPTAGS;
   9384       d->fxState[2].size   = 8 * sizeof(UChar);
   9385 
   9386       d->fxState[3].fx     = Ifx_Write;
   9387       d->fxState[3].offset = OFFB_FPROUND;
   9388       d->fxState[3].size   = sizeof(ULong);
   9389 
   9390       d->fxState[4].fx     = Ifx_Write;
   9391       d->fxState[4].offset = OFFB_FC3210;
   9392       d->fxState[4].size   = sizeof(ULong);
   9393 
   9394       d->fxState[5].fx     = Ifx_Write;
   9395       d->fxState[5].offset = OFFB_XMM0;
   9396       d->fxState[5].size   = 16 * sizeof(U128);
   9397 
   9398       d->fxState[6].fx     = Ifx_Write;
   9399       d->fxState[6].offset = OFFB_SSEROUND;
   9400       d->fxState[6].size   = sizeof(ULong);
   9401 
   9402       /* Be paranoid ... this assertion tries to ensure the 16 %xmm
   9403 	 images are packed back-to-back.  If not, the value of
   9404 	 d->fxState[5].size is wrong. */
   9405       vassert(16 == sizeof(U128));
   9406       vassert(OFFB_XMM15 == (OFFB_XMM0 + 15 * 16));
   9407 
   9408       stmt( IRStmt_Dirty(d) );
   9409 
   9410       goto decode_success;
   9411    }
   9412 
   9413    /* ------ SSE decoder main ------ */
   9414 
   9415    /* 0F 58 = ADDPS -- add 32Fx4 from R/M to R */
   9416    if (haveNo66noF2noF3(pfx) && sz == 4
   9417        && insn[0] == 0x0F && insn[1] == 0x58) {
   9418       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "addps", Iop_Add32Fx4 );
   9419       goto decode_success;
   9420    }
   9421 
   9422    /* F3 0F 58 = ADDSS -- add 32F0x4 from R/M to R */
   9423    if (haveF3no66noF2(pfx) && sz == 4
   9424        && insn[0] == 0x0F && insn[1] == 0x58) {
   9425       delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "addss", Iop_Add32F0x4 );
   9426       goto decode_success;
   9427    }
   9428 
   9429    /* 0F 55 = ANDNPS -- G = (not G) and E */
   9430    if (haveNo66noF2noF3(pfx) && sz == 4
   9431        && insn[0] == 0x0F && insn[1] == 0x55) {
   9432       delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta+2, "andnps", Iop_AndV128 );
   9433       goto decode_success;
   9434    }
   9435 
   9436    /* 0F 54 = ANDPS -- G = G and E */
   9437    if (haveNo66noF2noF3(pfx) && sz == 4
   9438        && insn[0] == 0x0F && insn[1] == 0x54) {
   9439       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "andps", Iop_AndV128 );
   9440       goto decode_success;
   9441    }
   9442 
   9443    /* 0F C2 = CMPPS -- 32Fx4 comparison from R/M to R */
   9444    if (haveNo66noF2noF3(pfx) && sz == 4
   9445        && insn[0] == 0x0F && insn[1] == 0xC2) {
   9446       delta = dis_SSEcmp_E_to_G( vbi, pfx, delta+2, "cmpps", True, 4 );
   9447       goto decode_success;
   9448    }
   9449 
   9450    /* F3 0F C2 = CMPSS -- 32F0x4 comparison from R/M to R */
   9451    if (haveF3no66noF2(pfx) && sz == 4
   9452        && insn[0] == 0x0F && insn[1] == 0xC2) {
   9453       delta = dis_SSEcmp_E_to_G( vbi, pfx, delta+2, "cmpss", False, 4 );
   9454       goto decode_success;
   9455    }
   9456 
   9457    /* 0F 2F = COMISS  -- 32F0x4 comparison G,E, and set ZCP */
   9458    /* 0F 2E = UCOMISS -- 32F0x4 comparison G,E, and set ZCP */
   9459    if (haveNo66noF2noF3(pfx) && sz == 4
   9460        && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
   9461       IRTemp argL = newTemp(Ity_F32);
   9462       IRTemp argR = newTemp(Ity_F32);
   9463       modrm = getUChar(delta+2);
   9464       if (epartIsReg(modrm)) {
   9465          assign( argR, getXMMRegLane32F( eregOfRexRM(pfx,modrm),
   9466                                          0/*lowest lane*/ ) );
   9467          delta += 2+1;
   9468          DIP("%scomiss %s,%s\n", insn[1]==0x2E ? "u" : "",
   9469                                  nameXMMReg(eregOfRexRM(pfx,modrm)),
   9470                                  nameXMMReg(gregOfRexRM(pfx,modrm)) );
   9471       } else {
   9472          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   9473 	 assign( argR, loadLE(Ity_F32, mkexpr(addr)) );
   9474          delta += 2+alen;
   9475          DIP("%scomiss %s,%s\n", insn[1]==0x2E ? "u" : "",
   9476                                  dis_buf,
   9477                                  nameXMMReg(gregOfRexRM(pfx,modrm)) );
   9478       }
   9479       assign( argL, getXMMRegLane32F( gregOfRexRM(pfx,modrm),
   9480                                       0/*lowest lane*/ ) );
   9481 
   9482       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   9483       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   9484       stmt( IRStmt_Put(
   9485                OFFB_CC_DEP1,
   9486                binop( Iop_And64,
   9487                       unop( Iop_32Uto64,
   9488                             binop(Iop_CmpF64,
   9489                                   unop(Iop_F32toF64,mkexpr(argL)),
   9490                                   unop(Iop_F32toF64,mkexpr(argR)))),
   9491                       mkU64(0x45)
   9492           )));
   9493 
   9494       goto decode_success;
   9495    }
   9496 
   9497    /* 0F 2A = CVTPI2PS -- convert 2 x I32 in mem/mmx to 2 x F32 in low
   9498       half xmm */
   9499    if (haveNo66noF2noF3(pfx) && sz == 4
   9500        && insn[0] == 0x0F && insn[1] == 0x2A) {
   9501       IRTemp arg64 = newTemp(Ity_I64);
   9502       IRTemp rmode = newTemp(Ity_I32);
   9503 
   9504       modrm = getUChar(delta+2);
   9505       do_MMX_preamble();
   9506       if (epartIsReg(modrm)) {
   9507          assign( arg64, getMMXReg(eregLO3ofRM(modrm)) );
   9508          delta += 2+1;
   9509          DIP("cvtpi2ps %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   9510                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   9511       } else {
   9512          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   9513          assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   9514          delta += 2+alen;
   9515          DIP("cvtpi2ps %s,%s\n", dis_buf,
   9516                                  nameXMMReg(gregOfRexRM(pfx,modrm)) );
   9517       }
   9518 
   9519       assign( rmode, get_sse_roundingmode() );
   9520 
   9521       putXMMRegLane32F(
   9522          gregOfRexRM(pfx,modrm), 0,
   9523          binop(Iop_F64toF32,
   9524                mkexpr(rmode),
   9525                unop(Iop_I32StoF64,
   9526                     unop(Iop_64to32, mkexpr(arg64)) )) );
   9527 
   9528       putXMMRegLane32F(
   9529          gregOfRexRM(pfx,modrm), 1,
   9530          binop(Iop_F64toF32,
   9531                mkexpr(rmode),
   9532                unop(Iop_I32StoF64,
   9533                     unop(Iop_64HIto32, mkexpr(arg64)) )) );
   9534 
   9535       goto decode_success;
   9536    }
   9537 
   9538    /* F3 0F 2A = CVTSI2SS
   9539       -- sz==4: convert I32 in mem/ireg to F32 in low quarter xmm
   9540       -- sz==8: convert I64 in mem/ireg to F32 in low quarter xmm */
   9541    if (haveF3no66noF2(pfx) && (sz == 4 || sz == 8)
   9542        && insn[0] == 0x0F && insn[1] == 0x2A) {
   9543 
   9544       IRTemp rmode = newTemp(Ity_I32);
   9545       assign( rmode, get_sse_roundingmode() );
   9546       modrm = getUChar(delta+2);
   9547 
   9548       if (sz == 4) {
   9549          IRTemp arg32 = newTemp(Ity_I32);
   9550          if (epartIsReg(modrm)) {
   9551             assign( arg32, getIReg32(eregOfRexRM(pfx,modrm)) );
   9552             delta += 2+1;
   9553             DIP("cvtsi2ss %s,%s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
   9554                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   9555          } else {
   9556             addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   9557             assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   9558             delta += 2+alen;
   9559             DIP("cvtsi2ss %s,%s\n", dis_buf,
   9560                                     nameXMMReg(gregOfRexRM(pfx,modrm)) );
   9561          }
   9562          putXMMRegLane32F(
   9563             gregOfRexRM(pfx,modrm), 0,
   9564             binop(Iop_F64toF32,
   9565                   mkexpr(rmode),
   9566                   unop(Iop_I32StoF64, mkexpr(arg32)) ) );
   9567       } else {
   9568          /* sz == 8 */
   9569          IRTemp arg64 = newTemp(Ity_I64);
   9570          if (epartIsReg(modrm)) {
   9571             assign( arg64, getIReg64(eregOfRexRM(pfx,modrm)) );
   9572             delta += 2+1;
   9573             DIP("cvtsi2ssq %s,%s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
   9574                                      nameXMMReg(gregOfRexRM(pfx,modrm)));
   9575          } else {
   9576             addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   9577             assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   9578             delta += 2+alen;
   9579             DIP("cvtsi2ssq %s,%s\n", dis_buf,
   9580                                      nameXMMReg(gregOfRexRM(pfx,modrm)) );
   9581          }
   9582          putXMMRegLane32F(
   9583             gregOfRexRM(pfx,modrm), 0,
   9584             binop(Iop_F64toF32,
   9585                   mkexpr(rmode),
   9586                   binop(Iop_I64StoF64, mkexpr(rmode), mkexpr(arg64)) ) );
   9587       }
   9588 
   9589       goto decode_success;
   9590    }
   9591 
   9592    /* 0F 2D = CVTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
   9593       I32 in mmx, according to prevailing SSE rounding mode */
   9594    /* 0F 2C = CVTTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
   9595       I32 in mmx, rounding towards zero */
   9596    if (haveNo66noF2noF3(pfx) && sz == 4
   9597        && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
   9598       IRTemp dst64  = newTemp(Ity_I64);
   9599       IRTemp rmode  = newTemp(Ity_I32);
   9600       IRTemp f32lo  = newTemp(Ity_F32);
   9601       IRTemp f32hi  = newTemp(Ity_F32);
   9602       Bool   r2zero = toBool(insn[1] == 0x2C);
   9603 
   9604       do_MMX_preamble();
   9605       modrm = getUChar(delta+2);
   9606 
   9607       if (epartIsReg(modrm)) {
   9608          delta += 2+1;
   9609          assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
   9610          assign(f32hi, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 1));
   9611          DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
   9612                                    nameXMMReg(eregOfRexRM(pfx,modrm)),
   9613                                    nameMMXReg(gregLO3ofRM(modrm)));
   9614       } else {
   9615          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   9616          assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   9617          assign(f32hi, loadLE(Ity_F32, binop( Iop_Add64,
   9618                                               mkexpr(addr),
   9619                                               mkU64(4) )));
   9620          delta += 2+alen;
   9621          DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
   9622                                    dis_buf,
   9623                                    nameMMXReg(gregLO3ofRM(modrm)));
   9624       }
   9625 
   9626       if (r2zero) {
   9627          assign(rmode, mkU32((UInt)Irrm_ZERO) );
   9628       } else {
   9629          assign( rmode, get_sse_roundingmode() );
   9630       }
   9631 
   9632       assign(
   9633          dst64,
   9634          binop( Iop_32HLto64,
   9635                 binop( Iop_F64toI32S,
   9636                        mkexpr(rmode),
   9637                        unop( Iop_F32toF64, mkexpr(f32hi) ) ),
   9638                 binop( Iop_F64toI32S,
   9639                        mkexpr(rmode),
   9640                        unop( Iop_F32toF64, mkexpr(f32lo) ) )
   9641               )
   9642       );
   9643 
   9644       putMMXReg(gregLO3ofRM(modrm), mkexpr(dst64));
   9645       goto decode_success;
   9646    }
   9647 
   9648    /* F3 0F 2D = CVTSS2SI
   9649       when sz==4 -- convert F32 in mem/low quarter xmm to I32 in ireg,
   9650                     according to prevailing SSE rounding mode
   9651       when sz==8 -- convert F32 in mem/low quarter xmm to I64 in ireg,
   9652                     according to prevailing SSE rounding mode
   9653    */
   9654    /* F3 0F 2C = CVTTSS2SI
   9655       when sz==4 -- convert F32 in mem/low quarter xmm to I32 in ireg,
   9656                     truncating towards zero
   9657       when sz==8 -- convert F32 in mem/low quarter xmm to I64 in ireg,
   9658                     truncating towards zero
   9659    */
   9660    if (haveF3no66noF2(pfx)
   9661        && insn[0] == 0x0F
   9662        && (insn[1] == 0x2D || insn[1] == 0x2C)) {
   9663       IRTemp rmode  = newTemp(Ity_I32);
   9664       IRTemp f32lo  = newTemp(Ity_F32);
   9665       Bool   r2zero = toBool(insn[1] == 0x2C);
   9666       vassert(sz == 4 || sz == 8);
   9667 
   9668       modrm = getUChar(delta+2);
   9669       if (epartIsReg(modrm)) {
   9670          delta += 2+1;
   9671          assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
   9672          DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
   9673                                    nameXMMReg(eregOfRexRM(pfx,modrm)),
   9674                                    nameIReg(sz, gregOfRexRM(pfx,modrm), False));
   9675       } else {
   9676          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   9677          assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   9678          delta += 2+alen;
   9679          DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
   9680                                    dis_buf,
   9681                                    nameIReg(sz, gregOfRexRM(pfx,modrm), False));
   9682       }
   9683 
   9684       if (r2zero) {
   9685          assign( rmode, mkU32((UInt)Irrm_ZERO) );
   9686       } else {
   9687          assign( rmode, get_sse_roundingmode() );
   9688       }
   9689 
   9690       if (sz == 4) {
   9691          putIReg32( gregOfRexRM(pfx,modrm),
   9692                     binop( Iop_F64toI32S,
   9693                            mkexpr(rmode),
   9694                            unop(Iop_F32toF64, mkexpr(f32lo))) );
   9695       } else {
   9696          putIReg64( gregOfRexRM(pfx,modrm),
   9697                     binop( Iop_F64toI64S,
   9698                            mkexpr(rmode),
   9699                            unop(Iop_F32toF64, mkexpr(f32lo))) );
   9700       }
   9701 
   9702       goto decode_success;
   9703    }
   9704 
   9705    /* 0F 5E = DIVPS -- div 32Fx4 from R/M to R */
   9706    if (haveNo66noF2noF3(pfx) && sz == 4
   9707        && insn[0] == 0x0F && insn[1] == 0x5E) {
   9708       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "divps", Iop_Div32Fx4 );
   9709       goto decode_success;
   9710    }
   9711 
   9712    /* F3 0F 5E = DIVSS -- div 32F0x4 from R/M to R */
   9713    if (haveF3no66noF2(pfx) && sz == 4
   9714        && insn[0] == 0x0F && insn[1] == 0x5E) {
   9715       delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "divss", Iop_Div32F0x4 );
   9716       goto decode_success;
   9717    }
   9718 
   9719    /* 0F AE /2 = LDMXCSR m32 -- load %mxcsr */
   9720    if (insn[0] == 0x0F && insn[1] == 0xAE
   9721        && haveNo66noF2noF3(pfx)
   9722        && !epartIsReg(insn[2]) && gregLO3ofRM(insn[2]) == 2) {
   9723 
   9724       IRTemp t64 = newTemp(Ity_I64);
   9725       IRTemp ew = newTemp(Ity_I32);
   9726 
   9727       vassert(sz == 4);
   9728       addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   9729       delta += 2+alen;
   9730       DIP("ldmxcsr %s\n", dis_buf);
   9731 
   9732       /* The only thing we observe in %mxcsr is the rounding mode.
   9733          Therefore, pass the 32-bit value (SSE native-format control
   9734          word) to a clean helper, getting back a 64-bit value, the
   9735          lower half of which is the SSEROUND value to store, and the
   9736          upper half of which is the emulation-warning token which may
   9737          be generated.
   9738       */
   9739       /* ULong amd64h_check_ldmxcsr ( ULong ); */
   9740       assign( t64, mkIRExprCCall(
   9741                       Ity_I64, 0/*regparms*/,
   9742                       "amd64g_check_ldmxcsr",
   9743                       &amd64g_check_ldmxcsr,
   9744                       mkIRExprVec_1(
   9745                          unop(Iop_32Uto64,
   9746                               loadLE(Ity_I32, mkexpr(addr))
   9747                          )
   9748                       )
   9749                    )
   9750             );
   9751 
   9752       put_sse_roundingmode( unop(Iop_64to32, mkexpr(t64)) );
   9753       assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
   9754       put_emwarn( mkexpr(ew) );
   9755       /* Finally, if an emulation warning was reported, side-exit to
   9756          the next insn, reporting the warning, so that Valgrind's
   9757          dispatcher sees the warning. */
   9758       stmt(
   9759          IRStmt_Exit(
   9760             binop(Iop_CmpNE64, unop(Iop_32Uto64,mkexpr(ew)), mkU64(0)),
   9761             Ijk_EmWarn,
   9762             IRConst_U64(guest_RIP_bbstart+delta)
   9763          )
   9764       );
   9765       goto decode_success;
   9766    }
   9767 
   9768    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   9769    /* 0F F7 = MASKMOVQ -- 8x8 masked store */
   9770    if (haveNo66noF2noF3(pfx) && sz == 4
   9771        && insn[0] == 0x0F && insn[1] == 0xF7) {
   9772       Bool ok = False;
   9773       delta = dis_MMX( &ok, vbi, pfx, sz, delta+1 );
   9774       if (!ok)
   9775          goto decode_failure;
   9776       goto decode_success;
   9777    }
   9778 
   9779    /* 0F 5F = MAXPS -- max 32Fx4 from R/M to R */
   9780    if (haveNo66noF2noF3(pfx) && sz == 4
   9781        && insn[0] == 0x0F && insn[1] == 0x5F) {
   9782       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "maxps", Iop_Max32Fx4 );
   9783       goto decode_success;
   9784    }
   9785 
   9786    /* F3 0F 5F = MAXSS -- max 32F0x4 from R/M to R */
   9787    if (haveF3no66noF2(pfx) && sz == 4
   9788        && insn[0] == 0x0F && insn[1] == 0x5F) {
   9789       delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "maxss", Iop_Max32F0x4 );
   9790       goto decode_success;
   9791    }
   9792 
   9793    /* 0F 5D = MINPS -- min 32Fx4 from R/M to R */
   9794    if (haveNo66noF2noF3(pfx) && sz == 4
   9795        && insn[0] == 0x0F && insn[1] == 0x5D) {
   9796       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "minps", Iop_Min32Fx4 );
   9797       goto decode_success;
   9798    }
   9799 
   9800    /* F3 0F 5D = MINSS -- min 32F0x4 from R/M to R */
   9801    if (haveF3no66noF2(pfx) && sz == 4
   9802        && insn[0] == 0x0F && insn[1] == 0x5D) {
   9803       delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "minss", Iop_Min32F0x4 );
   9804       goto decode_success;
   9805    }
   9806 
   9807    /* 0F 28 = MOVAPS -- move from E (mem or xmm) to G (xmm). */
   9808    /* 0F 10 = MOVUPS -- move from E (mem or xmm) to G (xmm). */
   9809    if (haveNo66noF2noF3(pfx)
   9810        && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
   9811        && insn[0] == 0x0F && (insn[1] == 0x28 || insn[1] == 0x10)) {
   9812       modrm = getUChar(delta+2);
   9813       if (epartIsReg(modrm)) {
   9814          putXMMReg( gregOfRexRM(pfx,modrm),
   9815                     getXMMReg( eregOfRexRM(pfx,modrm) ));
   9816          DIP("mov[ua]ps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   9817                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   9818          delta += 2+1;
   9819       } else {
   9820          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   9821          if (insn[1] == 0x28/*movaps*/)
   9822             gen_SEGV_if_not_16_aligned( addr );
   9823          putXMMReg( gregOfRexRM(pfx,modrm),
   9824                     loadLE(Ity_V128, mkexpr(addr)) );
   9825          DIP("mov[ua]ps %s,%s\n", dis_buf,
   9826                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   9827          delta += 2+alen;
   9828       }
   9829       goto decode_success;
   9830    }
   9831 
   9832    /* 0F 29 = MOVAPS -- move from G (xmm) to E (mem or xmm). */
   9833    /* 0F 11 = MOVUPS -- move from G (xmm) to E (mem or xmm). */
   9834    if (haveNo66noF2noF3(pfx)
   9835        && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
   9836        && insn[0] == 0x0F && (insn[1] == 0x29 || insn[1] == 0x11)) {
   9837       modrm = getUChar(delta+2);
   9838       if (epartIsReg(modrm)) {
   9839          /* fall through; awaiting test case */
   9840       } else {
   9841          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   9842          if (insn[1] == 0x29/*movaps*/)
   9843             gen_SEGV_if_not_16_aligned( addr );
   9844          storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   9845          DIP("mov[ua]ps %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   9846                                   dis_buf );
   9847          delta += 2+alen;
   9848          goto decode_success;
   9849       }
   9850    }
   9851 
   9852    /* 0F 16 = MOVHPS -- move from mem to high half of XMM. */
   9853    /* 0F 16 = MOVLHPS -- move from lo half to hi half of XMM. */
   9854    if (haveNo66noF2noF3(pfx)
   9855        && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
   9856        && insn[0] == 0x0F && insn[1] == 0x16) {
   9857       modrm = getUChar(delta+2);
   9858       if (epartIsReg(modrm)) {
   9859          delta += 2+1;
   9860          putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
   9861                           getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ) );
   9862          DIP("movhps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   9863                                nameXMMReg(gregOfRexRM(pfx,modrm)));
   9864       } else {
   9865          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   9866          delta += 2+alen;
   9867          putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
   9868                           loadLE(Ity_I64, mkexpr(addr)) );
   9869          DIP("movhps %s,%s\n", dis_buf,
   9870                                nameXMMReg( gregOfRexRM(pfx,modrm) ));
   9871       }
   9872       goto decode_success;
   9873    }
   9874 
   9875    /* 0F 17 = MOVHPS -- move from high half of XMM to mem. */
   9876    if (haveNo66noF2noF3(pfx)
   9877        && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
   9878        && insn[0] == 0x0F && insn[1] == 0x17) {
   9879       if (!epartIsReg(insn[2])) {
   9880          delta += 2;
   9881          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9882          delta += alen;
   9883          storeLE( mkexpr(addr),
   9884                   getXMMRegLane64( gregOfRexRM(pfx,insn[2]),
   9885                                    1/*upper lane*/ ) );
   9886          DIP("movhps %s,%s\n", nameXMMReg( gregOfRexRM(pfx,insn[2]) ),
   9887                                dis_buf);
   9888          goto decode_success;
   9889       }
   9890       /* else fall through */
   9891    }
   9892 
   9893    /* 0F 12 = MOVLPS -- move from mem to low half of XMM. */
   9894    /* OF 12 = MOVHLPS -- from from hi half to lo half of XMM. */
   9895    if (haveNo66noF2noF3(pfx)
   9896        && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
   9897        && insn[0] == 0x0F && insn[1] == 0x12) {
   9898       modrm = getUChar(delta+2);
   9899       if (epartIsReg(modrm)) {
   9900          delta += 2+1;
   9901          putXMMRegLane64( gregOfRexRM(pfx,modrm),
   9902                           0/*lower lane*/,
   9903                           getXMMRegLane64( eregOfRexRM(pfx,modrm), 1 ));
   9904          DIP("movhlps %s, %s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   9905                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   9906       } else {
   9907          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   9908          delta += 2+alen;
   9909          putXMMRegLane64( gregOfRexRM(pfx,modrm),  0/*lower lane*/,
   9910                           loadLE(Ity_I64, mkexpr(addr)) );
   9911          DIP("movlps %s, %s\n",
   9912              dis_buf, nameXMMReg( gregOfRexRM(pfx,modrm) ));
   9913       }
   9914       goto decode_success;
   9915    }
   9916 
   9917    /* 0F 13 = MOVLPS -- move from low half of XMM to mem. */
   9918    if (haveNo66noF2noF3(pfx)
   9919        && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
   9920        && insn[0] == 0x0F && insn[1] == 0x13) {
   9921       if (!epartIsReg(insn[2])) {
   9922          delta += 2;
   9923          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9924          delta += alen;
   9925          storeLE( mkexpr(addr),
   9926                   getXMMRegLane64( gregOfRexRM(pfx,insn[2]),
   9927                                    0/*lower lane*/ ) );
   9928          DIP("movlps %s, %s\n", nameXMMReg( gregOfRexRM(pfx,insn[2]) ),
   9929                                 dis_buf);
   9930          goto decode_success;
   9931       }
   9932       /* else fall through */
   9933    }
   9934 
   9935    /* 0F 50 = MOVMSKPS - move 4 sign bits from 4 x F32 in xmm(E)
   9936       to 4 lowest bits of ireg(G) */
   9937    if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   9938        && insn[0] == 0x0F && insn[1] == 0x50) {
   9939       /* sz == 8 is a kludge to handle insns with REX.W redundantly
   9940          set to 1, which has been known to happen:
   9941 
   9942          4c 0f 50 d9             rex64X movmskps %xmm1,%r11d
   9943 
   9944          20071106: Intel docs say that REX.W isn't redundant: when
   9945          present, a 64-bit register is written; when not present, only
   9946          the 32-bit half is written.  However, testing on a Core2
   9947          machine suggests the entire 64 bit register is written
   9948          irrespective of the status of REX.W.  That could be because
   9949          of the default rule that says "if the lower half of a 32-bit
   9950          register is written, the upper half is zeroed".  By using
   9951          putIReg32 here we inadvertantly produce the same behaviour as
   9952          the Core2, for the same reason -- putIReg32 implements said
   9953          rule.
   9954 
   9955          AMD docs give no indication that REX.W is even valid for this
   9956          insn. */
   9957       modrm = getUChar(delta+2);
   9958       if (epartIsReg(modrm)) {
   9959          Int src;
   9960          t0 = newTemp(Ity_I32);
   9961          t1 = newTemp(Ity_I32);
   9962          t2 = newTemp(Ity_I32);
   9963          t3 = newTemp(Ity_I32);
   9964          delta += 2+1;
   9965          src = eregOfRexRM(pfx,modrm);
   9966          assign( t0, binop( Iop_And32,
   9967                             binop(Iop_Shr32, getXMMRegLane32(src,0), mkU8(31)),
   9968                             mkU32(1) ));
   9969          assign( t1, binop( Iop_And32,
   9970                             binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(30)),
   9971                             mkU32(2) ));
   9972          assign( t2, binop( Iop_And32,
   9973                             binop(Iop_Shr32, getXMMRegLane32(src,2), mkU8(29)),
   9974                             mkU32(4) ));
   9975          assign( t3, binop( Iop_And32,
   9976                             binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(28)),
   9977                             mkU32(8) ));
   9978          putIReg32( gregOfRexRM(pfx,modrm),
   9979                     binop(Iop_Or32,
   9980                           binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
   9981                           binop(Iop_Or32, mkexpr(t2), mkexpr(t3))
   9982                          )
   9983                  );
   9984          DIP("movmskps %s,%s\n", nameXMMReg(src),
   9985                                  nameIReg32(gregOfRexRM(pfx,modrm)));
   9986          goto decode_success;
   9987       }
   9988       /* else fall through */
   9989    }
   9990 
   9991    /* 66 0F 2B = MOVNTPD -- for us, just a plain SSE store. */
   9992    /* 0F 2B = MOVNTPS -- for us, just a plain SSE store. */
   9993    if ( ( (haveNo66noF2noF3(pfx) && sz == 4)
   9994           || (have66noF2noF3(pfx) && sz == 2)
   9995         )
   9996         && insn[0] == 0x0F && insn[1] == 0x2B) {
   9997       modrm = getUChar(delta+2);
   9998       if (!epartIsReg(modrm)) {
   9999          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   10000          gen_SEGV_if_not_16_aligned( addr );
   10001          storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   10002          DIP("movntp%s %s,%s\n", sz==2 ? "d" : "s",
   10003                                  dis_buf,
   10004                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   10005          delta += 2+alen;
   10006          goto decode_success;
   10007       }
   10008       /* else fall through */
   10009    }
   10010 
   10011    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   10012    /* 0F E7 = MOVNTQ -- for us, just a plain MMX store.  Note, the
   10013       Intel manual does not say anything about the usual business of
   10014       the FP reg tags getting trashed whenever an MMX insn happens.
   10015       So we just leave them alone.
   10016    */
   10017    if (haveNo66noF2noF3(pfx) && sz == 4
   10018        && insn[0] == 0x0F && insn[1] == 0xE7) {
   10019       modrm = getUChar(delta+2);
   10020       if (!epartIsReg(modrm)) {
   10021          /* do_MMX_preamble(); Intel docs don't specify this */
   10022          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   10023          storeLE( mkexpr(addr), getMMXReg(gregLO3ofRM(modrm)) );
   10024          DIP("movntq %s,%s\n", dis_buf,
   10025                                nameMMXReg(gregLO3ofRM(modrm)));
   10026          delta += 2+alen;
   10027          goto decode_success;
   10028       }
   10029       /* else fall through */
   10030    }
   10031 
   10032    /* F3 0F 10 = MOVSS -- move 32 bits from E (mem or lo 1/4 xmm) to G
   10033       (lo 1/4 xmm).  If E is mem, upper 3/4 of G is zeroed out. */
   10034    if (haveF3no66noF2(pfx)
   10035        && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
   10036        && insn[0] == 0x0F && insn[1] == 0x10) {
   10037       modrm = getUChar(delta+2);
   10038       if (epartIsReg(modrm)) {
   10039          putXMMRegLane32( gregOfRexRM(pfx,modrm), 0,
   10040                           getXMMRegLane32( eregOfRexRM(pfx,modrm), 0 ));
   10041          DIP("movss %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   10042                               nameXMMReg(gregOfRexRM(pfx,modrm)));
   10043          delta += 2+1;
   10044       } else {
   10045          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   10046          putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
   10047          putXMMRegLane32( gregOfRexRM(pfx,modrm), 0,
   10048                           loadLE(Ity_I32, mkexpr(addr)) );
   10049          DIP("movss %s,%s\n", dis_buf,
   10050                               nameXMMReg(gregOfRexRM(pfx,modrm)));
   10051          delta += 2+alen;
   10052       }
   10053       goto decode_success;
   10054    }
   10055 
   10056    /* F3 0F 11 = MOVSS -- move 32 bits from G (lo 1/4 xmm) to E (mem
   10057       or lo 1/4 xmm). */
   10058    if (haveF3no66noF2(pfx) && sz == 4
   10059        && insn[0] == 0x0F && insn[1] == 0x11) {
   10060       modrm = getUChar(delta+2);
   10061       if (epartIsReg(modrm)) {
   10062          /* fall through, we don't yet have a test case */
   10063       } else {
   10064          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   10065          storeLE( mkexpr(addr),
   10066                   getXMMRegLane32(gregOfRexRM(pfx,modrm), 0) );
   10067          DIP("movss %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   10068                               dis_buf);
   10069          delta += 2+alen;
   10070          goto decode_success;
   10071       }
   10072    }
   10073 
   10074    /* 0F 59 = MULPS -- mul 32Fx4 from R/M to R */
   10075    if (haveNo66noF2noF3(pfx) && sz == 4
   10076        && insn[0] == 0x0F && insn[1] == 0x59) {
   10077       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "mulps", Iop_Mul32Fx4 );
   10078       goto decode_success;
   10079    }
   10080 
   10081    /* F3 0F 59 = MULSS -- mul 32F0x4 from R/M to R */
   10082    if (haveF3no66noF2(pfx) && sz == 4
   10083        && insn[0] == 0x0F && insn[1] == 0x59) {
   10084       delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "mulss", Iop_Mul32F0x4 );
   10085       goto decode_success;
   10086    }
   10087 
   10088    /* 0F 56 = ORPS -- G = G and E */
   10089    if (haveNo66noF2noF3(pfx) && sz == 4
   10090        && insn[0] == 0x0F && insn[1] == 0x56) {
   10091       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "orps", Iop_OrV128 );
   10092       goto decode_success;
   10093    }
   10094 
   10095    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   10096    /* 0F E0 = PAVGB -- 8x8 unsigned Packed Average, with rounding */
   10097    if (haveNo66noF2noF3(pfx) && sz == 4
   10098        && insn[0] == 0x0F && insn[1] == 0xE0) {
   10099       do_MMX_preamble();
   10100       delta = dis_MMXop_regmem_to_reg (
   10101                  vbi, pfx, delta+2, insn[1], "pavgb", False );
   10102       goto decode_success;
   10103    }
   10104 
   10105    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   10106    /* 0F E3 = PAVGW -- 16x4 unsigned Packed Average, with rounding */
   10107    if (haveNo66noF2noF3(pfx) && sz == 4
   10108        && insn[0] == 0x0F && insn[1] == 0xE3) {
   10109       do_MMX_preamble();
   10110       delta = dis_MMXop_regmem_to_reg (
   10111                  vbi, pfx, delta+2, insn[1], "pavgw", False );
   10112       goto decode_success;
   10113    }
   10114 
   10115    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   10116    /* 0F C5 = PEXTRW -- extract 16-bit field from mmx(E) and put
   10117       zero-extend of it in ireg(G). */
   10118    if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   10119        && insn[0] == 0x0F && insn[1] == 0xC5) {
   10120       modrm = insn[2];
   10121       if (epartIsReg(modrm)) {
   10122          IRTemp sV = newTemp(Ity_I64);
   10123          t5 = newTemp(Ity_I16);
   10124          do_MMX_preamble();
   10125          assign(sV, getMMXReg(eregLO3ofRM(modrm)));
   10126          breakup64to16s( sV, &t3, &t2, &t1, &t0 );
   10127          switch (insn[3] & 3) {
   10128             case 0:  assign(t5, mkexpr(t0)); break;
   10129             case 1:  assign(t5, mkexpr(t1)); break;
   10130             case 2:  assign(t5, mkexpr(t2)); break;
   10131             case 3:  assign(t5, mkexpr(t3)); break;
   10132             default: vassert(0);
   10133          }
   10134          if (sz == 8)
   10135             putIReg64(gregOfRexRM(pfx,modrm), unop(Iop_16Uto64, mkexpr(t5)));
   10136          else
   10137             putIReg32(gregOfRexRM(pfx,modrm), unop(Iop_16Uto32, mkexpr(t5)));
   10138          DIP("pextrw $%d,%s,%s\n",
   10139              (Int)insn[3], nameMMXReg(eregLO3ofRM(modrm)),
   10140                            sz==8 ? nameIReg64(gregOfRexRM(pfx,modrm))
   10141                                  : nameIReg32(gregOfRexRM(pfx,modrm))
   10142          );
   10143          delta += 4;
   10144          goto decode_success;
   10145       }
   10146       /* else fall through */
   10147       /* note, for anyone filling in the mem case: this insn has one
   10148          byte after the amode and therefore you must pass 1 as the
   10149          last arg to disAMode */
   10150    }
   10151 
   10152    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   10153    /* 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
   10154       put it into the specified lane of mmx(G). */
   10155    if (haveNo66noF2noF3(pfx)
   10156        && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
   10157        && insn[0] == 0x0F && insn[1] == 0xC4) {
   10158       /* Use t0 .. t3 to hold the 4 original 16-bit lanes of the
   10159          mmx reg.  t4 is the new lane value.  t5 is the original
   10160          mmx value. t6 is the new mmx value. */
   10161       Int lane;
   10162       t4 = newTemp(Ity_I16);
   10163       t5 = newTemp(Ity_I64);
   10164       t6 = newTemp(Ity_I64);
   10165       modrm = insn[2];
   10166       do_MMX_preamble();
   10167 
   10168       assign(t5, getMMXReg(gregLO3ofRM(modrm)));
   10169       breakup64to16s( t5, &t3, &t2, &t1, &t0 );
   10170 
   10171       if (epartIsReg(modrm)) {
   10172          assign(t4, getIReg16(eregOfRexRM(pfx,modrm)));
   10173          delta += 3+1;
   10174          lane = insn[3+1-1];
   10175          DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   10176                                    nameIReg16(eregOfRexRM(pfx,modrm)),
   10177                                    nameMMXReg(gregLO3ofRM(modrm)));
   10178       } else {
   10179          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 1 );
   10180          delta += 3+alen;
   10181          lane = insn[3+alen-1];
   10182          assign(t4, loadLE(Ity_I16, mkexpr(addr)));
   10183          DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   10184                                    dis_buf,
   10185                                    nameMMXReg(gregLO3ofRM(modrm)));
   10186       }
   10187 
   10188       switch (lane & 3) {
   10189          case 0:  assign(t6, mk64from16s(t3,t2,t1,t4)); break;
   10190          case 1:  assign(t6, mk64from16s(t3,t2,t4,t0)); break;
   10191          case 2:  assign(t6, mk64from16s(t3,t4,t1,t0)); break;
   10192          case 3:  assign(t6, mk64from16s(t4,t2,t1,t0)); break;
   10193          default: vassert(0);
   10194       }
   10195       putMMXReg(gregLO3ofRM(modrm), mkexpr(t6));
   10196       goto decode_success;
   10197    }
   10198 
   10199    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   10200    /* 0F EE = PMAXSW -- 16x4 signed max */
   10201    if (haveNo66noF2noF3(pfx) && sz == 4
   10202        && insn[0] == 0x0F && insn[1] == 0xEE) {
   10203       do_MMX_preamble();
   10204       delta = dis_MMXop_regmem_to_reg (
   10205                  vbi, pfx, delta+2, insn[1], "pmaxsw", False );
   10206       goto decode_success;
   10207    }
   10208 
   10209    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   10210    /* 0F DE = PMAXUB -- 8x8 unsigned max */
   10211    if (haveNo66noF2noF3(pfx) && sz == 4
   10212        && insn[0] == 0x0F && insn[1] == 0xDE) {
   10213       do_MMX_preamble();
   10214       delta = dis_MMXop_regmem_to_reg (
   10215                  vbi, pfx, delta+2, insn[1], "pmaxub", False );
   10216       goto decode_success;
   10217    }
   10218 
   10219    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   10220    /* 0F EA = PMINSW -- 16x4 signed min */
   10221    if (haveNo66noF2noF3(pfx) && sz == 4
   10222        && insn[0] == 0x0F && insn[1] == 0xEA) {
   10223       do_MMX_preamble();
   10224       delta = dis_MMXop_regmem_to_reg (
   10225                  vbi, pfx, delta+2, insn[1], "pminsw", False );
   10226       goto decode_success;
   10227    }
   10228 
   10229    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   10230    /* 0F DA = PMINUB -- 8x8 unsigned min */
   10231    if (haveNo66noF2noF3(pfx) && sz == 4
   10232        && insn[0] == 0x0F && insn[1] == 0xDA) {
   10233       do_MMX_preamble();
   10234       delta = dis_MMXop_regmem_to_reg (
   10235                  vbi, pfx, delta+2, insn[1], "pminub", False );
   10236       goto decode_success;
   10237    }
   10238 
   10239    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   10240    /* 0F D7 = PMOVMSKB -- extract sign bits from each of 8 lanes in
   10241       mmx(G), turn them into a byte, and put zero-extend of it in
   10242       ireg(G). */
   10243    if (haveNo66noF2noF3(pfx) && sz == 4
   10244        && insn[0] == 0x0F && insn[1] == 0xD7) {
   10245       modrm = insn[2];
   10246       if (epartIsReg(modrm)) {
   10247          do_MMX_preamble();
   10248          t0 = newTemp(Ity_I64);
   10249          t1 = newTemp(Ity_I64);
   10250          assign(t0, getMMXReg(eregLO3ofRM(modrm)));
   10251          assign(t1, mkIRExprCCall(
   10252                        Ity_I64, 0/*regparms*/,
   10253                        "amd64g_calculate_mmx_pmovmskb",
   10254                        &amd64g_calculate_mmx_pmovmskb,
   10255                        mkIRExprVec_1(mkexpr(t0))));
   10256          putIReg32(gregOfRexRM(pfx,modrm), unop(Iop_64to32,mkexpr(t1)));
   10257          DIP("pmovmskb %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   10258                                  nameIReg32(gregOfRexRM(pfx,modrm)));
   10259          delta += 3;
   10260          goto decode_success;
   10261       }
   10262       /* else fall through */
   10263    }
   10264 
   10265    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   10266    /* 0F E4 = PMULUH -- 16x4 hi-half of unsigned widening multiply */
   10267    if (haveNo66noF2noF3(pfx) && sz == 4
   10268        && insn[0] == 0x0F && insn[1] == 0xE4) {
   10269       do_MMX_preamble();
   10270       delta = dis_MMXop_regmem_to_reg (
   10271                  vbi, pfx, delta+2, insn[1], "pmuluh", False );
   10272       goto decode_success;
   10273    }
   10274 
   10275    /* 0F 18 /0 = PREFETCHNTA -- prefetch into caches, */
   10276    /* 0F 18 /1 = PREFETCH0   -- with various different hints */
   10277    /* 0F 18 /2 = PREFETCH1 */
   10278    /* 0F 18 /3 = PREFETCH2 */
   10279    if (insn[0] == 0x0F && insn[1] == 0x18
   10280        && haveNo66noF2noF3(pfx)
   10281        && !epartIsReg(insn[2])
   10282        && gregLO3ofRM(insn[2]) >= 0 && gregLO3ofRM(insn[2]) <= 3) {
   10283       HChar* hintstr = "??";
   10284 
   10285       modrm = getUChar(delta+2);
   10286       vassert(!epartIsReg(modrm));
   10287 
   10288       addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   10289       delta += 2+alen;
   10290 
   10291       switch (gregLO3ofRM(modrm)) {
   10292          case 0: hintstr = "nta"; break;
   10293          case 1: hintstr = "t0"; break;
   10294          case 2: hintstr = "t1"; break;
   10295          case 3: hintstr = "t2"; break;
   10296          default: vassert(0);
   10297       }
   10298 
   10299       DIP("prefetch%s %s\n", hintstr, dis_buf);
   10300       goto decode_success;
   10301    }
   10302 
   10303    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   10304    /* 0F F6 = PSADBW -- sum of 8Ux8 absolute differences */
   10305    if (haveNo66noF2noF3(pfx) && sz == 4
   10306        && insn[0] == 0x0F && insn[1] == 0xF6) {
   10307       do_MMX_preamble();
   10308       delta = dis_MMXop_regmem_to_reg (
   10309                  vbi, pfx, delta+2, insn[1], "psadbw", False );
   10310       goto decode_success;
   10311    }
   10312 
   10313    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   10314    /* 0F 70 = PSHUFW -- rearrange 4x16 from E(mmx or mem) to G(mmx) */
   10315    if (haveNo66noF2noF3(pfx) && sz == 4
   10316        && insn[0] == 0x0F && insn[1] == 0x70) {
   10317       Int order;
   10318       IRTemp sV, dV, s3, s2, s1, s0;
   10319       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   10320       sV = newTemp(Ity_I64);
   10321       dV = newTemp(Ity_I64);
   10322       do_MMX_preamble();
   10323       modrm = insn[2];
   10324       if (epartIsReg(modrm)) {
   10325          assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   10326          order = (Int)insn[3];
   10327          delta += 2+2;
   10328          DIP("pshufw $%d,%s,%s\n", order,
   10329                                    nameMMXReg(eregLO3ofRM(modrm)),
   10330                                    nameMMXReg(gregLO3ofRM(modrm)));
   10331       } else {
   10332          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf,
   10333                            1/*extra byte after amode*/ );
   10334          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   10335          order = (Int)insn[2+alen];
   10336          delta += 3+alen;
   10337          DIP("pshufw $%d,%s,%s\n", order,
   10338                                    dis_buf,
   10339                                    nameMMXReg(gregLO3ofRM(modrm)));
   10340       }
   10341       breakup64to16s( sV, &s3, &s2, &s1, &s0 );
   10342 #     define SEL(n) \
   10343                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   10344       assign(dV,
   10345 	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
   10346                           SEL((order>>2)&3), SEL((order>>0)&3) )
   10347       );
   10348       putMMXReg(gregLO3ofRM(modrm), mkexpr(dV));
   10349 #     undef SEL
   10350       goto decode_success;
   10351    }
   10352 
   10353    /* 0F 53 = RCPPS -- approx reciprocal 32Fx4 from R/M to R */
   10354    if (haveNo66noF2noF3(pfx) && sz == 4
   10355        && insn[0] == 0x0F && insn[1] == 0x53) {
   10356       delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta+2,
   10357                                         "rcpps", Iop_Recip32Fx4 );
   10358       goto decode_success;
   10359    }
   10360 
   10361    /* F3 0F 53 = RCPSS -- approx reciprocal 32F0x4 from R/M to R */
   10362    if (haveF3no66noF2(pfx) && sz == 4
   10363        && insn[0] == 0x0F && insn[1] == 0x53) {
   10364       delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta+2,
   10365                                          "rcpss", Iop_Recip32F0x4 );
   10366       goto decode_success;
   10367    }
   10368 
   10369    /* 0F 52 = RSQRTPS -- approx reciprocal sqrt 32Fx4 from R/M to R */
   10370    if (haveNo66noF2noF3(pfx) && sz == 4
   10371        && insn[0] == 0x0F && insn[1] == 0x52) {
   10372       delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta+2,
   10373                                         "rsqrtps", Iop_RSqrt32Fx4 );
   10374       goto decode_success;
   10375    }
   10376 
   10377    /* F3 0F 52 = RSQRTSS -- approx reciprocal sqrt 32F0x4 from R/M to R */
   10378    if (haveF3no66noF2(pfx) && sz == 4
   10379        && insn[0] == 0x0F && insn[1] == 0x52) {
   10380       delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta+2,
   10381                                          "rsqrtss", Iop_RSqrt32F0x4 );
   10382       goto decode_success;
   10383    }
   10384 
   10385    /* 0F AE /7 = SFENCE -- flush pending operations to memory */
   10386    if (haveNo66noF2noF3(pfx)
   10387        && insn[0] == 0x0F && insn[1] == 0xAE
   10388        && epartIsReg(insn[2]) && gregLO3ofRM(insn[2]) == 7
   10389        && sz == 4) {
   10390       delta += 3;
   10391       /* Insert a memory fence.  It's sometimes important that these
   10392          are carried through to the generated code. */
   10393       stmt( IRStmt_MBE(Imbe_Fence) );
   10394       DIP("sfence\n");
   10395       goto decode_success;
   10396    }
   10397 
   10398    /* 0F C6 /r ib = SHUFPS -- shuffle packed F32s */
   10399    if (haveNo66noF2noF3(pfx) && sz == 4
   10400        && insn[0] == 0x0F && insn[1] == 0xC6) {
   10401       Int    select;
   10402       IRTemp sV, dV;
   10403       IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   10404       sV = newTemp(Ity_V128);
   10405       dV = newTemp(Ity_V128);
   10406       s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   10407       modrm = insn[2];
   10408       assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   10409 
   10410       if (epartIsReg(modrm)) {
   10411          assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   10412          select = (Int)insn[3];
   10413          delta += 2+2;
   10414          DIP("shufps $%d,%s,%s\n", select,
   10415                                    nameXMMReg(eregOfRexRM(pfx,modrm)),
   10416                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   10417       } else {
   10418          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf,
   10419                            1/*byte at end of insn*/ );
   10420          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   10421          select = (Int)insn[2+alen];
   10422          delta += 3+alen;
   10423          DIP("shufps $%d,%s,%s\n", select,
   10424                                    dis_buf,
   10425                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   10426       }
   10427 
   10428       breakup128to32s( dV, &d3, &d2, &d1, &d0 );
   10429       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   10430 
   10431 #     define SELD(n) ((n)==0 ? d0 : ((n)==1 ? d1 : ((n)==2 ? d2 : d3)))
   10432 #     define SELS(n) ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   10433 
   10434       putXMMReg(
   10435          gregOfRexRM(pfx,modrm),
   10436          mk128from32s( SELS((select>>6)&3), SELS((select>>4)&3),
   10437                        SELD((select>>2)&3), SELD((select>>0)&3) )
   10438       );
   10439 
   10440 #     undef SELD
   10441 #     undef SELS
   10442 
   10443       goto decode_success;
   10444    }
   10445 
   10446    /* 0F 51 = SQRTPS -- approx sqrt 32Fx4 from R/M to R */
   10447    if (haveNo66noF2noF3(pfx) && sz == 4
   10448        && insn[0] == 0x0F && insn[1] == 0x51) {
   10449       delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta+2,
   10450                                         "sqrtps", Iop_Sqrt32Fx4 );
   10451       goto decode_success;
   10452    }
   10453 
   10454    /* F3 0F 51 = SQRTSS -- approx sqrt 32F0x4 from R/M to R */
   10455    if (haveF3no66noF2(pfx) && sz == 4
   10456        && insn[0] == 0x0F && insn[1] == 0x51) {
   10457       delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta+2,
   10458                                          "sqrtss", Iop_Sqrt32F0x4 );
   10459       goto decode_success;
   10460    }
   10461 
   10462    /* 0F AE /3 = STMXCSR m32 -- store %mxcsr */
   10463    if (insn[0] == 0x0F && insn[1] == 0xAE
   10464        && haveNo66noF2noF3(pfx)
   10465        && !epartIsReg(insn[2]) && gregLO3ofRM(insn[2]) == 3) {
   10466 
   10467       vassert(sz == 4);
   10468       addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   10469       delta += 2+alen;
   10470 
   10471       /* Fake up a native SSE mxcsr word.  The only thing it depends
   10472          on is SSEROUND[1:0], so call a clean helper to cook it up.
   10473       */
   10474       /* ULong amd64h_create_mxcsr ( ULong sseround ) */
   10475       DIP("stmxcsr %s\n", dis_buf);
   10476       storeLE(
   10477          mkexpr(addr),
   10478          unop(Iop_64to32,
   10479               mkIRExprCCall(
   10480                  Ity_I64, 0/*regp*/,
   10481                  "amd64g_create_mxcsr", &amd64g_create_mxcsr,
   10482                  mkIRExprVec_1( unop(Iop_32Uto64,get_sse_roundingmode()) )
   10483 	      )
   10484 	 )
   10485       );
   10486       goto decode_success;
   10487    }
   10488 
   10489    /* 0F 5C = SUBPS -- sub 32Fx4 from R/M to R */
   10490    if (haveNo66noF2noF3(pfx) && sz == 4
   10491        && insn[0] == 0x0F && insn[1] == 0x5C) {
   10492       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "subps", Iop_Sub32Fx4 );
   10493       goto decode_success;
   10494    }
   10495 
   10496    /* F3 0F 5C = SUBSS -- sub 32F0x4 from R/M to R */
   10497    if (haveF3no66noF2(pfx) && sz == 4
   10498        && insn[0] == 0x0F && insn[1] == 0x5C) {
   10499       delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "subss", Iop_Sub32F0x4 );
   10500       goto decode_success;
   10501    }
   10502 
   10503    /* 0F 15 = UNPCKHPS -- unpack and interleave high part F32s */
   10504    /* 0F 14 = UNPCKLPS -- unpack and interleave low part F32s */
   10505    /* These just appear to be special cases of SHUFPS */
   10506    if (haveNo66noF2noF3(pfx) && sz == 4
   10507        && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
   10508       IRTemp sV, dV;
   10509       IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   10510       Bool hi = toBool(insn[1] == 0x15);
   10511       sV = newTemp(Ity_V128);
   10512       dV = newTemp(Ity_V128);
   10513       s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   10514       modrm = insn[2];
   10515       assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   10516 
   10517       if (epartIsReg(modrm)) {
   10518          assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   10519          delta += 2+1;
   10520          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   10521                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
   10522                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   10523       } else {
   10524          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   10525          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   10526          delta += 2+alen;
   10527          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   10528                                   dis_buf,
   10529                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   10530       }
   10531 
   10532       breakup128to32s( dV, &d3, &d2, &d1, &d0 );
   10533       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   10534 
   10535       if (hi) {
   10536          putXMMReg( gregOfRexRM(pfx,modrm), mk128from32s( s3, d3, s2, d2 ) );
   10537       } else {
   10538          putXMMReg( gregOfRexRM(pfx,modrm), mk128from32s( s1, d1, s0, d0 ) );
   10539       }
   10540 
   10541       goto decode_success;
   10542    }
   10543 
   10544    /* 0F 57 = XORPS -- G = G and E */
   10545    if (haveNo66noF2noF3(pfx) && sz == 4
   10546        && insn[0] == 0x0F && insn[1] == 0x57) {
   10547       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "xorps", Iop_XorV128 );
   10548       goto decode_success;
   10549    }
   10550 
   10551    /* ---------------------------------------------------- */
   10552    /* --- end of the SSE decoder.                      --- */
   10553    /* ---------------------------------------------------- */
   10554 
   10555    /* ---------------------------------------------------- */
   10556    /* --- start of the SSE2 decoder.                   --- */
   10557    /* ---------------------------------------------------- */
   10558 
   10559    /* 66 0F 58 = ADDPD -- add 32Fx4 from R/M to R */
   10560    if (have66noF2noF3(pfx)
   10561        && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
   10562        && insn[0] == 0x0F && insn[1] == 0x58) {
   10563       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "addpd", Iop_Add64Fx2 );
   10564       goto decode_success;
   10565    }
   10566 
   10567    /* F2 0F 58 = ADDSD -- add 64F0x2 from R/M to R */
   10568    if (haveF2no66noF3(pfx)
   10569        && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
   10570        && insn[0] == 0x0F && insn[1] == 0x58) {
   10571       delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "addsd", Iop_Add64F0x2 );
   10572       goto decode_success;
   10573    }
   10574 
   10575    /* 66 0F 55 = ANDNPD -- G = (not G) and E */
   10576    if (have66noF2noF3(pfx) && sz == 2
   10577        && insn[0] == 0x0F && insn[1] == 0x55) {
   10578       delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta+2, "andnpd", Iop_AndV128 );
   10579       goto decode_success;
   10580    }
   10581 
   10582    /* 66 0F 54 = ANDPD -- G = G and E */
   10583    if (have66noF2noF3(pfx) && sz == 2
   10584        && insn[0] == 0x0F && insn[1] == 0x54) {
   10585       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "andpd", Iop_AndV128 );
   10586       goto decode_success;
   10587    }
   10588 
   10589    /* 66 0F C2 = CMPPD -- 64Fx2 comparison from R/M to R */
   10590    if (have66noF2noF3(pfx) && sz == 2
   10591        && insn[0] == 0x0F && insn[1] == 0xC2) {
   10592       delta = dis_SSEcmp_E_to_G( vbi, pfx, delta+2, "cmppd", True, 8 );
   10593       goto decode_success;
   10594    }
   10595 
   10596    /* F2 0F C2 = CMPSD -- 64F0x2 comparison from R/M to R */
   10597    if (haveF2no66noF3(pfx) && sz == 4
   10598        && insn[0] == 0x0F && insn[1] == 0xC2) {
   10599       delta = dis_SSEcmp_E_to_G( vbi, pfx, delta+2, "cmpsd", False, 8 );
   10600       goto decode_success;
   10601    }
   10602 
   10603    /* 66 0F 2F = COMISD  -- 64F0x2 comparison G,E, and set ZCP */
   10604    /* 66 0F 2E = UCOMISD -- 64F0x2 comparison G,E, and set ZCP */
   10605    if (have66noF2noF3(pfx) && sz == 2
   10606        && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
   10607       IRTemp argL = newTemp(Ity_F64);
   10608       IRTemp argR = newTemp(Ity_F64);
   10609       modrm = getUChar(delta+2);
   10610       if (epartIsReg(modrm)) {
   10611          assign( argR, getXMMRegLane64F( eregOfRexRM(pfx,modrm),
   10612                                          0/*lowest lane*/ ) );
   10613          delta += 2+1;
   10614          DIP("%scomisd %s,%s\n", insn[1]==0x2E ? "u" : "",
   10615                                  nameXMMReg(eregOfRexRM(pfx,modrm)),
   10616                                  nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10617       } else {
   10618          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   10619          assign( argR, loadLE(Ity_F64, mkexpr(addr)) );
   10620          delta += 2+alen;
   10621          DIP("%scomisd %s,%s\n", insn[1]==0x2E ? "u" : "",
   10622                                  dis_buf,
   10623                                  nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10624       }
   10625       assign( argL, getXMMRegLane64F( gregOfRexRM(pfx,modrm),
   10626                                       0/*lowest lane*/ ) );
   10627 
   10628       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   10629       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   10630       stmt( IRStmt_Put(
   10631                OFFB_CC_DEP1,
   10632                binop( Iop_And64,
   10633                       unop( Iop_32Uto64,
   10634                             binop(Iop_CmpF64, mkexpr(argL), mkexpr(argR)) ),
   10635                       mkU64(0x45)
   10636           )));
   10637 
   10638       goto decode_success;
   10639    }
   10640 
   10641    /* F3 0F E6 = CVTDQ2PD -- convert 2 x I32 in mem/lo half xmm to 2 x
   10642       F64 in xmm(G) */
   10643    if (haveF3no66noF2(pfx) && insn[0] == 0x0F && insn[1] == 0xE6) {
   10644       IRTemp arg64 = newTemp(Ity_I64);
   10645       if (sz != 4) goto decode_failure;
   10646 
   10647       modrm = getUChar(delta+2);
   10648       if (epartIsReg(modrm)) {
   10649          assign( arg64, getXMMRegLane64(eregOfRexRM(pfx,modrm), 0) );
   10650          delta += 2+1;
   10651          DIP("cvtdq2pd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   10652                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   10653       } else {
   10654          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   10655          assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   10656          delta += 2+alen;
   10657          DIP("cvtdq2pd %s,%s\n", dis_buf,
   10658                                  nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10659       }
   10660 
   10661       putXMMRegLane64F(
   10662          gregOfRexRM(pfx,modrm), 0,
   10663          unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)))
   10664       );
   10665 
   10666       putXMMRegLane64F(
   10667          gregOfRexRM(pfx,modrm), 1,
   10668          unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)))
   10669       );
   10670 
   10671       goto decode_success;
   10672    }
   10673 
   10674    /* 0F 5B = CVTDQ2PS -- convert 4 x I32 in mem/xmm to 4 x F32 in
   10675       xmm(G) */
   10676    if (haveNo66noF2noF3(pfx) && sz == 4
   10677        && insn[0] == 0x0F && insn[1] == 0x5B) {
   10678       IRTemp argV  = newTemp(Ity_V128);
   10679       IRTemp rmode = newTemp(Ity_I32);
   10680 
   10681       modrm = getUChar(delta+2);
   10682       if (epartIsReg(modrm)) {
   10683          assign( argV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   10684          delta += 2+1;
   10685          DIP("cvtdq2ps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   10686                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   10687       } else {
   10688          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   10689          assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10690          delta += 2+alen;
   10691          DIP("cvtdq2ps %s,%s\n", dis_buf,
   10692                                  nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10693       }
   10694 
   10695       assign( rmode, get_sse_roundingmode() );
   10696       breakup128to32s( argV, &t3, &t2, &t1, &t0 );
   10697 
   10698 #     define CVT(_t)  binop( Iop_F64toF32,                    \
   10699                              mkexpr(rmode),                   \
   10700                              unop(Iop_I32StoF64,mkexpr(_t)))
   10701 
   10702       putXMMRegLane32F( gregOfRexRM(pfx,modrm), 3, CVT(t3) );
   10703       putXMMRegLane32F( gregOfRexRM(pfx,modrm), 2, CVT(t2) );
   10704       putXMMRegLane32F( gregOfRexRM(pfx,modrm), 1, CVT(t1) );
   10705       putXMMRegLane32F( gregOfRexRM(pfx,modrm), 0, CVT(t0) );
   10706 
   10707 #     undef CVT
   10708 
   10709       goto decode_success;
   10710    }
   10711 
   10712    /* 66 0F E6 = CVTTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
   10713       lo half xmm(G), and zero upper half, rounding towards zero */
   10714    /* F2 0F E6 = CVTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
   10715       lo half xmm(G), according to prevailing rounding mode, and zero
   10716       upper half */
   10717    if ( ( (haveF2no66noF3(pfx) && sz == 4)
   10718           || (have66noF2noF3(pfx) && sz == 2)
   10719         )
   10720         && insn[0] == 0x0F && insn[1] == 0xE6) {
   10721       IRTemp argV   = newTemp(Ity_V128);
   10722       IRTemp rmode  = newTemp(Ity_I32);
   10723       Bool   r2zero = toBool(sz == 2);
   10724 
   10725       modrm = getUChar(delta+2);
   10726       if (epartIsReg(modrm)) {
   10727          assign( argV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   10728          delta += 2+1;
   10729          DIP("cvt%spd2dq %s,%s\n", r2zero ? "t" : "",
   10730                                    nameXMMReg(eregOfRexRM(pfx,modrm)),
   10731                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   10732       } else {
   10733          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   10734          assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10735          delta += 2+alen;
   10736          DIP("cvt%spd2dq %s,%s\n", r2zero ? "t" : "",
   10737                                    dis_buf,
   10738                                    nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10739       }
   10740 
   10741       if (r2zero) {
   10742          assign(rmode, mkU32((UInt)Irrm_ZERO) );
   10743       } else {
   10744          assign( rmode, get_sse_roundingmode() );
   10745       }
   10746 
   10747       t0 = newTemp(Ity_F64);
   10748       t1 = newTemp(Ity_F64);
   10749       assign( t0, unop(Iop_ReinterpI64asF64,
   10750                        unop(Iop_V128to64, mkexpr(argV))) );
   10751       assign( t1, unop(Iop_ReinterpI64asF64,
   10752                        unop(Iop_V128HIto64, mkexpr(argV))) );
   10753 
   10754 #     define CVT(_t)  binop( Iop_F64toI32S,                   \
   10755                              mkexpr(rmode),                   \
   10756                              mkexpr(_t) )
   10757 
   10758       putXMMRegLane32( gregOfRexRM(pfx,modrm), 3, mkU32(0) );
   10759       putXMMRegLane32( gregOfRexRM(pfx,modrm), 2, mkU32(0) );
   10760       putXMMRegLane32( gregOfRexRM(pfx,modrm), 1, CVT(t1) );
   10761       putXMMRegLane32( gregOfRexRM(pfx,modrm), 0, CVT(t0) );
   10762 
   10763 #     undef CVT
   10764 
   10765       goto decode_success;
   10766    }
   10767 
   10768    /* 66 0F 2D = CVTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
   10769       I32 in mmx, according to prevailing SSE rounding mode */
   10770    /* 66 0F 2C = CVTTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
   10771       I32 in mmx, rounding towards zero */
   10772    if (have66noF2noF3(pfx) && sz == 2
   10773        && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
   10774       IRTemp dst64  = newTemp(Ity_I64);
   10775       IRTemp rmode  = newTemp(Ity_I32);
   10776       IRTemp f64lo  = newTemp(Ity_F64);
   10777       IRTemp f64hi  = newTemp(Ity_F64);
   10778       Bool   r2zero = toBool(insn[1] == 0x2C);
   10779 
   10780       do_MMX_preamble();
   10781       modrm = getUChar(delta+2);
   10782 
   10783       if (epartIsReg(modrm)) {
   10784          delta += 2+1;
   10785          assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
   10786          assign(f64hi, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 1));
   10787          DIP("cvt%spd2pi %s,%s\n", r2zero ? "t" : "",
   10788                                    nameXMMReg(eregOfRexRM(pfx,modrm)),
   10789                                    nameMMXReg(gregLO3ofRM(modrm)));
   10790       } else {
   10791          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   10792          assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   10793          assign(f64hi, loadLE(Ity_F64, binop( Iop_Add64,
   10794                                               mkexpr(addr),
   10795                                               mkU64(8) )));
   10796          delta += 2+alen;
   10797          DIP("cvt%spf2pi %s,%s\n", r2zero ? "t" : "",
   10798                                    dis_buf,
   10799                                    nameMMXReg(gregLO3ofRM(modrm)));
   10800       }
   10801 
   10802       if (r2zero) {
   10803          assign(rmode, mkU32((UInt)Irrm_ZERO) );
   10804       } else {
   10805          assign( rmode, get_sse_roundingmode() );
   10806       }
   10807 
   10808       assign(
   10809          dst64,
   10810          binop( Iop_32HLto64,
   10811                 binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64hi) ),
   10812                 binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo) )
   10813               )
   10814       );
   10815 
   10816       putMMXReg(gregLO3ofRM(modrm), mkexpr(dst64));
   10817       goto decode_success;
   10818    }
   10819 
   10820    /* 66 0F 5A = CVTPD2PS -- convert 2 x F64 in mem/xmm to 2 x F32 in
   10821       lo half xmm(G), rounding according to prevailing SSE rounding
   10822       mode, and zero upper half */
   10823    /* Note, this is practically identical to CVTPD2DQ.  It would have
   10824       been nicer to merge them together, but the insn[] offsets differ
   10825       by one. */
   10826    if (have66noF2noF3(pfx) && sz == 2
   10827        && insn[0] == 0x0F && insn[1] == 0x5A) {
   10828       IRTemp argV  = newTemp(Ity_V128);
   10829       IRTemp rmode = newTemp(Ity_I32);
   10830 
   10831       modrm = getUChar(delta+2);
   10832       if (epartIsReg(modrm)) {
   10833          assign( argV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   10834          delta += 2+1;
   10835          DIP("cvtpd2ps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   10836                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   10837       } else {
   10838          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   10839          assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10840          delta += 2+alen;
   10841          DIP("cvtpd2ps %s,%s\n", dis_buf,
   10842                                  nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10843       }
   10844 
   10845       assign( rmode, get_sse_roundingmode() );
   10846       t0 = newTemp(Ity_F64);
   10847       t1 = newTemp(Ity_F64);
   10848       assign( t0, unop(Iop_ReinterpI64asF64,
   10849                        unop(Iop_V128to64, mkexpr(argV))) );
   10850       assign( t1, unop(Iop_ReinterpI64asF64,
   10851                        unop(Iop_V128HIto64, mkexpr(argV))) );
   10852 
   10853 #     define CVT(_t)  binop( Iop_F64toF32,                    \
   10854                              mkexpr(rmode),                   \
   10855                              mkexpr(_t) )
   10856 
   10857       putXMMRegLane32(  gregOfRexRM(pfx,modrm), 3, mkU32(0) );
   10858       putXMMRegLane32(  gregOfRexRM(pfx,modrm), 2, mkU32(0) );
   10859       putXMMRegLane32F( gregOfRexRM(pfx,modrm), 1, CVT(t1) );
   10860       putXMMRegLane32F( gregOfRexRM(pfx,modrm), 0, CVT(t0) );
   10861 
   10862 #     undef CVT
   10863 
   10864       goto decode_success;
   10865    }
   10866 
   10867    /* 66 0F 2A = CVTPI2PD -- convert 2 x I32 in mem/mmx to 2 x F64 in
   10868       xmm(G) */
   10869    if (have66noF2noF3(pfx) && sz == 2
   10870        && insn[0] == 0x0F && insn[1] == 0x2A) {
   10871       IRTemp arg64 = newTemp(Ity_I64);
   10872 
   10873       modrm = getUChar(delta+2);
   10874       if (epartIsReg(modrm)) {
   10875          /* Only switch to MMX mode if the source is a MMX register.
   10876             This is inconsistent with all other instructions which
   10877             convert between XMM and (M64 or MMX), which always switch
   10878             to MMX mode even if 64-bit operand is M64 and not MMX.  At
   10879             least, that's what the Intel docs seem to me to say.
   10880             Fixes #210264. */
   10881          do_MMX_preamble();
   10882          assign( arg64, getMMXReg(eregLO3ofRM(modrm)) );
   10883          delta += 2+1;
   10884          DIP("cvtpi2pd %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   10885                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   10886       } else {
   10887          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   10888          assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   10889          delta += 2+alen;
   10890          DIP("cvtpi2pd %s,%s\n", dis_buf,
   10891                                  nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10892       }
   10893 
   10894       putXMMRegLane64F(
   10895          gregOfRexRM(pfx,modrm), 0,
   10896          unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)) )
   10897       );
   10898 
   10899       putXMMRegLane64F(
   10900          gregOfRexRM(pfx,modrm), 1,
   10901          unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)) )
   10902       );
   10903 
   10904       goto decode_success;
   10905    }
   10906 
   10907    /* F3 0F 5B = CVTTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
   10908       xmm(G), rounding towards zero */
   10909    /* 66 0F 5B = CVTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
   10910       xmm(G), as per the prevailing rounding mode */
   10911    if ( ( (have66noF2noF3(pfx) && sz == 2)
   10912           || (haveF3no66noF2(pfx) && sz == 4)
   10913         )
   10914         && insn[0] == 0x0F && insn[1] == 0x5B) {
   10915       IRTemp argV   = newTemp(Ity_V128);
   10916       IRTemp rmode  = newTemp(Ity_I32);
   10917       Bool   r2zero = toBool(sz == 4);
   10918 
   10919       modrm = getUChar(delta+2);
   10920       if (epartIsReg(modrm)) {
   10921          assign( argV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   10922          delta += 2+1;
   10923          DIP("cvtps2dq %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   10924                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   10925       } else {
   10926          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   10927          assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10928          delta += 2+alen;
   10929          DIP("cvtps2dq %s,%s\n", dis_buf,
   10930                                  nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10931       }
   10932 
   10933       if (r2zero) {
   10934          assign( rmode, mkU32((UInt)Irrm_ZERO) );
   10935       } else {
   10936          assign( rmode, get_sse_roundingmode() );
   10937       }
   10938 
   10939       breakup128to32s( argV, &t3, &t2, &t1, &t0 );
   10940 
   10941       /* This is less than ideal.  If it turns out to be a performance
   10942          bottleneck it can be improved. */
   10943 #     define CVT(_t)                             \
   10944          binop( Iop_F64toI32S,                   \
   10945                 mkexpr(rmode),                   \
   10946                 unop( Iop_F32toF64,              \
   10947                       unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
   10948 
   10949       putXMMRegLane32( gregOfRexRM(pfx,modrm), 3, CVT(t3) );
   10950       putXMMRegLane32( gregOfRexRM(pfx,modrm), 2, CVT(t2) );
   10951       putXMMRegLane32( gregOfRexRM(pfx,modrm), 1, CVT(t1) );
   10952       putXMMRegLane32( gregOfRexRM(pfx,modrm), 0, CVT(t0) );
   10953 
   10954 #     undef CVT
   10955 
   10956       goto decode_success;
   10957    }
   10958 
   10959    /* 0F 5A = CVTPS2PD -- convert 2 x F32 in low half mem/xmm to 2 x
   10960       F64 in xmm(G). */
   10961    if (haveNo66noF2noF3(pfx) && sz == 4
   10962        && insn[0] == 0x0F && insn[1] == 0x5A) {
   10963       IRTemp f32lo = newTemp(Ity_F32);
   10964       IRTemp f32hi = newTemp(Ity_F32);
   10965 
   10966       modrm = getUChar(delta+2);
   10967       if (epartIsReg(modrm)) {
   10968          assign( f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0) );
   10969          assign( f32hi, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 1) );
   10970          delta += 2+1;
   10971          DIP("cvtps2pd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   10972                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   10973       } else {
   10974          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   10975 	 assign( f32lo, loadLE(Ity_F32, mkexpr(addr)) );
   10976 	 assign( f32hi, loadLE(Ity_F32,
   10977                                binop(Iop_Add64,mkexpr(addr),mkU64(4))) );
   10978          delta += 2+alen;
   10979          DIP("cvtps2pd %s,%s\n", dis_buf,
   10980                                  nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10981       }
   10982 
   10983       putXMMRegLane64F( gregOfRexRM(pfx,modrm), 1,
   10984                         unop(Iop_F32toF64, mkexpr(f32hi)) );
   10985       putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0,
   10986                         unop(Iop_F32toF64, mkexpr(f32lo)) );
   10987 
   10988       goto decode_success;
   10989    }
   10990 
   10991    /* F2 0F 2D = CVTSD2SI
   10992       when sz==4 -- convert F64 in mem/low half xmm to I32 in ireg,
   10993                     according to prevailing SSE rounding mode
   10994       when sz==8 -- convert F64 in mem/low half xmm to I64 in ireg,
   10995                     according to prevailing SSE rounding mode
   10996    */
   10997    /* F2 0F 2C = CVTTSD2SI
   10998       when sz==4 -- convert F64 in mem/low half xmm to I32 in ireg,
   10999                     truncating towards zero
   11000       when sz==8 -- convert F64 in mem/low half xmm to I64 in ireg,
   11001                     truncating towards zero
   11002    */
   11003    if (haveF2no66noF3(pfx)
   11004        && insn[0] == 0x0F
   11005        && (insn[1] == 0x2D || insn[1] == 0x2C)) {
   11006       IRTemp rmode  = newTemp(Ity_I32);
   11007       IRTemp f64lo  = newTemp(Ity_F64);
   11008       Bool   r2zero = toBool(insn[1] == 0x2C);
   11009       vassert(sz == 4 || sz == 8);
   11010 
   11011       modrm = getUChar(delta+2);
   11012       if (epartIsReg(modrm)) {
   11013          delta += 2+1;
   11014          assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
   11015          DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
   11016                                    nameXMMReg(eregOfRexRM(pfx,modrm)),
   11017                                    nameIReg(sz, gregOfRexRM(pfx,modrm), False));
   11018       } else {
   11019          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11020          assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   11021          delta += 2+alen;
   11022          DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
   11023                                    dis_buf,
   11024                                    nameIReg(sz, gregOfRexRM(pfx,modrm), False));
   11025       }
   11026 
   11027       if (r2zero) {
   11028          assign( rmode, mkU32((UInt)Irrm_ZERO) );
   11029       } else {
   11030          assign( rmode, get_sse_roundingmode() );
   11031       }
   11032 
   11033       if (sz == 4) {
   11034          putIReg32( gregOfRexRM(pfx,modrm),
   11035                     binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo)) );
   11036       } else {
   11037          putIReg64( gregOfRexRM(pfx,modrm),
   11038                     binop( Iop_F64toI64S, mkexpr(rmode), mkexpr(f64lo)) );
   11039       }
   11040 
   11041       goto decode_success;
   11042    }
   11043 
   11044    /* F2 0F 5A = CVTSD2SS -- convert F64 in mem/low half xmm to F32 in
   11045       low 1/4 xmm(G), according to prevailing SSE rounding mode */
   11046    if (haveF2no66noF3(pfx) && sz == 4
   11047        && insn[0] == 0x0F && insn[1] == 0x5A) {
   11048       IRTemp rmode = newTemp(Ity_I32);
   11049       IRTemp f64lo = newTemp(Ity_F64);
   11050       vassert(sz == 4);
   11051 
   11052       modrm = getUChar(delta+2);
   11053       if (epartIsReg(modrm)) {
   11054          delta += 2+1;
   11055          assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
   11056          DIP("cvtsd2ss %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   11057                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   11058       } else {
   11059          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11060          assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   11061          delta += 2+alen;
   11062          DIP("cvtsd2ss %s,%s\n", dis_buf,
   11063                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   11064       }
   11065 
   11066       assign( rmode, get_sse_roundingmode() );
   11067       putXMMRegLane32F(
   11068          gregOfRexRM(pfx,modrm), 0,
   11069          binop( Iop_F64toF32, mkexpr(rmode), mkexpr(f64lo) )
   11070       );
   11071 
   11072       goto decode_success;
   11073    }
   11074 
   11075    /* F2 0F 2A = CVTSI2SD
   11076       when sz==4 -- convert I32 in mem/ireg to F64 in low half xmm
   11077       when sz==8 -- convert I64 in mem/ireg to F64 in low half xmm
   11078    */
   11079    if (haveF2no66noF3(pfx) && (sz == 4 || sz == 8)
   11080        && insn[0] == 0x0F && insn[1] == 0x2A) {
   11081       modrm = getUChar(delta+2);
   11082 
   11083       if (sz == 4) {
   11084          IRTemp arg32 = newTemp(Ity_I32);
   11085          if (epartIsReg(modrm)) {
   11086             assign( arg32, getIReg32(eregOfRexRM(pfx,modrm)) );
   11087             delta += 2+1;
   11088             DIP("cvtsi2sd %s,%s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
   11089                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   11090          } else {
   11091             addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11092             assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   11093             delta += 2+alen;
   11094             DIP("cvtsi2sd %s,%s\n", dis_buf,
   11095                                     nameXMMReg(gregOfRexRM(pfx,modrm)) );
   11096          }
   11097          putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0,
   11098                            unop(Iop_I32StoF64, mkexpr(arg32))
   11099          );
   11100       } else {
   11101          /* sz == 8 */
   11102          IRTemp arg64 = newTemp(Ity_I64);
   11103          if (epartIsReg(modrm)) {
   11104             assign( arg64, getIReg64(eregOfRexRM(pfx,modrm)) );
   11105             delta += 2+1;
   11106             DIP("cvtsi2sdq %s,%s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
   11107                                      nameXMMReg(gregOfRexRM(pfx,modrm)));
   11108          } else {
   11109             addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11110             assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   11111             delta += 2+alen;
   11112             DIP("cvtsi2sdq %s,%s\n", dis_buf,
   11113                                      nameXMMReg(gregOfRexRM(pfx,modrm)) );
   11114          }
   11115          putXMMRegLane64F(
   11116             gregOfRexRM(pfx,modrm),
   11117             0,
   11118             binop( Iop_I64StoF64,
   11119                    get_sse_roundingmode(),
   11120                    mkexpr(arg64)
   11121             )
   11122          );
   11123 
   11124       }
   11125 
   11126       goto decode_success;
   11127    }
   11128 
   11129    /* F3 0F 5A = CVTSS2SD -- convert F32 in mem/low 1/4 xmm to F64 in
   11130       low half xmm(G) */
   11131    if (haveF3no66noF2(pfx) && sz == 4
   11132        && insn[0] == 0x0F && insn[1] == 0x5A) {
   11133       IRTemp f32lo = newTemp(Ity_F32);
   11134 
   11135       modrm = getUChar(delta+2);
   11136       if (epartIsReg(modrm)) {
   11137          delta += 2+1;
   11138          assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
   11139          DIP("cvtss2sd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   11140                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   11141       } else {
   11142          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11143          assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   11144          delta += 2+alen;
   11145          DIP("cvtss2sd %s,%s\n", dis_buf,
   11146                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   11147       }
   11148 
   11149       putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0,
   11150                         unop( Iop_F32toF64, mkexpr(f32lo) ) );
   11151 
   11152       goto decode_success;
   11153    }
   11154 
   11155    /* 66 0F 5E = DIVPD -- div 64Fx2 from R/M to R */
   11156    if (have66noF2noF3(pfx) && sz == 2
   11157        && insn[0] == 0x0F && insn[1] == 0x5E) {
   11158       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "divpd", Iop_Div64Fx2 );
   11159       goto decode_success;
   11160    }
   11161 
   11162    /* F2 0F 5E = DIVSD -- div 64F0x2 from R/M to R */
   11163    if (haveF2no66noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x5E) {
   11164       vassert(sz == 4);
   11165       delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "divsd", Iop_Div64F0x2 );
   11166       goto decode_success;
   11167    }
   11168 
   11169    /* 0F AE /5 = LFENCE -- flush pending operations to memory */
   11170    /* 0F AE /6 = MFENCE -- flush pending operations to memory */
   11171    if (haveNo66noF2noF3(pfx) && sz == 4
   11172        && insn[0] == 0x0F && insn[1] == 0xAE
   11173        && epartIsReg(insn[2])
   11174        && (gregLO3ofRM(insn[2]) == 5 || gregLO3ofRM(insn[2]) == 6)) {
   11175       delta += 3;
   11176       /* Insert a memory fence.  It's sometimes important that these
   11177          are carried through to the generated code. */
   11178       stmt( IRStmt_MBE(Imbe_Fence) );
   11179       DIP("%sfence\n", gregLO3ofRM(insn[2])==5 ? "l" : "m");
   11180       goto decode_success;
   11181    }
   11182 
   11183    /* 66 0F 5F = MAXPD -- max 64Fx2 from R/M to R */
   11184    if (have66noF2noF3(pfx) && sz == 2
   11185        && insn[0] == 0x0F && insn[1] == 0x5F) {
   11186       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "maxpd", Iop_Max64Fx2 );
   11187       goto decode_success;
   11188    }
   11189 
   11190    /* F2 0F 5F = MAXSD -- max 64F0x2 from R/M to R */
   11191    if (haveF2no66noF3(pfx) && sz == 4
   11192        && insn[0] == 0x0F && insn[1] == 0x5F) {
   11193       delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "maxsd", Iop_Max64F0x2 );
   11194       goto decode_success;
   11195    }
   11196 
   11197    /* 66 0F 5D = MINPD -- min 64Fx2 from R/M to R */
   11198    if (have66noF2noF3(pfx) && sz == 2
   11199        && insn[0] == 0x0F && insn[1] == 0x5D) {
   11200       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "minpd", Iop_Min64Fx2 );
   11201       goto decode_success;
   11202    }
   11203 
   11204    /* F2 0F 5D = MINSD -- min 64F0x2 from R/M to R */
   11205    if (haveF2no66noF3(pfx) && sz == 4
   11206        && insn[0] == 0x0F && insn[1] == 0x5D) {
   11207       delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "minsd", Iop_Min64F0x2 );
   11208       goto decode_success;
   11209    }
   11210 
   11211    /* 66 0F 28 = MOVAPD -- move from E (mem or xmm) to G (xmm). */
   11212    /* 66 0F 10 = MOVUPD -- move from E (mem or xmm) to G (xmm). */
   11213    /* 66 0F 6F = MOVDQA -- move from E (mem or xmm) to G (xmm). */
   11214    if (have66noF2noF3(pfx)
   11215        && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
   11216        && insn[0] == 0x0F
   11217        && (insn[1] == 0x28 || insn[1] == 0x10 || insn[1] == 0x6F)) {
   11218       HChar* wot = insn[1]==0x28 ? "apd" :
   11219                    insn[1]==0x10 ? "upd" : "dqa";
   11220       modrm = getUChar(delta+2);
   11221       if (epartIsReg(modrm)) {
   11222          putXMMReg( gregOfRexRM(pfx,modrm),
   11223                     getXMMReg( eregOfRexRM(pfx,modrm) ));
   11224          DIP("mov%s %s,%s\n", wot, nameXMMReg(eregOfRexRM(pfx,modrm)),
   11225                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   11226          delta += 2+1;
   11227       } else {
   11228          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11229          if (insn[1] == 0x28/*movapd*/ || insn[1] == 0x6F/*movdqa*/)
   11230             gen_SEGV_if_not_16_aligned( addr );
   11231          putXMMReg( gregOfRexRM(pfx,modrm),
   11232                     loadLE(Ity_V128, mkexpr(addr)) );
   11233          DIP("mov%s %s,%s\n", wot, dis_buf,
   11234                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   11235          delta += 2+alen;
   11236       }
   11237       goto decode_success;
   11238    }
   11239 
   11240    /* 66 0F 29 = MOVAPD -- move from G (xmm) to E (mem or xmm). */
   11241    /* 66 0F 11 = MOVUPD -- move from G (xmm) to E (mem or xmm). */
   11242    if (have66noF2noF3(pfx) && insn[0] == 0x0F
   11243        && (insn[1] == 0x29 || insn[1] == 0x11)) {
   11244       HChar* wot = insn[1]==0x29 ? "apd" : "upd";
   11245       modrm = getUChar(delta+2);
   11246       if (epartIsReg(modrm)) {
   11247          putXMMReg( eregOfRexRM(pfx,modrm),
   11248 		    getXMMReg( gregOfRexRM(pfx,modrm) ) );
   11249          DIP("mov%s %s,%s\n", wot, nameXMMReg(gregOfRexRM(pfx,modrm)),
   11250 	                           nameXMMReg(eregOfRexRM(pfx,modrm)));
   11251          delta += 2+1;
   11252       } else {
   11253          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11254          if (insn[1] == 0x29/*movapd*/)
   11255             gen_SEGV_if_not_16_aligned( addr );
   11256          storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   11257          DIP("mov%s %s,%s\n", wot, nameXMMReg(gregOfRexRM(pfx,modrm)),
   11258                               dis_buf );
   11259          delta += 2+alen;
   11260       }
   11261       goto decode_success;
   11262    }
   11263 
   11264    /* 66 0F 6E = MOVD from ireg32/m32 to xmm lo 1/4, zeroing high 3/4 of xmm. */
   11265    /*              or from ireg64/m64 to xmm lo 1/2, zeroing high 1/2 of xmm. */
   11266    if (have66noF2noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x6E) {
   11267       vassert(sz == 2 || sz == 8);
   11268       if (sz == 2) sz = 4;
   11269       modrm = getUChar(delta+2);
   11270       if (epartIsReg(modrm)) {
   11271          delta += 2+1;
   11272          if (sz == 4) {
   11273             putXMMReg(
   11274                gregOfRexRM(pfx,modrm),
   11275                unop( Iop_32UtoV128, getIReg32(eregOfRexRM(pfx,modrm)) )
   11276             );
   11277             DIP("movd %s, %s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
   11278                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   11279          } else {
   11280             putXMMReg(
   11281                gregOfRexRM(pfx,modrm),
   11282                unop( Iop_64UtoV128, getIReg64(eregOfRexRM(pfx,modrm)) )
   11283             );
   11284             DIP("movq %s, %s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
   11285                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   11286 	 }
   11287       } else {
   11288          addr = disAMode( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11289          delta += 2+alen;
   11290          putXMMReg(
   11291             gregOfRexRM(pfx,modrm),
   11292             sz == 4
   11293                ?  unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)) )
   11294 	       :  unop( Iop_64UtoV128,loadLE(Ity_I64, mkexpr(addr)) )
   11295          );
   11296          DIP("mov%c %s, %s\n", sz == 4 ? 'd' : 'q', dis_buf,
   11297                                nameXMMReg(gregOfRexRM(pfx,modrm)));
   11298       }
   11299       goto decode_success;
   11300    }
   11301 
   11302    /* 66 0F 7E = MOVD from xmm low 1/4 to ireg32 or m32. */
   11303    /*              or from xmm low 1/2 to ireg64 or m64. */
   11304    if (have66noF2noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x7E) {
   11305       if (sz == 2) sz = 4;
   11306       vassert(sz == 4 || sz == 8);
   11307       modrm = getUChar(delta+2);
   11308       if (epartIsReg(modrm)) {
   11309          delta += 2+1;
   11310          if (sz == 4) {
   11311             putIReg32( eregOfRexRM(pfx,modrm),
   11312                        getXMMRegLane32(gregOfRexRM(pfx,modrm), 0) );
   11313             DIP("movd %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11314                                  nameIReg32(eregOfRexRM(pfx,modrm)));
   11315 	 } else {
   11316             putIReg64( eregOfRexRM(pfx,modrm),
   11317                        getXMMRegLane64(gregOfRexRM(pfx,modrm), 0) );
   11318             DIP("movq %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11319                                  nameIReg64(eregOfRexRM(pfx,modrm)));
   11320 	 }
   11321       } else {
   11322          addr = disAMode( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11323          delta += 2+alen;
   11324          storeLE( mkexpr(addr),
   11325                   sz == 4
   11326                      ? getXMMRegLane32(gregOfRexRM(pfx,modrm),0)
   11327                      : getXMMRegLane64(gregOfRexRM(pfx,modrm),0) );
   11328          DIP("mov%c %s, %s\n", sz == 4 ? 'd' : 'q',
   11329                                nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
   11330       }
   11331       goto decode_success;
   11332    }
   11333 
   11334    /* 66 0F 7F = MOVDQA -- move from G (xmm) to E (mem or xmm). */
   11335    if (have66noF2noF3(pfx) && sz == 2
   11336        && insn[0] == 0x0F && insn[1] == 0x7F) {
   11337       modrm = getUChar(delta+2);
   11338       if (epartIsReg(modrm)) {
   11339          delta += 2+1;
   11340          putXMMReg( eregOfRexRM(pfx,modrm),
   11341                     getXMMReg(gregOfRexRM(pfx,modrm)) );
   11342          DIP("movdqa %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11343                                 nameXMMReg(eregOfRexRM(pfx,modrm)));
   11344       } else {
   11345          addr = disAMode( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11346          gen_SEGV_if_not_16_aligned( addr );
   11347          delta += 2+alen;
   11348          storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   11349          DIP("movdqa %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
   11350       }
   11351       goto decode_success;
   11352    }
   11353 
   11354    /* F3 0F 6F = MOVDQU -- move from E (mem or xmm) to G (xmm). */
   11355    if (haveF3no66noF2(pfx) && sz == 4
   11356        && insn[0] == 0x0F && insn[1] == 0x6F) {
   11357       modrm = getUChar(delta+2);
   11358       if (epartIsReg(modrm)) {
   11359          putXMMReg( gregOfRexRM(pfx,modrm),
   11360                     getXMMReg( eregOfRexRM(pfx,modrm) ));
   11361          DIP("movdqu %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   11362                                nameXMMReg(gregOfRexRM(pfx,modrm)));
   11363          delta += 2+1;
   11364       } else {
   11365          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11366          putXMMReg( gregOfRexRM(pfx,modrm),
   11367                     loadLE(Ity_V128, mkexpr(addr)) );
   11368          DIP("movdqu %s,%s\n", dis_buf,
   11369                                nameXMMReg(gregOfRexRM(pfx,modrm)));
   11370          delta += 2+alen;
   11371       }
   11372       goto decode_success;
   11373    }
   11374 
   11375    /* F3 0F 7F = MOVDQU -- move from G (xmm) to E (mem or xmm). */
   11376    if (haveF3no66noF2(pfx) && sz == 4
   11377        && insn[0] == 0x0F && insn[1] == 0x7F) {
   11378       modrm = getUChar(delta+2);
   11379       if (epartIsReg(modrm)) {
   11380          goto decode_failure; /* awaiting test case */
   11381          delta += 2+1;
   11382          putXMMReg( eregOfRexRM(pfx,modrm),
   11383                     getXMMReg(gregOfRexRM(pfx,modrm)) );
   11384          DIP("movdqu %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11385                                 nameXMMReg(eregOfRexRM(pfx,modrm)));
   11386       } else {
   11387          addr = disAMode( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11388          delta += 2+alen;
   11389          storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   11390          DIP("movdqu %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
   11391       }
   11392       goto decode_success;
   11393    }
   11394 
   11395    /* F2 0F D6 = MOVDQ2Q -- move from E (lo half xmm, not mem) to G (mmx). */
   11396    if (haveF2no66noF3(pfx) && sz == 4
   11397        && insn[0] == 0x0F && insn[1] == 0xD6) {
   11398       modrm = getUChar(delta+2);
   11399       if (epartIsReg(modrm)) {
   11400          do_MMX_preamble();
   11401          putMMXReg( gregLO3ofRM(modrm),
   11402                     getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
   11403          DIP("movdq2q %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   11404                                 nameMMXReg(gregLO3ofRM(modrm)));
   11405          delta += 2+1;
   11406          goto decode_success;
   11407       } else {
   11408          /* apparently no mem case for this insn */
   11409          goto decode_failure;
   11410       }
   11411    }
   11412 
   11413    /* 66 0F 16 = MOVHPD -- move from mem to high half of XMM. */
   11414    /* These seems identical to MOVHPS.  This instruction encoding is
   11415       completely crazy. */
   11416    if (have66noF2noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x16) {
   11417       modrm = getUChar(delta+2);
   11418       if (epartIsReg(modrm)) {
   11419          /* fall through; apparently reg-reg is not possible */
   11420       } else {
   11421          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11422          delta += 2+alen;
   11423          putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
   11424                           loadLE(Ity_I64, mkexpr(addr)) );
   11425          DIP("movhpd %s,%s\n", dis_buf,
   11426                                nameXMMReg( gregOfRexRM(pfx,modrm) ));
   11427          goto decode_success;
   11428       }
   11429    }
   11430 
   11431    /* 66 0F 17 = MOVHPD -- move from high half of XMM to mem. */
   11432    /* Again, this seems identical to MOVHPS. */
   11433    if (have66noF2noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x17) {
   11434       if (!epartIsReg(insn[2])) {
   11435          delta += 2;
   11436          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11437          delta += alen;
   11438          storeLE( mkexpr(addr),
   11439                   getXMMRegLane64( gregOfRexRM(pfx,insn[2]),
   11440                                    1/*upper lane*/ ) );
   11441          DIP("movhpd %s,%s\n", nameXMMReg( gregOfRexRM(pfx,insn[2]) ),
   11442                                dis_buf);
   11443          goto decode_success;
   11444       }
   11445       /* else fall through */
   11446    }
   11447 
   11448    /* 66 0F 12 = MOVLPD -- move from mem to low half of XMM. */
   11449    /* Identical to MOVLPS ? */
   11450    if (have66noF2noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x12) {
   11451       modrm = getUChar(delta+2);
   11452       if (epartIsReg(modrm)) {
   11453          /* fall through; apparently reg-reg is not possible */
   11454       } else {
   11455          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11456          delta += 2+alen;
   11457          putXMMRegLane64( gregOfRexRM(pfx,modrm),
   11458                           0/*lower lane*/,
   11459                           loadLE(Ity_I64, mkexpr(addr)) );
   11460          DIP("movlpd %s, %s\n",
   11461              dis_buf, nameXMMReg( gregOfRexRM(pfx,modrm) ));
   11462          goto decode_success;
   11463       }
   11464    }
   11465 
   11466    /* 66 0F 13 = MOVLPD -- move from low half of XMM to mem. */
   11467    /* Identical to MOVLPS ? */
   11468    if (have66noF2noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x13) {
   11469       modrm = getUChar(delta+2);
   11470       if (!epartIsReg(modrm)) {
   11471          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11472          delta += 2+alen;
   11473          storeLE( mkexpr(addr),
   11474                   getXMMRegLane64( gregOfRexRM(pfx,modrm),
   11475                                    0/*lower lane*/ ) );
   11476          DIP("movlpd %s, %s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
   11477                                 dis_buf);
   11478          goto decode_success;
   11479       }
   11480       /* else fall through */
   11481    }
   11482 
   11483    /* 66 0F 50 = MOVMSKPD - move 2 sign bits from 2 x F64 in xmm(E) to
   11484       2 lowest bits of ireg(G) */
   11485    if (have66noF2noF3(pfx) && (sz == 2 || sz == 8)
   11486        && insn[0] == 0x0F && insn[1] == 0x50) {
   11487       /* sz == 8 is a kludge to handle insns with REX.W redundantly
   11488          set to 1, which has been known to happen:
   11489          66 4c 0f 50 d9          rex64X movmskpd %xmm1,%r11d
   11490          20071106: see further comments on MOVMSKPS implementation above.
   11491       */
   11492       modrm = getUChar(delta+2);
   11493       if (epartIsReg(modrm)) {
   11494          Int src;
   11495          t0 = newTemp(Ity_I32);
   11496          t1 = newTemp(Ity_I32);
   11497          delta += 2+1;
   11498          src = eregOfRexRM(pfx,modrm);
   11499          assign( t0, binop( Iop_And32,
   11500                             binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(31)),
   11501                             mkU32(1) ));
   11502          assign( t1, binop( Iop_And32,
   11503                             binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(30)),
   11504                             mkU32(2) ));
   11505          putIReg32( gregOfRexRM(pfx,modrm),
   11506                     binop(Iop_Or32, mkexpr(t0), mkexpr(t1))
   11507                   );
   11508          DIP("movmskpd %s,%s\n", nameXMMReg(src),
   11509                                  nameIReg32(gregOfRexRM(pfx,modrm)));
   11510          goto decode_success;
   11511       }
   11512       /* else fall through */
   11513       goto decode_failure;
   11514    }
   11515 
   11516    /* 66 0F F7 = MASKMOVDQU -- store selected bytes of double quadword */
   11517    if (have66noF2noF3(pfx) && sz == 2
   11518        && insn[0] == 0x0F && insn[1] == 0xF7) {
   11519       modrm = getUChar(delta+2);
   11520       if (epartIsReg(modrm)) {
   11521          IRTemp regD    = newTemp(Ity_V128);
   11522          IRTemp mask    = newTemp(Ity_V128);
   11523          IRTemp olddata = newTemp(Ity_V128);
   11524          IRTemp newdata = newTemp(Ity_V128);
   11525                 addr    = newTemp(Ity_I64);
   11526 
   11527          assign( addr, handleAddrOverrides( vbi, pfx, getIReg64(R_RDI) ));
   11528          assign( regD, getXMMReg( gregOfRexRM(pfx,modrm) ));
   11529 
   11530          /* Unfortunately can't do the obvious thing with SarN8x16
   11531             here since that can't be re-emitted as SSE2 code - no such
   11532             insn. */
   11533 	 assign(
   11534             mask,
   11535             binop(Iop_64HLtoV128,
   11536                   binop(Iop_SarN8x8,
   11537                         getXMMRegLane64( eregOfRexRM(pfx,modrm), 1 ),
   11538                         mkU8(7) ),
   11539                   binop(Iop_SarN8x8,
   11540                         getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ),
   11541                         mkU8(7) ) ));
   11542          assign( olddata, loadLE( Ity_V128, mkexpr(addr) ));
   11543          assign( newdata,
   11544                  binop(Iop_OrV128,
   11545                        binop(Iop_AndV128,
   11546                              mkexpr(regD),
   11547                              mkexpr(mask) ),
   11548                        binop(Iop_AndV128,
   11549                              mkexpr(olddata),
   11550                              unop(Iop_NotV128, mkexpr(mask)))) );
   11551          storeLE( mkexpr(addr), mkexpr(newdata) );
   11552 
   11553          delta += 2+1;
   11554          DIP("maskmovdqu %s,%s\n", nameXMMReg( eregOfRexRM(pfx,modrm) ),
   11555                                    nameXMMReg( gregOfRexRM(pfx,modrm) ) );
   11556          goto decode_success;
   11557       }
   11558       /* else fall through */
   11559    }
   11560 
   11561    /* 66 0F E7 = MOVNTDQ -- for us, just a plain SSE store. */
   11562    if (have66noF2noF3(pfx) && sz == 2
   11563        && insn[0] == 0x0F && insn[1] == 0xE7) {
   11564       modrm = getUChar(delta+2);
   11565       if (!epartIsReg(modrm)) {
   11566          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11567          gen_SEGV_if_not_16_aligned( addr );
   11568          storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   11569          DIP("movntdq %s,%s\n", dis_buf,
   11570                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
   11571          delta += 2+alen;
   11572          goto decode_success;
   11573       }
   11574       /* else fall through */
   11575       goto decode_failure;
   11576    }
   11577 
   11578    /* 0F C3 = MOVNTI -- for us, just a plain ireg store. */
   11579    if (haveNo66noF2noF3(pfx) &&
   11580        insn[0] == 0x0F && insn[1] == 0xC3) {
   11581       vassert(sz == 4 || sz == 8);
   11582       modrm = getUChar(delta+2);
   11583       if (!epartIsReg(modrm)) {
   11584          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11585          storeLE( mkexpr(addr), getIRegG(sz, pfx, modrm) );
   11586          DIP("movnti %s,%s\n", dis_buf,
   11587                                nameIRegG(sz, pfx, modrm));
   11588          delta += 2+alen;
   11589          goto decode_success;
   11590       }
   11591       /* else fall through */
   11592    }
   11593 
   11594    /* 66 0F D6 = MOVQ -- move 64 bits from G (lo half xmm) to E (mem
   11595       or lo half xmm).  */
   11596    if (have66noF2noF3(pfx)
   11597        && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
   11598        && insn[0] == 0x0F && insn[1] == 0xD6) {
   11599       modrm = getUChar(delta+2);
   11600       if (epartIsReg(modrm)) {
   11601          /* fall through, awaiting test case */
   11602          /* dst: lo half copied, hi half zeroed */
   11603       } else {
   11604          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11605          storeLE( mkexpr(addr),
   11606                   getXMMRegLane64( gregOfRexRM(pfx,modrm), 0 ));
   11607          DIP("movq %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf );
   11608          delta += 2+alen;
   11609          goto decode_success;
   11610       }
   11611    }
   11612 
   11613    /* F3 0F D6 = MOVQ2DQ -- move from E (mmx) to G (lo half xmm, zero
   11614       hi half). */
   11615    if (haveF3no66noF2(pfx) && sz == 4
   11616        && insn[0] == 0x0F && insn[1] == 0xD6) {
   11617       modrm = getUChar(delta+2);
   11618       if (epartIsReg(modrm)) {
   11619          do_MMX_preamble();
   11620          putXMMReg( gregOfRexRM(pfx,modrm),
   11621                     unop(Iop_64UtoV128, getMMXReg( eregLO3ofRM(modrm) )) );
   11622          DIP("movq2dq %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   11623                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
   11624          delta += 2+1;
   11625          goto decode_success;
   11626       } else {
   11627          /* apparently no mem case for this insn */
   11628          goto decode_failure;
   11629       }
   11630    }
   11631 
   11632    /* F3 0F 7E = MOVQ -- move 64 bits from E (mem or lo half xmm) to
   11633       G (lo half xmm).  Upper half of G is zeroed out. */
   11634    /* F2 0F 10 = MOVSD -- move 64 bits from E (mem or lo half xmm) to
   11635       G (lo half xmm).  If E is mem, upper half of G is zeroed out.
   11636       If E is reg, upper half of G is unchanged. */
   11637    if ( (haveF2no66noF3(pfx)
   11638          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
   11639          && insn[0] == 0x0F && insn[1] == 0x10)
   11640         ||
   11641         (haveF3no66noF2(pfx)
   11642          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
   11643          && insn[0] == 0x0F && insn[1] == 0x7E)
   11644       ) {
   11645       modrm = getUChar(delta+2);
   11646       if (epartIsReg(modrm)) {
   11647          putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
   11648                           getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
   11649          if (insn[1] == 0x7E/*MOVQ*/) {
   11650             /* zero bits 127:64 */
   11651             putXMMRegLane64( gregOfRexRM(pfx,modrm), 1, mkU64(0) );
   11652          }
   11653          DIP("movsd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   11654                               nameXMMReg(gregOfRexRM(pfx,modrm)));
   11655          delta += 2+1;
   11656       } else {
   11657          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11658          putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
   11659          putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
   11660                           loadLE(Ity_I64, mkexpr(addr)) );
   11661          DIP("movsd %s,%s\n", dis_buf,
   11662                               nameXMMReg(gregOfRexRM(pfx,modrm)));
   11663          delta += 2+alen;
   11664       }
   11665       goto decode_success;
   11666    }
   11667 
   11668    /* F2 0F 11 = MOVSD -- move 64 bits from G (lo half xmm) to E (mem
   11669       or lo half xmm). */
   11670    if (haveF2no66noF3(pfx)
   11671        && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
   11672        && insn[0] == 0x0F && insn[1] == 0x11) {
   11673       modrm = getUChar(delta+2);
   11674       if (epartIsReg(modrm)) {
   11675          putXMMRegLane64( eregOfRexRM(pfx,modrm), 0,
   11676                           getXMMRegLane64( gregOfRexRM(pfx,modrm), 0 ));
   11677          DIP("movsd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11678                               nameXMMReg(eregOfRexRM(pfx,modrm)));
   11679          delta += 2+1;
   11680       } else {
   11681          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11682          storeLE( mkexpr(addr),
   11683                   getXMMRegLane64(gregOfRexRM(pfx,modrm), 0) );
   11684          DIP("movsd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11685                               dis_buf);
   11686          delta += 2+alen;
   11687       }
   11688       goto decode_success;
   11689    }
   11690 
   11691    /* 66 0F 59 = MULPD -- mul 64Fx2 from R/M to R */
   11692    if (have66noF2noF3(pfx)
   11693        && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
   11694        && insn[0] == 0x0F && insn[1] == 0x59) {
   11695       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "mulpd", Iop_Mul64Fx2 );
   11696       goto decode_success;
   11697    }
   11698 
   11699    /* F2 0F 59 = MULSD -- mul 64F0x2 from R/M to R */
   11700    if (haveF2no66noF3(pfx)
   11701        && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
   11702        && insn[0] == 0x0F && insn[1] == 0x59) {
   11703       delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "mulsd", Iop_Mul64F0x2 );
   11704       goto decode_success;
   11705    }
   11706 
   11707    /* 66 0F 56 = ORPD -- G = G and E */
   11708    if (have66noF2noF3(pfx) && sz == 2
   11709        && insn[0] == 0x0F && insn[1] == 0x56) {
   11710       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "orpd", Iop_OrV128 );
   11711       goto decode_success;
   11712    }
   11713 
   11714    /* 66 0F C6 /r ib = SHUFPD -- shuffle packed F64s */
   11715    if (have66noF2noF3(pfx) && sz == 2
   11716        && insn[0] == 0x0F && insn[1] == 0xC6) {
   11717       Int    select;
   11718       IRTemp sV = newTemp(Ity_V128);
   11719       IRTemp dV = newTemp(Ity_V128);
   11720       IRTemp s1 = newTemp(Ity_I64);
   11721       IRTemp s0 = newTemp(Ity_I64);
   11722       IRTemp d1 = newTemp(Ity_I64);
   11723       IRTemp d0 = newTemp(Ity_I64);
   11724 
   11725       modrm = insn[2];
   11726       assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   11727 
   11728       if (epartIsReg(modrm)) {
   11729          assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   11730          select = (Int)insn[3];
   11731          delta += 2+2;
   11732          DIP("shufpd $%d,%s,%s\n", select,
   11733                                    nameXMMReg(eregOfRexRM(pfx,modrm)),
   11734                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   11735       } else {
   11736          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 1 );
   11737          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11738          select = (Int)insn[2+alen];
   11739          delta += 3+alen;
   11740          DIP("shufpd $%d,%s,%s\n", select,
   11741                                    dis_buf,
   11742                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   11743       }
   11744 
   11745       assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
   11746       assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
   11747       assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
   11748       assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
   11749 
   11750 #     define SELD(n) mkexpr((n)==0 ? d0 : d1)
   11751 #     define SELS(n) mkexpr((n)==0 ? s0 : s1)
   11752 
   11753       putXMMReg(
   11754          gregOfRexRM(pfx,modrm),
   11755          binop(Iop_64HLtoV128, SELS((select>>1)&1), SELD((select>>0)&1) )
   11756       );
   11757 
   11758 #     undef SELD
   11759 #     undef SELS
   11760 
   11761       goto decode_success;
   11762    }
   11763 
   11764    /* 66 0F 51 = SQRTPD -- approx sqrt 64Fx2 from R/M to R */
   11765    if (have66noF2noF3(pfx) && sz == 2
   11766        && insn[0] == 0x0F && insn[1] == 0x51) {
   11767       delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta+2,
   11768                                         "sqrtpd", Iop_Sqrt64Fx2 );
   11769       goto decode_success;
   11770    }
   11771 
   11772    /* F2 0F 51 = SQRTSD -- approx sqrt 64F0x2 from R/M to R */
   11773    if (haveF2no66noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x51) {
   11774       vassert(sz == 4);
   11775       delta = dis_SSE_E_to_G_unary_lo64( vbi, pfx, delta+2,
   11776                                          "sqrtsd", Iop_Sqrt64F0x2 );
   11777       goto decode_success;
   11778    }
   11779 
   11780    /* 66 0F 5C = SUBPD -- sub 64Fx2 from R/M to R */
   11781    if (have66noF2noF3(pfx) && sz == 2
   11782        && insn[0] == 0x0F && insn[1] == 0x5C) {
   11783       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "subpd", Iop_Sub64Fx2 );
   11784       goto decode_success;
   11785    }
   11786 
   11787    /* F2 0F 5C = SUBSD -- sub 64F0x2 from R/M to R */
   11788    if (haveF2no66noF3(pfx)
   11789        && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
   11790        && insn[0] == 0x0F && insn[1] == 0x5C) {
   11791       delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "subsd", Iop_Sub64F0x2 );
   11792       goto decode_success;
   11793    }
   11794 
   11795    /* 66 0F 15 = UNPCKHPD -- unpack and interleave high part F64s */
   11796    /* 66 0F 14 = UNPCKLPD -- unpack and interleave low part F64s */
   11797    /* These just appear to be special cases of SHUFPS */
   11798    if (have66noF2noF3(pfx)
   11799        && sz == 2 /* could be 8 if rex also present */
   11800        && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
   11801       IRTemp s1 = newTemp(Ity_I64);
   11802       IRTemp s0 = newTemp(Ity_I64);
   11803       IRTemp d1 = newTemp(Ity_I64);
   11804       IRTemp d0 = newTemp(Ity_I64);
   11805       IRTemp sV = newTemp(Ity_V128);
   11806       IRTemp dV = newTemp(Ity_V128);
   11807       Bool   hi = toBool(insn[1] == 0x15);
   11808 
   11809       modrm = insn[2];
   11810       assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   11811 
   11812       if (epartIsReg(modrm)) {
   11813          assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   11814          delta += 2+1;
   11815          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   11816                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
   11817                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   11818       } else {
   11819          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   11820          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11821          delta += 2+alen;
   11822          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   11823                                   dis_buf,
   11824                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   11825       }
   11826 
   11827       assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
   11828       assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
   11829       assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
   11830       assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
   11831 
   11832       if (hi) {
   11833          putXMMReg( gregOfRexRM(pfx,modrm),
   11834                     binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1)) );
   11835       } else {
   11836          putXMMReg( gregOfRexRM(pfx,modrm),
   11837                     binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)) );
   11838       }
   11839 
   11840       goto decode_success;
   11841    }
   11842 
   11843    /* 66 0F 57 = XORPD -- G = G xor E */
   11844    if (have66noF2noF3(pfx) && sz == 2
   11845        && insn[0] == 0x0F && insn[1] == 0x57) {
   11846       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "xorpd", Iop_XorV128 );
   11847       goto decode_success;
   11848    }
   11849 
   11850    /* 66 0F 6B = PACKSSDW */
   11851    if (have66noF2noF3(pfx) && sz == 2
   11852        && insn[0] == 0x0F && insn[1] == 0x6B) {
   11853       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11854                                  "packssdw",
   11855                                  Iop_QNarrowBin32Sto16Sx8, True );
   11856       goto decode_success;
   11857    }
   11858 
   11859    /* 66 0F 63 = PACKSSWB */
   11860    if (have66noF2noF3(pfx) && sz == 2
   11861        && insn[0] == 0x0F && insn[1] == 0x63) {
   11862       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11863                                  "packsswb",
   11864                                  Iop_QNarrowBin16Sto8Sx16, True );
   11865       goto decode_success;
   11866    }
   11867 
   11868    /* 66 0F 67 = PACKUSWB */
   11869    if (have66noF2noF3(pfx) && sz == 2
   11870        && insn[0] == 0x0F && insn[1] == 0x67) {
   11871       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11872                                  "packuswb",
   11873                                  Iop_QNarrowBin16Sto8Ux16, True );
   11874       goto decode_success;
   11875    }
   11876 
   11877    /* 66 0F FC = PADDB */
   11878    if (have66noF2noF3(pfx) && sz == 2
   11879        && insn[0] == 0x0F && insn[1] == 0xFC) {
   11880       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11881                                  "paddb", Iop_Add8x16, False );
   11882       goto decode_success;
   11883    }
   11884 
   11885    /* 66 0F FE = PADDD */
   11886    if (have66noF2noF3(pfx) && sz == 2
   11887        && insn[0] == 0x0F && insn[1] == 0xFE) {
   11888       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11889                                  "paddd", Iop_Add32x4, False );
   11890       goto decode_success;
   11891    }
   11892 
   11893    /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   11894    /* 0F D4 = PADDQ -- add 64x1 */
   11895    if (haveNo66noF2noF3(pfx) && sz == 4
   11896        && insn[0] == 0x0F && insn[1] == 0xD4) {
   11897       do_MMX_preamble();
   11898       delta = dis_MMXop_regmem_to_reg (
   11899                 vbi, pfx, delta+2, insn[1], "paddq", False );
   11900       goto decode_success;
   11901    }
   11902 
   11903    /* 66 0F D4 = PADDQ */
   11904    if (have66noF2noF3(pfx) && sz == 2
   11905        && insn[0] == 0x0F && insn[1] == 0xD4) {
   11906       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11907                                  "paddq", Iop_Add64x2, False );
   11908       goto decode_success;
   11909    }
   11910 
   11911    /* 66 0F FD = PADDW */
   11912    if (have66noF2noF3(pfx) && sz == 2
   11913        && insn[0] == 0x0F && insn[1] == 0xFD) {
   11914       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11915                                  "paddw", Iop_Add16x8, False );
   11916       goto decode_success;
   11917    }
   11918 
   11919    /* 66 0F EC = PADDSB */
   11920    if (have66noF2noF3(pfx) && sz == 2
   11921        && insn[0] == 0x0F && insn[1] == 0xEC) {
   11922       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11923                                  "paddsb", Iop_QAdd8Sx16, False );
   11924       goto decode_success;
   11925    }
   11926 
   11927    /* 66 0F ED = PADDSW */
   11928    if (have66noF2noF3(pfx) && sz == 2
   11929        && insn[0] == 0x0F && insn[1] == 0xED) {
   11930       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11931                                  "paddsw", Iop_QAdd16Sx8, False );
   11932       goto decode_success;
   11933    }
   11934 
   11935    /* 66 0F DC = PADDUSB */
   11936    if (have66noF2noF3(pfx) && sz == 2
   11937        && insn[0] == 0x0F && insn[1] == 0xDC) {
   11938       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11939                                  "paddusb", Iop_QAdd8Ux16, False );
   11940       goto decode_success;
   11941    }
   11942 
   11943    /* 66 0F DD = PADDUSW */
   11944    if (have66noF2noF3(pfx) && sz == 2
   11945        && insn[0] == 0x0F && insn[1] == 0xDD) {
   11946       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11947                                  "paddusw", Iop_QAdd16Ux8, False );
   11948       goto decode_success;
   11949    }
   11950 
   11951    /* 66 0F DB = PAND */
   11952    if (have66noF2noF3(pfx) && sz == 2
   11953        && insn[0] == 0x0F && insn[1] == 0xDB) {
   11954       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "pand", Iop_AndV128 );
   11955       goto decode_success;
   11956    }
   11957 
   11958    /* 66 0F DF = PANDN */
   11959    if (have66noF2noF3(pfx) && sz == 2
   11960        && insn[0] == 0x0F && insn[1] == 0xDF) {
   11961       delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta+2, "pandn", Iop_AndV128 );
   11962       goto decode_success;
   11963    }
   11964 
   11965    /* 66 0F E0 = PAVGB */
   11966    if (have66noF2noF3(pfx) && sz == 2
   11967        && insn[0] == 0x0F && insn[1] == 0xE0) {
   11968       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11969                                  "pavgb", Iop_Avg8Ux16, False );
   11970       goto decode_success;
   11971    }
   11972 
   11973    /* 66 0F E3 = PAVGW */
   11974    if (have66noF2noF3(pfx) && sz == 2
   11975        && insn[0] == 0x0F && insn[1] == 0xE3) {
   11976       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11977                                  "pavgw", Iop_Avg16Ux8, False );
   11978       goto decode_success;
   11979    }
   11980 
   11981    /* 66 0F 74 = PCMPEQB */
   11982    if (have66noF2noF3(pfx) && sz == 2
   11983        && insn[0] == 0x0F && insn[1] == 0x74) {
   11984       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11985                                  "pcmpeqb", Iop_CmpEQ8x16, False );
   11986       goto decode_success;
   11987    }
   11988 
   11989    /* 66 0F 76 = PCMPEQD */
   11990    if (have66noF2noF3(pfx) && sz == 2
   11991        && insn[0] == 0x0F && insn[1] == 0x76) {
   11992       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   11993                                  "pcmpeqd", Iop_CmpEQ32x4, False );
   11994       goto decode_success;
   11995    }
   11996 
   11997    /* 66 0F 75 = PCMPEQW */
   11998    if (have66noF2noF3(pfx) && sz == 2
   11999        && insn[0] == 0x0F && insn[1] == 0x75) {
   12000       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12001                                  "pcmpeqw", Iop_CmpEQ16x8, False );
   12002       goto decode_success;
   12003    }
   12004 
   12005    /* 66 0F 64 = PCMPGTB */
   12006    if (have66noF2noF3(pfx) && sz == 2
   12007        && insn[0] == 0x0F && insn[1] == 0x64) {
   12008       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12009                                  "pcmpgtb", Iop_CmpGT8Sx16, False );
   12010       goto decode_success;
   12011    }
   12012 
   12013    /* 66 0F 66 = PCMPGTD */
   12014    if (have66noF2noF3(pfx) && sz == 2
   12015        && insn[0] == 0x0F && insn[1] == 0x66) {
   12016       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12017                                  "pcmpgtd", Iop_CmpGT32Sx4, False );
   12018       goto decode_success;
   12019    }
   12020 
   12021    /* 66 0F 65 = PCMPGTW */
   12022    if (have66noF2noF3(pfx) && sz == 2
   12023        && insn[0] == 0x0F && insn[1] == 0x65) {
   12024       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12025                                  "pcmpgtw", Iop_CmpGT16Sx8, False );
   12026       goto decode_success;
   12027    }
   12028 
   12029    /* 66 0F C5 = PEXTRW -- extract 16-bit field from xmm(E) and put
   12030       zero-extend of it in ireg(G). */
   12031    if (have66noF2noF3(pfx)
   12032        && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
   12033        && insn[0] == 0x0F && insn[1] == 0xC5) {
   12034       modrm = insn[2];
   12035       if (epartIsReg(modrm)) {
   12036          t5 = newTemp(Ity_V128);
   12037          t4 = newTemp(Ity_I16);
   12038          assign(t5, getXMMReg(eregOfRexRM(pfx,modrm)));
   12039          breakup128to32s( t5, &t3, &t2, &t1, &t0 );
   12040          switch (insn[3] & 7) {
   12041             case 0:  assign(t4, unop(Iop_32to16,   mkexpr(t0))); break;
   12042             case 1:  assign(t4, unop(Iop_32HIto16, mkexpr(t0))); break;
   12043             case 2:  assign(t4, unop(Iop_32to16,   mkexpr(t1))); break;
   12044             case 3:  assign(t4, unop(Iop_32HIto16, mkexpr(t1))); break;
   12045             case 4:  assign(t4, unop(Iop_32to16,   mkexpr(t2))); break;
   12046             case 5:  assign(t4, unop(Iop_32HIto16, mkexpr(t2))); break;
   12047             case 6:  assign(t4, unop(Iop_32to16,   mkexpr(t3))); break;
   12048             case 7:  assign(t4, unop(Iop_32HIto16, mkexpr(t3))); break;
   12049             default: vassert(0);
   12050          }
   12051          putIReg32(gregOfRexRM(pfx,modrm), unop(Iop_16Uto32, mkexpr(t4)));
   12052          DIP("pextrw $%d,%s,%s\n",
   12053              (Int)insn[3], nameXMMReg(eregOfRexRM(pfx,modrm)),
   12054                            nameIReg32(gregOfRexRM(pfx,modrm)));
   12055          delta += 4;
   12056          goto decode_success;
   12057       }
   12058       /* else fall through */
   12059       /* note, if memory case is ever filled in, there is 1 byte after
   12060          amode */
   12061    }
   12062 
   12063    /* 66 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
   12064       put it into the specified lane of xmm(G). */
   12065    if (have66noF2noF3(pfx)
   12066        && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
   12067        && insn[0] == 0x0F && insn[1] == 0xC4) {
   12068       Int lane;
   12069       t4 = newTemp(Ity_I16);
   12070       modrm = insn[2];
   12071 
   12072       if (epartIsReg(modrm)) {
   12073          assign(t4, getIReg16(eregOfRexRM(pfx,modrm)));
   12074          delta += 3+1;
   12075          lane = insn[3+1-1];
   12076          DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   12077                                    nameIReg16(eregOfRexRM(pfx,modrm)),
   12078                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   12079       } else {
   12080          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf,
   12081                            1/*byte after the amode*/ );
   12082          delta += 3+alen;
   12083          lane = insn[3+alen-1];
   12084          assign(t4, loadLE(Ity_I16, mkexpr(addr)));
   12085          DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   12086                                    dis_buf,
   12087                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   12088      }
   12089 
   12090       putXMMRegLane16( gregOfRexRM(pfx,modrm), lane & 7, mkexpr(t4) );
   12091       goto decode_success;
   12092    }
   12093 
   12094    /* 66 0F F5 = PMADDWD -- Multiply and add packed integers from
   12095       E(xmm or mem) to G(xmm) */
   12096    if (have66noF2noF3(pfx) && sz == 2
   12097        && insn[0] == 0x0F && insn[1] == 0xF5) {
   12098       IRTemp s1V  = newTemp(Ity_V128);
   12099       IRTemp s2V  = newTemp(Ity_V128);
   12100       IRTemp dV   = newTemp(Ity_V128);
   12101       IRTemp s1Hi = newTemp(Ity_I64);
   12102       IRTemp s1Lo = newTemp(Ity_I64);
   12103       IRTemp s2Hi = newTemp(Ity_I64);
   12104       IRTemp s2Lo = newTemp(Ity_I64);
   12105       IRTemp dHi  = newTemp(Ity_I64);
   12106       IRTemp dLo  = newTemp(Ity_I64);
   12107       modrm = insn[2];
   12108       if (epartIsReg(modrm)) {
   12109          assign( s1V, getXMMReg(eregOfRexRM(pfx,modrm)) );
   12110          delta += 2+1;
   12111          DIP("pmaddwd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12112                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
   12113       } else {
   12114          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   12115          assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
   12116          delta += 2+alen;
   12117          DIP("pmaddwd %s,%s\n", dis_buf,
   12118                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
   12119       }
   12120       assign( s2V, getXMMReg(gregOfRexRM(pfx,modrm)) );
   12121       assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
   12122       assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
   12123       assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
   12124       assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
   12125       assign( dHi, mkIRExprCCall(
   12126                       Ity_I64, 0/*regparms*/,
   12127                       "amd64g_calculate_mmx_pmaddwd",
   12128                       &amd64g_calculate_mmx_pmaddwd,
   12129                       mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
   12130                    ));
   12131       assign( dLo, mkIRExprCCall(
   12132                       Ity_I64, 0/*regparms*/,
   12133                       "amd64g_calculate_mmx_pmaddwd",
   12134                       &amd64g_calculate_mmx_pmaddwd,
   12135                       mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
   12136                    ));
   12137       assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
   12138       putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(dV));
   12139       goto decode_success;
   12140    }
   12141 
   12142    /* 66 0F EE = PMAXSW -- 16x8 signed max */
   12143    if (have66noF2noF3(pfx) && sz == 2
   12144        && insn[0] == 0x0F && insn[1] == 0xEE) {
   12145       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12146                                  "pmaxsw", Iop_Max16Sx8, False );
   12147       goto decode_success;
   12148    }
   12149 
   12150    /* 66 0F DE = PMAXUB -- 8x16 unsigned max */
   12151    if (have66noF2noF3(pfx) && sz == 2
   12152        && insn[0] == 0x0F && insn[1] == 0xDE) {
   12153       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12154                                  "pmaxub", Iop_Max8Ux16, False );
   12155       goto decode_success;
   12156    }
   12157 
   12158    /* 66 0F EA = PMINSW -- 16x8 signed min */
   12159    if (have66noF2noF3(pfx) && sz == 2
   12160        && insn[0] == 0x0F && insn[1] == 0xEA) {
   12161       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12162                                  "pminsw", Iop_Min16Sx8, False );
   12163       goto decode_success;
   12164    }
   12165 
   12166    /* 66 0F DA = PMINUB -- 8x16 unsigned min */
   12167    if (have66noF2noF3(pfx) && sz == 2
   12168        && insn[0] == 0x0F && insn[1] == 0xDA) {
   12169       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12170                                  "pminub", Iop_Min8Ux16, False );
   12171       goto decode_success;
   12172    }
   12173 
   12174    /* 66 0F D7 = PMOVMSKB -- extract sign bits from each of 16 lanes in
   12175       xmm(E), turn them into a byte, and put zero-extend of it in
   12176       ireg(G).  Doing this directly is just too cumbersome; give up
   12177       therefore and call a helper. */
   12178    /* UInt x86g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo ); */
   12179    if (have66noF2noF3(pfx)
   12180        && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
   12181        && insn[0] == 0x0F && insn[1] == 0xD7) {
   12182       modrm = insn[2];
   12183       if (epartIsReg(modrm)) {
   12184          t0 = newTemp(Ity_I64);
   12185          t1 = newTemp(Ity_I64);
   12186          assign(t0, getXMMRegLane64(eregOfRexRM(pfx,modrm), 0));
   12187          assign(t1, getXMMRegLane64(eregOfRexRM(pfx,modrm), 1));
   12188          t5 = newTemp(Ity_I64);
   12189          assign(t5, mkIRExprCCall(
   12190                        Ity_I64, 0/*regparms*/,
   12191                        "amd64g_calculate_sse_pmovmskb",
   12192                        &amd64g_calculate_sse_pmovmskb,
   12193                        mkIRExprVec_2( mkexpr(t1), mkexpr(t0) )));
   12194          putIReg32(gregOfRexRM(pfx,modrm), unop(Iop_64to32,mkexpr(t5)));
   12195          DIP("pmovmskb %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12196                                  nameIReg32(gregOfRexRM(pfx,modrm)));
   12197          delta += 3;
   12198          goto decode_success;
   12199       }
   12200       /* else fall through */
   12201    }
   12202 
   12203    /* 66 0F E4 = PMULHUW -- 16x8 hi-half of unsigned widening multiply */
   12204    if (have66noF2noF3(pfx) && sz == 2
   12205        && insn[0] == 0x0F && insn[1] == 0xE4) {
   12206       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12207                                  "pmulhuw", Iop_MulHi16Ux8, False );
   12208       goto decode_success;
   12209    }
   12210 
   12211    /* 66 0F E5 = PMULHW -- 16x8 hi-half of signed widening multiply */
   12212    if (have66noF2noF3(pfx) && sz == 2
   12213        && insn[0] == 0x0F && insn[1] == 0xE5) {
   12214       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12215                                  "pmulhw", Iop_MulHi16Sx8, False );
   12216       goto decode_success;
   12217    }
   12218 
   12219    /* 66 0F D5 = PMULHL -- 16x8 multiply */
   12220    if (have66noF2noF3(pfx) && sz == 2
   12221        && insn[0] == 0x0F && insn[1] == 0xD5) {
   12222       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12223                                  "pmullw", Iop_Mul16x8, False );
   12224       goto decode_success;
   12225    }
   12226 
   12227    /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   12228    /* 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
   12229       0 to form 64-bit result */
   12230    if (haveNo66noF2noF3(pfx) && sz == 4
   12231        && insn[0] == 0x0F && insn[1] == 0xF4) {
   12232       IRTemp sV = newTemp(Ity_I64);
   12233       IRTemp dV = newTemp(Ity_I64);
   12234       t1 = newTemp(Ity_I32);
   12235       t0 = newTemp(Ity_I32);
   12236       modrm = insn[2];
   12237 
   12238       do_MMX_preamble();
   12239       assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   12240 
   12241       if (epartIsReg(modrm)) {
   12242          assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   12243          delta += 2+1;
   12244          DIP("pmuludq %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   12245                                 nameMMXReg(gregLO3ofRM(modrm)));
   12246       } else {
   12247          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   12248          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12249          delta += 2+alen;
   12250          DIP("pmuludq %s,%s\n", dis_buf,
   12251                                 nameMMXReg(gregLO3ofRM(modrm)));
   12252       }
   12253 
   12254       assign( t0, unop(Iop_64to32, mkexpr(dV)) );
   12255       assign( t1, unop(Iop_64to32, mkexpr(sV)) );
   12256       putMMXReg( gregLO3ofRM(modrm),
   12257                  binop( Iop_MullU32, mkexpr(t0), mkexpr(t1) ) );
   12258       goto decode_success;
   12259    }
   12260 
   12261    /* 66 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
   12262       0 to form lower 64-bit half and lanes 2 x 2 to form upper 64-bit
   12263       half */
   12264    /* This is a really poor translation -- could be improved if
   12265       performance critical */
   12266    if (have66noF2noF3(pfx) && sz == 2
   12267        && insn[0] == 0x0F && insn[1] == 0xF4) {
   12268       IRTemp sV, dV;
   12269       IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   12270       sV = newTemp(Ity_V128);
   12271       dV = newTemp(Ity_V128);
   12272       s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   12273       t1 = newTemp(Ity_I64);
   12274       t0 = newTemp(Ity_I64);
   12275       modrm = insn[2];
   12276       assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   12277 
   12278       if (epartIsReg(modrm)) {
   12279          assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   12280          delta += 2+1;
   12281          DIP("pmuludq %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12282                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
   12283       } else {
   12284          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   12285          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12286          delta += 2+alen;
   12287          DIP("pmuludq %s,%s\n", dis_buf,
   12288                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
   12289       }
   12290 
   12291       breakup128to32s( dV, &d3, &d2, &d1, &d0 );
   12292       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   12293 
   12294       assign( t0, binop( Iop_MullU32, mkexpr(d0), mkexpr(s0)) );
   12295       putXMMRegLane64( gregOfRexRM(pfx,modrm), 0, mkexpr(t0) );
   12296       assign( t1, binop( Iop_MullU32, mkexpr(d2), mkexpr(s2)) );
   12297       putXMMRegLane64( gregOfRexRM(pfx,modrm), 1, mkexpr(t1) );
   12298       goto decode_success;
   12299    }
   12300 
   12301    /* 66 0F EB = POR */
   12302    if (have66noF2noF3(pfx) && sz == 2
   12303        && insn[0] == 0x0F && insn[1] == 0xEB) {
   12304       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "por", Iop_OrV128 );
   12305       goto decode_success;
   12306    }
   12307 
   12308    /* 66 0F F6 = PSADBW -- 2 x (8x8 -> 48 zeroes ++ u16) Sum Abs Diffs
   12309       from E(xmm or mem) to G(xmm) */
   12310    if (have66noF2noF3(pfx) && sz == 2
   12311        && insn[0] == 0x0F && insn[1] == 0xF6) {
   12312       IRTemp s1V  = newTemp(Ity_V128);
   12313       IRTemp s2V  = newTemp(Ity_V128);
   12314       IRTemp dV   = newTemp(Ity_V128);
   12315       IRTemp s1Hi = newTemp(Ity_I64);
   12316       IRTemp s1Lo = newTemp(Ity_I64);
   12317       IRTemp s2Hi = newTemp(Ity_I64);
   12318       IRTemp s2Lo = newTemp(Ity_I64);
   12319       IRTemp dHi  = newTemp(Ity_I64);
   12320       IRTemp dLo  = newTemp(Ity_I64);
   12321       modrm = insn[2];
   12322       if (epartIsReg(modrm)) {
   12323          assign( s1V, getXMMReg(eregOfRexRM(pfx,modrm)) );
   12324          delta += 2+1;
   12325          DIP("psadbw %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12326                                nameXMMReg(gregOfRexRM(pfx,modrm)));
   12327       } else {
   12328          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   12329          assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
   12330          delta += 2+alen;
   12331          DIP("psadbw %s,%s\n", dis_buf,
   12332                                nameXMMReg(gregOfRexRM(pfx,modrm)));
   12333       }
   12334       assign( s2V, getXMMReg(gregOfRexRM(pfx,modrm)) );
   12335       assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
   12336       assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
   12337       assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
   12338       assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
   12339       assign( dHi, mkIRExprCCall(
   12340                       Ity_I64, 0/*regparms*/,
   12341                       "amd64g_calculate_mmx_psadbw",
   12342                       &amd64g_calculate_mmx_psadbw,
   12343                       mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
   12344                    ));
   12345       assign( dLo, mkIRExprCCall(
   12346                       Ity_I64, 0/*regparms*/,
   12347                       "amd64g_calculate_mmx_psadbw",
   12348                       &amd64g_calculate_mmx_psadbw,
   12349                       mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
   12350                    ));
   12351       assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
   12352       putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(dV));
   12353       goto decode_success;
   12354    }
   12355 
   12356    /* 66 0F 70 = PSHUFD -- rearrange 4x32 from E(xmm or mem) to G(xmm) */
   12357    if (have66noF2noF3(pfx) && sz == 2
   12358        && insn[0] == 0x0F && insn[1] == 0x70) {
   12359       Int order;
   12360       IRTemp sV, dV, s3, s2, s1, s0;
   12361       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   12362       sV = newTemp(Ity_V128);
   12363       dV = newTemp(Ity_V128);
   12364       modrm = insn[2];
   12365       if (epartIsReg(modrm)) {
   12366          assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   12367          order = (Int)insn[3];
   12368          delta += 3+1;
   12369          DIP("pshufd $%d,%s,%s\n", order,
   12370                                    nameXMMReg(eregOfRexRM(pfx,modrm)),
   12371                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   12372       } else {
   12373          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf,
   12374                            1/*byte after the amode*/ );
   12375          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12376 	 order = (Int)insn[2+alen];
   12377          delta += 2+alen+1;
   12378          DIP("pshufd $%d,%s,%s\n", order,
   12379                                    dis_buf,
   12380                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   12381       }
   12382       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   12383 
   12384 #     define SEL(n) \
   12385                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   12386       assign(dV,
   12387 	     mk128from32s( SEL((order>>6)&3), SEL((order>>4)&3),
   12388                            SEL((order>>2)&3), SEL((order>>0)&3) )
   12389       );
   12390       putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(dV));
   12391 #     undef SEL
   12392       goto decode_success;
   12393    }
   12394 
   12395    /* F3 0F 70 = PSHUFHW -- rearrange upper half 4x16 from E(xmm or
   12396       mem) to G(xmm), and copy lower half */
   12397    if (haveF3no66noF2(pfx) && sz == 4
   12398        && insn[0] == 0x0F && insn[1] == 0x70) {
   12399       Int order;
   12400       IRTemp sVhi, dVhi, sV, dV, s3, s2, s1, s0;
   12401       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   12402       sV   = newTemp(Ity_V128);
   12403       dV   = newTemp(Ity_V128);
   12404       sVhi = newTemp(Ity_I64);
   12405       dVhi = newTemp(Ity_I64);
   12406       modrm = insn[2];
   12407       if (epartIsReg(modrm)) {
   12408          assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   12409          order = (Int)insn[3];
   12410          delta += 3+1;
   12411          DIP("pshufhw $%d,%s,%s\n", order,
   12412                                     nameXMMReg(eregOfRexRM(pfx,modrm)),
   12413                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12414       } else {
   12415          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf,
   12416                            1/*byte after the amode*/ );
   12417          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12418 	 order = (Int)insn[2+alen];
   12419          delta += 2+alen+1;
   12420          DIP("pshufhw $%d,%s,%s\n", order,
   12421                                     dis_buf,
   12422                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12423       }
   12424       assign( sVhi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12425       breakup64to16s( sVhi, &s3, &s2, &s1, &s0 );
   12426 
   12427 #     define SEL(n) \
   12428                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   12429       assign(dVhi,
   12430 	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
   12431                           SEL((order>>2)&3), SEL((order>>0)&3) )
   12432       );
   12433       assign(dV, binop( Iop_64HLtoV128,
   12434                         mkexpr(dVhi),
   12435                         unop(Iop_V128to64, mkexpr(sV))) );
   12436       putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(dV));
   12437 #     undef SEL
   12438       goto decode_success;
   12439    }
   12440 
   12441    /* F2 0F 70 = PSHUFLW -- rearrange lower half 4x16 from E(xmm or
   12442       mem) to G(xmm), and copy upper half */
   12443    if (haveF2no66noF3(pfx) && sz == 4
   12444        && insn[0] == 0x0F && insn[1] == 0x70) {
   12445       Int order;
   12446       IRTemp sVlo, dVlo, sV, dV, s3, s2, s1, s0;
   12447       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   12448       sV   = newTemp(Ity_V128);
   12449       dV   = newTemp(Ity_V128);
   12450       sVlo = newTemp(Ity_I64);
   12451       dVlo = newTemp(Ity_I64);
   12452       modrm = insn[2];
   12453       if (epartIsReg(modrm)) {
   12454          assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   12455          order = (Int)insn[3];
   12456          delta += 3+1;
   12457          DIP("pshuflw $%d,%s,%s\n", order,
   12458                                     nameXMMReg(eregOfRexRM(pfx,modrm)),
   12459                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12460       } else {
   12461          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf,
   12462                            1/*byte after the amode*/ );
   12463          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12464 	 order = (Int)insn[2+alen];
   12465          delta += 2+alen+1;
   12466          DIP("pshuflw $%d,%s,%s\n", order,
   12467                                     dis_buf,
   12468                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12469       }
   12470       assign( sVlo, unop(Iop_V128to64, mkexpr(sV)) );
   12471       breakup64to16s( sVlo, &s3, &s2, &s1, &s0 );
   12472 
   12473 #     define SEL(n) \
   12474                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   12475       assign(dVlo,
   12476 	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
   12477                           SEL((order>>2)&3), SEL((order>>0)&3) )
   12478       );
   12479       assign(dV, binop( Iop_64HLtoV128,
   12480                         unop(Iop_V128HIto64, mkexpr(sV)),
   12481                         mkexpr(dVlo) ) );
   12482       putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(dV));
   12483 #     undef SEL
   12484       goto decode_success;
   12485    }
   12486 
   12487    /* 66 0F 72 /6 ib = PSLLD by immediate */
   12488    if (have66noF2noF3(pfx) && sz == 2
   12489        && insn[0] == 0x0F && insn[1] == 0x72
   12490        && epartIsReg(insn[2])
   12491        && gregLO3ofRM(insn[2]) == 6) {
   12492       delta = dis_SSE_shiftE_imm( pfx, delta+2, "pslld", Iop_ShlN32x4 );
   12493       goto decode_success;
   12494    }
   12495 
   12496    /* 66 0F F2 = PSLLD by E */
   12497    if (have66noF2noF3(pfx) && sz == 2
   12498        && insn[0] == 0x0F && insn[1] == 0xF2) {
   12499       delta = dis_SSE_shiftG_byE( vbi, pfx, delta+2, "pslld", Iop_ShlN32x4 );
   12500       goto decode_success;
   12501    }
   12502 
   12503    /* 66 0F 73 /7 ib = PSLLDQ by immediate */
   12504    /* note, if mem case ever filled in, 1 byte after amode */
   12505    if (have66noF2noF3(pfx) && sz == 2
   12506        && insn[0] == 0x0F && insn[1] == 0x73
   12507        && epartIsReg(insn[2])
   12508        && gregLO3ofRM(insn[2]) == 7) {
   12509       IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
   12510       Int    imm = (Int)insn[3];
   12511       Int    reg = eregOfRexRM(pfx,insn[2]);
   12512       DIP("pslldq $%d,%s\n", imm, nameXMMReg(reg));
   12513       vassert(imm >= 0 && imm <= 255);
   12514       delta += 4;
   12515 
   12516       sV    = newTemp(Ity_V128);
   12517       dV    = newTemp(Ity_V128);
   12518       hi64  = newTemp(Ity_I64);
   12519       lo64  = newTemp(Ity_I64);
   12520       hi64r = newTemp(Ity_I64);
   12521       lo64r = newTemp(Ity_I64);
   12522 
   12523       if (imm >= 16) {
   12524          putXMMReg(reg, mkV128(0x0000));
   12525          goto decode_success;
   12526       }
   12527 
   12528       assign( sV, getXMMReg(reg) );
   12529       assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
   12530       assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
   12531 
   12532       if (imm == 0) {
   12533          assign( lo64r, mkexpr(lo64) );
   12534          assign( hi64r, mkexpr(hi64) );
   12535       }
   12536       else
   12537       if (imm == 8) {
   12538          assign( lo64r, mkU64(0) );
   12539          assign( hi64r, mkexpr(lo64) );
   12540       }
   12541       else
   12542       if (imm > 8) {
   12543          assign( lo64r, mkU64(0) );
   12544          assign( hi64r, binop( Iop_Shl64,
   12545                                mkexpr(lo64),
   12546                                mkU8( 8*(imm-8) ) ));
   12547       } else {
   12548          assign( lo64r, binop( Iop_Shl64,
   12549                                mkexpr(lo64),
   12550                                mkU8(8 * imm) ));
   12551          assign( hi64r,
   12552                  binop( Iop_Or64,
   12553                         binop(Iop_Shl64, mkexpr(hi64),
   12554                                          mkU8(8 * imm)),
   12555                         binop(Iop_Shr64, mkexpr(lo64),
   12556                                          mkU8(8 * (8 - imm)) )
   12557                       )
   12558                );
   12559       }
   12560       assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
   12561       putXMMReg(reg, mkexpr(dV));
   12562       goto decode_success;
   12563    }
   12564 
   12565    /* 66 0F 73 /6 ib = PSLLQ by immediate */
   12566    if (have66noF2noF3(pfx) && sz == 2
   12567        && insn[0] == 0x0F && insn[1] == 0x73
   12568        && epartIsReg(insn[2])
   12569        && gregLO3ofRM(insn[2]) == 6) {
   12570       delta = dis_SSE_shiftE_imm( pfx, delta+2, "psllq", Iop_ShlN64x2 );
   12571       goto decode_success;
   12572    }
   12573 
   12574    /* 66 0F F3 = PSLLQ by E */
   12575    if (have66noF2noF3(pfx) && sz == 2
   12576        && insn[0] == 0x0F && insn[1] == 0xF3) {
   12577       delta = dis_SSE_shiftG_byE( vbi, pfx, delta+2, "psllq", Iop_ShlN64x2 );
   12578       goto decode_success;
   12579    }
   12580 
   12581    /* 66 0F 71 /6 ib = PSLLW by immediate */
   12582    if (have66noF2noF3(pfx) && sz == 2
   12583        && insn[0] == 0x0F && insn[1] == 0x71
   12584        && epartIsReg(insn[2])
   12585        && gregLO3ofRM(insn[2]) == 6) {
   12586       delta = dis_SSE_shiftE_imm( pfx, delta+2, "psllw", Iop_ShlN16x8 );
   12587       goto decode_success;
   12588    }
   12589 
   12590    /* 66 0F F1 = PSLLW by E */
   12591    if (have66noF2noF3(pfx) && sz == 2
   12592        && insn[0] == 0x0F && insn[1] == 0xF1) {
   12593       delta = dis_SSE_shiftG_byE( vbi, pfx, delta+2, "psllw", Iop_ShlN16x8 );
   12594       goto decode_success;
   12595    }
   12596 
   12597    /* 66 0F 72 /4 ib = PSRAD by immediate */
   12598    if (have66noF2noF3(pfx) && sz == 2
   12599        && insn[0] == 0x0F && insn[1] == 0x72
   12600        && epartIsReg(insn[2])
   12601        && gregLO3ofRM(insn[2]) == 4) {
   12602       delta = dis_SSE_shiftE_imm( pfx, delta+2, "psrad", Iop_SarN32x4 );
   12603       goto decode_success;
   12604    }
   12605 
   12606    /* 66 0F E2 = PSRAD by E */
   12607    if (have66noF2noF3(pfx) && sz == 2
   12608        && insn[0] == 0x0F && insn[1] == 0xE2) {
   12609       delta = dis_SSE_shiftG_byE( vbi, pfx, delta+2, "psrad", Iop_SarN32x4 );
   12610       goto decode_success;
   12611    }
   12612 
   12613    /* 66 0F 71 /4 ib = PSRAW by immediate */
   12614    if (have66noF2noF3(pfx) && sz == 2
   12615        && insn[0] == 0x0F && insn[1] == 0x71
   12616        && epartIsReg(insn[2])
   12617        && gregLO3ofRM(insn[2]) == 4) {
   12618       delta = dis_SSE_shiftE_imm( pfx, delta+2, "psraw", Iop_SarN16x8 );
   12619       goto decode_success;
   12620    }
   12621 
   12622    /* 66 0F E1 = PSRAW by E */
   12623    if (have66noF2noF3(pfx) && sz == 2
   12624        && insn[0] == 0x0F && insn[1] == 0xE1) {
   12625       delta = dis_SSE_shiftG_byE( vbi, pfx, delta+2, "psraw", Iop_SarN16x8 );
   12626       goto decode_success;
   12627    }
   12628 
   12629    /* 66 0F 72 /2 ib = PSRLD by immediate */
   12630    if (have66noF2noF3(pfx) && sz == 2
   12631        && insn[0] == 0x0F && insn[1] == 0x72
   12632        && epartIsReg(insn[2])
   12633        && gregLO3ofRM(insn[2]) == 2) {
   12634       delta = dis_SSE_shiftE_imm( pfx, delta+2, "psrld", Iop_ShrN32x4 );
   12635       goto decode_success;
   12636    }
   12637 
   12638    /* 66 0F D2 = PSRLD by E */
   12639    if (have66noF2noF3(pfx) && sz == 2
   12640        && insn[0] == 0x0F && insn[1] == 0xD2) {
   12641       delta = dis_SSE_shiftG_byE( vbi, pfx, delta+2, "psrld", Iop_ShrN32x4 );
   12642       goto decode_success;
   12643    }
   12644 
   12645    /* 66 0F 73 /3 ib = PSRLDQ by immediate */
   12646    /* note, if mem case ever filled in, 1 byte after amode */
   12647    if (have66noF2noF3(pfx) && sz == 2
   12648        && insn[0] == 0x0F && insn[1] == 0x73
   12649        && epartIsReg(insn[2])
   12650        && gregLO3ofRM(insn[2]) == 3) {
   12651       IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
   12652       Int    imm = (Int)insn[3];
   12653       Int    reg = eregOfRexRM(pfx,insn[2]);
   12654       DIP("psrldq $%d,%s\n", imm, nameXMMReg(reg));
   12655       vassert(imm >= 0 && imm <= 255);
   12656       delta += 4;
   12657 
   12658       sV    = newTemp(Ity_V128);
   12659       dV    = newTemp(Ity_V128);
   12660       hi64  = newTemp(Ity_I64);
   12661       lo64  = newTemp(Ity_I64);
   12662       hi64r = newTemp(Ity_I64);
   12663       lo64r = newTemp(Ity_I64);
   12664 
   12665       if (imm >= 16) {
   12666          putXMMReg(reg, mkV128(0x0000));
   12667          goto decode_success;
   12668       }
   12669 
   12670       assign( sV, getXMMReg(reg) );
   12671       assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
   12672       assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
   12673 
   12674       if (imm == 0) {
   12675          assign( lo64r, mkexpr(lo64) );
   12676          assign( hi64r, mkexpr(hi64) );
   12677       }
   12678       else
   12679       if (imm == 8) {
   12680          assign( hi64r, mkU64(0) );
   12681          assign( lo64r, mkexpr(hi64) );
   12682       }
   12683       else
   12684       if (imm > 8) {
   12685          assign( hi64r, mkU64(0) );
   12686          assign( lo64r, binop( Iop_Shr64,
   12687                                mkexpr(hi64),
   12688                                mkU8( 8*(imm-8) ) ));
   12689       } else {
   12690          assign( hi64r, binop( Iop_Shr64,
   12691                                mkexpr(hi64),
   12692                                mkU8(8 * imm) ));
   12693          assign( lo64r,
   12694                  binop( Iop_Or64,
   12695                         binop(Iop_Shr64, mkexpr(lo64),
   12696                                          mkU8(8 * imm)),
   12697                         binop(Iop_Shl64, mkexpr(hi64),
   12698                                          mkU8(8 * (8 - imm)) )
   12699                       )
   12700                );
   12701       }
   12702 
   12703       assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
   12704       putXMMReg(reg, mkexpr(dV));
   12705       goto decode_success;
   12706    }
   12707 
   12708    /* 66 0F 73 /2 ib = PSRLQ by immediate */
   12709    if (have66noF2noF3(pfx) && sz == 2
   12710        && insn[0] == 0x0F && insn[1] == 0x73
   12711        && epartIsReg(insn[2])
   12712        && gregLO3ofRM(insn[2]) == 2) {
   12713       delta = dis_SSE_shiftE_imm( pfx, delta+2, "psrlq", Iop_ShrN64x2 );
   12714       goto decode_success;
   12715    }
   12716 
   12717    /* 66 0F D3 = PSRLQ by E */
   12718    if (have66noF2noF3(pfx) && sz == 2
   12719        && insn[0] == 0x0F && insn[1] == 0xD3) {
   12720       delta = dis_SSE_shiftG_byE( vbi, pfx, delta+2, "psrlq", Iop_ShrN64x2 );
   12721       goto decode_success;
   12722    }
   12723 
   12724    /* 66 0F 71 /2 ib = PSRLW by immediate */
   12725    if (have66noF2noF3(pfx) && sz == 2
   12726        && insn[0] == 0x0F && insn[1] == 0x71
   12727        && epartIsReg(insn[2])
   12728        && gregLO3ofRM(insn[2]) == 2) {
   12729       delta = dis_SSE_shiftE_imm( pfx, delta+2, "psrlw", Iop_ShrN16x8 );
   12730       goto decode_success;
   12731    }
   12732 
   12733    /* 66 0F D1 = PSRLW by E */
   12734    if (have66noF2noF3(pfx) && sz == 2
   12735        && insn[0] == 0x0F && insn[1] == 0xD1) {
   12736       delta = dis_SSE_shiftG_byE( vbi, pfx, delta+2, "psrlw", Iop_ShrN16x8 );
   12737       goto decode_success;
   12738    }
   12739 
   12740    /* 66 0F F8 = PSUBB */
   12741    if (have66noF2noF3(pfx) && sz == 2
   12742        && insn[0] == 0x0F && insn[1] == 0xF8) {
   12743       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12744                                  "psubb", Iop_Sub8x16, False );
   12745       goto decode_success;
   12746    }
   12747 
   12748    /* 66 0F FA = PSUBD */
   12749    if (have66noF2noF3(pfx) && sz == 2
   12750        && insn[0] == 0x0F && insn[1] == 0xFA) {
   12751       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12752                                  "psubd", Iop_Sub32x4, False );
   12753       goto decode_success;
   12754    }
   12755 
   12756    /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   12757    /* 0F FB = PSUBQ -- sub 64x1 */
   12758    if (haveNo66noF2noF3(pfx) && sz == 4
   12759        && insn[0] == 0x0F && insn[1] == 0xFB) {
   12760       do_MMX_preamble();
   12761       delta = dis_MMXop_regmem_to_reg (
   12762                 vbi, pfx, delta+2, insn[1], "psubq", False );
   12763       goto decode_success;
   12764    }
   12765 
   12766    /* 66 0F FB = PSUBQ */
   12767    if (have66noF2noF3(pfx) && sz == 2
   12768        && insn[0] == 0x0F && insn[1] == 0xFB) {
   12769       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12770                                  "psubq", Iop_Sub64x2, False );
   12771       goto decode_success;
   12772    }
   12773 
   12774    /* 66 0F F9 = PSUBW */
   12775    if (have66noF2noF3(pfx) && sz == 2
   12776        && insn[0] == 0x0F && insn[1] == 0xF9) {
   12777       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12778                                  "psubw", Iop_Sub16x8, False );
   12779       goto decode_success;
   12780    }
   12781 
   12782    /* 66 0F E8 = PSUBSB */
   12783    if (have66noF2noF3(pfx) && sz == 2
   12784        && insn[0] == 0x0F && insn[1] == 0xE8) {
   12785       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12786                                  "psubsb", Iop_QSub8Sx16, False );
   12787       goto decode_success;
   12788    }
   12789 
   12790    /* 66 0F E9 = PSUBSW */
   12791    if (have66noF2noF3(pfx) && sz == 2
   12792        && insn[0] == 0x0F && insn[1] == 0xE9) {
   12793       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12794                                  "psubsw", Iop_QSub16Sx8, False );
   12795       goto decode_success;
   12796    }
   12797 
   12798    /* 66 0F D8 = PSUBSB */
   12799    if (have66noF2noF3(pfx) && sz == 2
   12800        && insn[0] == 0x0F && insn[1] == 0xD8) {
   12801       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12802                                  "psubusb", Iop_QSub8Ux16, False );
   12803       goto decode_success;
   12804    }
   12805 
   12806    /* 66 0F D9 = PSUBSW */
   12807    if (have66noF2noF3(pfx) && sz == 2
   12808        && insn[0] == 0x0F && insn[1] == 0xD9) {
   12809       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12810                                  "psubusw", Iop_QSub16Ux8, False );
   12811       goto decode_success;
   12812    }
   12813 
   12814    /* 66 0F 68 = PUNPCKHBW */
   12815    if (have66noF2noF3(pfx) && sz == 2
   12816        && insn[0] == 0x0F && insn[1] == 0x68) {
   12817       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12818                                  "punpckhbw",
   12819                                  Iop_InterleaveHI8x16, True );
   12820       goto decode_success;
   12821    }
   12822 
   12823    /* 66 0F 6A = PUNPCKHDQ */
   12824    if (have66noF2noF3(pfx) && sz == 2
   12825        && insn[0] == 0x0F && insn[1] == 0x6A) {
   12826       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12827                                  "punpckhdq",
   12828                                  Iop_InterleaveHI32x4, True );
   12829       goto decode_success;
   12830    }
   12831 
   12832    /* 66 0F 6D = PUNPCKHQDQ */
   12833    if (have66noF2noF3(pfx) && sz == 2
   12834        && insn[0] == 0x0F && insn[1] == 0x6D) {
   12835       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12836                                  "punpckhqdq",
   12837                                  Iop_InterleaveHI64x2, True );
   12838       goto decode_success;
   12839    }
   12840 
   12841    /* 66 0F 69 = PUNPCKHWD */
   12842    if (have66noF2noF3(pfx) && sz == 2
   12843        && insn[0] == 0x0F && insn[1] == 0x69) {
   12844       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12845                                  "punpckhwd",
   12846                                  Iop_InterleaveHI16x8, True );
   12847       goto decode_success;
   12848    }
   12849 
   12850    /* 66 0F 60 = PUNPCKLBW */
   12851    if (have66noF2noF3(pfx) && sz == 2
   12852        && insn[0] == 0x0F && insn[1] == 0x60) {
   12853       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12854                                  "punpcklbw",
   12855                                  Iop_InterleaveLO8x16, True );
   12856       goto decode_success;
   12857    }
   12858 
   12859    /* 66 0F 62 = PUNPCKLDQ */
   12860    if (have66noF2noF3(pfx) && sz == 2
   12861        && insn[0] == 0x0F && insn[1] == 0x62) {
   12862       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12863                                  "punpckldq",
   12864                                  Iop_InterleaveLO32x4, True );
   12865       goto decode_success;
   12866    }
   12867 
   12868    /* 66 0F 6C = PUNPCKLQDQ */
   12869    if (have66noF2noF3(pfx) && sz == 2
   12870        && insn[0] == 0x0F && insn[1] == 0x6C) {
   12871       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12872                                  "punpcklqdq",
   12873                                  Iop_InterleaveLO64x2, True );
   12874       goto decode_success;
   12875    }
   12876 
   12877    /* 66 0F 61 = PUNPCKLWD */
   12878    if (have66noF2noF3(pfx) && sz == 2
   12879        && insn[0] == 0x0F && insn[1] == 0x61) {
   12880       delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
   12881                                  "punpcklwd",
   12882                                  Iop_InterleaveLO16x8, True );
   12883       goto decode_success;
   12884    }
   12885 
   12886    /* 66 0F EF = PXOR */
   12887    if (have66noF2noF3(pfx) && sz == 2
   12888        && insn[0] == 0x0F && insn[1] == 0xEF) {
   12889       delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "pxor", Iop_XorV128 );
   12890       goto decode_success;
   12891    }
   12892 
   12893 //.. //--    /* FXSAVE/FXRSTOR m32 -- load/store the FPU/MMX/SSE state. */
   12894 //.. //--    if (insn[0] == 0x0F && insn[1] == 0xAE
   12895 //.. //--        && (!epartIsReg(insn[2]))
   12896 //.. //--        && (gregOfRM(insn[2]) == 1 || gregOfRM(insn[2]) == 0) ) {
   12897 //.. //--       Bool store = gregOfRM(insn[2]) == 0;
   12898 //.. //--       vg_assert(sz == 4);
   12899 //.. //--       pair = disAMode ( cb, sorb, eip+2, dis_buf );
   12900 //.. //--       t1   = LOW24(pair);
   12901 //.. //--       eip += 2+HI8(pair);
   12902 //.. //--       uInstr3(cb, store ? SSE2a_MemWr : SSE2a_MemRd, 512,
   12903 //.. //--                   Lit16, (((UShort)insn[0]) << 8) | (UShort)insn[1],
   12904 //.. //--                   Lit16, (UShort)insn[2],
   12905 //.. //--                   TempReg, t1 );
   12906 //.. //--       DIP("fx%s %s\n", store ? "save" : "rstor", dis_buf );
   12907 //.. //--       goto decode_success;
   12908 //.. //--    }
   12909 
   12910    /* 0F AE /7 = CLFLUSH -- flush cache line */
   12911    if (haveNo66noF2noF3(pfx) && sz == 4
   12912        && insn[0] == 0x0F && insn[1] == 0xAE
   12913        && !epartIsReg(insn[2]) && gregLO3ofRM(insn[2]) == 7) {
   12914 
   12915       /* This is something of a hack.  We need to know the size of the
   12916          cache line containing addr.  Since we don't (easily), assume
   12917          256 on the basis that no real cache would have a line that
   12918          big.  It's safe to invalidate more stuff than we need, just
   12919          inefficient. */
   12920       ULong lineszB = 256ULL;
   12921 
   12922       addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   12923       delta += 2+alen;
   12924 
   12925       /* Round addr down to the start of the containing block. */
   12926       stmt( IRStmt_Put(
   12927                OFFB_TISTART,
   12928                binop( Iop_And64,
   12929                       mkexpr(addr),
   12930                       mkU64( ~(lineszB-1) ))) );
   12931 
   12932       stmt( IRStmt_Put(OFFB_TILEN, mkU64(lineszB) ) );
   12933 
   12934       irsb->jumpkind = Ijk_TInval;
   12935       irsb->next     = mkU64(guest_RIP_bbstart+delta);
   12936       dres.whatNext  = Dis_StopHere;
   12937 
   12938       DIP("clflush %s\n", dis_buf);
   12939       goto decode_success;
   12940    }
   12941 
   12942    /* ---------------------------------------------------- */
   12943    /* --- end of the SSE/SSE2 decoder.                 --- */
   12944    /* ---------------------------------------------------- */
   12945 
   12946    /* ---------------------------------------------------- */
   12947    /* --- start of the SSE3 decoder.                   --- */
   12948    /* ---------------------------------------------------- */
   12949 
   12950    /* F3 0F 12 = MOVSLDUP -- move from E (mem or xmm) to G (xmm),
   12951       duplicating some lanes (2:2:0:0). */
   12952    /* F3 0F 16 = MOVSHDUP -- move from E (mem or xmm) to G (xmm),
   12953       duplicating some lanes (3:3:1:1). */
   12954    if (haveF3no66noF2(pfx) && sz == 4
   12955        && insn[0] == 0x0F && (insn[1] == 0x12 || insn[1] == 0x16)) {
   12956       IRTemp s3, s2, s1, s0;
   12957       IRTemp sV  = newTemp(Ity_V128);
   12958       Bool   isH = insn[1] == 0x16;
   12959       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   12960 
   12961       modrm = insn[2];
   12962       if (epartIsReg(modrm)) {
   12963          assign( sV, getXMMReg( eregOfRexRM(pfx,modrm)) );
   12964          DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
   12965                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
   12966                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12967          delta += 2+1;
   12968       } else {
   12969          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   12970          gen_SEGV_if_not_16_aligned( addr );
   12971          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12972          DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
   12973 	     dis_buf,
   12974              nameXMMReg(gregOfRexRM(pfx,modrm)));
   12975          delta += 2+alen;
   12976       }
   12977 
   12978       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   12979       putXMMReg( gregOfRexRM(pfx,modrm),
   12980                  isH ? mk128from32s( s3, s3, s1, s1 )
   12981                      : mk128from32s( s2, s2, s0, s0 ) );
   12982       goto decode_success;
   12983    }
   12984 
   12985    /* F2 0F 12 = MOVDDUP -- move from E (mem or xmm) to G (xmm),
   12986       duplicating some lanes (0:1:0:1). */
   12987    if (haveF2no66noF3(pfx)
   12988        && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
   12989        && insn[0] == 0x0F && insn[1] == 0x12) {
   12990       IRTemp sV = newTemp(Ity_V128);
   12991       IRTemp d0 = newTemp(Ity_I64);
   12992 
   12993       modrm = insn[2];
   12994       if (epartIsReg(modrm)) {
   12995          assign( sV, getXMMReg( eregOfRexRM(pfx,modrm)) );
   12996          DIP("movddup %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12997                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
   12998          delta += 2+1;
   12999          assign ( d0, unop(Iop_V128to64, mkexpr(sV)) );
   13000       } else {
   13001          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   13002          assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
   13003          DIP("movddup %s,%s\n", dis_buf,
   13004                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
   13005          delta += 2+alen;
   13006       }
   13007 
   13008       putXMMReg( gregOfRexRM(pfx,modrm),
   13009                  binop(Iop_64HLtoV128,mkexpr(d0),mkexpr(d0)) );
   13010       goto decode_success;
   13011    }
   13012 
   13013    /* F2 0F D0 = ADDSUBPS -- 32x4 +/-/+/- from E (mem or xmm) to G (xmm). */
   13014    if (haveF2no66noF3(pfx) && sz == 4
   13015        && insn[0] == 0x0F && insn[1] == 0xD0) {
   13016       IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
   13017       IRTemp eV   = newTemp(Ity_V128);
   13018       IRTemp gV   = newTemp(Ity_V128);
   13019       IRTemp addV = newTemp(Ity_V128);
   13020       IRTemp subV = newTemp(Ity_V128);
   13021       a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   13022 
   13023       modrm = insn[2];
   13024       if (epartIsReg(modrm)) {
   13025          assign( eV, getXMMReg( eregOfRexRM(pfx,modrm)) );
   13026          DIP("addsubps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   13027                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   13028          delta += 2+1;
   13029       } else {
   13030          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   13031          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   13032          DIP("addsubps %s,%s\n", dis_buf,
   13033                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   13034          delta += 2+alen;
   13035       }
   13036 
   13037       assign( gV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   13038 
   13039       assign( addV, binop(Iop_Add32Fx4, mkexpr(gV), mkexpr(eV)) );
   13040       assign( subV, binop(Iop_Sub32Fx4, mkexpr(gV), mkexpr(eV)) );
   13041 
   13042       breakup128to32s( addV, &a3, &a2, &a1, &a0 );
   13043       breakup128to32s( subV, &s3, &s2, &s1, &s0 );
   13044 
   13045       putXMMReg( gregOfRexRM(pfx,modrm), mk128from32s( a3, s2, a1, s0 ));
   13046       goto decode_success;
   13047    }
   13048 
   13049    /* 66 0F D0 = ADDSUBPD -- 64x4 +/- from E (mem or xmm) to G (xmm). */
   13050    if (have66noF2noF3(pfx) && sz == 2
   13051        && insn[0] == 0x0F && insn[1] == 0xD0) {
   13052       IRTemp eV   = newTemp(Ity_V128);
   13053       IRTemp gV   = newTemp(Ity_V128);
   13054       IRTemp addV = newTemp(Ity_V128);
   13055       IRTemp subV = newTemp(Ity_V128);
   13056       IRTemp a1     = newTemp(Ity_I64);
   13057       IRTemp s0     = newTemp(Ity_I64);
   13058 
   13059       modrm = insn[2];
   13060       if (epartIsReg(modrm)) {
   13061          assign( eV, getXMMReg( eregOfRexRM(pfx,modrm)) );
   13062          DIP("addsubpd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   13063                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   13064          delta += 2+1;
   13065       } else {
   13066          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   13067          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   13068          DIP("addsubpd %s,%s\n", dis_buf,
   13069                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   13070          delta += 2+alen;
   13071       }
   13072 
   13073       assign( gV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   13074 
   13075       assign( addV, binop(Iop_Add64Fx2, mkexpr(gV), mkexpr(eV)) );
   13076       assign( subV, binop(Iop_Sub64Fx2, mkexpr(gV), mkexpr(eV)) );
   13077 
   13078       assign( a1, unop(Iop_V128HIto64, mkexpr(addV) ));
   13079       assign( s0, unop(Iop_V128to64,   mkexpr(subV) ));
   13080 
   13081       putXMMReg( gregOfRexRM(pfx,modrm),
   13082                  binop(Iop_64HLtoV128, mkexpr(a1), mkexpr(s0)) );
   13083       goto decode_success;
   13084    }
   13085 
   13086    /* F2 0F 7D = HSUBPS -- 32x4 sub across from E (mem or xmm) to G (xmm). */
   13087    /* F2 0F 7C = HADDPS -- 32x4 add across from E (mem or xmm) to G (xmm). */
   13088    if (haveF2no66noF3(pfx) && sz == 4
   13089        && insn[0] == 0x0F && (insn[1] == 0x7C || insn[1] == 0x7D)) {
   13090       IRTemp e3, e2, e1, e0, g3, g2, g1, g0;
   13091       IRTemp eV     = newTemp(Ity_V128);
   13092       IRTemp gV     = newTemp(Ity_V128);
   13093       IRTemp leftV  = newTemp(Ity_V128);
   13094       IRTemp rightV = newTemp(Ity_V128);
   13095       Bool   isAdd  = insn[1] == 0x7C;
   13096       HChar* str    = isAdd ? "add" : "sub";
   13097       e3 = e2 = e1 = e0 = g3 = g2 = g1 = g0 = IRTemp_INVALID;
   13098 
   13099       modrm = insn[2];
   13100       if (epartIsReg(modrm)) {
   13101          assign( eV, getXMMReg( eregOfRexRM(pfx,modrm)) );
   13102          DIP("h%sps %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
   13103                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   13104          delta += 2+1;
   13105       } else {
   13106          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   13107          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   13108          DIP("h%sps %s,%s\n", str, dis_buf,
   13109                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   13110          delta += 2+alen;
   13111       }
   13112 
   13113       assign( gV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   13114 
   13115       breakup128to32s( eV, &e3, &e2, &e1, &e0 );
   13116       breakup128to32s( gV, &g3, &g2, &g1, &g0 );
   13117 
   13118       assign( leftV,  mk128from32s( e2, e0, g2, g0 ) );
   13119       assign( rightV, mk128from32s( e3, e1, g3, g1 ) );
   13120 
   13121       putXMMReg( gregOfRexRM(pfx,modrm),
   13122                  binop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4,
   13123                        mkexpr(leftV), mkexpr(rightV) ) );
   13124       goto decode_success;
   13125    }
   13126 
   13127    /* 66 0F 7D = HSUBPD -- 64x2 sub across from E (mem or xmm) to G (xmm). */
   13128    /* 66 0F 7C = HADDPD -- 64x2 add across from E (mem or xmm) to G (xmm). */
   13129    if (have66noF2noF3(pfx) && sz == 2
   13130        && insn[0] == 0x0F && (insn[1] == 0x7C || insn[1] == 0x7D)) {
   13131       IRTemp e1     = newTemp(Ity_I64);
   13132       IRTemp e0     = newTemp(Ity_I64);
   13133       IRTemp g1     = newTemp(Ity_I64);
   13134       IRTemp g0     = newTemp(Ity_I64);
   13135       IRTemp eV     = newTemp(Ity_V128);
   13136       IRTemp gV     = newTemp(Ity_V128);
   13137       IRTemp leftV  = newTemp(Ity_V128);
   13138       IRTemp rightV = newTemp(Ity_V128);
   13139       Bool   isAdd  = insn[1] == 0x7C;
   13140       HChar* str    = isAdd ? "add" : "sub";
   13141 
   13142       modrm = insn[2];
   13143       if (epartIsReg(modrm)) {
   13144          assign( eV, getXMMReg( eregOfRexRM(pfx,modrm)) );
   13145          DIP("h%spd %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
   13146                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   13147          delta += 2+1;
   13148       } else {
   13149          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   13150          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   13151          DIP("h%spd %s,%s\n", str, dis_buf,
   13152                               nameXMMReg(gregOfRexRM(pfx,modrm)));
   13153          delta += 2+alen;
   13154       }
   13155 
   13156       assign( gV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   13157 
   13158       assign( e1, unop(Iop_V128HIto64, mkexpr(eV) ));
   13159       assign( e0, unop(Iop_V128to64, mkexpr(eV) ));
   13160       assign( g1, unop(Iop_V128HIto64, mkexpr(gV) ));
   13161       assign( g0, unop(Iop_V128to64, mkexpr(gV) ));
   13162 
   13163       assign( leftV,  binop(Iop_64HLtoV128, mkexpr(e0),mkexpr(g0)) );
   13164       assign( rightV, binop(Iop_64HLtoV128, mkexpr(e1),mkexpr(g1)) );
   13165 
   13166       putXMMReg( gregOfRexRM(pfx,modrm),
   13167                  binop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2,
   13168                        mkexpr(leftV), mkexpr(rightV) ) );
   13169       goto decode_success;
   13170    }
   13171 
   13172    /* F2 0F F0 = LDDQU -- move from E (mem or xmm) to G (xmm). */
   13173    if (haveF2no66noF3(pfx) && sz == 4
   13174        && insn[0] == 0x0F && insn[1] == 0xF0) {
   13175       modrm = insn[2];
   13176       if (epartIsReg(modrm)) {
   13177          goto decode_failure;
   13178       } else {
   13179          addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
   13180          putXMMReg( gregOfRexRM(pfx,modrm),
   13181                     loadLE(Ity_V128, mkexpr(addr)) );
   13182          DIP("lddqu %s,%s\n", dis_buf,
   13183                               nameXMMReg(gregOfRexRM(pfx,modrm)));
   13184          delta += 2+alen;
   13185       }
   13186       goto decode_success;
   13187    }
   13188 
   13189    /* ---------------------------------------------------- */
   13190    /* --- end of the SSE3 decoder.                     --- */
   13191    /* ---------------------------------------------------- */
   13192 
   13193    /* ---------------------------------------------------- */
   13194    /* --- start of the SSSE3 decoder.                  --- */
   13195    /* ---------------------------------------------------- */
   13196 
   13197    /* 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
   13198       Unsigned Bytes (MMX) */
   13199    if (haveNo66noF2noF3(pfx)
   13200        && sz == 4
   13201        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
   13202       IRTemp sV        = newTemp(Ity_I64);
   13203       IRTemp dV        = newTemp(Ity_I64);
   13204       IRTemp sVoddsSX  = newTemp(Ity_I64);
   13205       IRTemp sVevensSX = newTemp(Ity_I64);
   13206       IRTemp dVoddsZX  = newTemp(Ity_I64);
   13207       IRTemp dVevensZX = newTemp(Ity_I64);
   13208 
   13209       modrm = insn[3];
   13210       do_MMX_preamble();
   13211       assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   13212 
   13213       if (epartIsReg(modrm)) {
   13214          assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   13215          delta += 3+1;
   13216          DIP("pmaddubsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   13217                                   nameMMXReg(gregLO3ofRM(modrm)));
   13218       } else {
   13219          addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   13220          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   13221          delta += 3+alen;
   13222          DIP("pmaddubsw %s,%s\n", dis_buf,
   13223                                   nameMMXReg(gregLO3ofRM(modrm)));
   13224       }
   13225 
   13226       /* compute dV unsigned x sV signed */
   13227       assign( sVoddsSX,
   13228               binop(Iop_SarN16x4, mkexpr(sV), mkU8(8)) );
   13229       assign( sVevensSX,
   13230               binop(Iop_SarN16x4,
   13231                     binop(Iop_ShlN16x4, mkexpr(sV), mkU8(8)),
   13232                     mkU8(8)) );
   13233       assign( dVoddsZX,
   13234               binop(Iop_ShrN16x4, mkexpr(dV), mkU8(8)) );
   13235       assign( dVevensZX,
   13236               binop(Iop_ShrN16x4,
   13237                     binop(Iop_ShlN16x4, mkexpr(dV), mkU8(8)),
   13238                     mkU8(8)) );
   13239 
   13240       putMMXReg(
   13241          gregLO3ofRM(modrm),
   13242          binop(Iop_QAdd16Sx4,
   13243                binop(Iop_Mul16x4, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
   13244                binop(Iop_Mul16x4, mkexpr(sVevensSX), mkexpr(dVevensZX))
   13245          )
   13246       );
   13247       goto decode_success;
   13248    }
   13249 
   13250    /* 66 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
   13251       Unsigned Bytes (XMM) */
   13252    if (have66noF2noF3(pfx)
   13253        && (sz == 2 || /*redundant REX.W*/ sz == 8)
   13254        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
   13255       IRTemp sV        = newTemp(Ity_V128);
   13256       IRTemp dV        = newTemp(Ity_V128);
   13257       IRTemp sVoddsSX  = newTemp(Ity_V128);
   13258       IRTemp sVevensSX = newTemp(Ity_V128);
   13259       IRTemp dVoddsZX  = newTemp(Ity_V128);
   13260       IRTemp dVevensZX = newTemp(Ity_V128);
   13261 
   13262       modrm = insn[3];
   13263       assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   13264 
   13265       if (epartIsReg(modrm)) {
   13266          assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   13267          delta += 3+1;
   13268          DIP("pmaddubsw %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   13269                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13270       } else {
   13271          addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   13272          gen_SEGV_if_not_16_aligned( addr );
   13273          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   13274          delta += 3+alen;
   13275          DIP("pmaddubsw %s,%s\n", dis_buf,
   13276                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13277       }
   13278 
   13279       /* compute dV unsigned x sV signed */
   13280       assign( sVoddsSX,
   13281               binop(Iop_SarN16x8, mkexpr(sV), mkU8(8)) );
   13282       assign( sVevensSX,
   13283               binop(Iop_SarN16x8,
   13284                     binop(Iop_ShlN16x8, mkexpr(sV), mkU8(8)),
   13285                     mkU8(8)) );
   13286       assign( dVoddsZX,
   13287               binop(Iop_ShrN16x8, mkexpr(dV), mkU8(8)) );
   13288       assign( dVevensZX,
   13289               binop(Iop_ShrN16x8,
   13290                     binop(Iop_ShlN16x8, mkexpr(dV), mkU8(8)),
   13291                     mkU8(8)) );
   13292 
   13293       putXMMReg(
   13294          gregOfRexRM(pfx,modrm),
   13295          binop(Iop_QAdd16Sx8,
   13296                binop(Iop_Mul16x8, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
   13297                binop(Iop_Mul16x8, mkexpr(sVevensSX), mkexpr(dVevensZX))
   13298          )
   13299       );
   13300       goto decode_success;
   13301    }
   13302 
   13303    /* ***--- these are MMX class insns introduced in SSSE3 ---*** */
   13304    /* 0F 38 03 = PHADDSW -- 16x4 signed qadd across from E (mem or
   13305       mmx) and G to G (mmx). */
   13306    /* 0F 38 07 = PHSUBSW -- 16x4 signed qsub across from E (mem or
   13307       mmx) and G to G (mmx). */
   13308    /* 0F 38 01 = PHADDW -- 16x4 add across from E (mem or mmx) and G
   13309       to G (mmx). */
   13310    /* 0F 38 05 = PHSUBW -- 16x4 sub across from E (mem or mmx) and G
   13311       to G (mmx). */
   13312    /* 0F 38 02 = PHADDD -- 32x2 add across from E (mem or mmx) and G
   13313       to G (mmx). */
   13314    /* 0F 38 06 = PHSUBD -- 32x2 sub across from E (mem or mmx) and G
   13315       to G (mmx). */
   13316 
   13317    if (haveNo66noF2noF3(pfx)
   13318        && sz == 4
   13319        && insn[0] == 0x0F && insn[1] == 0x38
   13320        && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
   13321            || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
   13322       HChar* str    = "???";
   13323       IROp   opV64  = Iop_INVALID;
   13324       IROp   opCatO = Iop_CatOddLanes16x4;
   13325       IROp   opCatE = Iop_CatEvenLanes16x4;
   13326       IRTemp sV     = newTemp(Ity_I64);
   13327       IRTemp dV     = newTemp(Ity_I64);
   13328 
   13329       modrm = insn[3];
   13330 
   13331       switch (insn[2]) {
   13332          case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
   13333          case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
   13334          case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
   13335          case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
   13336          case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
   13337          case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
   13338          default: vassert(0);
   13339       }
   13340       if (insn[2] == 0x02 || insn[2] == 0x06) {
   13341          opCatO = Iop_InterleaveHI32x2;
   13342          opCatE = Iop_InterleaveLO32x2;
   13343       }
   13344 
   13345       do_MMX_preamble();
   13346       assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   13347 
   13348       if (epartIsReg(modrm)) {
   13349          assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   13350          delta += 3+1;
   13351          DIP("ph%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
   13352                                   nameMMXReg(gregLO3ofRM(modrm)));
   13353       } else {
   13354          addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   13355          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   13356          delta += 3+alen;
   13357          DIP("ph%s %s,%s\n", str, dis_buf,
   13358                                   nameMMXReg(gregLO3ofRM(modrm)));
   13359       }
   13360 
   13361       putMMXReg(
   13362          gregLO3ofRM(modrm),
   13363          binop(opV64,
   13364                binop(opCatE,mkexpr(sV),mkexpr(dV)),
   13365                binop(opCatO,mkexpr(sV),mkexpr(dV))
   13366          )
   13367       );
   13368       goto decode_success;
   13369    }
   13370 
   13371    /* 66 0F 38 03 = PHADDSW -- 16x8 signed qadd across from E (mem or
   13372       xmm) and G to G (xmm). */
   13373    /* 66 0F 38 07 = PHSUBSW -- 16x8 signed qsub across from E (mem or
   13374       xmm) and G to G (xmm). */
   13375    /* 66 0F 38 01 = PHADDW -- 16x8 add across from E (mem or xmm) and
   13376       G to G (xmm). */
   13377    /* 66 0F 38 05 = PHSUBW -- 16x8 sub across from E (mem or xmm) and
   13378       G to G (xmm). */
   13379    /* 66 0F 38 02 = PHADDD -- 32x4 add across from E (mem or xmm) and
   13380       G to G (xmm). */
   13381    /* 66 0F 38 06 = PHSUBD -- 32x4 sub across from E (mem or xmm) and
   13382       G to G (xmm). */
   13383 
   13384    if (have66noF2noF3(pfx)
   13385        && (sz == 2 || /*redundant REX.W*/ sz == 8)
   13386        && insn[0] == 0x0F && insn[1] == 0x38
   13387        && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
   13388            || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
   13389       HChar* str    = "???";
   13390       IROp   opV64  = Iop_INVALID;
   13391       IROp   opCatO = Iop_CatOddLanes16x4;
   13392       IROp   opCatE = Iop_CatEvenLanes16x4;
   13393       IRTemp sV     = newTemp(Ity_V128);
   13394       IRTemp dV     = newTemp(Ity_V128);
   13395       IRTemp sHi    = newTemp(Ity_I64);
   13396       IRTemp sLo    = newTemp(Ity_I64);
   13397       IRTemp dHi    = newTemp(Ity_I64);
   13398       IRTemp dLo    = newTemp(Ity_I64);
   13399 
   13400       modrm = insn[3];
   13401 
   13402       switch (insn[2]) {
   13403          case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
   13404          case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
   13405          case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
   13406          case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
   13407          case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
   13408          case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
   13409          default: vassert(0);
   13410       }
   13411       if (insn[2] == 0x02 || insn[2] == 0x06) {
   13412          opCatO = Iop_InterleaveHI32x2;
   13413          opCatE = Iop_InterleaveLO32x2;
   13414       }
   13415 
   13416       assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   13417 
   13418       if (epartIsReg(modrm)) {
   13419          assign( sV, getXMMReg( eregOfRexRM(pfx,modrm)) );
   13420          DIP("ph%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
   13421                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13422          delta += 3+1;
   13423       } else {
   13424          addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   13425          gen_SEGV_if_not_16_aligned( addr );
   13426          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   13427          DIP("ph%s %s,%s\n", str, dis_buf,
   13428                              nameXMMReg(gregOfRexRM(pfx,modrm)));
   13429          delta += 3+alen;
   13430       }
   13431 
   13432       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   13433       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   13434       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   13435       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   13436 
   13437       /* This isn't a particularly efficient way to compute the
   13438          result, but at least it avoids a proliferation of IROps,
   13439          hence avoids complication all the backends. */
   13440       putXMMReg(
   13441          gregOfRexRM(pfx,modrm),
   13442          binop(Iop_64HLtoV128,
   13443                binop(opV64,
   13444                      binop(opCatE,mkexpr(sHi),mkexpr(sLo)),
   13445                      binop(opCatO,mkexpr(sHi),mkexpr(sLo))
   13446                ),
   13447                binop(opV64,
   13448                      binop(opCatE,mkexpr(dHi),mkexpr(dLo)),
   13449                      binop(opCatO,mkexpr(dHi),mkexpr(dLo))
   13450                )
   13451          )
   13452       );
   13453       goto decode_success;
   13454    }
   13455 
   13456    /* 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and Scale
   13457       (MMX) */
   13458    if (haveNo66noF2noF3(pfx)
   13459        && sz == 4
   13460        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
   13461       IRTemp sV = newTemp(Ity_I64);
   13462       IRTemp dV = newTemp(Ity_I64);
   13463 
   13464       modrm = insn[3];
   13465       do_MMX_preamble();
   13466       assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   13467 
   13468       if (epartIsReg(modrm)) {
   13469          assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   13470          delta += 3+1;
   13471          DIP("pmulhrsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   13472                                  nameMMXReg(gregLO3ofRM(modrm)));
   13473       } else {
   13474          addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   13475          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   13476          delta += 3+alen;
   13477          DIP("pmulhrsw %s,%s\n", dis_buf,
   13478                                  nameMMXReg(gregLO3ofRM(modrm)));
   13479       }
   13480 
   13481       putMMXReg(
   13482          gregLO3ofRM(modrm),
   13483          dis_PMULHRSW_helper( mkexpr(sV), mkexpr(dV) )
   13484       );
   13485       goto decode_success;
   13486    }
   13487 
   13488    /* 66 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and
   13489       Scale (XMM) */
   13490    if (have66noF2noF3(pfx)
   13491        && (sz == 2 || /*redundant REX.W*/ sz == 8)
   13492        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
   13493       IRTemp sV  = newTemp(Ity_V128);
   13494       IRTemp dV  = newTemp(Ity_V128);
   13495       IRTemp sHi = newTemp(Ity_I64);
   13496       IRTemp sLo = newTemp(Ity_I64);
   13497       IRTemp dHi = newTemp(Ity_I64);
   13498       IRTemp dLo = newTemp(Ity_I64);
   13499 
   13500       modrm = insn[3];
   13501       assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   13502 
   13503       if (epartIsReg(modrm)) {
   13504          assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   13505          delta += 3+1;
   13506          DIP("pmulhrsw %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   13507                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   13508       } else {
   13509          addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   13510          gen_SEGV_if_not_16_aligned( addr );
   13511          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   13512          delta += 3+alen;
   13513          DIP("pmulhrsw %s,%s\n", dis_buf,
   13514                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   13515       }
   13516 
   13517       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   13518       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   13519       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   13520       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   13521 
   13522       putXMMReg(
   13523          gregOfRexRM(pfx,modrm),
   13524          binop(Iop_64HLtoV128,
   13525                dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
   13526                dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
   13527          )
   13528       );
   13529       goto decode_success;
   13530    }
   13531 
   13532    /* 0F 38 08 = PSIGNB -- Packed Sign 8x8  (MMX) */
   13533    /* 0F 38 09 = PSIGNW -- Packed Sign 16x4 (MMX) */
   13534    /* 0F 38 09 = PSIGND -- Packed Sign 32x2 (MMX) */
   13535    if (haveNo66noF2noF3(pfx)
   13536        && sz == 4
   13537        && insn[0] == 0x0F && insn[1] == 0x38
   13538        && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
   13539       IRTemp sV      = newTemp(Ity_I64);
   13540       IRTemp dV      = newTemp(Ity_I64);
   13541       HChar* str     = "???";
   13542       Int    laneszB = 0;
   13543 
   13544       switch (insn[2]) {
   13545          case 0x08: laneszB = 1; str = "b"; break;
   13546          case 0x09: laneszB = 2; str = "w"; break;
   13547          case 0x0A: laneszB = 4; str = "d"; break;
   13548          default: vassert(0);
   13549       }
   13550 
   13551       modrm = insn[3];
   13552       do_MMX_preamble();
   13553       assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   13554 
   13555       if (epartIsReg(modrm)) {
   13556          assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   13557          delta += 3+1;
   13558          DIP("psign%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
   13559                                      nameMMXReg(gregLO3ofRM(modrm)));
   13560       } else {
   13561          addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   13562          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   13563          delta += 3+alen;
   13564          DIP("psign%s %s,%s\n", str, dis_buf,
   13565                                      nameMMXReg(gregLO3ofRM(modrm)));
   13566       }
   13567 
   13568       putMMXReg(
   13569          gregLO3ofRM(modrm),
   13570          dis_PSIGN_helper( mkexpr(sV), mkexpr(dV), laneszB )
   13571       );
   13572       goto decode_success;
   13573    }
   13574 
   13575    /* 66 0F 38 08 = PSIGNB -- Packed Sign 8x16 (XMM) */
   13576    /* 66 0F 38 09 = PSIGNW -- Packed Sign 16x8 (XMM) */
   13577    /* 66 0F 38 09 = PSIGND -- Packed Sign 32x4 (XMM) */
   13578    if (have66noF2noF3(pfx)
   13579        && (sz == 2 || /*redundant REX.W*/ sz == 8)
   13580        && insn[0] == 0x0F && insn[1] == 0x38
   13581        && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
   13582       IRTemp sV      = newTemp(Ity_V128);
   13583       IRTemp dV      = newTemp(Ity_V128);
   13584       IRTemp sHi     = newTemp(Ity_I64);
   13585       IRTemp sLo     = newTemp(Ity_I64);
   13586       IRTemp dHi     = newTemp(Ity_I64);
   13587       IRTemp dLo     = newTemp(Ity_I64);
   13588       HChar* str     = "???";
   13589       Int    laneszB = 0;
   13590 
   13591       switch (insn[2]) {
   13592          case 0x08: laneszB = 1; str = "b"; break;
   13593          case 0x09: laneszB = 2; str = "w"; break;
   13594          case 0x0A: laneszB = 4; str = "d"; break;
   13595          default: vassert(0);
   13596       }
   13597 
   13598       modrm = insn[3];
   13599       assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   13600 
   13601       if (epartIsReg(modrm)) {
   13602          assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   13603          delta += 3+1;
   13604          DIP("psign%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
   13605                                      nameXMMReg(gregOfRexRM(pfx,modrm)));
   13606       } else {
   13607          addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   13608          gen_SEGV_if_not_16_aligned( addr );
   13609          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   13610          delta += 3+alen;
   13611          DIP("psign%s %s,%s\n", str, dis_buf,
   13612                                      nameXMMReg(gregOfRexRM(pfx,modrm)));
   13613       }
   13614 
   13615       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   13616       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   13617       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   13618       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   13619 
   13620       putXMMReg(
   13621          gregOfRexRM(pfx,modrm),
   13622          binop(Iop_64HLtoV128,
   13623                dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
   13624                dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
   13625          )
   13626       );
   13627       goto decode_success;
   13628    }
   13629 
   13630    /* 0F 38 1C = PABSB -- Packed Absolute Value 8x8  (MMX) */
   13631    /* 0F 38 1D = PABSW -- Packed Absolute Value 16x4 (MMX) */
   13632    /* 0F 38 1E = PABSD -- Packed Absolute Value 32x2 (MMX) */
   13633    if (haveNo66noF2noF3(pfx)
   13634        && sz == 4
   13635        && insn[0] == 0x0F && insn[1] == 0x38
   13636        && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
   13637       IRTemp sV      = newTemp(Ity_I64);
   13638       HChar* str     = "???";
   13639       Int    laneszB = 0;
   13640 
   13641       switch (insn[2]) {
   13642          case 0x1C: laneszB = 1; str = "b"; break;
   13643          case 0x1D: laneszB = 2; str = "w"; break;
   13644          case 0x1E: laneszB = 4; str = "d"; break;
   13645          default: vassert(0);
   13646       }
   13647 
   13648       modrm = insn[3];
   13649       do_MMX_preamble();
   13650 
   13651       if (epartIsReg(modrm)) {
   13652          assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   13653          delta += 3+1;
   13654          DIP("pabs%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
   13655                                     nameMMXReg(gregLO3ofRM(modrm)));
   13656       } else {
   13657          addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   13658          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   13659          delta += 3+alen;
   13660          DIP("pabs%s %s,%s\n", str, dis_buf,
   13661                                     nameMMXReg(gregLO3ofRM(modrm)));
   13662       }
   13663 
   13664       putMMXReg(
   13665          gregLO3ofRM(modrm),
   13666          dis_PABS_helper( mkexpr(sV), laneszB )
   13667       );
   13668       goto decode_success;
   13669    }
   13670 
   13671    /* 66 0F 38 1C = PABSB -- Packed Absolute Value 8x16 (XMM) */
   13672    /* 66 0F 38 1D = PABSW -- Packed Absolute Value 16x8 (XMM) */
   13673    /* 66 0F 38 1E = PABSD -- Packed Absolute Value 32x4 (XMM) */
   13674    if (have66noF2noF3(pfx)
   13675        && (sz == 2 || /*redundant REX.W*/ sz == 8)
   13676        && insn[0] == 0x0F && insn[1] == 0x38
   13677        && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
   13678       IRTemp sV      = newTemp(Ity_V128);
   13679       IRTemp sHi     = newTemp(Ity_I64);
   13680       IRTemp sLo     = newTemp(Ity_I64);
   13681       HChar* str     = "???";
   13682       Int    laneszB = 0;
   13683 
   13684       switch (insn[2]) {
   13685          case 0x1C: laneszB = 1; str = "b"; break;
   13686          case 0x1D: laneszB = 2; str = "w"; break;
   13687          case 0x1E: laneszB = 4; str = "d"; break;
   13688          default: vassert(0);
   13689       }
   13690 
   13691       modrm = insn[3];
   13692 
   13693       if (epartIsReg(modrm)) {
   13694          assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   13695          delta += 3+1;
   13696          DIP("pabs%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
   13697                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13698       } else {
   13699          addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   13700          gen_SEGV_if_not_16_aligned( addr );
   13701          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   13702          delta += 3+alen;
   13703          DIP("pabs%s %s,%s\n", str, dis_buf,
   13704                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13705       }
   13706 
   13707       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   13708       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   13709 
   13710       putXMMReg(
   13711          gregOfRexRM(pfx,modrm),
   13712          binop(Iop_64HLtoV128,
   13713                dis_PABS_helper( mkexpr(sHi), laneszB ),
   13714                dis_PABS_helper( mkexpr(sLo), laneszB )
   13715          )
   13716       );
   13717       goto decode_success;
   13718    }
   13719 
   13720    /* 0F 3A 0F = PALIGNR -- Packed Align Right (MMX) */
   13721    if (haveNo66noF2noF3(pfx) && sz == 4
   13722        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
   13723       IRTemp sV  = newTemp(Ity_I64);
   13724       IRTemp dV  = newTemp(Ity_I64);
   13725       IRTemp res = newTemp(Ity_I64);
   13726 
   13727       modrm = insn[3];
   13728       do_MMX_preamble();
   13729       assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   13730 
   13731       if (epartIsReg(modrm)) {
   13732          assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   13733          d64 = (Long)insn[3+1];
   13734          delta += 3+1+1;
   13735          DIP("palignr $%d,%s,%s\n",  (Int)d64,
   13736                                      nameMMXReg(eregLO3ofRM(modrm)),
   13737                                      nameMMXReg(gregLO3ofRM(modrm)));
   13738       } else {
   13739          addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 1 );
   13740          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   13741          d64 = (Long)insn[3+alen];
   13742          delta += 3+alen+1;
   13743          DIP("palignr $%d%s,%s\n", (Int)d64,
   13744                                    dis_buf,
   13745                                    nameMMXReg(gregLO3ofRM(modrm)));
   13746       }
   13747 
   13748       if (d64 == 0) {
   13749          assign( res, mkexpr(sV) );
   13750       }
   13751       else if (d64 >= 1 && d64 <= 7) {
   13752          assign(res,
   13753                 binop(Iop_Or64,
   13754                       binop(Iop_Shr64, mkexpr(sV), mkU8(8*d64)),
   13755                       binop(Iop_Shl64, mkexpr(dV), mkU8(8*(8-d64))
   13756                      )));
   13757       }
   13758       else if (d64 == 8) {
   13759         assign( res, mkexpr(dV) );
   13760       }
   13761       else if (d64 >= 9 && d64 <= 15) {
   13762          assign( res, binop(Iop_Shr64, mkexpr(dV), mkU8(8*(d64-8))) );
   13763       }
   13764       else if (d64 >= 16 && d64 <= 255) {
   13765          assign( res, mkU64(0) );
   13766       }
   13767       else
   13768          vassert(0);
   13769 
   13770       putMMXReg( gregLO3ofRM(modrm), mkexpr(res) );
   13771       goto decode_success;
   13772    }
   13773 
   13774    /* 66 0F 3A 0F = PALIGNR -- Packed Align Right (XMM) */
   13775    if (have66noF2noF3(pfx)
   13776        && (sz == 2 || /*redundant REX.W*/ sz == 8)
   13777        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
   13778       IRTemp sV  = newTemp(Ity_V128);
   13779       IRTemp dV  = newTemp(Ity_V128);
   13780       IRTemp sHi = newTemp(Ity_I64);
   13781       IRTemp sLo = newTemp(Ity_I64);
   13782       IRTemp dHi = newTemp(Ity_I64);
   13783       IRTemp dLo = newTemp(Ity_I64);
   13784       IRTemp rHi = newTemp(Ity_I64);
   13785       IRTemp rLo = newTemp(Ity_I64);
   13786 
   13787       modrm = insn[3];
   13788       assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   13789 
   13790       if (epartIsReg(modrm)) {
   13791          assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   13792          d64 = (Long)insn[3+1];
   13793          delta += 3+1+1;
   13794          DIP("palignr $%d,%s,%s\n", (Int)d64,
   13795                                     nameXMMReg(eregOfRexRM(pfx,modrm)),
   13796                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13797       } else {
   13798          addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 1 );
   13799          gen_SEGV_if_not_16_aligned( addr );
   13800          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   13801          d64 = (Long)insn[3+alen];
   13802          delta += 3+alen+1;
   13803          DIP("palignr $%d,%s,%s\n", (Int)d64,
   13804                                     dis_buf,
   13805                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13806       }
   13807 
   13808       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   13809       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   13810       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   13811       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   13812 
   13813       if (d64 == 0) {
   13814          assign( rHi, mkexpr(sHi) );
   13815          assign( rLo, mkexpr(sLo) );
   13816       }
   13817       else if (d64 >= 1 && d64 <= 7) {
   13818          assign( rHi, dis_PALIGNR_XMM_helper(dLo, sHi, d64) );
   13819          assign( rLo, dis_PALIGNR_XMM_helper(sHi, sLo, d64) );
   13820       }
   13821       else if (d64 == 8) {
   13822          assign( rHi, mkexpr(dLo) );
   13823          assign( rLo, mkexpr(sHi) );
   13824       }
   13825       else if (d64 >= 9 && d64 <= 15) {
   13826          assign( rHi, dis_PALIGNR_XMM_helper(dHi, dLo, d64-8) );
   13827          assign( rLo, dis_PALIGNR_XMM_helper(dLo, sHi, d64-8) );
   13828       }
   13829       else if (d64 == 16) {
   13830          assign( rHi, mkexpr(dHi) );
   13831          assign( rLo, mkexpr(dLo) );
   13832       }
   13833       else if (d64 >= 17 && d64 <= 23) {
   13834          assign( rHi, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d64-16))) );
   13835          assign( rLo, dis_PALIGNR_XMM_helper(dHi, dLo, d64-16) );
   13836       }
   13837       else if (d64 == 24) {
   13838          assign( rHi, mkU64(0) );
   13839          assign( rLo, mkexpr(dHi) );
   13840       }
   13841       else if (d64 >= 25 && d64 <= 31) {
   13842          assign( rHi, mkU64(0) );
   13843          assign( rLo, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d64-24))) );
   13844       }
   13845       else if (d64 >= 32 && d64 <= 255) {
   13846          assign( rHi, mkU64(0) );
   13847          assign( rLo, mkU64(0) );
   13848       }
   13849       else
   13850          vassert(0);
   13851 
   13852       putXMMReg(
   13853          gregOfRexRM(pfx,modrm),
   13854          binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
   13855       );
   13856       goto decode_success;
   13857    }
   13858 
   13859    /* 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x8 (MMX) */
   13860    if (haveNo66noF2noF3(pfx)
   13861        && sz == 4
   13862        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
   13863       IRTemp sV      = newTemp(Ity_I64);
   13864       IRTemp dV      = newTemp(Ity_I64);
   13865 
   13866       modrm = insn[3];
   13867       do_MMX_preamble();
   13868       assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   13869 
   13870       if (epartIsReg(modrm)) {
   13871          assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   13872          delta += 3+1;
   13873          DIP("pshufb %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   13874                                nameMMXReg(gregLO3ofRM(modrm)));
   13875       } else {
   13876          addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   13877          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   13878          delta += 3+alen;
   13879          DIP("pshufb %s,%s\n", dis_buf,
   13880                                nameMMXReg(gregLO3ofRM(modrm)));
   13881       }
   13882 
   13883       putMMXReg(
   13884          gregLO3ofRM(modrm),
   13885          binop(
   13886             Iop_And64,
   13887             /* permute the lanes */
   13888             binop(
   13889                Iop_Perm8x8,
   13890                mkexpr(dV),
   13891                binop(Iop_And64, mkexpr(sV), mkU64(0x0707070707070707ULL))
   13892             ),
   13893             /* mask off lanes which have (index & 0x80) == 0x80 */
   13894             unop(Iop_Not64, binop(Iop_SarN8x8, mkexpr(sV), mkU8(7)))
   13895          )
   13896       );
   13897       goto decode_success;
   13898    }
   13899 
   13900    /* 66 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x16 (XMM) */
   13901    if (have66noF2noF3(pfx)
   13902        && (sz == 2 || /*redundant REX.W*/ sz == 8)
   13903        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
   13904       IRTemp sV         = newTemp(Ity_V128);
   13905       IRTemp dV         = newTemp(Ity_V128);
   13906       IRTemp sHi        = newTemp(Ity_I64);
   13907       IRTemp sLo        = newTemp(Ity_I64);
   13908       IRTemp dHi        = newTemp(Ity_I64);
   13909       IRTemp dLo        = newTemp(Ity_I64);
   13910       IRTemp rHi        = newTemp(Ity_I64);
   13911       IRTemp rLo        = newTemp(Ity_I64);
   13912       IRTemp sevens     = newTemp(Ity_I64);
   13913       IRTemp mask0x80hi = newTemp(Ity_I64);
   13914       IRTemp mask0x80lo = newTemp(Ity_I64);
   13915       IRTemp maskBit3hi = newTemp(Ity_I64);
   13916       IRTemp maskBit3lo = newTemp(Ity_I64);
   13917       IRTemp sAnd7hi    = newTemp(Ity_I64);
   13918       IRTemp sAnd7lo    = newTemp(Ity_I64);
   13919       IRTemp permdHi    = newTemp(Ity_I64);
   13920       IRTemp permdLo    = newTemp(Ity_I64);
   13921 
   13922       modrm = insn[3];
   13923       assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   13924 
   13925       if (epartIsReg(modrm)) {
   13926          assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   13927          delta += 3+1;
   13928          DIP("pshufb %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   13929                                nameXMMReg(gregOfRexRM(pfx,modrm)));
   13930       } else {
   13931          addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   13932          gen_SEGV_if_not_16_aligned( addr );
   13933          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   13934          delta += 3+alen;
   13935          DIP("pshufb %s,%s\n", dis_buf,
   13936                                nameXMMReg(gregOfRexRM(pfx,modrm)));
   13937       }
   13938 
   13939       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   13940       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   13941       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   13942       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   13943 
   13944       assign( sevens, mkU64(0x0707070707070707ULL) );
   13945 
   13946       /*
   13947       mask0x80hi = Not(SarN8x8(sHi,7))
   13948       maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7)
   13949       sAnd7hi    = And(sHi,sevens)
   13950       permdHi    = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi),
   13951                        And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) )
   13952       rHi        = And(permdHi,mask0x80hi)
   13953       */
   13954       assign(
   13955          mask0x80hi,
   13956          unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7))));
   13957 
   13958       assign(
   13959          maskBit3hi,
   13960          binop(Iop_SarN8x8,
   13961                binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)),
   13962                mkU8(7)));
   13963 
   13964       assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens)));
   13965 
   13966       assign(
   13967          permdHi,
   13968          binop(
   13969             Iop_Or64,
   13970             binop(Iop_And64,
   13971                   binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)),
   13972                   mkexpr(maskBit3hi)),
   13973             binop(Iop_And64,
   13974                   binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)),
   13975                   unop(Iop_Not64,mkexpr(maskBit3hi))) ));
   13976 
   13977       assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) );
   13978 
   13979       /* And the same for the lower half of the result.  What fun. */
   13980 
   13981       assign(
   13982          mask0x80lo,
   13983          unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7))));
   13984 
   13985       assign(
   13986          maskBit3lo,
   13987          binop(Iop_SarN8x8,
   13988                binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)),
   13989                mkU8(7)));
   13990 
   13991       assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens)));
   13992 
   13993       assign(
   13994          permdLo,
   13995          binop(
   13996             Iop_Or64,
   13997             binop(Iop_And64,
   13998                   binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)),
   13999                   mkexpr(maskBit3lo)),
   14000             binop(Iop_And64,
   14001                   binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)),
   14002                   unop(Iop_Not64,mkexpr(maskBit3lo))) ));
   14003 
   14004       assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) );
   14005 
   14006       putXMMReg(
   14007          gregOfRexRM(pfx,modrm),
   14008          binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
   14009       );
   14010       goto decode_success;
   14011    }
   14012 
   14013    /* ---------------------------------------------------- */
   14014    /* --- end of the SSSE3 decoder.                    --- */
   14015    /* ---------------------------------------------------- */
   14016 
   14017    /* ---------------------------------------------------- */
   14018    /* --- start of the SSE4 decoder                    --- */
   14019    /* ---------------------------------------------------- */
   14020 
   14021    /* 66 0F 3A 0D /r ib = BLENDPD xmm1, xmm2/m128, imm8
   14022       Blend Packed Double Precision Floating-Point Values (XMM) */
   14023    if ( have66noF2noF3( pfx )
   14024         && sz == 2
   14025         && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0D ) {
   14026 
   14027       Int imm8;
   14028       UShort imm8_mask_16;
   14029 
   14030       IRTemp dst_vec = newTemp(Ity_V128);
   14031       IRTemp src_vec = newTemp(Ity_V128);
   14032       IRTemp imm8_mask = newTemp(Ity_V128);
   14033 
   14034       modrm = insn[3];
   14035       assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
   14036 
   14037       if ( epartIsReg( modrm ) ) {
   14038          imm8 = (Int)insn[4];
   14039          assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   14040          delta += 3+1+1;
   14041          DIP( "blendpd $%d, %s,%s\n", imm8,
   14042               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   14043               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14044       } else {
   14045          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf,
   14046                           1/* imm8 is 1 byte after the amode */ );
   14047          gen_SEGV_if_not_16_aligned( addr );
   14048          assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   14049          imm8 = (Int)insn[2+alen+1];
   14050          delta += 3+alen+1;
   14051          DIP( "blendpd $%d, %s,%s\n",
   14052               imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14053       }
   14054 
   14055       switch( imm8 & 3 ) {
   14056          case 0:  imm8_mask_16 = 0x0000; break;
   14057          case 1:  imm8_mask_16 = 0x00FF; break;
   14058          case 2:  imm8_mask_16 = 0xFF00; break;
   14059          case 3:  imm8_mask_16 = 0xFFFF; break;
   14060          default: vassert(0);            break;
   14061       }
   14062       assign( imm8_mask, mkV128( imm8_mask_16 ) );
   14063 
   14064       putXMMReg( gregOfRexRM(pfx, modrm),
   14065                  binop( Iop_OrV128,
   14066                         binop( Iop_AndV128, mkexpr(src_vec), mkexpr(imm8_mask) ),
   14067                         binop( Iop_AndV128, mkexpr(dst_vec),
   14068                                unop( Iop_NotV128, mkexpr(imm8_mask) ) ) ) );
   14069 
   14070       goto decode_success;
   14071    }
   14072 
   14073 
   14074    /* 66 0F 3A 0C /r ib = BLENDPS xmm1, xmm2/m128, imm8
   14075       Blend Packed Single Precision Floating-Point Values (XMM) */
   14076    if ( have66noF2noF3( pfx )
   14077         && sz == 2
   14078         && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0C ) {
   14079 
   14080       Int imm8;
   14081       IRTemp dst_vec = newTemp(Ity_V128);
   14082       IRTemp src_vec = newTemp(Ity_V128);
   14083 
   14084       modrm = insn[3];
   14085 
   14086       assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
   14087 
   14088       if ( epartIsReg( modrm ) ) {
   14089          imm8 = (Int)insn[3+1];
   14090          assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   14091          delta += 3+1+1;
   14092          DIP( "blendps $%d, %s,%s\n", imm8,
   14093               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   14094               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14095       } else {
   14096          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf,
   14097                           1/* imm8 is 1 byte after the amode */ );
   14098          gen_SEGV_if_not_16_aligned( addr );
   14099          assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   14100          imm8 = (Int)insn[3+alen];
   14101          delta += 3+alen+1;
   14102          DIP( "blendpd $%d, %s,%s\n",
   14103               imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14104       }
   14105 
   14106       UShort imm8_perms[16] = { 0x0000, 0x000F, 0x00F0, 0x00FF, 0x0F00, 0x0F0F,
   14107                                 0x0FF0, 0x0FFF, 0xF000, 0xF00F, 0xF0F0, 0xF0FF,
   14108                                 0xFF00, 0xFF0F, 0xFFF0, 0xFFFF };
   14109       IRTemp imm8_mask = newTemp(Ity_V128);
   14110       assign( imm8_mask, mkV128( imm8_perms[ (imm8 & 15) ] ) );
   14111 
   14112       putXMMReg( gregOfRexRM(pfx, modrm),
   14113                  binop( Iop_OrV128,
   14114                         binop( Iop_AndV128, mkexpr(src_vec), mkexpr(imm8_mask) ),
   14115                         binop( Iop_AndV128, mkexpr(dst_vec),
   14116                                unop( Iop_NotV128, mkexpr(imm8_mask) ) ) ) );
   14117 
   14118       goto decode_success;
   14119    }
   14120 
   14121 
   14122    /* 66 0F 3A 0E /r ib = PBLENDW xmm1, xmm2/m128, imm8
   14123       Blend Packed Words (XMM) */
   14124    if ( have66noF2noF3( pfx )
   14125         && sz == 2
   14126         && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0E ) {
   14127 
   14128       Int imm8;
   14129       IRTemp dst_vec = newTemp(Ity_V128);
   14130       IRTemp src_vec = newTemp(Ity_V128);
   14131 
   14132       modrm = insn[3];
   14133 
   14134       assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
   14135 
   14136       if ( epartIsReg( modrm ) ) {
   14137          imm8 = (Int)insn[3+1];
   14138          assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   14139          delta += 3+1+1;
   14140          DIP( "pblendw $%d, %s,%s\n", imm8,
   14141               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   14142               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14143       } else {
   14144          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf,
   14145                           1/* imm8 is 1 byte after the amode */ );
   14146          gen_SEGV_if_not_16_aligned( addr );
   14147          assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   14148          imm8 = (Int)insn[3+alen];
   14149          delta += 3+alen+1;
   14150          DIP( "pblendw $%d, %s,%s\n",
   14151               imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14152       }
   14153 
   14154       /* Make w be a 16-bit version of imm8, formed by duplicating each
   14155          bit in imm8. */
   14156       Int i;
   14157       UShort imm16 = 0;
   14158       for (i = 0; i < 8; i++) {
   14159          if (imm8 & (1 << i))
   14160              imm16 |= (3 << (2*i));
   14161       }
   14162       IRTemp imm16_mask = newTemp(Ity_V128);
   14163       assign( imm16_mask, mkV128( imm16 ));
   14164 
   14165       putXMMReg( gregOfRexRM(pfx, modrm),
   14166                  binop( Iop_OrV128,
   14167                         binop( Iop_AndV128, mkexpr(src_vec), mkexpr(imm16_mask) ),
   14168                         binop( Iop_AndV128, mkexpr(dst_vec),
   14169                                unop( Iop_NotV128, mkexpr(imm16_mask) ) ) ) );
   14170 
   14171       goto decode_success;
   14172    }
   14173 
   14174 
   14175    /* 66 0F 3A 44 /r ib = PCLMULQDQ xmm1, xmm2/m128, imm8
   14176     * Carry-less multiplication of selected XMM quadwords into XMM
   14177     * registers (a.k.a multiplication of polynomials over GF(2))
   14178     */
   14179    if ( have66noF2noF3( pfx )
   14180         && sz == 2
   14181         && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x44 ) {
   14182 
   14183       Int imm8;
   14184       IRTemp svec = newTemp(Ity_V128);
   14185       IRTemp dvec = newTemp(Ity_V128);
   14186 
   14187       modrm = insn[3];
   14188 
   14189       assign( dvec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
   14190 
   14191       if ( epartIsReg( modrm ) ) {
   14192          imm8 = (Int)insn[4];
   14193          assign( svec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   14194          delta += 3+1+1;
   14195          DIP( "pclmulqdq $%d, %s,%s\n", imm8,
   14196               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   14197               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14198       } else {
   14199          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf,
   14200                           1/* imm8 is 1 byte after the amode */ );
   14201          gen_SEGV_if_not_16_aligned( addr );
   14202          assign( svec, loadLE( Ity_V128, mkexpr(addr) ) );
   14203          imm8 = (Int)insn[2+alen+1];
   14204          delta += 3+alen+1;
   14205          DIP( "pclmulqdq $%d, %s,%s\n",
   14206               imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14207       }
   14208 
   14209       t0 = newTemp(Ity_I64);
   14210       t1 = newTemp(Ity_I64);
   14211       assign(t0, unop((imm8&1)? Iop_V128HIto64 : Iop_V128to64, mkexpr(dvec)));
   14212       assign(t1, unop((imm8&16) ? Iop_V128HIto64 : Iop_V128to64, mkexpr(svec)));
   14213 
   14214       t2 = newTemp(Ity_I64);
   14215       t3 = newTemp(Ity_I64);
   14216 
   14217       IRExpr** args;
   14218 
   14219       args = mkIRExprVec_3(mkexpr(t0), mkexpr(t1), mkU64(0));
   14220       assign(t2,
   14221               mkIRExprCCall(Ity_I64,0, "amd64g_calculate_pclmul",
   14222                                        &amd64g_calculate_pclmul, args));
   14223       args = mkIRExprVec_3(mkexpr(t0), mkexpr(t1), mkU64(1));
   14224       assign(t3,
   14225               mkIRExprCCall(Ity_I64,0, "amd64g_calculate_pclmul",
   14226                                        &amd64g_calculate_pclmul, args));
   14227 
   14228       IRTemp res     = newTemp(Ity_V128);
   14229       assign(res, binop(Iop_64HLtoV128, mkexpr(t3), mkexpr(t2)));
   14230       putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
   14231 
   14232       goto decode_success;
   14233    }
   14234 
   14235    /* 66 0F 3A 41 /r ib = DPPD xmm1, xmm2/m128, imm8
   14236       Dot Product of Packed Double Precision Floating-Point Values (XMM) */
   14237    if ( have66noF2noF3( pfx )
   14238         && sz == 2
   14239         && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x41 ) {
   14240 
   14241       Int imm8;
   14242       IRTemp src_vec = newTemp(Ity_V128);
   14243       IRTemp dst_vec = newTemp(Ity_V128);
   14244       IRTemp and_vec = newTemp(Ity_V128);
   14245       IRTemp sum_vec = newTemp(Ity_V128);
   14246 
   14247       modrm = insn[3];
   14248 
   14249       assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
   14250 
   14251       if ( epartIsReg( modrm ) ) {
   14252          imm8 = (Int)insn[4];
   14253          assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   14254          delta += 3+1+1;
   14255          DIP( "dppd $%d, %s,%s\n", imm8,
   14256               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   14257               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14258       } else {
   14259          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf,
   14260                           1/* imm8 is 1 byte after the amode */ );
   14261          gen_SEGV_if_not_16_aligned( addr );
   14262          assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   14263          imm8 = (Int)insn[2+alen+1];
   14264          delta += 3+alen+1;
   14265          DIP( "dppd $%d, %s,%s\n",
   14266               imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14267       }
   14268 
   14269       UShort imm8_perms[4] = { 0x0000, 0x00FF, 0xFF00, 0xFFFF };
   14270 
   14271       assign( and_vec, binop( Iop_AndV128,
   14272                               binop( Iop_Mul64Fx2,
   14273                                      mkexpr(dst_vec), mkexpr(src_vec) ),
   14274                               mkV128( imm8_perms[ ((imm8 >> 4) & 3) ] ) ) );
   14275 
   14276       assign( sum_vec, binop( Iop_Add64F0x2,
   14277                               binop( Iop_InterleaveHI64x2,
   14278                                      mkexpr(and_vec), mkexpr(and_vec) ),
   14279                               binop( Iop_InterleaveLO64x2,
   14280                                      mkexpr(and_vec), mkexpr(and_vec) ) ) );
   14281 
   14282       putXMMReg( gregOfRexRM( pfx, modrm ),
   14283                  binop( Iop_AndV128,
   14284                         binop( Iop_InterleaveLO64x2,
   14285                                mkexpr(sum_vec), mkexpr(sum_vec) ),
   14286                         mkV128( imm8_perms[ (imm8 & 3) ] ) ) );
   14287 
   14288       goto decode_success;
   14289    }
   14290 
   14291 
   14292    /* 66 0F 3A 40 /r ib = DPPS xmm1, xmm2/m128, imm8
   14293       Dot Product of Packed Single Precision Floating-Point Values (XMM) */
   14294    if ( have66noF2noF3( pfx )
   14295         && sz == 2
   14296         && insn[0] == 0x0F
   14297         && insn[1] == 0x3A
   14298         && insn[2] == 0x40 ) {
   14299 
   14300       Int imm8;
   14301       IRTemp xmm1_vec     = newTemp(Ity_V128);
   14302       IRTemp xmm2_vec     = newTemp(Ity_V128);
   14303       IRTemp tmp_prod_vec = newTemp(Ity_V128);
   14304       IRTemp prod_vec     = newTemp(Ity_V128);
   14305       IRTemp sum_vec      = newTemp(Ity_V128);
   14306       IRTemp v3, v2, v1, v0;
   14307       v3 = v2 = v1 = v0   = IRTemp_INVALID;
   14308 
   14309       modrm = insn[3];
   14310 
   14311       assign( xmm1_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
   14312 
   14313       if ( epartIsReg( modrm ) ) {
   14314          imm8 = (Int)insn[4];
   14315          assign( xmm2_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   14316          delta += 3+1+1;
   14317          DIP( "dpps $%d, %s,%s\n", imm8,
   14318               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   14319               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14320       } else {
   14321          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf,
   14322                           1/* imm8 is 1 byte after the amode */ );
   14323          gen_SEGV_if_not_16_aligned( addr );
   14324          assign( xmm2_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   14325          imm8 = (Int)insn[2+alen+1];
   14326          delta += 3+alen+1;
   14327          DIP( "dpps $%d, %s,%s\n",
   14328               imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14329       }
   14330 
   14331       UShort imm8_perms[16] = { 0x0000, 0x000F, 0x00F0, 0x00FF, 0x0F00,
   14332                                 0x0F0F, 0x0FF0, 0x0FFF, 0xF000, 0xF00F,
   14333                                 0xF0F0, 0xF0FF, 0xFF00, 0xFF0F, 0xFFF0, 0xFFFF };
   14334 
   14335       assign( tmp_prod_vec,
   14336               binop( Iop_AndV128,
   14337                      binop( Iop_Mul32Fx4, mkexpr(xmm1_vec), mkexpr(xmm2_vec) ),
   14338                      mkV128( imm8_perms[((imm8 >> 4)& 15)] ) ) );
   14339       breakup128to32s( tmp_prod_vec, &v3, &v2, &v1, &v0 );
   14340       assign( prod_vec, mk128from32s( v3, v1, v2, v0 ) );
   14341 
   14342       assign( sum_vec, binop( Iop_Add32Fx4,
   14343                               binop( Iop_InterleaveHI32x4,
   14344                                      mkexpr(prod_vec), mkexpr(prod_vec) ),
   14345                               binop( Iop_InterleaveLO32x4,
   14346                                      mkexpr(prod_vec), mkexpr(prod_vec) ) ) );
   14347 
   14348       putXMMReg( gregOfRexRM(pfx, modrm),
   14349                  binop( Iop_AndV128,
   14350                         binop( Iop_Add32Fx4,
   14351                                binop( Iop_InterleaveHI32x4,
   14352                                       mkexpr(sum_vec), mkexpr(sum_vec) ),
   14353                                binop( Iop_InterleaveLO32x4,
   14354                                       mkexpr(sum_vec), mkexpr(sum_vec) ) ),
   14355                         mkV128( imm8_perms[ (imm8 & 15) ] ) ) );
   14356 
   14357       goto decode_success;
   14358    }
   14359 
   14360 
   14361    /* 66 0F 3A 21 /r ib = INSERTPS xmm1, xmm2/m32, imm8
   14362       Insert Packed Single Precision Floating-Point Value (XMM) */
   14363    if ( have66noF2noF3( pfx )
   14364         && sz == 2
   14365         && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x21 ) {
   14366 
   14367       Int imm8;
   14368       Int imm8_count_s;
   14369       Int imm8_count_d;
   14370       Int imm8_zmask;
   14371       IRTemp dstVec   = newTemp(Ity_V128);
   14372       IRTemp srcDWord = newTemp(Ity_I32);
   14373 
   14374       modrm = insn[3];
   14375 
   14376       assign( dstVec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
   14377 
   14378       if ( epartIsReg( modrm ) ) {
   14379          IRTemp src_vec = newTemp(Ity_V128);
   14380          assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   14381 
   14382          IRTemp src_lane_0 = IRTemp_INVALID;
   14383          IRTemp src_lane_1 = IRTemp_INVALID;
   14384          IRTemp src_lane_2 = IRTemp_INVALID;
   14385          IRTemp src_lane_3 = IRTemp_INVALID;
   14386          breakup128to32s( src_vec,
   14387                           &src_lane_3, &src_lane_2, &src_lane_1, &src_lane_0 );
   14388 
   14389          imm8 = (Int)insn[4];
   14390          imm8_count_s = ((imm8 >> 6) & 3);
   14391          switch( imm8_count_s ) {
   14392            case 0:  assign( srcDWord, mkexpr(src_lane_0) ); break;
   14393            case 1:  assign( srcDWord, mkexpr(src_lane_1) ); break;
   14394            case 2:  assign( srcDWord, mkexpr(src_lane_2) ); break;
   14395            case 3:  assign( srcDWord, mkexpr(src_lane_3) ); break;
   14396            default: vassert(0);                             break;
   14397          }
   14398 
   14399          delta += 3+1+1;
   14400          DIP( "insertps $%d, %s,%s\n", imm8,
   14401               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   14402               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14403       } else {
   14404          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf,
   14405                           1/* const imm8 is 1 byte after the amode */ );
   14406          assign( srcDWord, loadLE( Ity_I32, mkexpr(addr) ) );
   14407          imm8 = (Int)insn[2+alen+1];
   14408          imm8_count_s = 0;
   14409          delta += 3+alen+1;
   14410          DIP( "insertps $%d, %s,%s\n",
   14411               imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14412       }
   14413 
   14414       IRTemp dst_lane_0 = IRTemp_INVALID;
   14415       IRTemp dst_lane_1 = IRTemp_INVALID;
   14416       IRTemp dst_lane_2 = IRTemp_INVALID;
   14417       IRTemp dst_lane_3 = IRTemp_INVALID;
   14418       breakup128to32s( dstVec,
   14419                        &dst_lane_3, &dst_lane_2, &dst_lane_1, &dst_lane_0 );
   14420 
   14421       imm8_count_d = ((imm8 >> 4) & 3);
   14422       switch( imm8_count_d ) {
   14423          case 0:  dst_lane_0 = srcDWord; break;
   14424          case 1:  dst_lane_1 = srcDWord; break;
   14425          case 2:  dst_lane_2 = srcDWord; break;
   14426          case 3:  dst_lane_3 = srcDWord; break;
   14427          default: vassert(0);            break;
   14428       }
   14429 
   14430       imm8_zmask = (imm8 & 15);
   14431       IRTemp zero_32 = newTemp(Ity_I32);
   14432       assign( zero_32, mkU32(0) );
   14433 
   14434       IRExpr* ire_vec_128 = mk128from32s(
   14435                                ((imm8_zmask & 8) == 8) ? zero_32 : dst_lane_3,
   14436                                ((imm8_zmask & 4) == 4) ? zero_32 : dst_lane_2,
   14437                                ((imm8_zmask & 2) == 2) ? zero_32 : dst_lane_1,
   14438                                ((imm8_zmask & 1) == 1) ? zero_32 : dst_lane_0 );
   14439 
   14440       putXMMReg( gregOfRexRM(pfx, modrm), ire_vec_128 );
   14441 
   14442       goto decode_success;
   14443    }
   14444 
   14445 
   14446   /* 66 0F 3A 14 /r ib = PEXTRB r/m16, xmm, imm8
   14447      Extract Byte from xmm, store in mem or zero-extend + store in gen.reg. (XMM) */
   14448   if ( have66noF2noF3( pfx )
   14449        && sz == 2
   14450        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x14 ) {
   14451 
   14452      Int imm8;
   14453      IRTemp xmm_vec  = newTemp(Ity_V128);
   14454      IRTemp sel_lane = newTemp(Ity_I32);
   14455      IRTemp shr_lane = newTemp(Ity_I32);
   14456 
   14457      modrm = insn[3];
   14458      assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
   14459      breakup128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
   14460 
   14461      if ( epartIsReg( modrm ) ) {
   14462         imm8 = (Int)insn[3+1];
   14463      } else {
   14464         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
   14465         imm8 = (Int)insn[3+alen];
   14466      }
   14467      switch( (imm8 >> 2) & 3 ) {
   14468         case 0:  assign( sel_lane, mkexpr(t0) ); break;
   14469         case 1:  assign( sel_lane, mkexpr(t1) ); break;
   14470         case 2:  assign( sel_lane, mkexpr(t2) ); break;
   14471         case 3:  assign( sel_lane, mkexpr(t3) ); break;
   14472         default: vassert(0);
   14473      }
   14474      assign( shr_lane,
   14475              binop( Iop_Shr32, mkexpr(sel_lane), mkU8(((imm8 & 3)*8)) ) );
   14476 
   14477      if ( epartIsReg( modrm ) ) {
   14478         putIReg64( eregOfRexRM(pfx,modrm),
   14479                    unop( Iop_32Uto64,
   14480                          binop(Iop_And32, mkexpr(shr_lane), mkU32(255)) ) );
   14481 
   14482         delta += 3+1+1;
   14483         DIP( "pextrb $%d, %s,%s\n", imm8,
   14484              nameXMMReg( gregOfRexRM(pfx, modrm) ),
   14485              nameIReg64( eregOfRexRM(pfx, modrm) ) );
   14486      } else {
   14487         storeLE( mkexpr(addr), unop(Iop_32to8, mkexpr(shr_lane) ) );
   14488         delta += 3+alen+1;
   14489         DIP( "$%d, pextrb %s,%s\n",
   14490              imm8, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
   14491      }
   14492 
   14493      goto decode_success;
   14494   }
   14495 
   14496 
   14497    /* 66 0F 3A 16 /r ib = PEXTRD reg/mem32, xmm2, imm8
   14498       Extract Doubleword int from xmm reg and store in gen.reg or mem. (XMM)
   14499       Note that this insn has the same opcodes as PEXTRQ, but
   14500       here the REX.W bit is _not_ present */
   14501    if ( have66noF2noF3( pfx )
   14502         && sz == 2  /* REX.W is _not_ present */
   14503         && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x16 ) {
   14504 
   14505       Int imm8_10;
   14506       IRTemp xmm_vec   = newTemp(Ity_V128);
   14507       IRTemp src_dword = newTemp(Ity_I32);
   14508 
   14509       modrm = insn[3];
   14510       assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
   14511       breakup128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
   14512 
   14513       if ( epartIsReg( modrm ) ) {
   14514          imm8_10 = (Int)(insn[3+1] & 3);
   14515       } else {
   14516          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
   14517          imm8_10 = (Int)(insn[3+alen] & 3);
   14518       }
   14519 
   14520       switch ( imm8_10 ) {
   14521          case 0:  assign( src_dword, mkexpr(t0) ); break;
   14522          case 1:  assign( src_dword, mkexpr(t1) ); break;
   14523          case 2:  assign( src_dword, mkexpr(t2) ); break;
   14524          case 3:  assign( src_dword, mkexpr(t3) ); break;
   14525          default: vassert(0);
   14526       }
   14527 
   14528       if ( epartIsReg( modrm ) ) {
   14529          putIReg32( eregOfRexRM(pfx,modrm), mkexpr(src_dword) );
   14530          delta += 3+1+1;
   14531          DIP( "pextrd $%d, %s,%s\n", imm8_10,
   14532               nameXMMReg( gregOfRexRM(pfx, modrm) ),
   14533               nameIReg32( eregOfRexRM(pfx, modrm) ) );
   14534       } else {
   14535          storeLE( mkexpr(addr), mkexpr(src_dword) );
   14536          delta += 3+alen+1;
   14537          DIP( "pextrd $%d, %s,%s\n",
   14538               imm8_10, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
   14539       }
   14540 
   14541       goto decode_success;
   14542    }
   14543 
   14544 
   14545    /* 66 REX.W 0F 3A 16 /r ib = PEXTRQ reg/mem64, xmm2, imm8
   14546       Extract Quadword int from xmm reg and store in gen.reg or mem. (XMM)
   14547       Note that this insn has the same opcodes as PEXTRD, but
   14548       here the REX.W bit is present */
   14549    if ( have66noF2noF3( pfx )
   14550         && sz == 8  /* REX.W is present */
   14551         && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x16 ) {
   14552 
   14553       Int imm8_0;
   14554       IRTemp xmm_vec   = newTemp(Ity_V128);
   14555       IRTemp src_qword = newTemp(Ity_I64);
   14556 
   14557       modrm = insn[3];
   14558       assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
   14559 
   14560       if ( epartIsReg( modrm ) ) {
   14561          imm8_0 = (Int)(insn[3+1] & 1);
   14562       } else {
   14563          addr   = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
   14564          imm8_0 = (Int)(insn[3+alen] & 1);
   14565       }
   14566       switch ( imm8_0 ) {
   14567          case 0:  assign( src_qword, unop(Iop_V128to64,   mkexpr(xmm_vec)) ); break;
   14568          case 1:  assign( src_qword, unop(Iop_V128HIto64, mkexpr(xmm_vec)) ); break;
   14569          default: vassert(0);
   14570       }
   14571 
   14572       if ( epartIsReg( modrm ) ) {
   14573          putIReg64( eregOfRexRM(pfx,modrm), mkexpr(src_qword) );
   14574          delta += 3+1+1;
   14575          DIP( "pextrq $%d, %s,%s\n", imm8_0,
   14576               nameXMMReg( gregOfRexRM(pfx, modrm) ),
   14577               nameIReg64( eregOfRexRM(pfx, modrm) ) );
   14578       } else {
   14579          storeLE( mkexpr(addr), mkexpr(src_qword) );
   14580          delta += 3+alen+1;
   14581          DIP( "pextrq $%d, %s,%s\n",
   14582               imm8_0, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
   14583       }
   14584 
   14585       goto decode_success;
   14586    }
   14587 
   14588 
   14589    /* 66 0F 3A 15 /r ib = PEXTRW r/m16, xmm, imm8
   14590       Extract Word from xmm, store in mem or zero-extend + store in gen.reg. (XMM) */
   14591    if ( have66noF2noF3( pfx )
   14592         && sz == 2
   14593         && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x15 ) {
   14594 
   14595       Int imm8_20;
   14596       IRTemp xmm_vec = newTemp(Ity_V128);
   14597       IRTemp src_word = newTemp(Ity_I16);
   14598 
   14599       modrm = insn[3];
   14600       assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
   14601       breakup128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
   14602 
   14603       if ( epartIsReg( modrm ) ) {
   14604          imm8_20 = (Int)(insn[3+1] & 7);
   14605       } else {
   14606          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
   14607          imm8_20 = (Int)(insn[3+alen] & 7);
   14608       }
   14609 
   14610       switch ( imm8_20 ) {
   14611          case 0:  assign( src_word, unop(Iop_32to16,   mkexpr(t0)) ); break;
   14612          case 1:  assign( src_word, unop(Iop_32HIto16, mkexpr(t0)) ); break;
   14613          case 2:  assign( src_word, unop(Iop_32to16,   mkexpr(t1)) ); break;
   14614          case 3:  assign( src_word, unop(Iop_32HIto16, mkexpr(t1)) ); break;
   14615          case 4:  assign( src_word, unop(Iop_32to16,   mkexpr(t2)) ); break;
   14616          case 5:  assign( src_word, unop(Iop_32HIto16, mkexpr(t2)) ); break;
   14617          case 6:  assign( src_word, unop(Iop_32to16,   mkexpr(t3)) ); break;
   14618          case 7:  assign( src_word, unop(Iop_32HIto16, mkexpr(t3)) ); break;
   14619          default: vassert(0);
   14620       }
   14621 
   14622       if ( epartIsReg( modrm ) ) {
   14623          putIReg64( eregOfRexRM(pfx,modrm), unop(Iop_16Uto64, mkexpr(src_word)) );
   14624          delta += 3+1+1;
   14625          DIP( "pextrw $%d, %s,%s\n", imm8_20,
   14626               nameXMMReg( gregOfRexRM(pfx, modrm) ),
   14627               nameIReg64( eregOfRexRM(pfx, modrm) ) );
   14628       } else {
   14629          storeLE( mkexpr(addr), mkexpr(src_word) );
   14630          delta += 3+alen+1;
   14631          DIP( "pextrw $%d, %s,%s\n",
   14632               imm8_20, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
   14633       }
   14634 
   14635       goto decode_success;
   14636    }
   14637 
   14638 
   14639    /* 66 REX.W 0F 3A 22 /r ib = PINSRQ xmm1, r/m64, imm8
   14640       Extract Quadword int from gen.reg/mem64 and insert into xmm1 */
   14641    if ( have66noF2noF3( pfx )
   14642         && sz == 8  /* REX.W is present */
   14643         && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x22 ) {
   14644 
   14645       Int imm8_0;
   14646       IRTemp src_elems = newTemp(Ity_I64);
   14647       IRTemp src_vec   = newTemp(Ity_V128);
   14648 
   14649       modrm = insn[3];
   14650 
   14651       if ( epartIsReg( modrm ) ) {
   14652          imm8_0 = (Int)(insn[3+1] & 1);
   14653          assign( src_elems, getIReg64( eregOfRexRM(pfx,modrm) ) );
   14654          delta += 3+1+1;
   14655          DIP( "pinsrq $%d, %s,%s\n", imm8_0,
   14656               nameIReg64( eregOfRexRM(pfx, modrm) ),
   14657               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14658       } else {
   14659          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
   14660          imm8_0 = (Int)(insn[3+alen] & 1);
   14661          assign( src_elems, loadLE( Ity_I64, mkexpr(addr) ) );
   14662          delta += 3+alen+1;
   14663          DIP( "pinsrq $%d, %s,%s\n",
   14664               imm8_0, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14665       }
   14666 
   14667       UShort mask = 0;
   14668       if ( imm8_0 == 0 ) {
   14669          mask = 0xFF00;
   14670          assign( src_vec,  binop( Iop_64HLtoV128, mkU64(0), mkexpr(src_elems) ) );
   14671       } else {
   14672          mask = 0x00FF;
   14673          assign( src_vec, binop( Iop_64HLtoV128, mkexpr(src_elems), mkU64(0) ) );
   14674       }
   14675 
   14676       putXMMReg( gregOfRexRM(pfx, modrm),
   14677                  binop( Iop_OrV128, mkexpr(src_vec),
   14678                         binop( Iop_AndV128,
   14679                                getXMMReg( gregOfRexRM(pfx, modrm) ),
   14680                                mkV128(mask) ) ) );
   14681 
   14682       goto decode_success;
   14683    }
   14684 
   14685 
   14686    /* 66 no-REX.W 0F 3A 22 /r ib = PINSRD xmm1, r/m32, imm8
   14687       Extract Doubleword int from gen.reg/mem32 and insert into xmm1 */
   14688    if ( have66noF2noF3( pfx )
   14689         && sz == 2 /* REX.W is NOT present */
   14690         && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x22 ) {
   14691 
   14692       Int imm8_10;
   14693       IRTemp src_elems = newTemp(Ity_I32);
   14694       IRTemp src_vec   = newTemp(Ity_V128);
   14695       IRTemp z32       = newTemp(Ity_I32);
   14696 
   14697       modrm = insn[3];
   14698 
   14699       if ( epartIsReg( modrm ) ) {
   14700          imm8_10 = (Int)(insn[3+1] & 3);
   14701          assign( src_elems, getIReg32( eregOfRexRM(pfx,modrm) ) );
   14702          delta += 3+1+1;
   14703          DIP( "pinsrd $%d, %s,%s\n", imm8_10,
   14704               nameIReg32( eregOfRexRM(pfx, modrm) ),
   14705               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14706       } else {
   14707          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
   14708          imm8_10 = (Int)(insn[3+alen] & 3);
   14709          assign( src_elems, loadLE( Ity_I32, mkexpr(addr) ) );
   14710          delta += 3+alen+1;
   14711          DIP( "pinsrd $%d, %s,%s\n",
   14712               imm8_10, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14713       }
   14714 
   14715       assign(z32, mkU32(0));
   14716 
   14717       UShort mask = 0;
   14718       switch (imm8_10) {
   14719          case 3:  mask = 0x0FFF;
   14720                   assign(src_vec, mk128from32s(src_elems, z32, z32, z32));
   14721                   break;
   14722          case 2:  mask = 0xF0FF;
   14723                   assign(src_vec, mk128from32s(z32, src_elems, z32, z32));
   14724                   break;
   14725          case 1:  mask = 0xFF0F;
   14726                   assign(src_vec, mk128from32s(z32, z32, src_elems, z32));
   14727                   break;
   14728          case 0:  mask = 0xFFF0;
   14729                   assign(src_vec, mk128from32s(z32, z32, z32, src_elems));
   14730                   break;
   14731          default: vassert(0);
   14732       }
   14733 
   14734       putXMMReg( gregOfRexRM(pfx, modrm),
   14735                  binop( Iop_OrV128, mkexpr(src_vec),
   14736                         binop( Iop_AndV128,
   14737                                getXMMReg( gregOfRexRM(pfx, modrm) ),
   14738                                mkV128(mask) ) ) );
   14739 
   14740       goto decode_success;
   14741    }
   14742 
   14743    /* 66 0F 3A 20 /r ib = PINSRB xmm1, r32/m8, imm8
   14744       Extract byte from r32/m8 and insert into xmm1 */
   14745    if ( have66noF2noF3( pfx )
   14746         && sz == 2
   14747         && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x20 ) {
   14748 
   14749       Int    imm8;
   14750       IRTemp new8 = newTemp(Ity_I64);
   14751 
   14752       modrm = insn[3];
   14753 
   14754       if ( epartIsReg( modrm ) ) {
   14755          imm8 = (Int)(insn[3+1] & 0xF);
   14756          assign( new8, binop(Iop_And64,
   14757                              unop(Iop_32Uto64,
   14758                                   getIReg32(eregOfRexRM(pfx,modrm))),
   14759                              mkU64(0xFF)));
   14760          delta += 3+1+1;
   14761          DIP( "pinsrb $%d,%s,%s\n", imm8,
   14762               nameIReg32( eregOfRexRM(pfx, modrm) ),
   14763               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14764       } else {
   14765          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
   14766          imm8 = (Int)(insn[3+alen] & 0xF);
   14767          assign( new8, unop(Iop_8Uto64, loadLE( Ity_I8, mkexpr(addr) )));
   14768          delta += 3+alen+1;
   14769          DIP( "pinsrb $%d,%s,%s\n",
   14770               imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14771       }
   14772 
   14773       // Create a V128 value which has the selected byte in the
   14774       // specified lane, and zeroes everywhere else.
   14775       IRTemp tmp128 = newTemp(Ity_V128);
   14776       IRTemp halfshift = newTemp(Ity_I64);
   14777       assign(halfshift, binop(Iop_Shl64,
   14778                               mkexpr(new8), mkU8(8 * (imm8 & 7))));
   14779       vassert(imm8 >= 0 && imm8 <= 15);
   14780       if (imm8 < 8) {
   14781          assign(tmp128, binop(Iop_64HLtoV128, mkU64(0), mkexpr(halfshift)));
   14782       } else {
   14783          assign(tmp128, binop(Iop_64HLtoV128, mkexpr(halfshift), mkU64(0)));
   14784       }
   14785 
   14786       UShort mask = ~(1 << imm8);
   14787 
   14788       putXMMReg( gregOfRexRM(pfx, modrm),
   14789                  binop( Iop_OrV128,
   14790                         mkexpr(tmp128),
   14791                         binop( Iop_AndV128,
   14792                                getXMMReg( gregOfRexRM(pfx, modrm) ),
   14793                                mkV128(mask) ) ) );
   14794 
   14795       goto decode_success;
   14796    }
   14797 
   14798 
   14799    /* 66 0F 3A 17 /r ib = EXTRACTPS reg/mem32, xmm2, imm8 Extract
   14800       float from xmm reg and store in gen.reg or mem.  This is
   14801       identical to PEXTRD, except that REX.W appears to be ignored.
   14802    */
   14803    if ( have66noF2noF3( pfx )
   14804         && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
   14805         && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x17 ) {
   14806 
   14807       Int imm8_10;
   14808       IRTemp xmm_vec   = newTemp(Ity_V128);
   14809       IRTemp src_dword = newTemp(Ity_I32);
   14810 
   14811       modrm = insn[3];
   14812       assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
   14813       breakup128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
   14814 
   14815       if ( epartIsReg( modrm ) ) {
   14816          imm8_10 = (Int)(insn[3+1] & 3);
   14817       } else {
   14818          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
   14819          imm8_10 = (Int)(insn[3+alen] & 3);
   14820       }
   14821 
   14822       switch ( imm8_10 ) {
   14823          case 0:  assign( src_dword, mkexpr(t0) ); break;
   14824          case 1:  assign( src_dword, mkexpr(t1) ); break;
   14825          case 2:  assign( src_dword, mkexpr(t2) ); break;
   14826          case 3:  assign( src_dword, mkexpr(t3) ); break;
   14827          default: vassert(0);
   14828       }
   14829 
   14830       if ( epartIsReg( modrm ) ) {
   14831          putIReg32( eregOfRexRM(pfx,modrm), mkexpr(src_dword) );
   14832          delta += 3+1+1;
   14833          DIP( "extractps $%d, %s,%s\n", imm8_10,
   14834               nameXMMReg( gregOfRexRM(pfx, modrm) ),
   14835               nameIReg32( eregOfRexRM(pfx, modrm) ) );
   14836       } else {
   14837          storeLE( mkexpr(addr), mkexpr(src_dword) );
   14838          delta += 3+alen+1;
   14839          DIP( "extractps $%d, %s,%s\n",
   14840               imm8_10, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
   14841       }
   14842 
   14843       goto decode_success;
   14844    }
   14845 
   14846 
   14847    /* 66 0F 38 37 = PCMPGTQ
   14848       64x2 comparison (signed, presumably; the Intel docs don't say :-)
   14849    */
   14850    if ( have66noF2noF3( pfx ) && sz == 2
   14851         && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x37) {
   14852       /* FIXME: this needs an alignment check */
   14853       delta = dis_SSEint_E_to_G( vbi, pfx, delta+3,
   14854                                  "pcmpgtq", Iop_CmpGT64Sx2, False );
   14855       goto decode_success;
   14856    }
   14857 
   14858    /* 66 0F 38 3D /r = PMAXSD xmm1, xmm2/m128
   14859       Maximum of Packed Signed Double Word Integers (XMM)
   14860       66 0F 38 39 /r = PMINSD xmm1, xmm2/m128
   14861       Minimum of Packed Signed Double Word Integers (XMM) */
   14862    if ( have66noF2noF3( pfx ) && sz == 2
   14863         && insn[0] == 0x0F && insn[1] == 0x38
   14864         && (insn[2] == 0x3D || insn[2] == 0x39)) {
   14865       /* FIXME: this needs an alignment check */
   14866       Bool isMAX = insn[2] == 0x3D;
   14867       delta = dis_SSEint_E_to_G(
   14868                  vbi, pfx, delta+3,
   14869                  isMAX ? "pmaxsd" : "pminsd",
   14870                  isMAX ? Iop_Max32Sx4 : Iop_Min32Sx4,
   14871                  False
   14872               );
   14873       goto decode_success;
   14874    }
   14875 
   14876    /* 66 0F 38 3F /r = PMAXUD xmm1, xmm2/m128
   14877       Maximum of Packed Unsigned Doubleword Integers (XMM)
   14878       66 0F 38 3B /r = PMINUD xmm1, xmm2/m128
   14879       Minimum of Packed Unsigned Doubleword Integers (XMM) */
   14880    if ( have66noF2noF3( pfx ) && sz == 2
   14881         && insn[0] == 0x0F && insn[1] == 0x38
   14882         && (insn[2] == 0x3F || insn[2] == 0x3B)) {
   14883       /* FIXME: this needs an alignment check */
   14884       Bool isMAX = insn[2] == 0x3F;
   14885       delta = dis_SSEint_E_to_G(
   14886                  vbi, pfx, delta+3,
   14887                  isMAX ? "pmaxud" : "pminud",
   14888                  isMAX ? Iop_Max32Ux4 : Iop_Min32Ux4,
   14889                  False
   14890               );
   14891       goto decode_success;
   14892    }
   14893 
   14894    /* 66 0F 38 3E /r = PMAXUW xmm1, xmm2/m128
   14895       Maximum of Packed Unsigned Word Integers (XMM)
   14896       66 0F 38 3A /r = PMINUW xmm1, xmm2/m128
   14897       Minimum of Packed Unsigned Word Integers (XMM)
   14898    */
   14899    if ( have66noF2noF3( pfx ) && sz == 2
   14900         && insn[0] == 0x0F && insn[1] == 0x38
   14901         && (insn[2] == 0x3E || insn[2] == 0x3A)) {
   14902       /* FIXME: this needs an alignment check */
   14903       Bool isMAX = insn[2] == 0x3E;
   14904       delta = dis_SSEint_E_to_G(
   14905                  vbi, pfx, delta+3,
   14906                  isMAX ? "pmaxuw" : "pminuw",
   14907                  isMAX ? Iop_Max16Ux8 : Iop_Min16Ux8,
   14908                  False
   14909               );
   14910       goto decode_success;
   14911    }
   14912 
   14913    /* 66 0F 38 3C /r = PMAXSB xmm1, xmm2/m128
   14914       8Sx16 (signed) max
   14915       66 0F 38 38 /r = PMINSB xmm1, xmm2/m128
   14916       8Sx16 (signed) min
   14917    */
   14918    if ( have66noF2noF3( pfx ) && sz == 2
   14919         && insn[0] == 0x0F && insn[1] == 0x38
   14920         && (insn[2] == 0x3C || insn[2] == 0x38)) {
   14921       /* FIXME: this needs an alignment check */
   14922       Bool isMAX = insn[2] == 0x3C;
   14923       delta = dis_SSEint_E_to_G(
   14924                  vbi, pfx, delta+3,
   14925                  isMAX ? "pmaxsb" : "pminsb",
   14926                  isMAX ? Iop_Max8Sx16 : Iop_Min8Sx16,
   14927                  False
   14928               );
   14929       goto decode_success;
   14930    }
   14931 
   14932    /* 66 0f 38 20 /r = PMOVSXBW xmm1, xmm2/m64
   14933       Packed Move with Sign Extend from Byte to Word (XMM) */
   14934    if ( have66noF2noF3( pfx )
   14935         && sz == 2
   14936         && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x20 ) {
   14937 
   14938       modrm = insn[3];
   14939 
   14940       IRTemp srcVec = newTemp(Ity_V128);
   14941 
   14942       if ( epartIsReg( modrm ) ) {
   14943          assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   14944          delta += 3+1;
   14945          DIP( "pmovsxbw %s,%s\n",
   14946               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   14947               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14948       } else {
   14949          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   14950          assign( srcVec,
   14951                  unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
   14952          delta += 3+alen;
   14953          DIP( "pmovsxbw %s,%s\n",
   14954               dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14955       }
   14956 
   14957       putXMMReg( gregOfRexRM(pfx, modrm),
   14958                  binop( Iop_SarN16x8,
   14959                         binop( Iop_ShlN16x8,
   14960                                binop( Iop_InterleaveLO8x16,
   14961                                       IRExpr_Const( IRConst_V128(0) ),
   14962                                       mkexpr(srcVec) ),
   14963                                mkU8(8) ),
   14964                         mkU8(8) ) );
   14965 
   14966       goto decode_success;
   14967    }
   14968 
   14969 
   14970    /* 66 0f 38 21 /r = PMOVSXBD xmm1, xmm2/m32
   14971       Packed Move with Sign Extend from Byte to DWord (XMM) */
   14972    if ( have66noF2noF3( pfx )
   14973         && sz == 2
   14974         && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x21 ) {
   14975 
   14976       modrm = insn[3];
   14977 
   14978       IRTemp srcVec = newTemp(Ity_V128);
   14979 
   14980       if ( epartIsReg( modrm ) ) {
   14981          assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   14982          delta += 3+1;
   14983          DIP( "pmovsxbd %s,%s\n",
   14984               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   14985               nameXMMReg( gregOfRexRM(pfx, modrm) )  );
   14986       } else {
   14987          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   14988          assign( srcVec,
   14989                  unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) ) ) );
   14990          delta += 3+alen;
   14991          DIP( "pmovsxbd %s,%s\n",
   14992               dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   14993       }
   14994 
   14995       IRTemp zeroVec = newTemp(Ity_V128);
   14996       assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   14997 
   14998       putXMMReg( gregOfRexRM(pfx, modrm),
   14999                  binop( Iop_SarN32x4,
   15000                         binop( Iop_ShlN32x4,
   15001                                binop( Iop_InterleaveLO8x16,
   15002                                       mkexpr(zeroVec),
   15003                                       binop( Iop_InterleaveLO8x16,
   15004                                              mkexpr(zeroVec),
   15005                                              mkexpr(srcVec) ) ),
   15006                                mkU8(24) ), mkU8(24) ) );
   15007 
   15008       goto decode_success;
   15009    }
   15010 
   15011 
   15012    /* 66 0f 38 22 /r = PMOVSXBQ xmm1, xmm2/m16
   15013       Packed Move with Sign Extend from Byte to QWord (XMM) */
   15014    if ( have66noF2noF3(pfx)
   15015         && sz == 2
   15016         && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x22 ) {
   15017 
   15018       modrm = insn[3];
   15019 
   15020       IRTemp srcBytes = newTemp(Ity_I16);
   15021 
   15022       if ( epartIsReg(modrm) ) {
   15023          assign( srcBytes, getXMMRegLane16( eregOfRexRM(pfx, modrm), 0 ) );
   15024          delta += 3+1;
   15025          DIP( "pmovsxbq %s,%s\n",
   15026               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   15027               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15028       } else {
   15029          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15030          assign( srcBytes, loadLE( Ity_I16, mkexpr(addr) ) );
   15031          delta += 3+alen;
   15032          DIP( "pmovsxbq %s,%s\n",
   15033               dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15034       }
   15035 
   15036       putXMMReg( gregOfRexRM( pfx, modrm ),
   15037                  binop( Iop_64HLtoV128,
   15038                         unop( Iop_8Sto64,
   15039                               unop( Iop_16HIto8,
   15040                                     mkexpr(srcBytes) ) ),
   15041                         unop( Iop_8Sto64,
   15042                               unop( Iop_16to8, mkexpr(srcBytes) ) ) ) );
   15043 
   15044       goto decode_success;
   15045    }
   15046 
   15047 
   15048    /* 66 0f 38 23 /r = PMOVSXWD xmm1, xmm2/m64
   15049       Packed Move with Sign Extend from Word to DWord (XMM) */
   15050    if ( have66noF2noF3( pfx )
   15051         && sz == 2
   15052         && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x23 ) {
   15053 
   15054       modrm = insn[3];
   15055 
   15056       IRTemp srcVec = newTemp(Ity_V128);
   15057 
   15058       if ( epartIsReg(modrm) ) {
   15059          assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   15060          delta += 3+1;
   15061          DIP( "pmovsxwd %s,%s\n",
   15062               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   15063               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15064       } else {
   15065          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15066          assign( srcVec,
   15067                  unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
   15068          delta += 3+alen;
   15069          DIP( "pmovsxwd %s,%s\n",
   15070               dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15071       }
   15072 
   15073       putXMMReg( gregOfRexRM(pfx, modrm),
   15074                  binop( Iop_SarN32x4,
   15075                         binop( Iop_ShlN32x4,
   15076                                binop( Iop_InterleaveLO16x8,
   15077                                       IRExpr_Const( IRConst_V128(0) ),
   15078                                       mkexpr(srcVec) ),
   15079                                mkU8(16) ),
   15080                         mkU8(16) ) );
   15081 
   15082       goto decode_success;
   15083    }
   15084 
   15085 
   15086    /* 66 0f 38 24 /r = PMOVSXWQ xmm1, xmm2/m32
   15087       Packed Move with Sign Extend from Word to QWord (XMM) */
   15088    if ( have66noF2noF3( pfx )
   15089         && sz == 2
   15090         && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x24 ) {
   15091 
   15092       modrm = insn[3];
   15093 
   15094       IRTemp srcBytes = newTemp(Ity_I32);
   15095 
   15096       if ( epartIsReg( modrm ) ) {
   15097          assign( srcBytes, getXMMRegLane32( eregOfRexRM(pfx, modrm), 0 ) );
   15098          delta += 3+1;
   15099          DIP( "pmovsxwq %s,%s\n",
   15100               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   15101               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15102       } else {
   15103          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15104          assign( srcBytes, loadLE( Ity_I32, mkexpr(addr) ) );
   15105          delta += 3+alen;
   15106          DIP( "pmovsxwq %s,%s\n",
   15107               dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15108       }
   15109 
   15110       putXMMReg( gregOfRexRM( pfx, modrm ),
   15111                  binop( Iop_64HLtoV128,
   15112                         unop( Iop_16Sto64,
   15113                               unop( Iop_32HIto16, mkexpr(srcBytes) ) ),
   15114                         unop( Iop_16Sto64,
   15115                               unop( Iop_32to16, mkexpr(srcBytes) ) ) ) );
   15116 
   15117       goto decode_success;
   15118    }
   15119 
   15120 
   15121    /* 66 0f 38 25 /r = PMOVSXDQ xmm1, xmm2/m64
   15122       Packed Move with Sign Extend from Double Word to Quad Word (XMM) */
   15123    if ( have66noF2noF3( pfx )
   15124         && sz == 2
   15125         && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x25 ) {
   15126 
   15127       modrm = insn[3];
   15128 
   15129       IRTemp srcBytes = newTemp(Ity_I64);
   15130 
   15131       if ( epartIsReg(modrm) ) {
   15132          assign( srcBytes, getXMMRegLane64( eregOfRexRM(pfx, modrm), 0 ) );
   15133          delta += 3+1;
   15134          DIP( "pmovsxdq %s,%s\n",
   15135               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   15136               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15137       } else {
   15138          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15139          assign( srcBytes, loadLE( Ity_I64, mkexpr(addr) ) );
   15140          delta += 3+alen;
   15141          DIP( "pmovsxdq %s,%s\n",
   15142               dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15143       }
   15144 
   15145       putXMMReg( gregOfRexRM(pfx, modrm),
   15146                  binop( Iop_64HLtoV128,
   15147                         unop( Iop_32Sto64,
   15148                               unop( Iop_64HIto32, mkexpr(srcBytes) ) ),
   15149                         unop( Iop_32Sto64,
   15150                               unop( Iop_64to32, mkexpr(srcBytes) ) ) ) );
   15151 
   15152       goto decode_success;
   15153    }
   15154 
   15155 
   15156    /* 66 0f 38 30 /r = PMOVZXBW xmm1, xmm2/m64
   15157       Packed Move with Zero Extend from Byte to Word (XMM) */
   15158    if ( have66noF2noF3(pfx)
   15159         && sz == 2
   15160         && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x30 ) {
   15161 
   15162       modrm = insn[3];
   15163 
   15164       IRTemp srcVec = newTemp(Ity_V128);
   15165 
   15166       if ( epartIsReg(modrm) ) {
   15167          assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   15168          delta += 3+1;
   15169          DIP( "pmovzxbw %s,%s\n",
   15170               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   15171               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15172       } else {
   15173          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15174          assign( srcVec,
   15175                  unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
   15176          delta += 3+alen;
   15177          DIP( "pmovzxbw %s,%s\n",
   15178               dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15179       }
   15180 
   15181       putXMMReg( gregOfRexRM(pfx, modrm),
   15182                  binop( Iop_InterleaveLO8x16,
   15183                         IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ) );
   15184 
   15185       goto decode_success;
   15186    }
   15187 
   15188 
   15189    /* 66 0f 38 31 /r = PMOVZXBD xmm1, xmm2/m32
   15190       Packed Move with Zero Extend from Byte to DWord (XMM) */
   15191    if ( have66noF2noF3( pfx )
   15192         && sz == 2
   15193         && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x31 ) {
   15194 
   15195       modrm = insn[3];
   15196 
   15197       IRTemp srcVec = newTemp(Ity_V128);
   15198 
   15199       if ( epartIsReg(modrm) ) {
   15200          assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   15201          delta += 3+1;
   15202          DIP( "pmovzxbd %s,%s\n",
   15203               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   15204               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15205       } else {
   15206          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15207          assign( srcVec,
   15208                  unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) ) ) );
   15209          delta += 3+alen;
   15210          DIP( "pmovzxbd %s,%s\n",
   15211               dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15212       }
   15213 
   15214       IRTemp zeroVec = newTemp(Ity_V128);
   15215       assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   15216 
   15217       putXMMReg( gregOfRexRM( pfx, modrm ),
   15218                  binop( Iop_InterleaveLO8x16,
   15219                         mkexpr(zeroVec),
   15220                         binop( Iop_InterleaveLO8x16,
   15221                                mkexpr(zeroVec), mkexpr(srcVec) ) ) );
   15222 
   15223       goto decode_success;
   15224    }
   15225 
   15226 
   15227    /* 66 0f 38 32 /r = PMOVZXBQ xmm1, xmm2/m16
   15228       Packed Move with Zero Extend from Byte to QWord (XMM) */
   15229    if ( have66noF2noF3( pfx )
   15230         && sz == 2
   15231         && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x32 ) {
   15232 
   15233       modrm = insn[3];
   15234 
   15235       IRTemp srcVec = newTemp(Ity_V128);
   15236 
   15237       if ( epartIsReg(modrm) ) {
   15238          assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   15239          delta += 3+1;
   15240          DIP( "pmovzxbq %s,%s\n",
   15241               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   15242               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15243       } else {
   15244          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15245          assign( srcVec,
   15246                  unop( Iop_32UtoV128,
   15247                        unop( Iop_16Uto32, loadLE( Ity_I16, mkexpr(addr) ) ) ) );
   15248          delta += 3+alen;
   15249          DIP( "pmovzxbq %s,%s\n",
   15250               dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15251       }
   15252 
   15253       IRTemp zeroVec = newTemp(Ity_V128);
   15254       assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   15255 
   15256       putXMMReg( gregOfRexRM( pfx, modrm ),
   15257                  binop( Iop_InterleaveLO8x16,
   15258                         mkexpr(zeroVec),
   15259                         binop( Iop_InterleaveLO8x16,
   15260                                mkexpr(zeroVec),
   15261                                binop( Iop_InterleaveLO8x16,
   15262                                       mkexpr(zeroVec), mkexpr(srcVec) ) ) ) );
   15263 
   15264       goto decode_success;
   15265    }
   15266 
   15267 
   15268    /* 66 0f 38 33 /r = PMOVZXWD xmm1, xmm2/m64
   15269       Packed Move with Zero Extend from Word to DWord (XMM) */
   15270    if ( have66noF2noF3( pfx )
   15271         && sz == 2
   15272         && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x33 ) {
   15273 
   15274       modrm = insn[3];
   15275 
   15276       IRTemp srcVec = newTemp(Ity_V128);
   15277 
   15278       if ( epartIsReg(modrm) ) {
   15279          assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   15280          delta += 3+1;
   15281          DIP( "pmovzxwd %s,%s\n",
   15282               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   15283               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15284       } else {
   15285          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15286          assign( srcVec,
   15287                  unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
   15288          delta += 3+alen;
   15289          DIP( "pmovzxwd %s,%s\n",
   15290               dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15291       }
   15292 
   15293       putXMMReg( gregOfRexRM(pfx, modrm),
   15294                  binop( Iop_InterleaveLO16x8,
   15295                         IRExpr_Const( IRConst_V128(0) ),
   15296                         mkexpr(srcVec) ) );
   15297 
   15298       goto decode_success;
   15299    }
   15300 
   15301 
   15302    /* 66 0f 38 34 /r = PMOVZXWQ xmm1, xmm2/m32
   15303       Packed Move with Zero Extend from Word to QWord (XMM) */
   15304    if ( have66noF2noF3( pfx )
   15305         && sz == 2
   15306         && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x34 ) {
   15307 
   15308       modrm = insn[3];
   15309 
   15310       IRTemp srcVec = newTemp(Ity_V128);
   15311 
   15312       if ( epartIsReg( modrm ) ) {
   15313          assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   15314          delta += 3+1;
   15315          DIP( "pmovzxwq %s,%s\n",
   15316               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   15317               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15318       } else {
   15319          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15320          assign( srcVec,
   15321                  unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) ) ) );
   15322          delta += 3+alen;
   15323          DIP( "pmovzxwq %s,%s\n",
   15324               dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15325       }
   15326 
   15327       IRTemp zeroVec = newTemp( Ity_V128 );
   15328       assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   15329 
   15330       putXMMReg( gregOfRexRM( pfx, modrm ),
   15331                  binop( Iop_InterleaveLO16x8,
   15332                         mkexpr(zeroVec),
   15333                         binop( Iop_InterleaveLO16x8,
   15334                                mkexpr(zeroVec), mkexpr(srcVec) ) ) );
   15335 
   15336       goto decode_success;
   15337    }
   15338 
   15339 
   15340    /* 66 0f 38 35 /r = PMOVZXDQ xmm1, xmm2/m64
   15341       Packed Move with Zero Extend from DWord to QWord (XMM) */
   15342    if ( have66noF2noF3( pfx )
   15343         && sz == 2
   15344         && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x35 ) {
   15345 
   15346       modrm = insn[3];
   15347 
   15348       IRTemp srcVec = newTemp(Ity_V128);
   15349 
   15350       if ( epartIsReg(modrm) ) {
   15351          assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   15352          delta += 3+1;
   15353          DIP( "pmovzxdq %s,%s\n",
   15354               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   15355               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15356       } else {
   15357          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15358          assign( srcVec,
   15359                  unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
   15360          delta += 3+alen;
   15361          DIP( "pmovzxdq %s,%s\n",
   15362               dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15363       }
   15364 
   15365       putXMMReg( gregOfRexRM(pfx, modrm),
   15366                  binop( Iop_InterleaveLO32x4,
   15367                         IRExpr_Const( IRConst_V128(0) ),
   15368                         mkexpr(srcVec) ) );
   15369 
   15370       goto decode_success;
   15371    }
   15372 
   15373 
   15374    /* 66 0f 38 40 /r = PMULLD xmm1, xmm2/m128
   15375       32x4 integer multiply from xmm2/m128 to xmm1 */
   15376    if ( have66noF2noF3( pfx )
   15377         && sz == 2
   15378         && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x40 ) {
   15379 
   15380       modrm = insn[3];
   15381 
   15382       IRTemp argL = newTemp(Ity_V128);
   15383       IRTemp argR = newTemp(Ity_V128);
   15384 
   15385       if ( epartIsReg(modrm) ) {
   15386          assign( argL, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   15387          delta += 3+1;
   15388          DIP( "pmulld %s,%s\n",
   15389               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   15390               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15391       } else {
   15392          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15393          gen_SEGV_if_not_16_aligned( addr );
   15394          assign( argL, loadLE( Ity_V128, mkexpr(addr) ));
   15395          delta += 3+alen;
   15396          DIP( "pmulld %s,%s\n",
   15397               dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15398       }
   15399 
   15400       assign(argR, getXMMReg( gregOfRexRM(pfx, modrm) ));
   15401 
   15402       putXMMReg( gregOfRexRM(pfx, modrm),
   15403                  binop( Iop_Mul32x4, mkexpr(argL), mkexpr(argR)) );
   15404 
   15405       goto decode_success;
   15406    }
   15407 
   15408 
   15409    /* F3 0F B8  = POPCNT{W,L,Q}
   15410       Count the number of 1 bits in a register
   15411     */
   15412    if (haveF3noF2(pfx) /* so both 66 and 48 are possibilities */
   15413        && insn[0] == 0x0F && insn[1] == 0xB8) {
   15414       vassert(sz == 2 || sz == 4 || sz == 8);
   15415       /*IRType*/ ty  = szToITy(sz);
   15416       IRTemp     src = newTemp(ty);
   15417       modrm = insn[2];
   15418       if (epartIsReg(modrm)) {
   15419          assign(src, getIRegE(sz, pfx, modrm));
   15420          delta += 2+1;
   15421          DIP("popcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
   15422              nameIRegG(sz, pfx, modrm));
   15423       } else {
   15424          addr = disAMode( &alen, vbi, pfx, delta+2, dis_buf, 0);
   15425          assign(src, loadLE(ty, mkexpr(addr)));
   15426          delta += 2+alen;
   15427          DIP("popcnt%c %s, %s\n", nameISize(sz), dis_buf,
   15428              nameIRegG(sz, pfx, modrm));
   15429       }
   15430 
   15431       IRTemp result = gen_POPCOUNT(ty, src);
   15432       putIRegG(sz, pfx, modrm, mkexpr(result));
   15433 
   15434       // Update flags.  This is pretty lame .. perhaps can do better
   15435       // if this turns out to be performance critical.
   15436       // O S A C P are cleared.  Z is set if SRC == 0.
   15437       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   15438       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   15439       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   15440       stmt( IRStmt_Put( OFFB_CC_DEP1,
   15441             binop(Iop_Shl64,
   15442                   unop(Iop_1Uto64,
   15443                        binop(Iop_CmpEQ64,
   15444                              widenUto64(mkexpr(src)),
   15445                              mkU64(0))),
   15446                   mkU8(AMD64G_CC_SHIFT_Z))));
   15447 
   15448       goto decode_success;
   15449    }
   15450 
   15451 
   15452    /* 66 0F 3A 0B /r ib = ROUNDSD imm8, xmm2/m64, xmm1
   15453       66 0F 3A 0A /r ib = ROUNDSS imm8, xmm2/m32, xmm1
   15454    */
   15455    if (have66noF2noF3(pfx)
   15456        && sz == 2
   15457        && insn[0] == 0x0F && insn[1] == 0x3A
   15458        && (insn[2] == 0x0B || insn[2] == 0x0A)) {
   15459 
   15460       Bool   isD = insn[2] == 0x0B;
   15461       IRTemp src = newTemp(isD ? Ity_F64 : Ity_F32);
   15462       IRTemp res = newTemp(isD ? Ity_F64 : Ity_F32);
   15463       Int    imm = 0;
   15464 
   15465       modrm = insn[3];
   15466 
   15467       if (epartIsReg(modrm)) {
   15468          assign( src,
   15469                  isD ? getXMMRegLane64F( eregOfRexRM(pfx, modrm), 0 )
   15470                      : getXMMRegLane32F( eregOfRexRM(pfx, modrm), 0 ) );
   15471          imm = insn[3+1];
   15472          if (imm & ~15) goto decode_failure;
   15473          delta += 3+1+1;
   15474          DIP( "rounds%c $%d,%s,%s\n",
   15475               isD ? 'd' : 's',
   15476               imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
   15477                    nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15478       } else {
   15479          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15480          assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
   15481          imm = insn[3+alen];
   15482          if (imm & ~15) goto decode_failure;
   15483          delta += 3+alen+1;
   15484          DIP( "rounds%c $%d,%s,%s\n",
   15485               isD ? 'd' : 's',
   15486               imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15487       }
   15488 
   15489       /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   15490          that encoding is the same as the encoding for IRRoundingMode,
   15491          we can use that value directly in the IR as a rounding
   15492          mode. */
   15493       assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
   15494                         (imm & 4) ? get_sse_roundingmode()
   15495                                   : mkU32(imm & 3),
   15496                         mkexpr(src)) );
   15497 
   15498       if (isD)
   15499          putXMMRegLane64F( gregOfRexRM(pfx, modrm), 0, mkexpr(res) );
   15500       else
   15501          putXMMRegLane32F( gregOfRexRM(pfx, modrm), 0, mkexpr(res) );
   15502 
   15503       goto decode_success;
   15504    }
   15505 
   15506 
   15507    /* 66 0F 3A 09 /r ib = ROUNDPD imm8, xmm2/m128, xmm1 */
   15508    if (have66noF2noF3(pfx)
   15509        && sz == 2
   15510        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x09) {
   15511 
   15512       IRTemp src0 = newTemp(Ity_F64);
   15513       IRTemp src1 = newTemp(Ity_F64);
   15514       IRTemp res0 = newTemp(Ity_F64);
   15515       IRTemp res1 = newTemp(Ity_F64);
   15516       IRTemp rm   = newTemp(Ity_I32);
   15517       Int    imm  = 0;
   15518 
   15519       modrm = insn[3];
   15520 
   15521       if (epartIsReg(modrm)) {
   15522          assign( src0,
   15523                  getXMMRegLane64F( eregOfRexRM(pfx, modrm), 0 ) );
   15524          assign( src1,
   15525                  getXMMRegLane64F( eregOfRexRM(pfx, modrm), 1 ) );
   15526          imm = insn[3+1];
   15527          if (imm & ~15) goto decode_failure;
   15528          delta += 3+1+1;
   15529          DIP( "roundpd $%d,%s,%s\n",
   15530               imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
   15531                    nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15532       } else {
   15533          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15534          gen_SEGV_if_not_16_aligned(addr);
   15535          assign( src0, loadLE(Ity_F64,
   15536                               binop(Iop_Add64, mkexpr(addr), mkU64(0) )));
   15537          assign( src1, loadLE(Ity_F64,
   15538                               binop(Iop_Add64, mkexpr(addr), mkU64(8) )));
   15539          imm = insn[3+alen];
   15540          if (imm & ~15) goto decode_failure;
   15541          delta += 3+alen+1;
   15542          DIP( "roundpd $%d,%s,%s\n",
   15543               imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15544       }
   15545 
   15546       /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   15547          that encoding is the same as the encoding for IRRoundingMode,
   15548          we can use that value directly in the IR as a rounding
   15549          mode. */
   15550       assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   15551 
   15552       assign(res0, binop(Iop_RoundF64toInt, mkexpr(rm), mkexpr(src0)) );
   15553       assign(res1, binop(Iop_RoundF64toInt, mkexpr(rm), mkexpr(src1)) );
   15554 
   15555       putXMMRegLane64F( gregOfRexRM(pfx, modrm), 0, mkexpr(res0) );
   15556       putXMMRegLane64F( gregOfRexRM(pfx, modrm), 1, mkexpr(res1) );
   15557 
   15558       goto decode_success;
   15559    }
   15560 
   15561 
   15562    /* 66 0F 3A 08 /r ib = ROUNDPS imm8, xmm2/m128, xmm1 */
   15563    if (have66noF2noF3(pfx)
   15564        && sz == 2
   15565        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x08) {
   15566 
   15567       IRTemp src0 = newTemp(Ity_F32);
   15568       IRTemp src1 = newTemp(Ity_F32);
   15569       IRTemp src2 = newTemp(Ity_F32);
   15570       IRTemp src3 = newTemp(Ity_F32);
   15571       IRTemp res0 = newTemp(Ity_F32);
   15572       IRTemp res1 = newTemp(Ity_F32);
   15573       IRTemp res2 = newTemp(Ity_F32);
   15574       IRTemp res3 = newTemp(Ity_F32);
   15575       IRTemp rm   = newTemp(Ity_I32);
   15576       Int    imm  = 0;
   15577 
   15578       modrm = insn[3];
   15579 
   15580       if (epartIsReg(modrm)) {
   15581          assign( src0,
   15582                  getXMMRegLane32F( eregOfRexRM(pfx, modrm), 0 ) );
   15583          assign( src1,
   15584                  getXMMRegLane32F( eregOfRexRM(pfx, modrm), 1 ) );
   15585          assign( src2,
   15586                  getXMMRegLane32F( eregOfRexRM(pfx, modrm), 2 ) );
   15587          assign( src3,
   15588                  getXMMRegLane32F( eregOfRexRM(pfx, modrm), 3 ) );
   15589          imm = insn[3+1];
   15590          if (imm & ~15) goto decode_failure;
   15591          delta += 3+1+1;
   15592          DIP( "roundps $%d,%s,%s\n",
   15593               imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
   15594                    nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15595       } else {
   15596          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15597          gen_SEGV_if_not_16_aligned(addr);
   15598          assign( src0, loadLE(Ity_F32,
   15599                               binop(Iop_Add64, mkexpr(addr), mkU64(0) )));
   15600          assign( src1, loadLE(Ity_F32,
   15601                               binop(Iop_Add64, mkexpr(addr), mkU64(4) )));
   15602          assign( src2, loadLE(Ity_F32,
   15603                               binop(Iop_Add64, mkexpr(addr), mkU64(8) )));
   15604          assign( src3, loadLE(Ity_F32,
   15605                               binop(Iop_Add64, mkexpr(addr), mkU64(12) )));
   15606          imm = insn[3+alen];
   15607          if (imm & ~15) goto decode_failure;
   15608          delta += 3+alen+1;
   15609          DIP( "roundps $%d,%s,%s\n",
   15610               imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15611       }
   15612 
   15613       /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   15614          that encoding is the same as the encoding for IRRoundingMode,
   15615          we can use that value directly in the IR as a rounding
   15616          mode. */
   15617       assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   15618 
   15619       assign(res0, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src0)) );
   15620       assign(res1, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src1)) );
   15621       assign(res2, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src2)) );
   15622       assign(res3, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src3)) );
   15623 
   15624       putXMMRegLane32F( gregOfRexRM(pfx, modrm), 0, mkexpr(res0) );
   15625       putXMMRegLane32F( gregOfRexRM(pfx, modrm), 1, mkexpr(res1) );
   15626       putXMMRegLane32F( gregOfRexRM(pfx, modrm), 2, mkexpr(res2) );
   15627       putXMMRegLane32F( gregOfRexRM(pfx, modrm), 3, mkexpr(res3) );
   15628 
   15629       goto decode_success;
   15630    }
   15631 
   15632 
   15633    /* F3 0F BD -- LZCNT (count leading zeroes.  An AMD extension,
   15634       which we can only decode if we're sure this is an AMD cpu that
   15635       supports LZCNT, since otherwise it's BSR, which behaves
   15636       differently. */
   15637    if (haveF3noF2(pfx) /* so both 66 and 48 are possibilities */
   15638        && insn[0] == 0x0F && insn[1] == 0xBD
   15639        && 0 != (archinfo->hwcaps & VEX_HWCAPS_AMD64_LZCNT)) {
   15640       vassert(sz == 2 || sz == 4 || sz == 8);
   15641       /*IRType*/ ty  = szToITy(sz);
   15642       IRTemp     src = newTemp(ty);
   15643       modrm = insn[2];
   15644       if (epartIsReg(modrm)) {
   15645          assign(src, getIRegE(sz, pfx, modrm));
   15646          delta += 2+1;
   15647          DIP("lzcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
   15648              nameIRegG(sz, pfx, modrm));
   15649       } else {
   15650          addr = disAMode( &alen, vbi, pfx, delta+2, dis_buf, 0);
   15651          assign(src, loadLE(ty, mkexpr(addr)));
   15652          delta += 2+alen;
   15653          DIP("lzcnt%c %s, %s\n", nameISize(sz), dis_buf,
   15654              nameIRegG(sz, pfx, modrm));
   15655       }
   15656 
   15657       IRTemp res = gen_LZCNT(ty, src);
   15658       putIRegG(sz, pfx, modrm, mkexpr(res));
   15659 
   15660       // Update flags.  This is pretty lame .. perhaps can do better
   15661       // if this turns out to be performance critical.
   15662       // O S A P are cleared.  Z is set if RESULT == 0.
   15663       // C is set if SRC is zero.
   15664       IRTemp src64 = newTemp(Ity_I64);
   15665       IRTemp res64 = newTemp(Ity_I64);
   15666       assign(src64, widenUto64(mkexpr(src)));
   15667       assign(res64, widenUto64(mkexpr(res)));
   15668 
   15669       IRTemp oszacp = newTemp(Ity_I64);
   15670       assign(
   15671          oszacp,
   15672          binop(Iop_Or64,
   15673                binop(Iop_Shl64,
   15674                      unop(Iop_1Uto64,
   15675                           binop(Iop_CmpEQ64, mkexpr(res64), mkU64(0))),
   15676                      mkU8(AMD64G_CC_SHIFT_Z)),
   15677                binop(Iop_Shl64,
   15678                      unop(Iop_1Uto64,
   15679                           binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0))),
   15680                      mkU8(AMD64G_CC_SHIFT_C))
   15681          )
   15682       );
   15683 
   15684       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   15685       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   15686       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   15687       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
   15688 
   15689       goto decode_success;
   15690    }
   15691 
   15692    /* 66 0F 3A 63 /r ib = PCMPISTRI imm8, xmm2/m128, xmm1
   15693       66 0F 3A 62 /r ib = PCMPISTRM imm8, xmm2/m128, xmm1
   15694       66 0F 3A 61 /r ib = PCMPESTRI imm8, xmm2/m128, xmm1
   15695       66 0F 3A 60 /r ib = PCMPESTRM imm8, xmm2/m128, xmm1
   15696       (selected special cases that actually occur in glibc,
   15697        not by any means a complete implementation.)
   15698    */
   15699    if (have66noF2noF3(pfx)
   15700        && sz == 2
   15701        && insn[0] == 0x0F && insn[1] == 0x3A
   15702        && (insn[2] >= 0x60 && insn[2] <= 0x63)) {
   15703 
   15704       UInt  isISTRx = insn[2] & 2;
   15705       UInt  isxSTRM = (insn[2] & 1) ^ 1;
   15706       UInt  regNoL = 0;
   15707       UInt  regNoR = 0;
   15708       UChar imm    = 0;
   15709 
   15710       /* This is a nasty kludge.  We need to pass 2 x V128 to the
   15711          helper (which is clean).  Since we can't do that, use a dirty
   15712          helper to compute the results directly from the XMM regs in
   15713          the guest state.  That means for the memory case, we need to
   15714          move the left operand into a pseudo-register (XMM16, let's
   15715          call it). */
   15716       modrm = insn[3];
   15717       if (epartIsReg(modrm)) {
   15718          regNoL = eregOfRexRM(pfx, modrm);
   15719          regNoR = gregOfRexRM(pfx, modrm);
   15720          imm = insn[3+1];
   15721          delta += 3+1+1;
   15722       } else {
   15723          regNoL = 16; /* use XMM16 as an intermediary */
   15724          regNoR = gregOfRexRM(pfx, modrm);
   15725          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15726          /* No alignment check; I guess that makes sense, given that
   15727             these insns are for dealing with C style strings. */
   15728          stmt( IRStmt_Put( OFFB_XMM16, loadLE(Ity_V128, mkexpr(addr)) ));
   15729          imm = insn[3+alen];
   15730          delta += 3+alen+1;
   15731       }
   15732 
   15733       /* Now we know the XMM reg numbers for the operands, and the
   15734          immediate byte.  Is it one we can actually handle? Throw out
   15735          any cases for which the helper function has not been
   15736          verified. */
   15737       switch (imm) {
   15738          case 0x00:
   15739          case 0x02: case 0x08: case 0x0A: case 0x0C: case 0x12:
   15740          case 0x1A: case 0x38: case 0x3A: case 0x44: case 0x4A:
   15741             break;
   15742          default:
   15743             goto decode_failure;
   15744       }
   15745 
   15746       /* Who ya gonna call?  Presumably not Ghostbusters. */
   15747       void*  fn = &amd64g_dirtyhelper_PCMPxSTRx;
   15748       HChar* nm = "amd64g_dirtyhelper_PCMPxSTRx";
   15749 
   15750       /* Round up the arguments.  Note that this is a kludge -- the
   15751          use of mkU64 rather than mkIRExpr_HWord implies the
   15752          assumption that the host's word size is 64-bit. */
   15753       UInt gstOffL = regNoL == 16 ? OFFB_XMM16 : xmmGuestRegOffset(regNoL);
   15754       UInt gstOffR = xmmGuestRegOffset(regNoR);
   15755 
   15756       IRExpr*  opc4_and_imm = mkU64((insn[2] << 8) | (imm & 0xFF));
   15757       IRExpr*  gstOffLe     = mkU64(gstOffL);
   15758       IRExpr*  gstOffRe     = mkU64(gstOffR);
   15759       IRExpr*  edxIN        = isISTRx ? mkU64(0) : getIRegRDX(8);
   15760       IRExpr*  eaxIN        = isISTRx ? mkU64(0) : getIRegRAX(8);
   15761       IRExpr** args
   15762          = mkIRExprVec_5( opc4_and_imm, gstOffLe, gstOffRe, edxIN, eaxIN );
   15763 
   15764       IRTemp   resT = newTemp(Ity_I64);
   15765       IRDirty* d    = unsafeIRDirty_1_N( resT, 0/*regparms*/, nm, fn, args );
   15766       /* It's not really a dirty call, but we can't use the clean
   15767          helper mechanism here for the very lame reason that we can't
   15768          pass 2 x V128s by value to a helper, nor get one back.  Hence
   15769          this roundabout scheme. */
   15770       d->needsBBP = True;
   15771       d->nFxState = 2;
   15772       d->fxState[0].fx     = Ifx_Read;
   15773       d->fxState[0].offset = gstOffL;
   15774       d->fxState[0].size   = sizeof(U128);
   15775       d->fxState[1].fx     = Ifx_Read;
   15776       d->fxState[1].offset = gstOffR;
   15777       d->fxState[1].size   = sizeof(U128);
   15778       if (isxSTRM) {
   15779          /* Declare that the helper writes XMM0. */
   15780          d->nFxState = 3;
   15781          d->fxState[2].fx     = Ifx_Write;
   15782          d->fxState[2].offset = xmmGuestRegOffset(0);
   15783          d->fxState[2].size   = sizeof(U128);
   15784       }
   15785 
   15786       stmt( IRStmt_Dirty(d) );
   15787 
   15788       /* Now resT[15:0] holds the new OSZACP values, so the condition
   15789          codes must be updated. And for a xSTRI case, resT[31:16]
   15790          holds the new ECX value, so stash that too. */
   15791       if (!isxSTRM) {
   15792          putIReg64(R_RCX, binop(Iop_And64,
   15793                                 binop(Iop_Shr64, mkexpr(resT), mkU8(16)),
   15794                                 mkU64(0xFFFF)));
   15795       }
   15796 
   15797       stmt( IRStmt_Put(
   15798                OFFB_CC_DEP1,
   15799                binop(Iop_And64, mkexpr(resT), mkU64(0xFFFF))
   15800       ));
   15801       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   15802       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   15803       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   15804 
   15805       if (regNoL == 16) {
   15806          DIP("pcmp%cstr%c $%x,%s,%s\n",
   15807              isISTRx ? 'i' : 'e', isxSTRM ? 'm' : 'i',
   15808              (UInt)imm, dis_buf, nameXMMReg(regNoR));
   15809       } else {
   15810          DIP("pcmp%cstr%c $%x,%s,%s\n",
   15811              isISTRx ? 'i' : 'e', isxSTRM ? 'm' : 'i',
   15812              (UInt)imm, nameXMMReg(regNoL), nameXMMReg(regNoR));
   15813       }
   15814 
   15815       goto decode_success;
   15816    }
   15817 
   15818 
   15819    /* 66 0f 38 17 /r = PTEST xmm1, xmm2/m128
   15820       Logical compare (set ZF and CF from AND/ANDN of the operands) */
   15821    if (have66noF2noF3( pfx )
   15822        && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
   15823        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x17) {
   15824       modrm = insn[3];
   15825       IRTemp vecE = newTemp(Ity_V128);
   15826       IRTemp vecG = newTemp(Ity_V128);
   15827 
   15828       if ( epartIsReg(modrm) ) {
   15829          assign(vecE, getXMMReg(eregOfRexRM(pfx, modrm)));
   15830          delta += 3+1;
   15831          DIP( "ptest %s,%s\n",
   15832               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   15833               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15834       } else {
   15835          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15836          gen_SEGV_if_not_16_aligned( addr );
   15837          assign(vecE, loadLE( Ity_V128, mkexpr(addr) ));
   15838          delta += 3+alen;
   15839          DIP( "ptest %s,%s\n",
   15840               dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15841       }
   15842 
   15843       assign(vecG, getXMMReg(gregOfRexRM(pfx, modrm)));
   15844 
   15845       /* Set Z=1 iff (vecE & vecG) == 0
   15846          Set C=1 iff (vecE & not vecG) == 0
   15847       */
   15848 
   15849       /* andV, andnV:  vecE & vecG,  vecE and not(vecG) */
   15850       IRTemp andV  = newTemp(Ity_V128);
   15851       IRTemp andnV = newTemp(Ity_V128);
   15852       assign(andV,  binop(Iop_AndV128, mkexpr(vecE), mkexpr(vecG)));
   15853       assign(andnV, binop(Iop_AndV128,
   15854                           mkexpr(vecE),
   15855                           binop(Iop_XorV128, mkexpr(vecG),
   15856                                              mkV128(0xFFFF))));
   15857 
   15858       /* The same, but reduced to 64-bit values, by or-ing the top
   15859          and bottom 64-bits together.  It relies on this trick:
   15860 
   15861           InterleaveLO64x2([a,b],[c,d]) == [b,d]    hence
   15862 
   15863           InterleaveLO64x2([a,b],[a,b]) == [b,b]    and similarly
   15864           InterleaveHI64x2([a,b],[a,b]) == [a,a]
   15865 
   15866           and so the OR of the above 2 exprs produces
   15867           [a OR b, a OR b], from which we simply take the lower half.
   15868       */
   15869       IRTemp and64  = newTemp(Ity_I64);
   15870       IRTemp andn64 = newTemp(Ity_I64);
   15871 
   15872       assign(
   15873          and64,
   15874          unop(Iop_V128to64,
   15875               binop(Iop_OrV128,
   15876                     binop(Iop_InterleaveLO64x2, mkexpr(andV), mkexpr(andV)),
   15877                     binop(Iop_InterleaveHI64x2, mkexpr(andV), mkexpr(andV))
   15878               )
   15879          )
   15880       );
   15881 
   15882       assign(
   15883          andn64,
   15884          unop(Iop_V128to64,
   15885               binop(Iop_OrV128,
   15886                     binop(Iop_InterleaveLO64x2, mkexpr(andnV), mkexpr(andnV)),
   15887                     binop(Iop_InterleaveHI64x2, mkexpr(andnV), mkexpr(andnV))
   15888               )
   15889           )
   15890        );
   15891 
   15892       /* Now convert and64, andn64 to all-zeroes or all-1s, so we can
   15893          slice out the Z and C bits conveniently.  We use the standard
   15894          trick all-zeroes -> all-zeroes, anything-else -> all-ones
   15895          done by "(x | -x) >>s (word-size - 1)".
   15896       */
   15897       IRTemp z64 = newTemp(Ity_I64);
   15898       IRTemp c64 = newTemp(Ity_I64);
   15899       assign(z64,
   15900              unop(Iop_Not64,
   15901                   binop(Iop_Sar64,
   15902                         binop(Iop_Or64,
   15903                               binop(Iop_Sub64, mkU64(0), mkexpr(and64)),
   15904                               mkexpr(and64)
   15905                         ),
   15906                         mkU8(63)))
   15907       );
   15908 
   15909       assign(c64,
   15910              unop(Iop_Not64,
   15911                   binop(Iop_Sar64,
   15912                         binop(Iop_Or64,
   15913                               binop(Iop_Sub64, mkU64(0), mkexpr(andn64)),
   15914                               mkexpr(andn64)
   15915                         ),
   15916                         mkU8(63)))
   15917       );
   15918 
   15919       /* And finally, slice out the Z and C flags and set the flags
   15920          thunk to COPY for them.  OSAP are set to zero. */
   15921       IRTemp newOSZACP = newTemp(Ity_I64);
   15922       assign(newOSZACP,
   15923              binop(Iop_Or64,
   15924                    binop(Iop_And64, mkexpr(z64), mkU64(AMD64G_CC_MASK_Z)),
   15925                    binop(Iop_And64, mkexpr(c64), mkU64(AMD64G_CC_MASK_C))
   15926              )
   15927       );
   15928 
   15929       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(newOSZACP)));
   15930       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   15931       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   15932       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   15933 
   15934       goto decode_success;
   15935    }
   15936 
   15937    /* 66 0F 38 15 /r = BLENDVPD xmm1, xmm2/m128  (double gran)
   15938       66 0F 38 14 /r = BLENDVPS xmm1, xmm2/m128  (float gran)
   15939       66 0F 38 10 /r = PBLENDVB xmm1, xmm2/m128  (byte gran)
   15940       Blend at various granularities, with XMM0 (implicit operand)
   15941       providing the controlling mask.
   15942    */
   15943    if (have66noF2noF3(pfx) && sz == 2
   15944        && insn[0] == 0x0F && insn[1] == 0x38
   15945        && (insn[2] == 0x15 || insn[2] == 0x14 || insn[2] == 0x10)) {
   15946       modrm = insn[3];
   15947 
   15948       HChar* nm    = NULL;
   15949       UInt   gran  = 0;
   15950       IROp   opSAR = Iop_INVALID;
   15951       switch (insn[2]) {
   15952          case 0x15:
   15953             nm = "blendvpd"; gran = 8; opSAR = Iop_SarN64x2;
   15954             break;
   15955          case 0x14:
   15956             nm = "blendvps"; gran = 4; opSAR = Iop_SarN32x4;
   15957             break;
   15958          case 0x10:
   15959             nm = "pblendvb"; gran = 1; opSAR = Iop_SarN8x16;
   15960             break;
   15961       }
   15962       vassert(nm);
   15963 
   15964       IRTemp vecE = newTemp(Ity_V128);
   15965       IRTemp vecG = newTemp(Ity_V128);
   15966       IRTemp vec0 = newTemp(Ity_V128);
   15967 
   15968       if ( epartIsReg(modrm) ) {
   15969          assign(vecE, getXMMReg(eregOfRexRM(pfx, modrm)));
   15970          delta += 3+1;
   15971          DIP( "%s %s,%s\n", nm,
   15972               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   15973               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15974       } else {
   15975          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   15976          gen_SEGV_if_not_16_aligned( addr );
   15977          assign(vecE, loadLE( Ity_V128, mkexpr(addr) ));
   15978          delta += 3+alen;
   15979          DIP( "%s %s,%s\n", nm,
   15980               dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   15981       }
   15982 
   15983       assign(vecG, getXMMReg(gregOfRexRM(pfx, modrm)));
   15984       assign(vec0, getXMMReg(0));
   15985 
   15986       /* Now the tricky bit is to convert vec0 into a suitable mask,
   15987          by copying the most significant bit of each lane into all
   15988          positions in the lane. */
   15989       IRTemp sh = newTemp(Ity_I8);
   15990       assign(sh, mkU8(8 * gran - 1));
   15991 
   15992       IRTemp mask = newTemp(Ity_V128);
   15993       assign(mask, binop(opSAR, mkexpr(vec0), mkexpr(sh)));
   15994 
   15995       IRTemp notmask = newTemp(Ity_V128);
   15996       assign(notmask, unop(Iop_NotV128, mkexpr(mask)));
   15997 
   15998       IRExpr* res = binop(Iop_OrV128,
   15999                           binop(Iop_AndV128, mkexpr(vecE), mkexpr(mask)),
   16000                           binop(Iop_AndV128, mkexpr(vecG), mkexpr(notmask)));
   16001       putXMMReg(gregOfRexRM(pfx, modrm), res);
   16002 
   16003       goto decode_success;
   16004    }
   16005 
   16006    /* F2 0F 38 F0 /r = CRC32 r/m8, r32 (REX.W ok, 66 not ok)
   16007       F2 0F 38 F1 /r = CRC32 r/m{16,32,64}, r32
   16008       The decoding on this is a bit unusual.
   16009    */
   16010    if (haveF2noF3(pfx)
   16011        && insn[0] == 0x0F && insn[1] == 0x38
   16012        && (insn[2] == 0xF1
   16013            || (insn[2] == 0xF0 && !have66(pfx)))) {
   16014       modrm = insn[3];
   16015 
   16016       if (insn[2] == 0xF0)
   16017          sz = 1;
   16018       else
   16019          vassert(sz == 2 || sz == 4 || sz == 8);
   16020 
   16021       IRType tyE = szToITy(sz);
   16022       IRTemp valE = newTemp(tyE);
   16023 
   16024       if (epartIsReg(modrm)) {
   16025          assign(valE, getIRegE(sz, pfx, modrm));
   16026          delta += 3+1;
   16027          DIP("crc32b %s,%s\n", nameIRegE(sz, pfx, modrm),
   16028              nameIRegG(1==getRexW(pfx) ? 8 : 4 ,pfx, modrm));
   16029       } else {
   16030          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   16031          assign(valE, loadLE(tyE, mkexpr(addr)));
   16032          delta += 3+alen;
   16033          DIP("crc32b %s,%s\n", dis_buf,
   16034              nameIRegG(1==getRexW(pfx) ? 8 : 4 ,pfx, modrm));
   16035       }
   16036 
   16037       /* Somewhat funny getting/putting of the crc32 value, in order
   16038          to ensure that it turns into 64-bit gets and puts.  However,
   16039          mask off the upper 32 bits so as to not get memcheck false
   16040          +ves around the helper call. */
   16041       IRTemp valG0 = newTemp(Ity_I64);
   16042       assign(valG0, binop(Iop_And64, getIRegG(8, pfx, modrm),
   16043                           mkU64(0xFFFFFFFF)));
   16044 
   16045       HChar* nm = NULL;
   16046       void* fn = NULL;
   16047       switch (sz) {
   16048          case 1: nm = "amd64g_calc_crc32b";
   16049                  fn = &amd64g_calc_crc32b; break;
   16050          case 2: nm = "amd64g_calc_crc32w";
   16051                  fn = &amd64g_calc_crc32w; break;
   16052          case 4: nm = "amd64g_calc_crc32l";
   16053                  fn = &amd64g_calc_crc32l; break;
   16054          case 8: nm = "amd64g_calc_crc32q";
   16055                  fn = &amd64g_calc_crc32q; break;
   16056       }
   16057       vassert(nm && fn);
   16058       IRTemp valG1 = newTemp(Ity_I64);
   16059       assign(valG1,
   16060              mkIRExprCCall(Ity_I64, 0/*regparm*/, nm, fn,
   16061                            mkIRExprVec_2(mkexpr(valG0),
   16062                                          widenUto64(mkexpr(valE)))));
   16063 
   16064       putIRegG(4, pfx, modrm, unop(Iop_64to32, mkexpr(valG1)));
   16065       goto decode_success;
   16066    }
   16067 
   16068    /* 66 0f 38 2B /r = PACKUSDW xmm1, xmm2/m128
   16069       2x 32x4 S->U saturating narrow from xmm2/m128 to xmm1 */
   16070    if ( have66noF2noF3( pfx )
   16071         && sz == 2
   16072         && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x2B ) {
   16073 
   16074       modrm = insn[3];
   16075 
   16076       IRTemp argL = newTemp(Ity_V128);
   16077       IRTemp argR = newTemp(Ity_V128);
   16078 
   16079       if ( epartIsReg(modrm) ) {
   16080          assign( argL, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   16081          delta += 3+1;
   16082          DIP( "packusdw %s,%s\n",
   16083               nameXMMReg( eregOfRexRM(pfx, modrm) ),
   16084               nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   16085       } else {
   16086          addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   16087          gen_SEGV_if_not_16_aligned( addr );
   16088          assign( argL, loadLE( Ity_V128, mkexpr(addr) ));
   16089          delta += 3+alen;
   16090          DIP( "packusdw %s,%s\n",
   16091               dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   16092       }
   16093 
   16094       assign(argR, getXMMReg( gregOfRexRM(pfx, modrm) ));
   16095 
   16096       putXMMReg( gregOfRexRM(pfx, modrm),
   16097                  binop( Iop_QNarrowBin32Sto16Ux8,
   16098                         mkexpr(argL), mkexpr(argR)) );
   16099 
   16100       goto decode_success;
   16101    }
   16102 
   16103    /* 66 0F 38 28 = PMULUDQ -- signed widening multiply of 32-lanes 0 x
   16104       0 to form lower 64-bit half and lanes 2 x 2 to form upper 64-bit
   16105       half */
   16106    /* This is a really poor translation -- could be improved if
   16107       performance critical.  It's a copy-paste of PMULDQ, too. */
   16108    if (have66noF2noF3(pfx) && sz == 2
   16109        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x28) {
   16110       IRTemp sV, dV;
   16111       IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   16112       sV = newTemp(Ity_V128);
   16113       dV = newTemp(Ity_V128);
   16114       s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   16115       t1 = newTemp(Ity_I64);
   16116       t0 = newTemp(Ity_I64);
   16117       modrm = insn[3];
   16118       assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   16119 
   16120       if (epartIsReg(modrm)) {
   16121          assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   16122          delta += 3+1;
   16123          DIP("pmuldq %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   16124                                nameXMMReg(gregOfRexRM(pfx,modrm)));
   16125       } else {
   16126          addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
   16127          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   16128          delta += 3+alen;
   16129          DIP("pmuldq %s,%s\n", dis_buf,
   16130                                nameXMMReg(gregOfRexRM(pfx,modrm)));
   16131       }
   16132 
   16133       breakup128to32s( dV, &d3, &d2, &d1, &d0 );
   16134       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   16135 
   16136       assign( t0, binop( Iop_MullS32, mkexpr(d0), mkexpr(s0)) );
   16137       putXMMRegLane64( gregOfRexRM(pfx,modrm), 0, mkexpr(t0) );
   16138       assign( t1, binop( Iop_MullS32, mkexpr(d2), mkexpr(s2)) );
   16139       putXMMRegLane64( gregOfRexRM(pfx,modrm), 1, mkexpr(t1) );
   16140       goto decode_success;
   16141    }
   16142 
   16143    /* 66 0F 38 29 = PCMPEQQ
   16144       64x2 equality comparison
   16145    */
   16146    if ( have66noF2noF3( pfx ) && sz == 2
   16147         && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x29) {
   16148       /* FIXME: this needs an alignment check */
   16149       delta = dis_SSEint_E_to_G( vbi, pfx, delta+3,
   16150                                  "pcmpeqq", Iop_CmpEQ64x2, False );
   16151       goto decode_success;
   16152    }
   16153 
   16154    /* ---------------------------------------------------- */
   16155    /* --- end of the SSE4 decoder                      --- */
   16156    /* ---------------------------------------------------- */
   16157 
   16158    /*after_sse_decoders:*/
   16159 
   16160    /* Get the primary opcode. */
   16161    opc = getUChar(delta); delta++;
   16162 
   16163    /* We get here if the current insn isn't SSE, or this CPU doesn't
   16164       support SSE. */
   16165 
   16166    switch (opc) {
   16167 
   16168    /* ------------------------ Control flow --------------- */
   16169 
   16170    case 0xC2: /* RET imm16 */
   16171       if (have66orF2orF3(pfx)) goto decode_failure;
   16172       d64 = getUDisp16(delta);
   16173       delta += 2;
   16174       dis_ret(vbi, d64);
   16175       dres.whatNext = Dis_StopHere;
   16176       DIP("ret %lld\n", d64);
   16177       break;
   16178 
   16179    case 0xC3: /* RET */
   16180       if (have66orF2(pfx)) goto decode_failure;
   16181       /* F3 is acceptable on AMD. */
   16182       dis_ret(vbi, 0);
   16183       dres.whatNext = Dis_StopHere;
   16184       DIP(haveF3(pfx) ? "rep ; ret\n" : "ret\n");
   16185       break;
   16186 
   16187    case 0xE8: /* CALL J4 */
   16188       if (haveF2orF3(pfx)) goto decode_failure;
   16189       d64 = getSDisp32(delta); delta += 4;
   16190       d64 += (guest_RIP_bbstart+delta);
   16191       /* (guest_RIP_bbstart+delta) == return-to addr, d64 == call-to addr */
   16192       t1 = newTemp(Ity_I64);
   16193       assign(t1, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   16194       putIReg64(R_RSP, mkexpr(t1));
   16195       storeLE( mkexpr(t1), mkU64(guest_RIP_bbstart+delta));
   16196       t2 = newTemp(Ity_I64);
   16197       assign(t2, mkU64((Addr64)d64));
   16198       make_redzone_AbiHint(vbi, t1, t2/*nia*/, "call-d32");
   16199       if (resteerOkFn( callback_opaque, (Addr64)d64) ) {
   16200          /* follow into the call target. */
   16201          dres.whatNext   = Dis_ResteerU;
   16202          dres.continueAt = d64;
   16203       } else {
   16204          jmp_lit(Ijk_Call,d64);
   16205          dres.whatNext = Dis_StopHere;
   16206       }
   16207       DIP("call 0x%llx\n",d64);
   16208       break;
   16209 
   16210 //.. //--    case 0xC8: /* ENTER */
   16211 //.. //--       d32 = getUDisp16(eip); eip += 2;
   16212 //.. //--       abyte = getUChar(delta); delta++;
   16213 //.. //--
   16214 //.. //--       vg_assert(sz == 4);
   16215 //.. //--       vg_assert(abyte == 0);
   16216 //.. //--
   16217 //.. //--       t1 = newTemp(cb); t2 = newTemp(cb);
   16218 //.. //--       uInstr2(cb, GET,   sz, ArchReg, R_EBP, TempReg, t1);
   16219 //.. //--       uInstr2(cb, GET,    4, ArchReg, R_ESP, TempReg, t2);
   16220 //.. //--       uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
   16221 //.. //--       uLiteral(cb, sz);
   16222 //.. //--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
   16223 //.. //--       uInstr2(cb, STORE,  4, TempReg, t1,    TempReg, t2);
   16224 //.. //--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_EBP);
   16225 //.. //--       if (d32) {
   16226 //.. //--          uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
   16227 //.. //--          uLiteral(cb, d32);
   16228 //.. //--          uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
   16229 //.. //--       }
   16230 //.. //--       DIP("enter 0x%x, 0x%x", d32, abyte);
   16231 //.. //--       break;
   16232 
   16233    case 0xC8: /* ENTER */
   16234       /* Same comments re operand size as for LEAVE below apply.
   16235          Also, only handles the case "enter $imm16, $0"; other cases
   16236          for the second operand (nesting depth) are not handled. */
   16237       if (sz != 4)
   16238          goto decode_failure;
   16239       d64 = getUDisp16(delta);
   16240       delta += 2;
   16241       vassert(d64 >= 0 && d64 <= 0xFFFF);
   16242       if (getUChar(delta) != 0)
   16243          goto decode_failure;
   16244       delta++;
   16245       /* Intel docs seem to suggest:
   16246            push rbp
   16247            temp = rsp
   16248            rbp = temp
   16249            rsp = rsp - imm16
   16250       */
   16251       t1 = newTemp(Ity_I64);
   16252       assign(t1, getIReg64(R_RBP));
   16253       t2 = newTemp(Ity_I64);
   16254       assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   16255       putIReg64(R_RSP, mkexpr(t2));
   16256       storeLE(mkexpr(t2), mkexpr(t1));
   16257       putIReg64(R_RBP, mkexpr(t2));
   16258       if (d64 > 0) {
   16259          putIReg64(R_RSP, binop(Iop_Sub64, mkexpr(t2), mkU64(d64)));
   16260       }
   16261       DIP("enter $%u, $0\n", (UInt)d64);
   16262       break;
   16263 
   16264    case 0xC9: /* LEAVE */
   16265       /* In 64-bit mode this defaults to a 64-bit operand size.  There
   16266          is no way to encode a 32-bit variant.  Hence sz==4 but we do
   16267          it as if sz=8. */
   16268       if (sz != 4)
   16269          goto decode_failure;
   16270       t1 = newTemp(Ity_I64);
   16271       t2 = newTemp(Ity_I64);
   16272       assign(t1, getIReg64(R_RBP));
   16273       /* First PUT RSP looks redundant, but need it because RSP must
   16274          always be up-to-date for Memcheck to work... */
   16275       putIReg64(R_RSP, mkexpr(t1));
   16276       assign(t2, loadLE(Ity_I64,mkexpr(t1)));
   16277       putIReg64(R_RBP, mkexpr(t2));
   16278       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t1), mkU64(8)) );
   16279       DIP("leave\n");
   16280       break;
   16281 
   16282 //.. //--    /* ---------------- Misc weird-ass insns --------------- */
   16283 //.. //--
   16284 //.. //--    case 0x27: /* DAA */
   16285 //.. //--    case 0x2F: /* DAS */
   16286 //.. //--       t1 = newTemp(cb);
   16287 //.. //--       uInstr2(cb, GET, 1, ArchReg, R_AL, TempReg, t1);
   16288 //.. //--       /* Widen %AL to 32 bits, so it's all defined when we push it. */
   16289 //.. //--       uInstr1(cb, WIDEN, 4, TempReg, t1);
   16290 //.. //--       uWiden(cb, 1, False);
   16291 //.. //--       uInstr0(cb, CALLM_S, 0);
   16292 //.. //--       uInstr1(cb, PUSH, 4, TempReg, t1);
   16293 //.. //--       uInstr1(cb, CALLM, 0, Lit16,
   16294 //.. //--                   opc == 0x27 ? VGOFF_(helper_DAA) : VGOFF_(helper_DAS) );
   16295 //.. //--       uFlagsRWU(cb, FlagsAC, FlagsSZACP, FlagO);
   16296 //.. //--       uInstr1(cb, POP, 4, TempReg, t1);
   16297 //.. //--       uInstr0(cb, CALLM_E, 0);
   16298 //.. //--       uInstr2(cb, PUT, 1, TempReg, t1, ArchReg, R_AL);
   16299 //.. //--       DIP(opc == 0x27 ? "daa\n" : "das\n");
   16300 //.. //--       break;
   16301 //.. //--
   16302 //.. //--    case 0x37: /* AAA */
   16303 //.. //--    case 0x3F: /* AAS */
   16304 //.. //--       t1 = newTemp(cb);
   16305 //.. //--       uInstr2(cb, GET, 2, ArchReg, R_EAX, TempReg, t1);
   16306 //.. //--       /* Widen %AL to 32 bits, so it's all defined when we push it. */
   16307 //.. //--       uInstr1(cb, WIDEN, 4, TempReg, t1);
   16308 //.. //--       uWiden(cb, 2, False);
   16309 //.. //--       uInstr0(cb, CALLM_S, 0);
   16310 //.. //--       uInstr1(cb, PUSH, 4, TempReg, t1);
   16311 //.. //--       uInstr1(cb, CALLM, 0, Lit16,
   16312 //.. //--                   opc == 0x37 ? VGOFF_(helper_AAA) : VGOFF_(helper_AAS) );
   16313 //.. //--       uFlagsRWU(cb, FlagA, FlagsAC, FlagsEmpty);
   16314 //.. //--       uInstr1(cb, POP, 4, TempReg, t1);
   16315 //.. //--       uInstr0(cb, CALLM_E, 0);
   16316 //.. //--       uInstr2(cb, PUT, 2, TempReg, t1, ArchReg, R_EAX);
   16317 //.. //--       DIP(opc == 0x37 ? "aaa\n" : "aas\n");
   16318 //.. //--       break;
   16319 //.. //--
   16320 //.. //--    case 0xD4: /* AAM */
   16321 //.. //--    case 0xD5: /* AAD */
   16322 //.. //--       d32 = getUChar(delta); delta++;
   16323 //.. //--       if (d32 != 10) VG_(core_panic)("disInstr: AAM/AAD but base not 10 !");
   16324 //.. //--       t1 = newTemp(cb);
   16325 //.. //--       uInstr2(cb, GET, 2, ArchReg, R_EAX, TempReg, t1);
   16326 //.. //--       /* Widen %AX to 32 bits, so it's all defined when we push it. */
   16327 //.. //--       uInstr1(cb, WIDEN, 4, TempReg, t1);
   16328 //.. //--       uWiden(cb, 2, False);
   16329 //.. //--       uInstr0(cb, CALLM_S, 0);
   16330 //.. //--       uInstr1(cb, PUSH, 4, TempReg, t1);
   16331 //.. //--       uInstr1(cb, CALLM, 0, Lit16,
   16332 //.. //--                   opc == 0xD4 ? VGOFF_(helper_AAM) : VGOFF_(helper_AAD) );
   16333 //.. //--       uFlagsRWU(cb, FlagsEmpty, FlagsSZP, FlagsEmpty);
   16334 //.. //--       uInstr1(cb, POP, 4, TempReg, t1);
   16335 //.. //--       uInstr0(cb, CALLM_E, 0);
   16336 //.. //--       uInstr2(cb, PUT, 2, TempReg, t1, ArchReg, R_EAX);
   16337 //.. //--       DIP(opc == 0xD4 ? "aam\n" : "aad\n");
   16338 //.. //--       break;
   16339 
   16340    /* ------------------------ CWD/CDQ -------------------- */
   16341 
   16342    case 0x98: /* CBW */
   16343       if (haveF2orF3(pfx)) goto decode_failure;
   16344       if (sz == 8) {
   16345          putIRegRAX( 8, unop(Iop_32Sto64, getIRegRAX(4)) );
   16346          DIP(/*"cdqe\n"*/"cltq");
   16347          break;
   16348       }
   16349       if (sz == 4) {
   16350          putIRegRAX( 4, unop(Iop_16Sto32, getIRegRAX(2)) );
   16351          DIP("cwtl\n");
   16352          break;
   16353       }
   16354       if (sz == 2) {
   16355          putIRegRAX( 2, unop(Iop_8Sto16, getIRegRAX(1)) );
   16356          DIP("cbw\n");
   16357          break;
   16358       }
   16359       goto decode_failure;
   16360 
   16361    case 0x99: /* CWD/CDQ/CQO */
   16362       if (haveF2orF3(pfx)) goto decode_failure;
   16363       vassert(sz == 2 || sz == 4 || sz == 8);
   16364       ty = szToITy(sz);
   16365       putIRegRDX( sz,
   16366                   binop(mkSizedOp(ty,Iop_Sar8),
   16367                         getIRegRAX(sz),
   16368                         mkU8(sz == 2 ? 15 : (sz == 4 ? 31 : 63))) );
   16369       DIP(sz == 2 ? "cwd\n"
   16370                   : (sz == 4 ? /*"cdq\n"*/ "cltd\n"
   16371                              : "cqo\n"));
   16372       break;
   16373 
   16374    /* ------------------------ FPU ops -------------------- */
   16375 
   16376    case 0x9E: /* SAHF */
   16377       codegen_SAHF();
   16378       DIP("sahf\n");
   16379       break;
   16380 
   16381    case 0x9F: /* LAHF */
   16382       codegen_LAHF();
   16383       DIP("lahf\n");
   16384       break;
   16385 
   16386    case 0x9B: /* FWAIT */
   16387       /* ignore? */
   16388       DIP("fwait\n");
   16389       break;
   16390 
   16391    case 0xD8:
   16392    case 0xD9:
   16393    case 0xDA:
   16394    case 0xDB:
   16395    case 0xDC:
   16396    case 0xDD:
   16397    case 0xDE:
   16398    case 0xDF: {
   16399       Bool redundantREXWok = False;
   16400 
   16401       if (haveF2orF3(pfx))
   16402          goto decode_failure;
   16403 
   16404       /* kludge to tolerate redundant rex.w prefixes (should do this
   16405          properly one day) */
   16406       /* mono 1.1.18.1 produces 48 D9 FA, which is rex.w fsqrt */
   16407       if ( (opc == 0xD9 && getUChar(delta+0) == 0xFA)/*fsqrt*/ )
   16408          redundantREXWok = True;
   16409 
   16410       if ( (sz == 4
   16411            || (sz == 8 && redundantREXWok))
   16412            && haveNo66noF2noF3(pfx)) {
   16413          Long delta0    = delta;
   16414          Bool decode_OK = False;
   16415          delta = dis_FPU ( &decode_OK, vbi, pfx, delta );
   16416          if (!decode_OK) {
   16417             delta = delta0;
   16418             goto decode_failure;
   16419          }
   16420          break;
   16421       } else {
   16422          goto decode_failure;
   16423       }
   16424    }
   16425 
   16426    /* ------------------------ INT ------------------------ */
   16427 
   16428    case 0xCC: /* INT 3 */
   16429       jmp_lit(Ijk_SigTRAP, guest_RIP_bbstart + delta);
   16430       dres.whatNext = Dis_StopHere;
   16431       DIP("int $0x3\n");
   16432       break;
   16433 
   16434    case 0xCD: { /* INT imm8 */
   16435       IRJumpKind jk = Ijk_Boring;
   16436       if (have66orF2orF3(pfx)) goto decode_failure;
   16437       d64 = getUChar(delta); delta++;
   16438       switch (d64) {
   16439          case 32: jk = Ijk_Sys_int32; break;
   16440          default: goto decode_failure;
   16441       }
   16442       guest_RIP_next_mustcheck = True;
   16443       guest_RIP_next_assumed = guest_RIP_bbstart + delta;
   16444       jmp_lit(jk, guest_RIP_next_assumed);
   16445       /* It's important that all ArchRegs carry their up-to-date value
   16446          at this point.  So we declare an end-of-block here, which
   16447          forces any TempRegs caching ArchRegs to be flushed. */
   16448       dres.whatNext = Dis_StopHere;
   16449       DIP("int $0x%02x\n", (UInt)d64);
   16450       break;
   16451    }
   16452 
   16453    /* ------------------------ Jcond, byte offset --------- */
   16454 
   16455    case 0xEB: /* Jb (jump, byte offset) */
   16456       if (haveF2orF3(pfx)) goto decode_failure;
   16457       if (sz != 4)
   16458          goto decode_failure; /* JRS added 2004 July 11 */
   16459       d64 = (guest_RIP_bbstart+delta+1) + getSDisp8(delta);
   16460       delta++;
   16461       if (resteerOkFn(callback_opaque,d64)) {
   16462          dres.whatNext   = Dis_ResteerU;
   16463          dres.continueAt = d64;
   16464       } else {
   16465          jmp_lit(Ijk_Boring,d64);
   16466          dres.whatNext = Dis_StopHere;
   16467       }
   16468       DIP("jmp-8 0x%llx\n", d64);
   16469       break;
   16470 
   16471    case 0xE9: /* Jv (jump, 16/32 offset) */
   16472       if (haveF2orF3(pfx)) goto decode_failure;
   16473       if (sz != 4)
   16474          goto decode_failure; /* JRS added 2004 July 11 */
   16475       d64 = (guest_RIP_bbstart+delta+sz) + getSDisp(sz,delta);
   16476       delta += sz;
   16477       if (resteerOkFn(callback_opaque,d64)) {
   16478          dres.whatNext   = Dis_ResteerU;
   16479          dres.continueAt = d64;
   16480       } else {
   16481          jmp_lit(Ijk_Boring,d64);
   16482          dres.whatNext = Dis_StopHere;
   16483       }
   16484       DIP("jmp 0x%llx\n", d64);
   16485       break;
   16486 
   16487    case 0x70:
   16488    case 0x71:
   16489    case 0x72: /* JBb/JNAEb (jump below) */
   16490    case 0x73: /* JNBb/JAEb (jump not below) */
   16491    case 0x74: /* JZb/JEb (jump zero) */
   16492    case 0x75: /* JNZb/JNEb (jump not zero) */
   16493    case 0x76: /* JBEb/JNAb (jump below or equal) */
   16494    case 0x77: /* JNBEb/JAb (jump not below or equal) */
   16495    case 0x78: /* JSb (jump negative) */
   16496    case 0x79: /* JSb (jump not negative) */
   16497    case 0x7A: /* JP (jump parity even) */
   16498    case 0x7B: /* JNP/JPO (jump parity odd) */
   16499    case 0x7C: /* JLb/JNGEb (jump less) */
   16500    case 0x7D: /* JGEb/JNLb (jump greater or equal) */
   16501    case 0x7E: /* JLEb/JNGb (jump less or equal) */
   16502    case 0x7F: /* JGb/JNLEb (jump greater) */
   16503     { Long   jmpDelta;
   16504       HChar* comment  = "";
   16505       if (haveF2orF3(pfx)) goto decode_failure;
   16506       jmpDelta = getSDisp8(delta);
   16507       vassert(-128 <= jmpDelta && jmpDelta < 128);
   16508       d64 = (guest_RIP_bbstart+delta+1) + jmpDelta;
   16509       delta++;
   16510       if (resteerCisOk
   16511           && vex_control.guest_chase_cond
   16512           && (Addr64)d64 != (Addr64)guest_RIP_bbstart
   16513           && jmpDelta < 0
   16514           && resteerOkFn( callback_opaque, d64) ) {
   16515          /* Speculation: assume this backward branch is taken.  So we
   16516             need to emit a side-exit to the insn following this one,
   16517             on the negation of the condition, and continue at the
   16518             branch target address (d64).  If we wind up back at the
   16519             first instruction of the trace, just stop; it's better to
   16520             let the IR loop unroller handle that case. */
   16521          stmt( IRStmt_Exit(
   16522                   mk_amd64g_calculate_condition(
   16523                      (AMD64Condcode)(1 ^ (opc - 0x70))),
   16524                   Ijk_Boring,
   16525                   IRConst_U64(guest_RIP_bbstart+delta) ) );
   16526          dres.whatNext   = Dis_ResteerC;
   16527          dres.continueAt = d64;
   16528          comment = "(assumed taken)";
   16529       }
   16530       else
   16531       if (resteerCisOk
   16532           && vex_control.guest_chase_cond
   16533           && (Addr64)d64 != (Addr64)guest_RIP_bbstart
   16534           && jmpDelta >= 0
   16535           && resteerOkFn( callback_opaque, guest_RIP_bbstart+delta ) ) {
   16536          /* Speculation: assume this forward branch is not taken.  So
   16537             we need to emit a side-exit to d64 (the dest) and continue
   16538             disassembling at the insn immediately following this
   16539             one. */
   16540          stmt( IRStmt_Exit(
   16541                   mk_amd64g_calculate_condition((AMD64Condcode)(opc - 0x70)),
   16542                   Ijk_Boring,
   16543                   IRConst_U64(d64) ) );
   16544          dres.whatNext   = Dis_ResteerC;
   16545          dres.continueAt = guest_RIP_bbstart+delta;
   16546          comment = "(assumed not taken)";
   16547       }
   16548       else {
   16549          /* Conservative default translation - end the block at this
   16550             point. */
   16551          jcc_01( (AMD64Condcode)(opc - 0x70),
   16552                  guest_RIP_bbstart+delta,
   16553                  d64 );
   16554          dres.whatNext = Dis_StopHere;
   16555       }
   16556       DIP("j%s-8 0x%llx %s\n", name_AMD64Condcode(opc - 0x70), d64, comment);
   16557       break;
   16558     }
   16559 
   16560    case 0xE3:
   16561       /* JRCXZ or JECXZ, depending address size override. */
   16562       if (have66orF2orF3(pfx)) goto decode_failure;
   16563       d64 = (guest_RIP_bbstart+delta+1) + getSDisp8(delta);
   16564       delta++;
   16565       if (haveASO(pfx)) {
   16566          /* 32-bit */
   16567          stmt( IRStmt_Exit( binop(Iop_CmpEQ64,
   16568                             unop(Iop_32Uto64, getIReg32(R_RCX)),
   16569                             mkU64(0)),
   16570                Ijk_Boring,
   16571                IRConst_U64(d64))
   16572              );
   16573          DIP("jecxz 0x%llx\n", d64);
   16574       } else {
   16575          /* 64-bit */
   16576          stmt( IRStmt_Exit( binop(Iop_CmpEQ64,
   16577                                   getIReg64(R_RCX),
   16578                                   mkU64(0)),
   16579                Ijk_Boring,
   16580                IRConst_U64(d64))
   16581              );
   16582          DIP("jrcxz 0x%llx\n", d64);
   16583       }
   16584       break;
   16585 
   16586    case 0xE0: /* LOOPNE disp8: decrement count, jump if count != 0 && ZF==0 */
   16587    case 0xE1: /* LOOPE  disp8: decrement count, jump if count != 0 && ZF==1 */
   16588    case 0xE2: /* LOOP   disp8: decrement count, jump if count != 0 */
   16589     { /* The docs say this uses rCX as a count depending on the
   16590          address size override, not the operand one. */
   16591       IRExpr* zbit  = NULL;
   16592       IRExpr* count = NULL;
   16593       IRExpr* cond  = NULL;
   16594       HChar*  xtra  = NULL;
   16595 
   16596       if (have66orF2orF3(pfx) || 1==getRexW(pfx)) goto decode_failure;
   16597       /* So at this point we've rejected any variants which appear to
   16598          be governed by the usual operand-size modifiers.  Hence only
   16599          the address size prefix can have an effect.  It changes the
   16600          size from 64 (default) to 32. */
   16601       d64 = guest_RIP_bbstart+delta+1 + getSDisp8(delta);
   16602       delta++;
   16603       if (haveASO(pfx)) {
   16604          /* 64to32 of 64-bit get is merely a get-put improvement
   16605             trick. */
   16606          putIReg32(R_RCX, binop(Iop_Sub32,
   16607                                 unop(Iop_64to32, getIReg64(R_RCX)),
   16608                                 mkU32(1)));
   16609       } else {
   16610          putIReg64(R_RCX, binop(Iop_Sub64, getIReg64(R_RCX), mkU64(1)));
   16611       }
   16612 
   16613       /* This is correct, both for 32- and 64-bit versions.  If we're
   16614          doing a 32-bit dec and the result is zero then the default
   16615          zero extension rule will cause the upper 32 bits to be zero
   16616          too.  Hence a 64-bit check against zero is OK. */
   16617       count = getIReg64(R_RCX);
   16618       cond = binop(Iop_CmpNE64, count, mkU64(0));
   16619       switch (opc) {
   16620          case 0xE2:
   16621             xtra = "";
   16622             break;
   16623          case 0xE1:
   16624             xtra = "e";
   16625             zbit = mk_amd64g_calculate_condition( AMD64CondZ );
   16626             cond = mkAnd1(cond, zbit);
   16627             break;
   16628          case 0xE0:
   16629             xtra = "ne";
   16630             zbit = mk_amd64g_calculate_condition( AMD64CondNZ );
   16631             cond = mkAnd1(cond, zbit);
   16632             break;
   16633          default:
   16634 	    vassert(0);
   16635       }
   16636       stmt( IRStmt_Exit(cond, Ijk_Boring, IRConst_U64(d64)) );
   16637 
   16638       DIP("loop%s%s 0x%llx\n", xtra, haveASO(pfx) ? "l" : "", d64);
   16639       break;
   16640     }
   16641 
   16642    /* ------------------------ IMUL ----------------------- */
   16643 
   16644    case 0x69: /* IMUL Iv, Ev, Gv */
   16645       if (haveF2orF3(pfx)) goto decode_failure;
   16646       delta = dis_imul_I_E_G ( vbi, pfx, sz, delta, sz );
   16647       break;
   16648    case 0x6B: /* IMUL Ib, Ev, Gv */
   16649       delta = dis_imul_I_E_G ( vbi, pfx, sz, delta, 1 );
   16650       break;
   16651 
   16652    /* ------------------------ MOV ------------------------ */
   16653 
   16654    case 0x88: /* MOV Gb,Eb */
   16655       if (haveF2orF3(pfx)) goto decode_failure;
   16656       delta = dis_mov_G_E(vbi, pfx, 1, delta);
   16657       break;
   16658 
   16659    case 0x89: /* MOV Gv,Ev */
   16660       if (haveF2orF3(pfx)) goto decode_failure;
   16661       delta = dis_mov_G_E(vbi, pfx, sz, delta);
   16662       break;
   16663 
   16664    case 0x8A: /* MOV Eb,Gb */
   16665       if (haveF2orF3(pfx)) goto decode_failure;
   16666       delta = dis_mov_E_G(vbi, pfx, 1, delta);
   16667       break;
   16668 
   16669    case 0x8B: /* MOV Ev,Gv */
   16670       if (haveF2orF3(pfx)) goto decode_failure;
   16671       delta = dis_mov_E_G(vbi, pfx, sz, delta);
   16672       break;
   16673 
   16674    case 0x8D: /* LEA M,Gv */
   16675       if (haveF2orF3(pfx)) goto decode_failure;
   16676       if (sz != 4 && sz != 8)
   16677          goto decode_failure;
   16678       modrm = getUChar(delta);
   16679       if (epartIsReg(modrm))
   16680          goto decode_failure;
   16681       /* NOTE!  this is the one place where a segment override prefix
   16682          has no effect on the address calculation.  Therefore we clear
   16683          any segment override bits in pfx. */
   16684       addr = disAMode ( &alen, vbi, clearSegBits(pfx), delta, dis_buf, 0 );
   16685       delta += alen;
   16686       /* This is a hack.  But it isn't clear that really doing the
   16687          calculation at 32 bits is really worth it.  Hence for leal,
   16688          do the full 64-bit calculation and then truncate it. */
   16689       putIRegG( sz, pfx, modrm,
   16690                          sz == 4
   16691                             ? unop(Iop_64to32, mkexpr(addr))
   16692                             : mkexpr(addr)
   16693               );
   16694       DIP("lea%c %s, %s\n", nameISize(sz), dis_buf,
   16695                             nameIRegG(sz,pfx,modrm));
   16696       break;
   16697 
   16698 //..    case 0x8C: /* MOV Sw,Ew -- MOV from a SEGMENT REGISTER */
   16699 //..       delta = dis_mov_Sw_Ew(sorb, sz, delta);
   16700 //..       break;
   16701 //..
   16702 //..    case 0x8E: /* MOV Ew,Sw -- MOV to a SEGMENT REGISTER */
   16703 //..       delta = dis_mov_Ew_Sw(sorb, delta);
   16704 //..       break;
   16705 
   16706    case 0xA0: /* MOV Ob,AL */
   16707       if (have66orF2orF3(pfx)) goto decode_failure;
   16708       sz = 1;
   16709       /* Fall through ... */
   16710    case 0xA1: /* MOV Ov,eAX */
   16711       if (sz != 8 && sz != 4 && sz != 2 && sz != 1)
   16712          goto decode_failure;
   16713       d64 = getDisp64(delta);
   16714       delta += 8;
   16715       ty = szToITy(sz);
   16716       addr = newTemp(Ity_I64);
   16717       assign( addr, handleAddrOverrides(vbi, pfx, mkU64(d64)) );
   16718       putIRegRAX(sz, loadLE( ty, mkexpr(addr) ));
   16719       DIP("mov%c %s0x%llx, %s\n", nameISize(sz),
   16720                                   segRegTxt(pfx), d64,
   16721                                   nameIRegRAX(sz));
   16722       break;
   16723 
   16724    case 0xA2: /* MOV AL,Ob */
   16725       if (have66orF2orF3(pfx)) goto decode_failure;
   16726       sz = 1;
   16727       /* Fall through ... */
   16728    case 0xA3: /* MOV eAX,Ov */
   16729       if (sz != 8 && sz != 4 && sz != 2 && sz != 1)
   16730          goto decode_failure;
   16731       d64 = getDisp64(delta);
   16732       delta += 8;
   16733       ty = szToITy(sz);
   16734       addr = newTemp(Ity_I64);
   16735       assign( addr, handleAddrOverrides(vbi, pfx, mkU64(d64)) );
   16736       storeLE( mkexpr(addr), getIRegRAX(sz) );
   16737       DIP("mov%c %s, %s0x%llx\n", nameISize(sz), nameIRegRAX(sz),
   16738                                   segRegTxt(pfx), d64);
   16739       break;
   16740 
   16741    /* XXXX be careful here with moves to AH/BH/CH/DH */
   16742    case 0xB0: /* MOV imm,AL */
   16743    case 0xB1: /* MOV imm,CL */
   16744    case 0xB2: /* MOV imm,DL */
   16745    case 0xB3: /* MOV imm,BL */
   16746    case 0xB4: /* MOV imm,AH */
   16747    case 0xB5: /* MOV imm,CH */
   16748    case 0xB6: /* MOV imm,DH */
   16749    case 0xB7: /* MOV imm,BH */
   16750       if (haveF2orF3(pfx)) goto decode_failure;
   16751       d64 = getUChar(delta);
   16752       delta += 1;
   16753       putIRegRexB(1, pfx, opc-0xB0, mkU8(d64));
   16754       DIP("movb $%lld,%s\n", d64, nameIRegRexB(1,pfx,opc-0xB0));
   16755       break;
   16756 
   16757    case 0xB8: /* MOV imm,eAX */
   16758    case 0xB9: /* MOV imm,eCX */
   16759    case 0xBA: /* MOV imm,eDX */
   16760    case 0xBB: /* MOV imm,eBX */
   16761    case 0xBC: /* MOV imm,eSP */
   16762    case 0xBD: /* MOV imm,eBP */
   16763    case 0xBE: /* MOV imm,eSI */
   16764    case 0xBF: /* MOV imm,eDI */
   16765       /* This is the one-and-only place where 64-bit literals are
   16766          allowed in the instruction stream. */
   16767       if (haveF2orF3(pfx)) goto decode_failure;
   16768       if (sz == 8) {
   16769          d64 = getDisp64(delta);
   16770          delta += 8;
   16771          putIRegRexB(8, pfx, opc-0xB8, mkU64(d64));
   16772          DIP("movabsq $%lld,%s\n", (Long)d64,
   16773                                    nameIRegRexB(8,pfx,opc-0xB8));
   16774       } else {
   16775          d64 = getSDisp(imin(4,sz),delta);
   16776          delta += imin(4,sz);
   16777          putIRegRexB(sz, pfx, opc-0xB8,
   16778                          mkU(szToITy(sz), d64 & mkSizeMask(sz)));
   16779          DIP("mov%c $%lld,%s\n", nameISize(sz),
   16780                                  (Long)d64,
   16781                                  nameIRegRexB(sz,pfx,opc-0xB8));
   16782       }
   16783       break;
   16784 
   16785    case 0xC6: /* MOV Ib,Eb */
   16786       sz = 1;
   16787       goto do_Mov_I_E;
   16788    case 0xC7: /* MOV Iv,Ev */
   16789       goto do_Mov_I_E;
   16790 
   16791    do_Mov_I_E:
   16792       if (haveF2orF3(pfx)) goto decode_failure;
   16793       modrm = getUChar(delta);
   16794       if (epartIsReg(modrm)) {
   16795          delta++; /* mod/rm byte */
   16796          d64 = getSDisp(imin(4,sz),delta);
   16797          delta += imin(4,sz);
   16798          putIRegE(sz, pfx, modrm,
   16799                       mkU(szToITy(sz), d64 & mkSizeMask(sz)));
   16800          DIP("mov%c $%lld, %s\n", nameISize(sz),
   16801                                   (Long)d64,
   16802                                   nameIRegE(sz,pfx,modrm));
   16803       } else {
   16804          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   16805                            /*xtra*/imin(4,sz) );
   16806          delta += alen;
   16807          d64 = getSDisp(imin(4,sz),delta);
   16808          delta += imin(4,sz);
   16809          storeLE(mkexpr(addr),
   16810                  mkU(szToITy(sz), d64 & mkSizeMask(sz)));
   16811          DIP("mov%c $%lld, %s\n", nameISize(sz), (Long)d64, dis_buf);
   16812       }
   16813       break;
   16814 
   16815    /* ------------------------ MOVx ------------------------ */
   16816 
   16817    case 0x63: /* MOVSX */
   16818       if (haveF2orF3(pfx)) goto decode_failure;
   16819       if (haveREX(pfx) && 1==getRexW(pfx)) {
   16820          vassert(sz == 8);
   16821          /* movsx r/m32 to r64 */
   16822          modrm = getUChar(delta);
   16823          if (epartIsReg(modrm)) {
   16824             delta++;
   16825             putIRegG(8, pfx, modrm,
   16826                              unop(Iop_32Sto64,
   16827                                   getIRegE(4, pfx, modrm)));
   16828             DIP("movslq %s,%s\n",
   16829                 nameIRegE(4, pfx, modrm),
   16830                 nameIRegG(8, pfx, modrm));
   16831             break;
   16832          } else {
   16833             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   16834             delta += alen;
   16835             putIRegG(8, pfx, modrm,
   16836                              unop(Iop_32Sto64,
   16837                                   loadLE(Ity_I32, mkexpr(addr))));
   16838             DIP("movslq %s,%s\n", dis_buf,
   16839                 nameIRegG(8, pfx, modrm));
   16840             break;
   16841          }
   16842       } else {
   16843          goto decode_failure;
   16844       }
   16845 
   16846    /* ------------------------ opl imm, A ----------------- */
   16847 
   16848    case 0x04: /* ADD Ib, AL */
   16849       if (haveF2orF3(pfx)) goto decode_failure;
   16850       delta = dis_op_imm_A( 1, False, Iop_Add8, True, delta, "add" );
   16851       break;
   16852    case 0x05: /* ADD Iv, eAX */
   16853       if (haveF2orF3(pfx)) goto decode_failure;
   16854       delta = dis_op_imm_A(sz, False, Iop_Add8, True, delta, "add" );
   16855       break;
   16856 
   16857    case 0x0C: /* OR Ib, AL */
   16858       if (haveF2orF3(pfx)) goto decode_failure;
   16859       delta = dis_op_imm_A( 1, False, Iop_Or8, True, delta, "or" );
   16860       break;
   16861    case 0x0D: /* OR Iv, eAX */
   16862       if (haveF2orF3(pfx)) goto decode_failure;
   16863       delta = dis_op_imm_A( sz, False, Iop_Or8, True, delta, "or" );
   16864       break;
   16865 
   16866    case 0x14: /* ADC Ib, AL */
   16867       if (haveF2orF3(pfx)) goto decode_failure;
   16868       delta = dis_op_imm_A( 1, True, Iop_Add8, True, delta, "adc" );
   16869       break;
   16870    case 0x15: /* ADC Iv, eAX */
   16871       if (haveF2orF3(pfx)) goto decode_failure;
   16872       delta = dis_op_imm_A( sz, True, Iop_Add8, True, delta, "adc" );
   16873       break;
   16874 
   16875    case 0x1C: /* SBB Ib, AL */
   16876       if (haveF2orF3(pfx)) goto decode_failure;
   16877       delta = dis_op_imm_A( 1, True, Iop_Sub8, True, delta, "sbb" );
   16878       break;
   16879    case 0x1D: /* SBB Iv, eAX */
   16880       if (haveF2orF3(pfx)) goto decode_failure;
   16881       delta = dis_op_imm_A( sz, True, Iop_Sub8, True, delta, "sbb" );
   16882       break;
   16883 
   16884    case 0x24: /* AND Ib, AL */
   16885       if (haveF2orF3(pfx)) goto decode_failure;
   16886       delta = dis_op_imm_A( 1, False, Iop_And8, True, delta, "and" );
   16887       break;
   16888    case 0x25: /* AND Iv, eAX */
   16889       if (haveF2orF3(pfx)) goto decode_failure;
   16890       delta = dis_op_imm_A( sz, False, Iop_And8, True, delta, "and" );
   16891       break;
   16892 
   16893    case 0x2C: /* SUB Ib, AL */
   16894       if (haveF2orF3(pfx)) goto decode_failure;
   16895       delta = dis_op_imm_A(1, False, Iop_Sub8, True, delta, "sub" );
   16896       break;
   16897    case 0x2D: /* SUB Iv, eAX */
   16898       if (haveF2orF3(pfx)) goto decode_failure;
   16899       delta = dis_op_imm_A( sz, False, Iop_Sub8, True, delta, "sub" );
   16900       break;
   16901 
   16902    case 0x34: /* XOR Ib, AL */
   16903       if (haveF2orF3(pfx)) goto decode_failure;
   16904       delta = dis_op_imm_A( 1, False, Iop_Xor8, True, delta, "xor" );
   16905       break;
   16906    case 0x35: /* XOR Iv, eAX */
   16907       if (haveF2orF3(pfx)) goto decode_failure;
   16908       delta = dis_op_imm_A( sz, False, Iop_Xor8, True, delta, "xor" );
   16909       break;
   16910 
   16911    case 0x3C: /* CMP Ib, AL */
   16912       if (haveF2orF3(pfx)) goto decode_failure;
   16913       delta = dis_op_imm_A( 1, False, Iop_Sub8, False, delta, "cmp" );
   16914       break;
   16915    case 0x3D: /* CMP Iv, eAX */
   16916       if (haveF2orF3(pfx)) goto decode_failure;
   16917       delta = dis_op_imm_A( sz, False, Iop_Sub8, False, delta, "cmp" );
   16918       break;
   16919 
   16920    case 0xA8: /* TEST Ib, AL */
   16921       if (haveF2orF3(pfx)) goto decode_failure;
   16922       delta = dis_op_imm_A( 1, False, Iop_And8, False, delta, "test" );
   16923       break;
   16924    case 0xA9: /* TEST Iv, eAX */
   16925       if (haveF2orF3(pfx)) goto decode_failure;
   16926       delta = dis_op_imm_A( sz, False, Iop_And8, False, delta, "test" );
   16927       break;
   16928 
   16929    /* ------------------------ opl Ev, Gv ----------------- */
   16930 
   16931    case 0x02: /* ADD Eb,Gb */
   16932       if (haveF2orF3(pfx)) goto decode_failure;
   16933       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Add8, True, 1, delta, "add" );
   16934       break;
   16935    case 0x03: /* ADD Ev,Gv */
   16936       if (haveF2orF3(pfx)) goto decode_failure;
   16937       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Add8, True, sz, delta, "add" );
   16938       break;
   16939 
   16940    case 0x0A: /* OR Eb,Gb */
   16941       if (haveF2orF3(pfx)) goto decode_failure;
   16942       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Or8, True, 1, delta, "or" );
   16943       break;
   16944    case 0x0B: /* OR Ev,Gv */
   16945       if (haveF2orF3(pfx)) goto decode_failure;
   16946       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Or8, True, sz, delta, "or" );
   16947       break;
   16948 
   16949    case 0x12: /* ADC Eb,Gb */
   16950       if (haveF2orF3(pfx)) goto decode_failure;
   16951       delta = dis_op2_E_G ( vbi, pfx, True, Iop_Add8, True, 1, delta, "adc" );
   16952       break;
   16953    case 0x13: /* ADC Ev,Gv */
   16954       if (haveF2orF3(pfx)) goto decode_failure;
   16955       delta = dis_op2_E_G ( vbi, pfx, True, Iop_Add8, True, sz, delta, "adc" );
   16956       break;
   16957 
   16958    case 0x1A: /* SBB Eb,Gb */
   16959       if (haveF2orF3(pfx)) goto decode_failure;
   16960       delta = dis_op2_E_G ( vbi, pfx, True, Iop_Sub8, True, 1, delta, "sbb" );
   16961       break;
   16962    case 0x1B: /* SBB Ev,Gv */
   16963       if (haveF2orF3(pfx)) goto decode_failure;
   16964       delta = dis_op2_E_G ( vbi, pfx, True, Iop_Sub8, True, sz, delta, "sbb" );
   16965       break;
   16966 
   16967    case 0x22: /* AND Eb,Gb */
   16968       if (haveF2orF3(pfx)) goto decode_failure;
   16969       delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, True, 1, delta, "and" );
   16970       break;
   16971    case 0x23: /* AND Ev,Gv */
   16972       if (haveF2orF3(pfx)) goto decode_failure;
   16973       delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, True, sz, delta, "and" );
   16974       break;
   16975 
   16976    case 0x2A: /* SUB Eb,Gb */
   16977       if (haveF2orF3(pfx)) goto decode_failure;
   16978       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, True, 1, delta, "sub" );
   16979       break;
   16980    case 0x2B: /* SUB Ev,Gv */
   16981       if (haveF2orF3(pfx)) goto decode_failure;
   16982       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, True, sz, delta, "sub" );
   16983       break;
   16984 
   16985    case 0x32: /* XOR Eb,Gb */
   16986       if (haveF2orF3(pfx)) goto decode_failure;
   16987       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Xor8, True, 1, delta, "xor" );
   16988       break;
   16989    case 0x33: /* XOR Ev,Gv */
   16990       if (haveF2orF3(pfx)) goto decode_failure;
   16991       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Xor8, True, sz, delta, "xor" );
   16992       break;
   16993 
   16994    case 0x3A: /* CMP Eb,Gb */
   16995       if (haveF2orF3(pfx)) goto decode_failure;
   16996       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, False, 1, delta, "cmp" );
   16997       break;
   16998    case 0x3B: /* CMP Ev,Gv */
   16999       if (haveF2orF3(pfx)) goto decode_failure;
   17000       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, False, sz, delta, "cmp" );
   17001       break;
   17002 
   17003    case 0x84: /* TEST Eb,Gb */
   17004       if (haveF2orF3(pfx)) goto decode_failure;
   17005       delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, False, 1, delta, "test" );
   17006       break;
   17007    case 0x85: /* TEST Ev,Gv */
   17008       if (haveF2orF3(pfx)) goto decode_failure;
   17009       delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, False, sz, delta, "test" );
   17010       break;
   17011 
   17012    /* ------------------------ opl Gv, Ev ----------------- */
   17013 
   17014    case 0x00: /* ADD Gb,Eb */
   17015       if (haveF2orF3(pfx)) goto decode_failure;
   17016       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Add8, True, 1, delta, "add" );
   17017       break;
   17018    case 0x01: /* ADD Gv,Ev */
   17019       if (haveF2orF3(pfx)) goto decode_failure;
   17020       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Add8, True, sz, delta, "add" );
   17021       break;
   17022 
   17023    case 0x08: /* OR Gb,Eb */
   17024       if (haveF2orF3(pfx)) goto decode_failure;
   17025       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Or8, True, 1, delta, "or" );
   17026       break;
   17027    case 0x09: /* OR Gv,Ev */
   17028       if (haveF2orF3(pfx)) goto decode_failure;
   17029       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Or8, True, sz, delta, "or" );
   17030       break;
   17031 
   17032    case 0x10: /* ADC Gb,Eb */
   17033       if (haveF2orF3(pfx)) goto decode_failure;
   17034       delta = dis_op2_G_E ( vbi, pfx, True, Iop_Add8, True, 1, delta, "adc" );
   17035       break;
   17036    case 0x11: /* ADC Gv,Ev */
   17037       if (haveF2orF3(pfx)) goto decode_failure;
   17038       delta = dis_op2_G_E ( vbi, pfx, True, Iop_Add8, True, sz, delta, "adc" );
   17039       break;
   17040 
   17041    case 0x18: /* SBB Gb,Eb */
   17042       if (haveF2orF3(pfx)) goto decode_failure;
   17043       delta = dis_op2_G_E ( vbi, pfx, True, Iop_Sub8, True, 1, delta, "sbb" );
   17044       break;
   17045    case 0x19: /* SBB Gv,Ev */
   17046       if (haveF2orF3(pfx)) goto decode_failure;
   17047       delta = dis_op2_G_E ( vbi, pfx, True, Iop_Sub8, True, sz, delta, "sbb" );
   17048       break;
   17049 
   17050    case 0x20: /* AND Gb,Eb */
   17051       if (haveF2orF3(pfx)) goto decode_failure;
   17052       delta = dis_op2_G_E ( vbi, pfx, False, Iop_And8, True, 1, delta, "and" );
   17053       break;
   17054    case 0x21: /* AND Gv,Ev */
   17055       if (haveF2orF3(pfx)) goto decode_failure;
   17056       delta = dis_op2_G_E ( vbi, pfx, False, Iop_And8, True, sz, delta, "and" );
   17057       break;
   17058 
   17059    case 0x28: /* SUB Gb,Eb */
   17060       if (haveF2orF3(pfx)) goto decode_failure;
   17061       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, True, 1, delta, "sub" );
   17062       break;
   17063    case 0x29: /* SUB Gv,Ev */
   17064       if (haveF2orF3(pfx)) goto decode_failure;
   17065       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, True, sz, delta, "sub" );
   17066       break;
   17067 
   17068    case 0x30: /* XOR Gb,Eb */
   17069       if (haveF2orF3(pfx)) goto decode_failure;
   17070       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Xor8, True, 1, delta, "xor" );
   17071       break;
   17072    case 0x31: /* XOR Gv,Ev */
   17073       if (haveF2orF3(pfx)) goto decode_failure;
   17074       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Xor8, True, sz, delta, "xor" );
   17075       break;
   17076 
   17077    case 0x38: /* CMP Gb,Eb */
   17078       if (haveF2orF3(pfx)) goto decode_failure;
   17079       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, False, 1, delta, "cmp" );
   17080       break;
   17081    case 0x39: /* CMP Gv,Ev */
   17082       if (haveF2orF3(pfx)) goto decode_failure;
   17083       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, False, sz, delta, "cmp" );
   17084       break;
   17085 
   17086    /* ------------------------ POP ------------------------ */
   17087 
   17088    case 0x58: /* POP eAX */
   17089    case 0x59: /* POP eCX */
   17090    case 0x5A: /* POP eDX */
   17091    case 0x5B: /* POP eBX */
   17092    case 0x5D: /* POP eBP */
   17093    case 0x5E: /* POP eSI */
   17094    case 0x5F: /* POP eDI */
   17095    case 0x5C: /* POP eSP */
   17096       if (haveF2orF3(pfx)) goto decode_failure;
   17097       vassert(sz == 2 || sz == 4 || sz == 8);
   17098       if (sz == 4)
   17099          sz = 8; /* there is no encoding for 32-bit pop in 64-bit mode */
   17100       t1 = newTemp(szToITy(sz));
   17101       t2 = newTemp(Ity_I64);
   17102       assign(t2, getIReg64(R_RSP));
   17103       assign(t1, loadLE(szToITy(sz),mkexpr(t2)));
   17104       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t2), mkU64(sz)));
   17105       putIRegRexB(sz, pfx, opc-0x58, mkexpr(t1));
   17106       DIP("pop%c %s\n", nameISize(sz), nameIRegRexB(sz,pfx,opc-0x58));
   17107       break;
   17108 
   17109    case 0x9D: /* POPF */
   17110       /* Note.  There is no encoding for a 32-bit popf in 64-bit mode.
   17111          So sz==4 actually means sz==8. */
   17112       if (haveF2orF3(pfx)) goto decode_failure;
   17113       vassert(sz == 2 || sz == 4 || sz == 8);
   17114       if (sz == 4) sz = 8;
   17115       if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
   17116       t1 = newTemp(Ity_I64); t2 = newTemp(Ity_I64);
   17117       assign(t2, getIReg64(R_RSP));
   17118       assign(t1, widenUto64(loadLE(szToITy(sz),mkexpr(t2))));
   17119       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t2), mkU64(sz)));
   17120       /* t1 is the flag word.  Mask out everything except OSZACP and
   17121          set the flags thunk to AMD64G_CC_OP_COPY. */
   17122       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   17123       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   17124       stmt( IRStmt_Put( OFFB_CC_DEP1,
   17125                         binop(Iop_And64,
   17126                               mkexpr(t1),
   17127                               mkU64( AMD64G_CC_MASK_C | AMD64G_CC_MASK_P
   17128                                      | AMD64G_CC_MASK_A | AMD64G_CC_MASK_Z
   17129                                      | AMD64G_CC_MASK_S| AMD64G_CC_MASK_O )
   17130                              )
   17131                        )
   17132           );
   17133 
   17134       /* Also need to set the D flag, which is held in bit 10 of t1.
   17135          If zero, put 1 in OFFB_DFLAG, else -1 in OFFB_DFLAG. */
   17136       stmt( IRStmt_Put(
   17137                OFFB_DFLAG,
   17138                IRExpr_Mux0X(
   17139                   unop(Iop_32to8,
   17140                   unop(Iop_64to32,
   17141                        binop(Iop_And64,
   17142                              binop(Iop_Shr64, mkexpr(t1), mkU8(10)),
   17143                              mkU64(1)))),
   17144                   mkU64(1),
   17145                   mkU64(0xFFFFFFFFFFFFFFFFULL)))
   17146           );
   17147 
   17148       /* And set the ID flag */
   17149       stmt( IRStmt_Put(
   17150                OFFB_IDFLAG,
   17151                IRExpr_Mux0X(
   17152                   unop(Iop_32to8,
   17153                   unop(Iop_64to32,
   17154                        binop(Iop_And64,
   17155                              binop(Iop_Shr64, mkexpr(t1), mkU8(21)),
   17156                              mkU64(1)))),
   17157                   mkU64(0),
   17158                   mkU64(1)))
   17159           );
   17160 
   17161       /* And set the AC flag too */
   17162       stmt( IRStmt_Put(
   17163                OFFB_ACFLAG,
   17164                IRExpr_Mux0X(
   17165                   unop(Iop_32to8,
   17166                   unop(Iop_64to32,
   17167                        binop(Iop_And64,
   17168                              binop(Iop_Shr64, mkexpr(t1), mkU8(18)),
   17169                              mkU64(1)))),
   17170                   mkU64(0),
   17171                   mkU64(1)))
   17172           );
   17173 
   17174       DIP("popf%c\n", nameISize(sz));
   17175       break;
   17176 
   17177 //..    case 0x61: /* POPA */
   17178 //..       /* This is almost certainly wrong for sz==2.  So ... */
   17179 //..       if (sz != 4) goto decode_failure;
   17180 //..
   17181 //..       /* t5 is the old %ESP value. */
   17182 //..       t5 = newTemp(Ity_I32);
   17183 //..       assign( t5, getIReg(4, R_ESP) );
   17184 //..
   17185 //..       /* Reload all the registers, except %esp. */
   17186 //..       putIReg(4,R_EAX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(28)) ));
   17187 //..       putIReg(4,R_ECX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(24)) ));
   17188 //..       putIReg(4,R_EDX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(20)) ));
   17189 //..       putIReg(4,R_EBX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(16)) ));
   17190 //..       /* ignore saved %ESP */
   17191 //..       putIReg(4,R_EBP, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 8)) ));
   17192 //..       putIReg(4,R_ESI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 4)) ));
   17193 //..       putIReg(4,R_EDI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 0)) ));
   17194 //..
   17195 //..       /* and move %ESP back up */
   17196 //..       putIReg( 4, R_ESP, binop(Iop_Add32, mkexpr(t5), mkU32(8*4)) );
   17197 //..
   17198 //..       DIP("pusha%c\n", nameISize(sz));
   17199 //..       break;
   17200 
   17201    case 0x8F: { /* POPQ m64 / POPW m16 */
   17202       Int   len;
   17203       UChar rm;
   17204       /* There is no encoding for 32-bit pop in 64-bit mode.
   17205          So sz==4 actually means sz==8. */
   17206       if (haveF2orF3(pfx)) goto decode_failure;
   17207       vassert(sz == 2 || sz == 4
   17208               || /* tolerate redundant REX.W, see #210481 */ sz == 8);
   17209       if (sz == 4) sz = 8;
   17210       if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
   17211 
   17212       rm = getUChar(delta);
   17213 
   17214       /* make sure this instruction is correct POP */
   17215       if (epartIsReg(rm) || gregLO3ofRM(rm) != 0)
   17216          goto decode_failure;
   17217       /* and has correct size */
   17218       vassert(sz == 8);
   17219 
   17220       t1 = newTemp(Ity_I64);
   17221       t3 = newTemp(Ity_I64);
   17222       assign( t1, getIReg64(R_RSP) );
   17223       assign( t3, loadLE(Ity_I64, mkexpr(t1)) );
   17224 
   17225       /* Increase RSP; must be done before the STORE.  Intel manual
   17226          says: If the RSP register is used as a base register for
   17227          addressing a destination operand in memory, the POP
   17228          instruction computes the effective address of the operand
   17229          after it increments the RSP register.  */
   17230       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t1), mkU64(sz)) );
   17231 
   17232       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   17233       storeLE( mkexpr(addr), mkexpr(t3) );
   17234 
   17235       DIP("popl %s\n", dis_buf);
   17236 
   17237       delta += len;
   17238       break;
   17239    }
   17240 
   17241 //.. //--    case 0x1F: /* POP %DS */
   17242 //.. //--       dis_pop_segreg( cb, R_DS, sz ); break;
   17243 //.. //--    case 0x07: /* POP %ES */
   17244 //.. //--       dis_pop_segreg( cb, R_ES, sz ); break;
   17245 //.. //--    case 0x17: /* POP %SS */
   17246 //.. //--       dis_pop_segreg( cb, R_SS, sz ); break;
   17247 
   17248    /* ------------------------ PUSH ----------------------- */
   17249 
   17250    case 0x50: /* PUSH eAX */
   17251    case 0x51: /* PUSH eCX */
   17252    case 0x52: /* PUSH eDX */
   17253    case 0x53: /* PUSH eBX */
   17254    case 0x55: /* PUSH eBP */
   17255    case 0x56: /* PUSH eSI */
   17256    case 0x57: /* PUSH eDI */
   17257    case 0x54: /* PUSH eSP */
   17258       /* This is the Right Way, in that the value to be pushed is
   17259          established before %rsp is changed, so that pushq %rsp
   17260          correctly pushes the old value. */
   17261       if (haveF2orF3(pfx)) goto decode_failure;
   17262       vassert(sz == 2 || sz == 4 || sz == 8);
   17263       if (sz == 4)
   17264          sz = 8; /* there is no encoding for 32-bit push in 64-bit mode */
   17265       ty = sz==2 ? Ity_I16 : Ity_I64;
   17266       t1 = newTemp(ty);
   17267       t2 = newTemp(Ity_I64);
   17268       assign(t1, getIRegRexB(sz, pfx, opc-0x50));
   17269       assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(sz)));
   17270       putIReg64(R_RSP, mkexpr(t2) );
   17271       storeLE(mkexpr(t2),mkexpr(t1));
   17272       DIP("push%c %s\n", nameISize(sz), nameIRegRexB(sz,pfx,opc-0x50));
   17273       break;
   17274 
   17275    case 0x68: /* PUSH Iv */
   17276       if (haveF2orF3(pfx)) goto decode_failure;
   17277       /* Note, sz==4 is not possible in 64-bit mode.  Hence ... */
   17278       if (sz == 4) sz = 8;
   17279       d64 = getSDisp(imin(4,sz),delta);
   17280       delta += imin(4,sz);
   17281       goto do_push_I;
   17282    case 0x6A: /* PUSH Ib, sign-extended to sz */
   17283       if (haveF2orF3(pfx)) goto decode_failure;
   17284       /* Note, sz==4 is not possible in 64-bit mode.  Hence ... */
   17285       if (sz == 4) sz = 8;
   17286       d64 = getSDisp8(delta); delta += 1;
   17287       goto do_push_I;
   17288    do_push_I:
   17289       ty = szToITy(sz);
   17290       t1 = newTemp(Ity_I64);
   17291       t2 = newTemp(ty);
   17292       assign( t1, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
   17293       putIReg64(R_RSP, mkexpr(t1) );
   17294       /* stop mkU16 asserting if d32 is a negative 16-bit number
   17295          (bug #132813) */
   17296       if (ty == Ity_I16)
   17297          d64 &= 0xFFFF;
   17298       storeLE( mkexpr(t1), mkU(ty,d64) );
   17299       DIP("push%c $%lld\n", nameISize(sz), (Long)d64);
   17300       break;
   17301 
   17302    case 0x9C: /* PUSHF */ {
   17303       /* Note.  There is no encoding for a 32-bit pushf in 64-bit
   17304          mode.  So sz==4 actually means sz==8. */
   17305       /* 24 July 06: has also been seen with a redundant REX prefix,
   17306          so must also allow sz==8. */
   17307       if (haveF2orF3(pfx)) goto decode_failure;
   17308       vassert(sz == 2 || sz == 4 || sz == 8);
   17309       if (sz == 4) sz = 8;
   17310       if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
   17311 
   17312       t1 = newTemp(Ity_I64);
   17313       assign( t1, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
   17314       putIReg64(R_RSP, mkexpr(t1) );
   17315 
   17316       t2 = newTemp(Ity_I64);
   17317       assign( t2, mk_amd64g_calculate_rflags_all() );
   17318 
   17319       /* Patch in the D flag.  This can simply be a copy of bit 10 of
   17320          baseBlock[OFFB_DFLAG]. */
   17321       t3 = newTemp(Ity_I64);
   17322       assign( t3, binop(Iop_Or64,
   17323                         mkexpr(t2),
   17324                         binop(Iop_And64,
   17325                               IRExpr_Get(OFFB_DFLAG,Ity_I64),
   17326                               mkU64(1<<10)))
   17327             );
   17328 
   17329       /* And patch in the ID flag. */
   17330       t4 = newTemp(Ity_I64);
   17331       assign( t4, binop(Iop_Or64,
   17332                         mkexpr(t3),
   17333                         binop(Iop_And64,
   17334                               binop(Iop_Shl64, IRExpr_Get(OFFB_IDFLAG,Ity_I64),
   17335                                                mkU8(21)),
   17336                               mkU64(1<<21)))
   17337             );
   17338 
   17339       /* And patch in the AC flag too. */
   17340       t5 = newTemp(Ity_I64);
   17341       assign( t5, binop(Iop_Or64,
   17342                         mkexpr(t4),
   17343                         binop(Iop_And64,
   17344                               binop(Iop_Shl64, IRExpr_Get(OFFB_ACFLAG,Ity_I64),
   17345                                                mkU8(18)),
   17346                               mkU64(1<<18)))
   17347             );
   17348 
   17349       /* if sz==2, the stored value needs to be narrowed. */
   17350       if (sz == 2)
   17351         storeLE( mkexpr(t1), unop(Iop_32to16,
   17352                              unop(Iop_64to32,mkexpr(t5))) );
   17353       else
   17354         storeLE( mkexpr(t1), mkexpr(t5) );
   17355 
   17356       DIP("pushf%c\n", nameISize(sz));
   17357       break;
   17358    }
   17359 
   17360 //..    case 0x60: /* PUSHA */
   17361 //..       /* This is almost certainly wrong for sz==2.  So ... */
   17362 //..       if (sz != 4) goto decode_failure;
   17363 //..
   17364 //..       /* This is the Right Way, in that the value to be pushed is
   17365 //..          established before %esp is changed, so that pusha
   17366 //..          correctly pushes the old %esp value.  New value of %esp is
   17367 //..          pushed at start. */
   17368 //..       /* t0 is the %ESP value we're going to push. */
   17369 //..       t0 = newTemp(Ity_I32);
   17370 //..       assign( t0, getIReg(4, R_ESP) );
   17371 //..
   17372 //..       /* t5 will be the new %ESP value. */
   17373 //..       t5 = newTemp(Ity_I32);
   17374 //..       assign( t5, binop(Iop_Sub32, mkexpr(t0), mkU32(8*4)) );
   17375 //..
   17376 //..       /* Update guest state before prodding memory. */
   17377 //..       putIReg(4, R_ESP, mkexpr(t5));
   17378 //..
   17379 //..       /* Dump all the registers. */
   17380 //..       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(28)), getIReg(4,R_EAX) );
   17381 //..       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(24)), getIReg(4,R_ECX) );
   17382 //..       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(20)), getIReg(4,R_EDX) );
   17383 //..       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(16)), getIReg(4,R_EBX) );
   17384 //..       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(12)), mkexpr(t0) /*esp*/);
   17385 //..       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 8)), getIReg(4,R_EBP) );
   17386 //..       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 4)), getIReg(4,R_ESI) );
   17387 //..       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 0)), getIReg(4,R_EDI) );
   17388 //..
   17389 //..       DIP("pusha%c\n", nameISize(sz));
   17390 //..       break;
   17391 //..
   17392 //..
   17393 //.. //--    case 0x0E: /* PUSH %CS */
   17394 //.. //--       dis_push_segreg( cb, R_CS, sz ); break;
   17395 //.. //--    case 0x1E: /* PUSH %DS */
   17396 //.. //--       dis_push_segreg( cb, R_DS, sz ); break;
   17397 //.. //--    case 0x06: /* PUSH %ES */
   17398 //.. //--       dis_push_segreg( cb, R_ES, sz ); break;
   17399 //.. //--    case 0x16: /* PUSH %SS */
   17400 //.. //--       dis_push_segreg( cb, R_SS, sz ); break;
   17401 //..
   17402 //..    /* ------------------------ SCAS et al ----------------- */
   17403 //..
   17404 //..    case 0xA4: /* MOVS, no REP prefix */
   17405 //..    case 0xA5:
   17406 //..       dis_string_op( dis_MOVS, ( opc == 0xA4 ? 1 : sz ), "movs", sorb );
   17407 //..       break;
   17408 //..
   17409 //..   case 0xA6: /* CMPSb, no REP prefix */
   17410 //.. //--    case 0xA7:
   17411 //..      dis_string_op( dis_CMPS, ( opc == 0xA6 ? 1 : sz ), "cmps", sorb );
   17412 //..      break;
   17413 //.. //--
   17414 //.. //--
   17415     case 0xAC: /* LODS, no REP prefix */
   17416     case 0xAD:
   17417        dis_string_op( dis_LODS, ( opc == 0xAC ? 1 : sz ), "lods", pfx );
   17418        break;
   17419 //..
   17420 //..    case 0xAE: /* SCAS, no REP prefix */
   17421 //..    case 0xAF:
   17422 //..       dis_string_op( dis_SCAS, ( opc == 0xAE ? 1 : sz ), "scas", sorb );
   17423 //..       break;
   17424 
   17425 
   17426    case 0xFC: /* CLD */
   17427       if (haveF2orF3(pfx)) goto decode_failure;
   17428       stmt( IRStmt_Put( OFFB_DFLAG, mkU64(1)) );
   17429       DIP("cld\n");
   17430       break;
   17431 
   17432    case 0xFD: /* STD */
   17433       if (haveF2orF3(pfx)) goto decode_failure;
   17434       stmt( IRStmt_Put( OFFB_DFLAG, mkU64(-1ULL)) );
   17435       DIP("std\n");
   17436       break;
   17437 
   17438    case 0xF8: /* CLC */
   17439    case 0xF9: /* STC */
   17440    case 0xF5: /* CMC */
   17441       t0 = newTemp(Ity_I64);
   17442       t1 = newTemp(Ity_I64);
   17443       assign( t0, mk_amd64g_calculate_rflags_all() );
   17444       switch (opc) {
   17445          case 0xF8:
   17446             assign( t1, binop(Iop_And64, mkexpr(t0),
   17447                                          mkU64(~AMD64G_CC_MASK_C)));
   17448             DIP("clc\n");
   17449             break;
   17450          case 0xF9:
   17451             assign( t1, binop(Iop_Or64, mkexpr(t0),
   17452                                         mkU64(AMD64G_CC_MASK_C)));
   17453             DIP("stc\n");
   17454             break;
   17455          case 0xF5:
   17456             assign( t1, binop(Iop_Xor64, mkexpr(t0),
   17457                                          mkU64(AMD64G_CC_MASK_C)));
   17458             DIP("cmc\n");
   17459             break;
   17460          default:
   17461             vpanic("disInstr(x64)(clc/stc/cmc)");
   17462       }
   17463       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   17464       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   17465       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t1) ));
   17466       /* Set NDEP even though it isn't used.  This makes redundant-PUT
   17467          elimination of previous stores to this field work better. */
   17468       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   17469       break;
   17470 
   17471 //..    /* REPNE prefix insn */
   17472 //..    case 0xF2: {
   17473 //..       Addr32 eip_orig = guest_eip_bbstart + delta - 1;
   17474 //..       vassert(sorb == 0);
   17475 //..       abyte = getUChar(delta); delta++;
   17476 //..
   17477 //..       if (abyte == 0x66) { sz = 2; abyte = getUChar(delta); delta++; }
   17478 //..       whatNext = Dis_StopHere;
   17479 //..
   17480 //..       switch (abyte) {
   17481 //..       /* According to the Intel manual, "repne movs" should never occur, but
   17482 //..        * in practice it has happened, so allow for it here... */
   17483 //..       case 0xA4: sz = 1;   /* REPNE MOVS<sz> */
   17484 //..         goto decode_failure;
   17485 //.. //--       case 0xA5:
   17486 //..         //         dis_REP_op ( CondNZ, dis_MOVS, sz, eip_orig,
   17487 //..         //                              guest_eip_bbstart+delta, "repne movs" );
   17488 //..         //         break;
   17489 //.. //--
   17490 //.. //--       case 0xA6: sz = 1;   /* REPNE CMPS<sz> */
   17491 //.. //--       case 0xA7:
   17492 //.. //--          dis_REP_op ( cb, CondNZ, dis_CMPS, sz, eip_orig, eip, "repne cmps" );
   17493 //.. //--          break;
   17494 //.. //--
   17495 //..       case 0xAE: sz = 1;   /* REPNE SCAS<sz> */
   17496 //..       case 0xAF:
   17497 //..          dis_REP_op ( X86CondNZ, dis_SCAS, sz, eip_orig,
   17498 //..                                  guest_eip_bbstart+delta, "repne scas" );
   17499 //..          break;
   17500 //..
   17501 //..       default:
   17502 //..          goto decode_failure;
   17503 //..       }
   17504 //..       break;
   17505 //..    }
   17506 
   17507    /* ------ AE: SCAS variants ------ */
   17508    case 0xAE:
   17509    case 0xAF:
   17510       /* F2 AE/AF: repne scasb/repne scas{w,l,q} */
   17511       if (haveF2(pfx) && !haveF3(pfx)) {
   17512          if (opc == 0xAE)
   17513             sz = 1;
   17514          dis_REP_op ( AMD64CondNZ, dis_SCAS, sz,
   17515                       guest_RIP_curr_instr,
   17516                       guest_RIP_bbstart+delta, "repne scas", pfx );
   17517          dres.whatNext = Dis_StopHere;
   17518          break;
   17519       }
   17520       /* F3 AE/AF: repe scasb/repe scas{w,l,q} */
   17521       if (!haveF2(pfx) && haveF3(pfx)) {
   17522          if (opc == 0xAE)
   17523             sz = 1;
   17524          dis_REP_op ( AMD64CondZ, dis_SCAS, sz,
   17525                       guest_RIP_curr_instr,
   17526                       guest_RIP_bbstart+delta, "repe scas", pfx );
   17527          dres.whatNext = Dis_StopHere;
   17528          break;
   17529       }
   17530       /* AE/AF: scasb/scas{w,l,q} */
   17531       if (!haveF2(pfx) && !haveF3(pfx)) {
   17532          if (opc == 0xAE)
   17533             sz = 1;
   17534          dis_string_op( dis_SCAS, sz, "scas", pfx );
   17535          break;
   17536       }
   17537       goto decode_failure;
   17538 
   17539    /* ------ A6, A7: CMPS variants ------ */
   17540    case 0xA6:
   17541    case 0xA7:
   17542       /* F3 A6/A7: repe cmps/rep cmps{w,l,q} */
   17543       if (haveF3(pfx) && !haveF2(pfx)) {
   17544          if (opc == 0xA6)
   17545             sz = 1;
   17546          dis_REP_op ( AMD64CondZ, dis_CMPS, sz,
   17547                       guest_RIP_curr_instr,
   17548                       guest_RIP_bbstart+delta, "repe cmps", pfx );
   17549          dres.whatNext = Dis_StopHere;
   17550          break;
   17551       }
   17552       goto decode_failure;
   17553 
   17554    /* ------ AA, AB: STOS variants ------ */
   17555    case 0xAA:
   17556    case 0xAB:
   17557       /* F3 AA/AB: rep stosb/rep stos{w,l,q} */
   17558       if (haveF3(pfx) && !haveF2(pfx)) {
   17559          if (opc == 0xAA)
   17560             sz = 1;
   17561          dis_REP_op ( AMD64CondAlways, dis_STOS, sz,
   17562                       guest_RIP_curr_instr,
   17563                       guest_RIP_bbstart+delta, "rep stos", pfx );
   17564         dres.whatNext = Dis_StopHere;
   17565         break;
   17566       }
   17567       /* AA/AB: stosb/stos{w,l,q} */
   17568       if (!haveF3(pfx) && !haveF2(pfx)) {
   17569          if (opc == 0xAA)
   17570             sz = 1;
   17571          dis_string_op( dis_STOS, sz, "stos", pfx );
   17572          break;
   17573       }
   17574       goto decode_failure;
   17575 
   17576    /* ------ A4, A5: MOVS variants ------ */
   17577    case 0xA4:
   17578    case 0xA5:
   17579       /* F3 A4: rep movsb */
   17580       if (haveF3(pfx) && !haveF2(pfx)) {
   17581          if (opc == 0xA4)
   17582             sz = 1;
   17583          dis_REP_op ( AMD64CondAlways, dis_MOVS, sz,
   17584                       guest_RIP_curr_instr,
   17585                       guest_RIP_bbstart+delta, "rep movs", pfx );
   17586         dres.whatNext = Dis_StopHere;
   17587         break;
   17588       }
   17589       /* A4: movsb */
   17590       if (!haveF3(pfx) && !haveF2(pfx)) {
   17591          if (opc == 0xA4)
   17592             sz = 1;
   17593          dis_string_op( dis_MOVS, sz, "movs", pfx );
   17594          break;
   17595       }
   17596       goto decode_failure;
   17597 
   17598 
   17599    /* ------------------------ XCHG ----------------------- */
   17600 
   17601    /* XCHG reg,mem automatically asserts LOCK# even without a LOCK
   17602       prefix.  Therefore, surround it with a IRStmt_MBE(Imbe_BusLock)
   17603       and IRStmt_MBE(Imbe_BusUnlock) pair.  But be careful; if it is
   17604       used with an explicit LOCK prefix, we don't want to end up with
   17605       two IRStmt_MBE(Imbe_BusLock)s -- one made here and one made by
   17606       the generic LOCK logic at the top of disInstr. */
   17607    case 0x86: /* XCHG Gb,Eb */
   17608       sz = 1;
   17609       /* Fall through ... */
   17610    case 0x87: /* XCHG Gv,Ev */
   17611       if (haveF2orF3(pfx)) goto decode_failure;
   17612       modrm = getUChar(delta);
   17613       ty = szToITy(sz);
   17614       t1 = newTemp(ty); t2 = newTemp(ty);
   17615       if (epartIsReg(modrm)) {
   17616          assign(t1, getIRegE(sz, pfx, modrm));
   17617          assign(t2, getIRegG(sz, pfx, modrm));
   17618          putIRegG(sz, pfx, modrm, mkexpr(t1));
   17619          putIRegE(sz, pfx, modrm, mkexpr(t2));
   17620          delta++;
   17621          DIP("xchg%c %s, %s\n",
   17622              nameISize(sz), nameIRegG(sz, pfx, modrm),
   17623                             nameIRegE(sz, pfx, modrm));
   17624       } else {
   17625          *expect_CAS = True;
   17626          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   17627          assign( t1, loadLE(ty, mkexpr(addr)) );
   17628          assign( t2, getIRegG(sz, pfx, modrm) );
   17629          casLE( mkexpr(addr),
   17630                 mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
   17631          putIRegG( sz, pfx, modrm, mkexpr(t1) );
   17632          delta += alen;
   17633          DIP("xchg%c %s, %s\n", nameISize(sz),
   17634                                 nameIRegG(sz, pfx, modrm), dis_buf);
   17635       }
   17636       break;
   17637 
   17638    case 0x90: /* XCHG eAX,eAX */
   17639       /* detect and handle F3 90 (rep nop) specially */
   17640       if (!have66(pfx) && !haveF2(pfx) && haveF3(pfx)) {
   17641          DIP("rep nop (P4 pause)\n");
   17642          /* "observe" the hint.  The Vex client needs to be careful not
   17643             to cause very long delays as a result, though. */
   17644          jmp_lit(Ijk_Yield, guest_RIP_bbstart+delta);
   17645          dres.whatNext = Dis_StopHere;
   17646          break;
   17647       }
   17648       /* detect and handle NOPs specially */
   17649       if (/* F2/F3 probably change meaning completely */
   17650           !haveF2orF3(pfx)
   17651           /* If REX.B is 1, we're not exchanging rAX with itself */
   17652           && getRexB(pfx)==0 ) {
   17653          DIP("nop\n");
   17654          break;
   17655       }
   17656       /* else fall through to normal case. */
   17657    case 0x91: /* XCHG rAX,rCX */
   17658    case 0x92: /* XCHG rAX,rDX */
   17659    case 0x93: /* XCHG rAX,rBX */
   17660    case 0x94: /* XCHG rAX,rSP */
   17661    case 0x95: /* XCHG rAX,rBP */
   17662    case 0x96: /* XCHG rAX,rSI */
   17663    case 0x97: /* XCHG rAX,rDI */
   17664 
   17665       /* guard against mutancy */
   17666       if (haveF2orF3(pfx)) goto decode_failure;
   17667 
   17668       codegen_xchg_rAX_Reg ( pfx, sz, opc - 0x90 );
   17669       break;
   17670 
   17671 //.. //--    /* ------------------------ XLAT ----------------------- */
   17672 //.. //--
   17673 //.. //--    case 0xD7: /* XLAT */
   17674 //.. //--       t1 = newTemp(cb); t2 = newTemp(cb);
   17675 //.. //--       uInstr2(cb, GET, sz, ArchReg, R_EBX, TempReg, t1); /* get eBX */
   17676 //.. //--       handleAddrOverrides( cb, sorb, t1 );               /* make t1 DS:eBX */
   17677 //.. //--       uInstr2(cb, GET, 1, ArchReg, R_AL, TempReg, t2); /* get AL */
   17678 //.. //--       /* Widen %AL to 32 bits, so it's all defined when we add it. */
   17679 //.. //--       uInstr1(cb, WIDEN, 4, TempReg, t2);
   17680 //.. //--       uWiden(cb, 1, False);
   17681 //.. //--       uInstr2(cb, ADD, sz, TempReg, t2, TempReg, t1);  /* add AL to eBX */
   17682 //.. //--       uInstr2(cb, LOAD, 1, TempReg, t1,  TempReg, t2); /* get byte at t1 into t2 */
   17683 //.. //--       uInstr2(cb, PUT, 1, TempReg, t2, ArchReg, R_AL); /* put byte into AL */
   17684 //.. //--
   17685 //.. //--       DIP("xlat%c [ebx]\n", nameISize(sz));
   17686 //.. //--       break;
   17687 
   17688    /* ------------------------ IN / OUT ----------------------- */
   17689 
   17690    case 0xE4: /* IN imm8, AL */
   17691       sz = 1;
   17692       t1 = newTemp(Ity_I64);
   17693       abyte = getUChar(delta); delta++;
   17694       assign(t1, mkU64( abyte & 0xFF ));
   17695       DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIRegRAX(sz));
   17696       goto do_IN;
   17697    case 0xE5: /* IN imm8, eAX */
   17698       if (!(sz == 2 || sz == 4)) goto decode_failure;
   17699       t1 = newTemp(Ity_I64);
   17700       abyte = getUChar(delta); delta++;
   17701       assign(t1, mkU64( abyte & 0xFF ));
   17702       DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIRegRAX(sz));
   17703       goto do_IN;
   17704    case 0xEC: /* IN %DX, AL */
   17705       sz = 1;
   17706       t1 = newTemp(Ity_I64);
   17707       assign(t1, unop(Iop_16Uto64, getIRegRDX(2)));
   17708       DIP("in%c %s,%s\n", nameISize(sz), nameIRegRDX(2),
   17709                                          nameIRegRAX(sz));
   17710       goto do_IN;
   17711    case 0xED: /* IN %DX, eAX */
   17712       if (!(sz == 2 || sz == 4)) goto decode_failure;
   17713       t1 = newTemp(Ity_I64);
   17714       assign(t1, unop(Iop_16Uto64, getIRegRDX(2)));
   17715       DIP("in%c %s,%s\n", nameISize(sz), nameIRegRDX(2),
   17716                                          nameIRegRAX(sz));
   17717       goto do_IN;
   17718    do_IN: {
   17719       /* At this point, sz indicates the width, and t1 is a 64-bit
   17720          value giving port number. */
   17721       IRDirty* d;
   17722       if (haveF2orF3(pfx)) goto decode_failure;
   17723       vassert(sz == 1 || sz == 2 || sz == 4);
   17724       ty = szToITy(sz);
   17725       t2 = newTemp(Ity_I64);
   17726       d = unsafeIRDirty_1_N(
   17727              t2,
   17728              0/*regparms*/,
   17729              "amd64g_dirtyhelper_IN",
   17730              &amd64g_dirtyhelper_IN,
   17731              mkIRExprVec_2( mkexpr(t1), mkU64(sz) )
   17732           );
   17733       /* do the call, dumping the result in t2. */
   17734       stmt( IRStmt_Dirty(d) );
   17735       putIRegRAX(sz, narrowTo( ty, mkexpr(t2) ) );
   17736       break;
   17737    }
   17738 
   17739    case 0xE6: /* OUT AL, imm8 */
   17740       sz = 1;
   17741       t1 = newTemp(Ity_I64);
   17742       abyte = getUChar(delta); delta++;
   17743       assign( t1, mkU64( abyte & 0xFF ) );
   17744       DIP("out%c %s,$%d\n", nameISize(sz), nameIRegRAX(sz), (Int)abyte);
   17745       goto do_OUT;
   17746    case 0xE7: /* OUT eAX, imm8 */
   17747       if (!(sz == 2 || sz == 4)) goto decode_failure;
   17748       t1 = newTemp(Ity_I64);
   17749       abyte = getUChar(delta); delta++;
   17750       assign( t1, mkU64( abyte & 0xFF ) );
   17751       DIP("out%c %s,$%d\n", nameISize(sz), nameIRegRAX(sz), (Int)abyte);
   17752       goto do_OUT;
   17753    case 0xEE: /* OUT AL, %DX */
   17754       sz = 1;
   17755       t1 = newTemp(Ity_I64);
   17756       assign( t1, unop(Iop_16Uto64, getIRegRDX(2)) );
   17757       DIP("out%c %s,%s\n", nameISize(sz), nameIRegRAX(sz),
   17758                                           nameIRegRDX(2));
   17759       goto do_OUT;
   17760    case 0xEF: /* OUT eAX, %DX */
   17761       if (!(sz == 2 || sz == 4)) goto decode_failure;
   17762       t1 = newTemp(Ity_I64);
   17763       assign( t1, unop(Iop_16Uto64, getIRegRDX(2)) );
   17764       DIP("out%c %s,%s\n", nameISize(sz), nameIRegRAX(sz),
   17765                                           nameIRegRDX(2));
   17766       goto do_OUT;
   17767    do_OUT: {
   17768       /* At this point, sz indicates the width, and t1 is a 64-bit
   17769          value giving port number. */
   17770       IRDirty* d;
   17771       if (haveF2orF3(pfx)) goto decode_failure;
   17772       vassert(sz == 1 || sz == 2 || sz == 4);
   17773       ty = szToITy(sz);
   17774       d = unsafeIRDirty_0_N(
   17775              0/*regparms*/,
   17776              "amd64g_dirtyhelper_OUT",
   17777              &amd64g_dirtyhelper_OUT,
   17778              mkIRExprVec_3( mkexpr(t1),
   17779                             widenUto64( getIRegRAX(sz) ),
   17780                             mkU64(sz) )
   17781           );
   17782       stmt( IRStmt_Dirty(d) );
   17783       break;
   17784    }
   17785 
   17786    /* ------------------------ (Grp1 extensions) ---------- */
   17787 
   17788    case 0x80: /* Grp1 Ib,Eb */
   17789       if (haveF2orF3(pfx)) goto decode_failure;
   17790       modrm = getUChar(delta);
   17791       am_sz = lengthAMode(pfx,delta);
   17792       sz    = 1;
   17793       d_sz  = 1;
   17794       d64   = getSDisp8(delta + am_sz);
   17795       delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
   17796       break;
   17797 
   17798    case 0x81: /* Grp1 Iv,Ev */
   17799       if (haveF2orF3(pfx)) goto decode_failure;
   17800       modrm = getUChar(delta);
   17801       am_sz = lengthAMode(pfx,delta);
   17802       d_sz  = imin(sz,4);
   17803       d64   = getSDisp(d_sz, delta + am_sz);
   17804       delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
   17805       break;
   17806 
   17807    case 0x83: /* Grp1 Ib,Ev */
   17808       if (haveF2orF3(pfx)) goto decode_failure;
   17809       modrm = getUChar(delta);
   17810       am_sz = lengthAMode(pfx,delta);
   17811       d_sz  = 1;
   17812       d64   = getSDisp8(delta + am_sz);
   17813       delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
   17814       break;
   17815 
   17816    /* ------------------------ (Grp2 extensions) ---------- */
   17817 
   17818    case 0xC0: { /* Grp2 Ib,Eb */
   17819       Bool decode_OK = True;
   17820       if (haveF2orF3(pfx)) goto decode_failure;
   17821       modrm = getUChar(delta);
   17822       am_sz = lengthAMode(pfx,delta);
   17823       d_sz  = 1;
   17824       d64   = getUChar(delta + am_sz);
   17825       sz    = 1;
   17826       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   17827                          mkU8(d64 & 0xFF), NULL, &decode_OK );
   17828       if (!decode_OK) goto decode_failure;
   17829       break;
   17830    }
   17831    case 0xC1: { /* Grp2 Ib,Ev */
   17832       Bool decode_OK = True;
   17833       if (haveF2orF3(pfx)) goto decode_failure;
   17834       modrm = getUChar(delta);
   17835       am_sz = lengthAMode(pfx,delta);
   17836       d_sz  = 1;
   17837       d64   = getUChar(delta + am_sz);
   17838       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   17839                          mkU8(d64 & 0xFF), NULL, &decode_OK );
   17840       if (!decode_OK) goto decode_failure;
   17841       break;
   17842    }
   17843    case 0xD0: { /* Grp2 1,Eb */
   17844       Bool decode_OK = True;
   17845       if (haveF2orF3(pfx)) goto decode_failure;
   17846       modrm = getUChar(delta);
   17847       am_sz = lengthAMode(pfx,delta);
   17848       d_sz  = 0;
   17849       d64   = 1;
   17850       sz    = 1;
   17851       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   17852                          mkU8(d64), NULL, &decode_OK );
   17853       if (!decode_OK) goto decode_failure;
   17854       break;
   17855    }
   17856    case 0xD1: { /* Grp2 1,Ev */
   17857       Bool decode_OK = True;
   17858       if (haveF2orF3(pfx)) goto decode_failure;
   17859       modrm = getUChar(delta);
   17860       am_sz = lengthAMode(pfx,delta);
   17861       d_sz  = 0;
   17862       d64   = 1;
   17863       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   17864                          mkU8(d64), NULL, &decode_OK );
   17865       if (!decode_OK) goto decode_failure;
   17866       break;
   17867    }
   17868    case 0xD2: { /* Grp2 CL,Eb */
   17869       Bool decode_OK = True;
   17870       if (haveF2orF3(pfx)) goto decode_failure;
   17871       modrm = getUChar(delta);
   17872       am_sz = lengthAMode(pfx,delta);
   17873       d_sz  = 0;
   17874       sz    = 1;
   17875       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   17876                          getIRegCL(), "%cl", &decode_OK );
   17877       if (!decode_OK) goto decode_failure;
   17878       break;
   17879    }
   17880    case 0xD3: { /* Grp2 CL,Ev */
   17881       Bool decode_OK = True;
   17882       if (haveF2orF3(pfx)) goto decode_failure;
   17883       modrm = getUChar(delta);
   17884       am_sz = lengthAMode(pfx,delta);
   17885       d_sz  = 0;
   17886       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   17887                          getIRegCL(), "%cl", &decode_OK );
   17888       if (!decode_OK) goto decode_failure;
   17889       break;
   17890    }
   17891 
   17892    /* ------------------------ (Grp3 extensions) ---------- */
   17893 
   17894    case 0xF6: { /* Grp3 Eb */
   17895       Bool decode_OK = True;
   17896       if (haveF2orF3(pfx)) goto decode_failure;
   17897       delta = dis_Grp3 ( vbi, pfx, 1, delta, &decode_OK );
   17898       if (!decode_OK) goto decode_failure;
   17899       break;
   17900    }
   17901    case 0xF7: { /* Grp3 Ev */
   17902       Bool decode_OK = True;
   17903       if (haveF2orF3(pfx)) goto decode_failure;
   17904       delta = dis_Grp3 ( vbi, pfx, sz, delta, &decode_OK );
   17905       if (!decode_OK) goto decode_failure;
   17906       break;
   17907    }
   17908 
   17909    /* ------------------------ (Grp4 extensions) ---------- */
   17910 
   17911    case 0xFE: { /* Grp4 Eb */
   17912       Bool decode_OK = True;
   17913       if (haveF2orF3(pfx)) goto decode_failure;
   17914       delta = dis_Grp4 ( vbi, pfx, delta, &decode_OK );
   17915       if (!decode_OK) goto decode_failure;
   17916       break;
   17917    }
   17918 
   17919    /* ------------------------ (Grp5 extensions) ---------- */
   17920 
   17921    case 0xFF: { /* Grp5 Ev */
   17922       Bool decode_OK = True;
   17923       if (haveF2orF3(pfx)) goto decode_failure;
   17924       delta = dis_Grp5 ( vbi, pfx, sz, delta, &dres, &decode_OK );
   17925       if (!decode_OK) goto decode_failure;
   17926       break;
   17927    }
   17928 
   17929    /* ------------------------ Escapes to 2-byte opcodes -- */
   17930 
   17931    case 0x0F: {
   17932       opc = getUChar(delta); delta++;
   17933       switch (opc) {
   17934 
   17935       /* =-=-=-=-=-=-=-=-=- Grp8 =-=-=-=-=-=-=-=-=-=-=-= */
   17936 
   17937       case 0xBA: { /* Grp8 Ib,Ev */
   17938          Bool decode_OK = False;
   17939          if (haveF2orF3(pfx)) goto decode_failure;
   17940          modrm = getUChar(delta);
   17941          am_sz = lengthAMode(pfx,delta);
   17942          d64   = getSDisp8(delta + am_sz);
   17943          delta = dis_Grp8_Imm ( vbi, pfx, delta, modrm, am_sz, sz, d64,
   17944                                 &decode_OK );
   17945          if (!decode_OK)
   17946             goto decode_failure;
   17947          break;
   17948       }
   17949 
   17950       /* =-=-=-=-=-=-=-=-=- BSF/BSR -=-=-=-=-=-=-=-=-=-= */
   17951 
   17952       case 0xBC: /* BSF Gv,Ev */
   17953          if (haveF2orF3(pfx)) goto decode_failure;
   17954          delta = dis_bs_E_G ( vbi, pfx, sz, delta, True );
   17955          break;
   17956       case 0xBD: /* BSR Gv,Ev */
   17957          if (haveF2orF3(pfx)) goto decode_failure;
   17958          delta = dis_bs_E_G ( vbi, pfx, sz, delta, False );
   17959          break;
   17960 
   17961       /* =-=-=-=-=-=-=-=-=- BSWAP -=-=-=-=-=-=-=-=-=-=-= */
   17962 
   17963       case 0xC8: /* BSWAP %eax */
   17964       case 0xC9:
   17965       case 0xCA:
   17966       case 0xCB:
   17967       case 0xCC:
   17968       case 0xCD:
   17969       case 0xCE:
   17970       case 0xCF: /* BSWAP %edi */
   17971          if (haveF2orF3(pfx)) goto decode_failure;
   17972          /* According to the AMD64 docs, this insn can have size 4 or
   17973             8. */
   17974          if (sz == 4) {
   17975             t1 = newTemp(Ity_I32);
   17976             t2 = newTemp(Ity_I32);
   17977             assign( t1, getIRegRexB(4, pfx, opc-0xC8) );
   17978             assign( t2,
   17979                binop(Iop_Or32,
   17980                   binop(Iop_Shl32, mkexpr(t1), mkU8(24)),
   17981                binop(Iop_Or32,
   17982                   binop(Iop_And32, binop(Iop_Shl32, mkexpr(t1), mkU8(8)),
   17983                                    mkU32(0x00FF0000)),
   17984                binop(Iop_Or32,
   17985                   binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(8)),
   17986                                    mkU32(0x0000FF00)),
   17987                   binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(24)),
   17988                                    mkU32(0x000000FF) )
   17989                )))
   17990             );
   17991             putIRegRexB(4, pfx, opc-0xC8, mkexpr(t2));
   17992             DIP("bswapl %s\n", nameIRegRexB(4, pfx, opc-0xC8));
   17993             break;
   17994          }
   17995 	 else if (sz == 8) {
   17996             IRTemp m8  = newTemp(Ity_I64);
   17997             IRTemp s8  = newTemp(Ity_I64);
   17998             IRTemp m16 = newTemp(Ity_I64);
   17999             IRTemp s16 = newTemp(Ity_I64);
   18000             IRTemp m32 = newTemp(Ity_I64);
   18001             t1 = newTemp(Ity_I64);
   18002             t2 = newTemp(Ity_I64);
   18003             assign( t1, getIRegRexB(8, pfx, opc-0xC8) );
   18004 
   18005             assign( m8, mkU64(0xFF00FF00FF00FF00ULL) );
   18006             assign( s8,
   18007                     binop(Iop_Or64,
   18008                           binop(Iop_Shr64,
   18009                                 binop(Iop_And64,mkexpr(t1),mkexpr(m8)),
   18010                                 mkU8(8)),
   18011                           binop(Iop_And64,
   18012                                 binop(Iop_Shl64,mkexpr(t1),mkU8(8)),
   18013                                 mkexpr(m8))
   18014                          )
   18015                   );
   18016 
   18017             assign( m16, mkU64(0xFFFF0000FFFF0000ULL) );
   18018             assign( s16,
   18019                     binop(Iop_Or64,
   18020                           binop(Iop_Shr64,
   18021                                 binop(Iop_And64,mkexpr(s8),mkexpr(m16)),
   18022                                 mkU8(16)),
   18023                           binop(Iop_And64,
   18024                                 binop(Iop_Shl64,mkexpr(s8),mkU8(16)),
   18025                                 mkexpr(m16))
   18026                          )
   18027                   );
   18028 
   18029             assign( m32, mkU64(0xFFFFFFFF00000000ULL) );
   18030             assign( t2,
   18031                     binop(Iop_Or64,
   18032                           binop(Iop_Shr64,
   18033                                 binop(Iop_And64,mkexpr(s16),mkexpr(m32)),
   18034                                 mkU8(32)),
   18035                           binop(Iop_And64,
   18036                                 binop(Iop_Shl64,mkexpr(s16),mkU8(32)),
   18037                                 mkexpr(m32))
   18038                          )
   18039                   );
   18040 
   18041             putIRegRexB(8, pfx, opc-0xC8, mkexpr(t2));
   18042             DIP("bswapq %s\n", nameIRegRexB(8, pfx, opc-0xC8));
   18043             break;
   18044          } else {
   18045             goto decode_failure;
   18046          }
   18047 
   18048       /* =-=-=-=-=-=-=-=-=- BT/BTS/BTR/BTC =-=-=-=-=-=-= */
   18049 
   18050       /* All of these are possible at sizes 2, 4 and 8, but until a
   18051          size 2 test case shows up, only handle sizes 4 and 8. */
   18052 
   18053       case 0xA3: /* BT Gv,Ev */
   18054          if (haveF2orF3(pfx)) goto decode_failure;
   18055          if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
   18056          delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpNone );
   18057          break;
   18058       case 0xB3: /* BTR Gv,Ev */
   18059          if (haveF2orF3(pfx)) goto decode_failure;
   18060          if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
   18061          delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpReset );
   18062          break;
   18063       case 0xAB: /* BTS Gv,Ev */
   18064          if (haveF2orF3(pfx)) goto decode_failure;
   18065          if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
   18066          delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpSet );
   18067          break;
   18068       case 0xBB: /* BTC Gv,Ev */
   18069          if (haveF2orF3(pfx)) goto decode_failure;
   18070          if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
   18071          delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpComp );
   18072          break;
   18073 
   18074       /* =-=-=-=-=-=-=-=-=- CMOV =-=-=-=-=-=-=-=-=-=-=-= */
   18075 
   18076       case 0x40:
   18077       case 0x41:
   18078       case 0x42: /* CMOVBb/CMOVNAEb (cmov below) */
   18079       case 0x43: /* CMOVNBb/CMOVAEb (cmov not below) */
   18080       case 0x44: /* CMOVZb/CMOVEb (cmov zero) */
   18081       case 0x45: /* CMOVNZb/CMOVNEb (cmov not zero) */
   18082       case 0x46: /* CMOVBEb/CMOVNAb (cmov below or equal) */
   18083       case 0x47: /* CMOVNBEb/CMOVAb (cmov not below or equal) */
   18084       case 0x48: /* CMOVSb (cmov negative) */
   18085       case 0x49: /* CMOVSb (cmov not negative) */
   18086       case 0x4A: /* CMOVP (cmov parity even) */
   18087       case 0x4B: /* CMOVNP (cmov parity odd) */
   18088       case 0x4C: /* CMOVLb/CMOVNGEb (cmov less) */
   18089       case 0x4D: /* CMOVGEb/CMOVNLb (cmov greater or equal) */
   18090       case 0x4E: /* CMOVLEb/CMOVNGb (cmov less or equal) */
   18091       case 0x4F: /* CMOVGb/CMOVNLEb (cmov greater) */
   18092          if (haveF2orF3(pfx)) goto decode_failure;
   18093          delta = dis_cmov_E_G(vbi, pfx, sz, (AMD64Condcode)(opc - 0x40), delta);
   18094          break;
   18095 
   18096       /* =-=-=-=-=-=-=-=-=- CMPXCHG -=-=-=-=-=-=-=-=-=-= */
   18097 
   18098       case 0xB0: { /* CMPXCHG Gb,Eb */
   18099          Bool ok = True;
   18100          if (haveF2orF3(pfx)) goto decode_failure;
   18101          delta = dis_cmpxchg_G_E ( &ok, vbi, pfx, 1, delta );
   18102          if (!ok) goto decode_failure;
   18103          break;
   18104       }
   18105       case 0xB1: { /* CMPXCHG Gv,Ev (allowed in 16,32,64 bit) */
   18106          Bool ok = True;
   18107          if (haveF2orF3(pfx)) goto decode_failure;
   18108          if (sz != 2 && sz != 4 && sz != 8) goto decode_failure;
   18109          delta = dis_cmpxchg_G_E ( &ok, vbi, pfx, sz, delta );
   18110          if (!ok) goto decode_failure;
   18111          break;
   18112       }
   18113 
   18114       case 0xC7: { /* CMPXCHG8B Ev, CMPXCHG16B Ev */
   18115          IRType  elemTy     = sz==4 ? Ity_I32 : Ity_I64;
   18116          IRTemp  expdHi     = newTemp(elemTy);
   18117          IRTemp  expdLo     = newTemp(elemTy);
   18118          IRTemp  dataHi     = newTemp(elemTy);
   18119          IRTemp  dataLo     = newTemp(elemTy);
   18120          IRTemp  oldHi      = newTemp(elemTy);
   18121          IRTemp  oldLo      = newTemp(elemTy);
   18122          IRTemp  flags_old  = newTemp(Ity_I64);
   18123          IRTemp  flags_new  = newTemp(Ity_I64);
   18124          IRTemp  success    = newTemp(Ity_I1);
   18125          IROp    opOR       = sz==4 ? Iop_Or32    : Iop_Or64;
   18126          IROp    opXOR      = sz==4 ? Iop_Xor32   : Iop_Xor64;
   18127          IROp    opCasCmpEQ = sz==4 ? Iop_CasCmpEQ32 : Iop_CasCmpEQ64;
   18128          IRExpr* zero       = sz==4 ? mkU32(0)    : mkU64(0);
   18129          IRTemp expdHi64    = newTemp(Ity_I64);
   18130          IRTemp expdLo64    = newTemp(Ity_I64);
   18131 
   18132          /* Translate this using a DCAS, even if there is no LOCK
   18133             prefix.  Life is too short to bother with generating two
   18134             different translations for the with/without-LOCK-prefix
   18135             cases. */
   18136          *expect_CAS = True;
   18137 
   18138 	 /* Decode, and generate address. */
   18139          if (have66orF2orF3(pfx)) goto decode_failure;
   18140          if (sz != 4 && sz != 8) goto decode_failure;
   18141          if (sz == 8 && !(archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16))
   18142             goto decode_failure;
   18143          modrm = getUChar(delta);
   18144          if (epartIsReg(modrm)) goto decode_failure;
   18145          if (gregLO3ofRM(modrm) != 1) goto decode_failure;
   18146          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   18147          delta += alen;
   18148 
   18149          /* cmpxchg16b requires an alignment check. */
   18150          if (sz == 8)
   18151             gen_SEGV_if_not_16_aligned( addr );
   18152 
   18153          /* Get the expected and new values. */
   18154          assign( expdHi64, getIReg64(R_RDX) );
   18155          assign( expdLo64, getIReg64(R_RAX) );
   18156 
   18157          /* These are the correctly-sized expected and new values.
   18158             However, we also get expdHi64/expdLo64 above as 64-bits
   18159             regardless, because we will need them later in the 32-bit
   18160             case (paradoxically). */
   18161          assign( expdHi, sz==4 ? unop(Iop_64to32, mkexpr(expdHi64))
   18162                                : mkexpr(expdHi64) );
   18163          assign( expdLo, sz==4 ? unop(Iop_64to32, mkexpr(expdLo64))
   18164                                : mkexpr(expdLo64) );
   18165          assign( dataHi, sz==4 ? getIReg32(R_RCX) : getIReg64(R_RCX) );
   18166          assign( dataLo, sz==4 ? getIReg32(R_RBX) : getIReg64(R_RBX) );
   18167 
   18168          /* Do the DCAS */
   18169          stmt( IRStmt_CAS(
   18170                   mkIRCAS( oldHi, oldLo,
   18171                            Iend_LE, mkexpr(addr),
   18172                            mkexpr(expdHi), mkexpr(expdLo),
   18173                            mkexpr(dataHi), mkexpr(dataLo)
   18174                )));
   18175 
   18176          /* success when oldHi:oldLo == expdHi:expdLo */
   18177          assign( success,
   18178                  binop(opCasCmpEQ,
   18179                        binop(opOR,
   18180                              binop(opXOR, mkexpr(oldHi), mkexpr(expdHi)),
   18181                              binop(opXOR, mkexpr(oldLo), mkexpr(expdLo))
   18182                        ),
   18183                        zero
   18184                  ));
   18185 
   18186          /* If the DCAS is successful, that is to say oldHi:oldLo ==
   18187             expdHi:expdLo, then put expdHi:expdLo back in RDX:RAX,
   18188             which is where they came from originally.  Both the actual
   18189             contents of these two regs, and any shadow values, are
   18190             unchanged.  If the DCAS fails then we're putting into
   18191             RDX:RAX the value seen in memory. */
   18192          /* Now of course there's a complication in the 32-bit case
   18193             (bah!): if the DCAS succeeds, we need to leave RDX:RAX
   18194             unchanged; but if we use the same scheme as in the 64-bit
   18195             case, we get hit by the standard rule that a write to the
   18196             bottom 32 bits of an integer register zeros the upper 32
   18197             bits.  And so the upper halves of RDX and RAX mysteriously
   18198             become zero.  So we have to stuff back in the original
   18199             64-bit values which we previously stashed in
   18200             expdHi64:expdLo64, even if we're doing a cmpxchg8b. */
   18201          /* It's just _so_ much fun ... */
   18202          putIRegRDX( 8,
   18203                      IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(success)),
   18204                                    sz == 4 ? unop(Iop_32Uto64, mkexpr(oldHi))
   18205                                            : mkexpr(oldHi),
   18206                                    mkexpr(expdHi64)
   18207                    ));
   18208          putIRegRAX( 8,
   18209                      IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(success)),
   18210                                    sz == 4 ? unop(Iop_32Uto64, mkexpr(oldLo))
   18211                                            : mkexpr(oldLo),
   18212                                    mkexpr(expdLo64)
   18213                    ));
   18214 
   18215          /* Copy the success bit into the Z flag and leave the others
   18216             unchanged */
   18217          assign( flags_old, widenUto64(mk_amd64g_calculate_rflags_all()));
   18218          assign(
   18219             flags_new,
   18220             binop(Iop_Or64,
   18221                   binop(Iop_And64, mkexpr(flags_old),
   18222                                    mkU64(~AMD64G_CC_MASK_Z)),
   18223                   binop(Iop_Shl64,
   18224                         binop(Iop_And64,
   18225                               unop(Iop_1Uto64, mkexpr(success)), mkU64(1)),
   18226                         mkU8(AMD64G_CC_SHIFT_Z)) ));
   18227 
   18228          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   18229          stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(flags_new) ));
   18230          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   18231          /* Set NDEP even though it isn't used.  This makes
   18232             redundant-PUT elimination of previous stores to this field
   18233             work better. */
   18234          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   18235 
   18236          /* Sheesh.  Aren't you glad it was me and not you that had to
   18237 	    write and validate all this grunge? */
   18238 
   18239 	 DIP("cmpxchg8b %s\n", dis_buf);
   18240 	 break;
   18241 
   18242       }
   18243 
   18244       /* =-=-=-=-=-=-=-=-=- CPUID -=-=-=-=-=-=-=-=-=-=-= */
   18245 
   18246       case 0xA2: { /* CPUID */
   18247          /* Uses dirty helper:
   18248                void amd64g_dirtyhelper_CPUID ( VexGuestAMD64State* )
   18249             declared to mod rax, wr rbx, rcx, rdx
   18250          */
   18251          IRDirty* d     = NULL;
   18252          HChar*   fName = NULL;
   18253          void*    fAddr = NULL;
   18254          if (haveF2orF3(pfx)) goto decode_failure;
   18255          if (archinfo->hwcaps == (VEX_HWCAPS_AMD64_SSE3
   18256                                   |VEX_HWCAPS_AMD64_CX16)) {
   18257             fName = "amd64g_dirtyhelper_CPUID_sse3_and_cx16";
   18258             fAddr = &amd64g_dirtyhelper_CPUID_sse3_and_cx16;
   18259             /* This is a Core-2-like machine */
   18260             //fName = "amd64g_dirtyhelper_CPUID_sse42_and_cx16";
   18261             //fAddr = &amd64g_dirtyhelper_CPUID_sse42_and_cx16;
   18262             /* This is a Core-i5-like machine */
   18263          }
   18264          else {
   18265             /* Give a CPUID for at least a baseline machine, SSE2
   18266                only, and no CX16 */
   18267             fName = "amd64g_dirtyhelper_CPUID_baseline";
   18268             fAddr = &amd64g_dirtyhelper_CPUID_baseline;
   18269          }
   18270 
   18271          vassert(fName); vassert(fAddr);
   18272          d = unsafeIRDirty_0_N ( 0/*regparms*/,
   18273                                  fName, fAddr, mkIRExprVec_0() );
   18274          /* declare guest state effects */
   18275          d->needsBBP = True;
   18276          d->nFxState = 4;
   18277          d->fxState[0].fx     = Ifx_Modify;
   18278          d->fxState[0].offset = OFFB_RAX;
   18279          d->fxState[0].size   = 8;
   18280          d->fxState[1].fx     = Ifx_Write;
   18281          d->fxState[1].offset = OFFB_RBX;
   18282          d->fxState[1].size   = 8;
   18283          d->fxState[2].fx     = Ifx_Modify;
   18284          d->fxState[2].offset = OFFB_RCX;
   18285          d->fxState[2].size   = 8;
   18286          d->fxState[3].fx     = Ifx_Write;
   18287          d->fxState[3].offset = OFFB_RDX;
   18288          d->fxState[3].size   = 8;
   18289          /* execute the dirty call, side-effecting guest state */
   18290          stmt( IRStmt_Dirty(d) );
   18291          /* CPUID is a serialising insn.  So, just in case someone is
   18292             using it as a memory fence ... */
   18293          stmt( IRStmt_MBE(Imbe_Fence) );
   18294          DIP("cpuid\n");
   18295          break;
   18296       }
   18297 
   18298       /* =-=-=-=-=-=-=-=-=- MOVZX, MOVSX =-=-=-=-=-=-=-= */
   18299 
   18300       case 0xB6: /* MOVZXb Eb,Gv */
   18301          if (haveF2orF3(pfx)) goto decode_failure;
   18302          if (sz != 2 && sz != 4 && sz != 8)
   18303             goto decode_failure;
   18304          delta = dis_movx_E_G ( vbi, pfx, delta, 1, sz, False );
   18305          break;
   18306       case 0xB7: /* MOVZXw Ew,Gv */
   18307          if (haveF2orF3(pfx)) goto decode_failure;
   18308          if (sz != 4 && sz != 8)
   18309             goto decode_failure;
   18310          delta = dis_movx_E_G ( vbi, pfx, delta, 2, sz, False );
   18311          break;
   18312 
   18313       case 0xBE: /* MOVSXb Eb,Gv */
   18314          if (haveF2orF3(pfx)) goto decode_failure;
   18315          if (sz != 2 && sz != 4 && sz != 8)
   18316             goto decode_failure;
   18317          delta = dis_movx_E_G ( vbi, pfx, delta, 1, sz, True );
   18318          break;
   18319       case 0xBF: /* MOVSXw Ew,Gv */
   18320          if (haveF2orF3(pfx)) goto decode_failure;
   18321          if (sz != 4 && sz != 8)
   18322             goto decode_failure;
   18323          delta = dis_movx_E_G ( vbi, pfx, delta, 2, sz, True );
   18324          break;
   18325 
   18326 //.. //--       /* =-=-=-=-=-=-=-=-=-=-= MOVNTI -=-=-=-=-=-=-=-=-= */
   18327 //.. //--
   18328 //.. //--       case 0xC3: /* MOVNTI Gv,Ev */
   18329 //.. //--          vg_assert(sz == 4);
   18330 //.. //--          modrm = getUChar(eip);
   18331 //.. //--          vg_assert(!epartIsReg(modrm));
   18332 //.. //--          t1 = newTemp(cb);
   18333 //.. //--          uInstr2(cb, GET, 4, ArchReg, gregOfRM(modrm), TempReg, t1);
   18334 //.. //--          pair = disAMode ( cb, sorb, eip, dis_buf );
   18335 //.. //--          t2 = LOW24(pair);
   18336 //.. //--          eip += HI8(pair);
   18337 //.. //--          uInstr2(cb, STORE, 4, TempReg, t1, TempReg, t2);
   18338 //.. //--          DIP("movnti %s,%s\n", nameIReg(4,gregOfRM(modrm)), dis_buf);
   18339 //.. //--          break;
   18340 
   18341       /* =-=-=-=-=-=-=-=-=- MUL/IMUL =-=-=-=-=-=-=-=-=-= */
   18342 
   18343       case 0xAF: /* IMUL Ev, Gv */
   18344          if (haveF2orF3(pfx)) goto decode_failure;
   18345          delta = dis_mul_E_G ( vbi, pfx, sz, delta );
   18346          break;
   18347 
   18348       /* =-=-=-=-=-=-=-=-=- NOPs =-=-=-=-=-=-=-=-=-=-=-= */
   18349 
   18350       case 0x1F:
   18351          if (haveF2orF3(pfx)) goto decode_failure;
   18352          modrm = getUChar(delta);
   18353          if (epartIsReg(modrm)) goto decode_failure;
   18354          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   18355          delta += alen;
   18356          DIP("nop%c %s\n", nameISize(sz), dis_buf);
   18357          break;
   18358 
   18359       /* =-=-=-=-=-=-=-=-=- Jcond d32 -=-=-=-=-=-=-=-=-= */
   18360       case 0x80:
   18361       case 0x81:
   18362       case 0x82: /* JBb/JNAEb (jump below) */
   18363       case 0x83: /* JNBb/JAEb (jump not below) */
   18364       case 0x84: /* JZb/JEb (jump zero) */
   18365       case 0x85: /* JNZb/JNEb (jump not zero) */
   18366       case 0x86: /* JBEb/JNAb (jump below or equal) */
   18367       case 0x87: /* JNBEb/JAb (jump not below or equal) */
   18368       case 0x88: /* JSb (jump negative) */
   18369       case 0x89: /* JSb (jump not negative) */
   18370       case 0x8A: /* JP (jump parity even) */
   18371       case 0x8B: /* JNP/JPO (jump parity odd) */
   18372       case 0x8C: /* JLb/JNGEb (jump less) */
   18373       case 0x8D: /* JGEb/JNLb (jump greater or equal) */
   18374       case 0x8E: /* JLEb/JNGb (jump less or equal) */
   18375       case 0x8F: /* JGb/JNLEb (jump greater) */
   18376        { Long   jmpDelta;
   18377          HChar* comment  = "";
   18378          if (haveF2orF3(pfx)) goto decode_failure;
   18379          jmpDelta = getSDisp32(delta);
   18380          d64 = (guest_RIP_bbstart+delta+4) + jmpDelta;
   18381          delta += 4;
   18382          if (resteerCisOk
   18383              && vex_control.guest_chase_cond
   18384              && (Addr64)d64 != (Addr64)guest_RIP_bbstart
   18385              && jmpDelta < 0
   18386              && resteerOkFn( callback_opaque, d64) ) {
   18387             /* Speculation: assume this backward branch is taken.  So
   18388                we need to emit a side-exit to the insn following this
   18389                one, on the negation of the condition, and continue at
   18390                the branch target address (d64).  If we wind up back at
   18391                the first instruction of the trace, just stop; it's
   18392                better to let the IR loop unroller handle that case. */
   18393             stmt( IRStmt_Exit(
   18394                      mk_amd64g_calculate_condition(
   18395                         (AMD64Condcode)(1 ^ (opc - 0x80))),
   18396                      Ijk_Boring,
   18397                      IRConst_U64(guest_RIP_bbstart+delta) ) );
   18398             dres.whatNext   = Dis_ResteerC;
   18399             dres.continueAt = d64;
   18400             comment = "(assumed taken)";
   18401          }
   18402          else
   18403          if (resteerCisOk
   18404              && vex_control.guest_chase_cond
   18405              && (Addr64)d64 != (Addr64)guest_RIP_bbstart
   18406              && jmpDelta >= 0
   18407              && resteerOkFn( callback_opaque, guest_RIP_bbstart+delta ) ) {
   18408             /* Speculation: assume this forward branch is not taken.
   18409                So we need to emit a side-exit to d64 (the dest) and
   18410                continue disassembling at the insn immediately
   18411                following this one. */
   18412             stmt( IRStmt_Exit(
   18413                      mk_amd64g_calculate_condition((AMD64Condcode)
   18414                                                    (opc - 0x80)),
   18415                      Ijk_Boring,
   18416                      IRConst_U64(d64) ) );
   18417             dres.whatNext   = Dis_ResteerC;
   18418             dres.continueAt = guest_RIP_bbstart+delta;
   18419             comment = "(assumed not taken)";
   18420          }
   18421          else {
   18422             /* Conservative default translation - end the block at
   18423                this point. */
   18424             jcc_01( (AMD64Condcode)(opc - 0x80),
   18425                     guest_RIP_bbstart+delta,
   18426                     d64 );
   18427             dres.whatNext = Dis_StopHere;
   18428          }
   18429          DIP("j%s-32 0x%llx %s\n", name_AMD64Condcode(opc - 0x80), d64, comment);
   18430          break;
   18431        }
   18432 
   18433       /* =-=-=-=-=-=-=-=-=- PREFETCH =-=-=-=-=-=-=-=-=-= */
   18434       case 0x0D: /* 0F 0D /0 -- prefetch mem8 */
   18435                  /* 0F 0D /1 -- prefetchw mem8 */
   18436          if (have66orF2orF3(pfx)) goto decode_failure;
   18437          modrm = getUChar(delta);
   18438          if (epartIsReg(modrm)) goto decode_failure;
   18439          if (gregLO3ofRM(modrm) != 0 && gregLO3ofRM(modrm) != 1)
   18440             goto decode_failure;
   18441 
   18442          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   18443          delta += alen;
   18444 
   18445          switch (gregLO3ofRM(modrm)) {
   18446             case 0: DIP("prefetch %s\n", dis_buf); break;
   18447             case 1: DIP("prefetchw %s\n", dis_buf); break;
   18448             default: vassert(0); /*NOTREACHED*/
   18449          }
   18450          break;
   18451 
   18452       /* =-=-=-=-=-=-=-=-=- RDTSC -=-=-=-=-=-=-=-=-=-=-= */
   18453       case 0x31: { /* RDTSC */
   18454          IRTemp   val  = newTemp(Ity_I64);
   18455          IRExpr** args = mkIRExprVec_0();
   18456          IRDirty* d    = unsafeIRDirty_1_N (
   18457                             val,
   18458                             0/*regparms*/,
   18459                             "amd64g_dirtyhelper_RDTSC",
   18460                             &amd64g_dirtyhelper_RDTSC,
   18461                             args
   18462                          );
   18463          if (have66orF2orF3(pfx)) goto decode_failure;
   18464          /* execute the dirty call, dumping the result in val. */
   18465          stmt( IRStmt_Dirty(d) );
   18466          putIRegRDX(4, unop(Iop_64HIto32, mkexpr(val)));
   18467          putIRegRAX(4, unop(Iop_64to32, mkexpr(val)));
   18468          DIP("rdtsc\n");
   18469          break;
   18470       }
   18471 
   18472 //..       /* =-=-=-=-=-=-=-=-=- PUSH/POP Sreg =-=-=-=-=-=-=-=-=-= */
   18473 //..
   18474 //..       case 0xA1: /* POP %FS */
   18475 //..          dis_pop_segreg( R_FS, sz ); break;
   18476 //..       case 0xA9: /* POP %GS */
   18477 //..          dis_pop_segreg( R_GS, sz ); break;
   18478 //..
   18479 //..       case 0xA0: /* PUSH %FS */
   18480 //..          dis_push_segreg( R_FS, sz ); break;
   18481 //..       case 0xA8: /* PUSH %GS */
   18482 //..          dis_push_segreg( R_GS, sz ); break;
   18483 
   18484       /* =-=-=-=-=-=-=-=-=- SETcc Eb =-=-=-=-=-=-=-=-=-= */
   18485       case 0x90:
   18486       case 0x91:
   18487       case 0x92: /* set-Bb/set-NAEb (set if below) */
   18488       case 0x93: /* set-NBb/set-AEb (set if not below) */
   18489       case 0x94: /* set-Zb/set-Eb (set if zero) */
   18490       case 0x95: /* set-NZb/set-NEb (set if not zero) */
   18491       case 0x96: /* set-BEb/set-NAb (set if below or equal) */
   18492       case 0x97: /* set-NBEb/set-Ab (set if not below or equal) */
   18493       case 0x98: /* set-Sb (set if negative) */
   18494       case 0x99: /* set-Sb (set if not negative) */
   18495       case 0x9A: /* set-P (set if parity even) */
   18496       case 0x9B: /* set-NP (set if parity odd) */
   18497       case 0x9C: /* set-Lb/set-NGEb (set if less) */
   18498       case 0x9D: /* set-GEb/set-NLb (set if greater or equal) */
   18499       case 0x9E: /* set-LEb/set-NGb (set if less or equal) */
   18500       case 0x9F: /* set-Gb/set-NLEb (set if greater) */
   18501          if (haveF2orF3(pfx)) goto decode_failure;
   18502          t1 = newTemp(Ity_I8);
   18503          assign( t1, unop(Iop_1Uto8,mk_amd64g_calculate_condition(opc-0x90)) );
   18504          modrm = getUChar(delta);
   18505          if (epartIsReg(modrm)) {
   18506             delta++;
   18507             putIRegE(1, pfx, modrm, mkexpr(t1));
   18508             DIP("set%s %s\n", name_AMD64Condcode(opc-0x90),
   18509                               nameIRegE(1,pfx,modrm));
   18510          } else {
   18511             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   18512             delta += alen;
   18513             storeLE( mkexpr(addr), mkexpr(t1) );
   18514             DIP("set%s %s\n", name_AMD64Condcode(opc-0x90), dis_buf);
   18515          }
   18516          break;
   18517 
   18518       /* =-=-=-=-=-=-=-=-=- SHLD/SHRD -=-=-=-=-=-=-=-=-= */
   18519 
   18520       case 0xA4: /* SHLDv imm8,Gv,Ev */
   18521          modrm = getUChar(delta);
   18522          d64   = delta + lengthAMode(pfx, delta);
   18523          vex_sprintf(dis_buf, "$%d", (Int)getUChar(d64));
   18524          delta = dis_SHLRD_Gv_Ev (
   18525                     vbi, pfx, delta, modrm, sz,
   18526                     mkU8(getUChar(d64)), True, /* literal */
   18527                     dis_buf, True /* left */ );
   18528          break;
   18529       case 0xA5: /* SHLDv %cl,Gv,Ev */
   18530          modrm = getUChar(delta);
   18531          delta = dis_SHLRD_Gv_Ev (
   18532                     vbi, pfx, delta, modrm, sz,
   18533                     getIRegCL(), False, /* not literal */
   18534                     "%cl", True /* left */ );
   18535          break;
   18536 
   18537       case 0xAC: /* SHRDv imm8,Gv,Ev */
   18538          modrm = getUChar(delta);
   18539          d64   = delta + lengthAMode(pfx, delta);
   18540          vex_sprintf(dis_buf, "$%d", (Int)getUChar(d64));
   18541          delta = dis_SHLRD_Gv_Ev (
   18542                     vbi, pfx, delta, modrm, sz,
   18543                     mkU8(getUChar(d64)), True, /* literal */
   18544                     dis_buf, False /* right */ );
   18545          break;
   18546       case 0xAD: /* SHRDv %cl,Gv,Ev */
   18547          modrm = getUChar(delta);
   18548          delta = dis_SHLRD_Gv_Ev (
   18549                     vbi, pfx, delta, modrm, sz,
   18550                     getIRegCL(), False, /* not literal */
   18551                     "%cl", False /* right */);
   18552          break;
   18553 
   18554       /* =-=-=-=-=-=-=-=-=- SYSCALL -=-=-=-=-=-=-=-=-=-= */
   18555       case 0x05: /* SYSCALL */
   18556          guest_RIP_next_mustcheck = True;
   18557          guest_RIP_next_assumed = guest_RIP_bbstart + delta;
   18558          putIReg64( R_RCX, mkU64(guest_RIP_next_assumed) );
   18559          /* It's important that all guest state is up-to-date
   18560             at this point.  So we declare an end-of-block here, which
   18561             forces any cached guest state to be flushed. */
   18562          jmp_lit(Ijk_Sys_syscall, guest_RIP_next_assumed);
   18563          dres.whatNext = Dis_StopHere;
   18564          DIP("syscall\n");
   18565          break;
   18566 
   18567       /* =-=-=-=-=-=-=-=-=- XADD -=-=-=-=-=-=-=-=-=-= */
   18568 
   18569       case 0xC0: { /* XADD Gb,Eb */
   18570          Bool decode_OK = False;
   18571          delta = dis_xadd_G_E ( &decode_OK, vbi, pfx, 1, delta );
   18572          if (!decode_OK)
   18573             goto decode_failure;
   18574          break;
   18575       }
   18576       case 0xC1: { /* XADD Gv,Ev */
   18577          Bool decode_OK = False;
   18578          delta = dis_xadd_G_E ( &decode_OK, vbi, pfx, sz, delta );
   18579          if (!decode_OK)
   18580             goto decode_failure;
   18581          break;
   18582       }
   18583 
   18584       /* =-=-=-=-=-=-=-=-=- MMXery =-=-=-=-=-=-=-=-=-=-= */
   18585 
   18586       case 0x71:
   18587       case 0x72:
   18588       case 0x73: /* PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
   18589 
   18590       case 0x6E: /* MOVD (src)ireg-or-mem, (dst)mmxreg */
   18591       case 0x7E: /* MOVD (src)mmxreg, (dst)ireg-or-mem */
   18592       case 0x7F: /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
   18593       case 0x6F: /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
   18594 
   18595       case 0xFC:
   18596       case 0xFD:
   18597       case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
   18598 
   18599       case 0xEC:
   18600       case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
   18601 
   18602       case 0xDC:
   18603       case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   18604 
   18605       case 0xF8:
   18606       case 0xF9:
   18607       case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
   18608 
   18609       case 0xE8:
   18610       case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
   18611 
   18612       case 0xD8:
   18613       case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   18614 
   18615       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
   18616       case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
   18617 
   18618       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
   18619 
   18620       case 0x74:
   18621       case 0x75:
   18622       case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
   18623 
   18624       case 0x64:
   18625       case 0x65:
   18626       case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
   18627 
   18628       case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
   18629       case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
   18630       case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
   18631 
   18632       case 0x68:
   18633       case 0x69:
   18634       case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
   18635 
   18636       case 0x60:
   18637       case 0x61:
   18638       case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
   18639 
   18640       case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
   18641       case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
   18642       case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
   18643       case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
   18644 
   18645       case 0xF1: /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
   18646       case 0xF2:
   18647       case 0xF3:
   18648 
   18649       case 0xD1: /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
   18650       case 0xD2:
   18651       case 0xD3:
   18652 
   18653       case 0xE1: /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
   18654       case 0xE2:
   18655       {
   18656          Long delta0    = delta-1;
   18657          Bool decode_OK = False;
   18658 
   18659          /* If sz==2 this is SSE, and we assume sse idec has
   18660             already spotted those cases by now. */
   18661          if (sz != 4 && sz != 8)
   18662             goto decode_failure;
   18663          if (have66orF2orF3(pfx))
   18664             goto decode_failure;
   18665 
   18666          delta = dis_MMX ( &decode_OK, vbi, pfx, sz, delta-1 );
   18667          if (!decode_OK) {
   18668             delta = delta0;
   18669             goto decode_failure;
   18670          }
   18671          break;
   18672       }
   18673 
   18674       case 0x0E: /* FEMMS */
   18675       case 0x77: /* EMMS */
   18676          if (sz != 4)
   18677             goto decode_failure;
   18678          do_EMMS_preamble();
   18679          DIP("{f}emms\n");
   18680          break;
   18681 
   18682       /* =-=-=-=-=-=-=-=-=- SGDT and SIDT =-=-=-=-=-=-=-=-=-=-= */
   18683       case 0x01: /* 0F 01 /0 -- SGDT */
   18684                  /* 0F 01 /1 -- SIDT */
   18685       {
   18686           /* This is really revolting, but ... since each processor
   18687              (core) only has one IDT and one GDT, just let the guest
   18688              see it (pass-through semantics).  I can't see any way to
   18689              construct a faked-up value, so don't bother to try. */
   18690          modrm = getUChar(delta);
   18691          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   18692          delta += alen;
   18693          if (epartIsReg(modrm)) goto decode_failure;
   18694          if (gregLO3ofRM(modrm) != 0 && gregLO3ofRM(modrm) != 1)
   18695             goto decode_failure;
   18696          switch (gregLO3ofRM(modrm)) {
   18697             case 0: DIP("sgdt %s\n", dis_buf); break;
   18698             case 1: DIP("sidt %s\n", dis_buf); break;
   18699             default: vassert(0); /*NOTREACHED*/
   18700          }
   18701 
   18702          IRDirty* d = unsafeIRDirty_0_N (
   18703                           0/*regparms*/,
   18704                           "amd64g_dirtyhelper_SxDT",
   18705                           &amd64g_dirtyhelper_SxDT,
   18706                           mkIRExprVec_2( mkexpr(addr),
   18707                                          mkU64(gregLO3ofRM(modrm)) )
   18708                       );
   18709          /* declare we're writing memory */
   18710          d->mFx   = Ifx_Write;
   18711          d->mAddr = mkexpr(addr);
   18712          d->mSize = 6;
   18713          stmt( IRStmt_Dirty(d) );
   18714          break;
   18715       }
   18716 
   18717       /* =-=-=-=-=-=-=-=-=- unimp2 =-=-=-=-=-=-=-=-=-=-= */
   18718 
   18719       default:
   18720          goto decode_failure;
   18721    } /* switch (opc) for the 2-byte opcodes */
   18722    goto decode_success;
   18723    } /* case 0x0F: of primary opcode */
   18724 
   18725    /* ------------------------ ??? ------------------------ */
   18726 
   18727   default:
   18728   decode_failure:
   18729    /* All decode failures end up here. */
   18730    vex_printf("vex amd64->IR: unhandled instruction bytes: "
   18731               "0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
   18732               (Int)getUChar(delta_start+0),
   18733               (Int)getUChar(delta_start+1),
   18734               (Int)getUChar(delta_start+2),
   18735               (Int)getUChar(delta_start+3),
   18736               (Int)getUChar(delta_start+4),
   18737               (Int)getUChar(delta_start+5),
   18738               (Int)getUChar(delta_start+6),
   18739               (Int)getUChar(delta_start+7) );
   18740 
   18741    /* Tell the dispatcher that this insn cannot be decoded, and so has
   18742       not been executed, and (is currently) the next to be executed.
   18743       RIP should be up-to-date since it made so at the start of each
   18744       insn, but nevertheless be paranoid and update it again right
   18745       now. */
   18746    stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_curr_instr) ) );
   18747    jmp_lit(Ijk_NoDecode, guest_RIP_curr_instr);
   18748    dres.whatNext = Dis_StopHere;
   18749    dres.len      = 0;
   18750    /* We also need to say that a CAS is not expected now, regardless
   18751       of what it might have been set to at the start of the function,
   18752       since the IR that we've emitted just above (to synthesis a
   18753       SIGILL) does not involve any CAS, and presumably no other IR has
   18754       been emitted for this (non-decoded) insn. */
   18755    *expect_CAS = False;
   18756    return dres;
   18757 
   18758    } /* switch (opc) for the main (primary) opcode switch. */
   18759 
   18760   decode_success:
   18761    /* All decode successes end up here. */
   18762    DIP("\n");
   18763    dres.len = (Int)toUInt(delta - delta_start);
   18764    return dres;
   18765 }
   18766 
   18767 #undef DIP
   18768 #undef DIS
   18769 
   18770 
   18771 /*------------------------------------------------------------*/
   18772 /*--- Top-level fn                                         ---*/
   18773 /*------------------------------------------------------------*/
   18774 
   18775 /* Disassemble a single instruction into IR.  The instruction
   18776    is located in host memory at &guest_code[delta]. */
   18777 
   18778 DisResult disInstr_AMD64 ( IRSB*        irsb_IN,
   18779                            Bool         put_IP,
   18780                            Bool         (*resteerOkFn) ( void*, Addr64 ),
   18781                            Bool         resteerCisOk,
   18782                            void*        callback_opaque,
   18783                            UChar*       guest_code_IN,
   18784                            Long         delta,
   18785                            Addr64       guest_IP,
   18786                            VexArch      guest_arch,
   18787                            VexArchInfo* archinfo,
   18788                            VexAbiInfo*  abiinfo,
   18789                            Bool         host_bigendian_IN )
   18790 {
   18791    Int       i, x1, x2;
   18792    Bool      expect_CAS, has_CAS;
   18793    DisResult dres;
   18794 
   18795    /* Set globals (see top of this file) */
   18796    vassert(guest_arch == VexArchAMD64);
   18797    guest_code           = guest_code_IN;
   18798    irsb                 = irsb_IN;
   18799    host_is_bigendian    = host_bigendian_IN;
   18800    guest_RIP_curr_instr = guest_IP;
   18801    guest_RIP_bbstart    = guest_IP - delta;
   18802 
   18803    /* We'll consult these after doing disInstr_AMD64_WRK. */
   18804    guest_RIP_next_assumed   = 0;
   18805    guest_RIP_next_mustcheck = False;
   18806 
   18807    x1 = irsb_IN->stmts_used;
   18808    expect_CAS = False;
   18809    dres = disInstr_AMD64_WRK ( &expect_CAS, put_IP, resteerOkFn,
   18810                                resteerCisOk,
   18811                                callback_opaque,
   18812                                delta, archinfo, abiinfo );
   18813    x2 = irsb_IN->stmts_used;
   18814    vassert(x2 >= x1);
   18815 
   18816    /* If disInstr_AMD64_WRK tried to figure out the next rip, check it
   18817       got it right.  Failure of this assertion is serious and denotes
   18818       a bug in disInstr. */
   18819    if (guest_RIP_next_mustcheck
   18820        && guest_RIP_next_assumed != guest_RIP_curr_instr + dres.len) {
   18821       vex_printf("\n");
   18822       vex_printf("assumed next %%rip = 0x%llx\n",
   18823                  guest_RIP_next_assumed );
   18824       vex_printf(" actual next %%rip = 0x%llx\n",
   18825                  guest_RIP_curr_instr + dres.len );
   18826       vpanic("disInstr_AMD64: disInstr miscalculated next %rip");
   18827    }
   18828 
   18829    /* See comment at the top of disInstr_AMD64_WRK for meaning of
   18830       expect_CAS.  Here, we (sanity-)check for the presence/absence of
   18831       IRCAS as directed by the returned expect_CAS value. */
   18832    has_CAS = False;
   18833    for (i = x1; i < x2; i++) {
   18834       if (irsb_IN->stmts[i]->tag == Ist_CAS)
   18835          has_CAS = True;
   18836    }
   18837 
   18838    if (expect_CAS != has_CAS) {
   18839       /* inconsistency detected.  re-disassemble the instruction so as
   18840          to generate a useful error message; then assert. */
   18841       vex_traceflags |= VEX_TRACE_FE;
   18842       dres = disInstr_AMD64_WRK ( &expect_CAS, put_IP, resteerOkFn,
   18843                                   resteerCisOk,
   18844                                   callback_opaque,
   18845                                   delta, archinfo, abiinfo );
   18846       for (i = x1; i < x2; i++) {
   18847          vex_printf("\t\t");
   18848          ppIRStmt(irsb_IN->stmts[i]);
   18849          vex_printf("\n");
   18850       }
   18851       /* Failure of this assertion is serious and denotes a bug in
   18852          disInstr. */
   18853       vpanic("disInstr_AMD64: inconsistency in LOCK prefix handling");
   18854    }
   18855 
   18856    return dres;
   18857 }
   18858 
   18859 
   18860 /*------------------------------------------------------------*/
   18861 /*--- Unused stuff                                         ---*/
   18862 /*------------------------------------------------------------*/
   18863 
   18864 // A potentially more Memcheck-friendly version of gen_LZCNT, if
   18865 // this should ever be needed.
   18866 //
   18867 //static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
   18868 //{
   18869 //   /* Scheme is simple: propagate the most significant 1-bit into all
   18870 //      lower positions in the word.  This gives a word of the form
   18871 //      0---01---1.  Now invert it, giving a word of the form
   18872 //      1---10---0, then do a population-count idiom (to count the 1s,
   18873 //      which is the number of leading zeroes, or the word size if the
   18874 //      original word was 0.
   18875 //   */
   18876 //   Int i;
   18877 //   IRTemp t[7];
   18878 //   for (i = 0; i < 7; i++) {
   18879 //      t[i] = newTemp(ty);
   18880 //   }
   18881 //   if (ty == Ity_I64) {
   18882 //      assign(t[0], binop(Iop_Or64, mkexpr(src),
   18883 //                                   binop(Iop_Shr64, mkexpr(src),  mkU8(1))));
   18884 //      assign(t[1], binop(Iop_Or64, mkexpr(t[0]),
   18885 //                                   binop(Iop_Shr64, mkexpr(t[0]), mkU8(2))));
   18886 //      assign(t[2], binop(Iop_Or64, mkexpr(t[1]),
   18887 //                                   binop(Iop_Shr64, mkexpr(t[1]), mkU8(4))));
   18888 //      assign(t[3], binop(Iop_Or64, mkexpr(t[2]),
   18889 //                                   binop(Iop_Shr64, mkexpr(t[2]), mkU8(8))));
   18890 //      assign(t[4], binop(Iop_Or64, mkexpr(t[3]),
   18891 //                                   binop(Iop_Shr64, mkexpr(t[3]), mkU8(16))));
   18892 //      assign(t[5], binop(Iop_Or64, mkexpr(t[4]),
   18893 //                                   binop(Iop_Shr64, mkexpr(t[4]), mkU8(32))));
   18894 //      assign(t[6], unop(Iop_Not64, mkexpr(t[5])));
   18895 //      return gen_POPCOUNT(ty, t[6]);
   18896 //   }
   18897 //   if (ty == Ity_I32) {
   18898 //      assign(t[0], binop(Iop_Or32, mkexpr(src),
   18899 //                                   binop(Iop_Shr32, mkexpr(src),  mkU8(1))));
   18900 //      assign(t[1], binop(Iop_Or32, mkexpr(t[0]),
   18901 //                                   binop(Iop_Shr32, mkexpr(t[0]), mkU8(2))));
   18902 //      assign(t[2], binop(Iop_Or32, mkexpr(t[1]),
   18903 //                                   binop(Iop_Shr32, mkexpr(t[1]), mkU8(4))));
   18904 //      assign(t[3], binop(Iop_Or32, mkexpr(t[2]),
   18905 //                                   binop(Iop_Shr32, mkexpr(t[2]), mkU8(8))));
   18906 //      assign(t[4], binop(Iop_Or32, mkexpr(t[3]),
   18907 //                                   binop(Iop_Shr32, mkexpr(t[3]), mkU8(16))));
   18908 //      assign(t[5], unop(Iop_Not32, mkexpr(t[4])));
   18909 //      return gen_POPCOUNT(ty, t[5]);
   18910 //   }
   18911 //   if (ty == Ity_I16) {
   18912 //      assign(t[0], binop(Iop_Or16, mkexpr(src),
   18913 //                                   binop(Iop_Shr16, mkexpr(src),  mkU8(1))));
   18914 //      assign(t[1], binop(Iop_Or16, mkexpr(t[0]),
   18915 //                                   binop(Iop_Shr16, mkexpr(t[0]), mkU8(2))));
   18916 //      assign(t[2], binop(Iop_Or16, mkexpr(t[1]),
   18917 //                                   binop(Iop_Shr16, mkexpr(t[1]), mkU8(4))));
   18918 //      assign(t[3], binop(Iop_Or16, mkexpr(t[2]),
   18919 //                                   binop(Iop_Shr16, mkexpr(t[2]), mkU8(8))));
   18920 //      assign(t[4], unop(Iop_Not16, mkexpr(t[3])));
   18921 //      return gen_POPCOUNT(ty, t[4]);
   18922 //   }
   18923 //   vassert(0);
   18924 //}
   18925 
   18926 
   18927 /*--------------------------------------------------------------------*/
   18928 /*--- end                                       guest_amd64_toIR.c ---*/
   18929 /*--------------------------------------------------------------------*/
   18930