Home | History | Annotate | Download | only in priv
      1 
      2 /*--------------------------------------------------------------------*/
      3 /*--- begin                                     guest_amd64_toIR.c ---*/
      4 /*--------------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2017 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 /* Translates AMD64 code to IR. */
     37 
     38 /* TODO:
     39 
     40    All Puts to CC_OP/CC_DEP1/CC_DEP2/CC_NDEP should really be checked
     41    to ensure a 64-bit value is being written.
     42 
     43    x87 FP Limitations:
     44 
     45    * all arithmetic done at 64 bits
     46 
     47    * no FP exceptions, except for handling stack over/underflow
     48 
     49    * FP rounding mode observed only for float->int conversions and
     50      int->float conversions which could lose accuracy, and for
     51      float-to-float rounding.  For all other operations,
     52      round-to-nearest is used, regardless.
     53 
     54    * some of the FCOM cases could do with testing -- not convinced
     55      that the args are the right way round.
     56 
     57    * FSAVE does not re-initialise the FPU; it should do
     58 
     59    * FINIT not only initialises the FPU environment, it also zeroes
     60      all the FP registers.  It should leave the registers unchanged.
     61 
     62     SAHF should cause eflags[1] == 1, and in fact it produces 0.  As
     63     per Intel docs this bit has no meaning anyway.  Since PUSHF is the
     64     only way to observe eflags[1], a proper fix would be to make that
     65     bit be set by PUSHF.
     66 
     67     This module uses global variables and so is not MT-safe (if that
     68     should ever become relevant).
     69 */
     70 
     71 /* Notes re address size overrides (0x67).
     72 
     73    According to the AMD documentation (24594 Rev 3.09, Sept 2003,
     74    "AMD64 Architecture Programmer's Manual Volume 3: General-Purpose
     75    and System Instructions"), Section 1.2.3 ("Address-Size Override
     76    Prefix"):
     77 
     78    0x67 applies to all explicit memory references, causing the top
     79    32 bits of the effective address to become zero.
     80 
     81    0x67 has no effect on stack references (push/pop); these always
     82    use a 64-bit address.
     83 
     84    0x67 changes the interpretation of instructions which implicitly
     85    reference RCX/RSI/RDI, so that in fact ECX/ESI/EDI are used
     86    instead.  These are:
     87 
     88       cmp{s,sb,sw,sd,sq}
     89       in{s,sb,sw,sd}
     90       jcxz, jecxz, jrcxz
     91       lod{s,sb,sw,sd,sq}
     92       loop{,e,bz,be,z}
     93       mov{s,sb,sw,sd,sq}
     94       out{s,sb,sw,sd}
     95       rep{,e,ne,nz}
     96       sca{s,sb,sw,sd,sq}
     97       sto{s,sb,sw,sd,sq}
     98       xlat{,b} */
     99 
    100 /* "Special" instructions.
    101 
    102    This instruction decoder can decode three special instructions
    103    which mean nothing natively (are no-ops as far as regs/mem are
    104    concerned) but have meaning for supporting Valgrind.  A special
    105    instruction is flagged by the 16-byte preamble 48C1C703 48C1C70D
    106    48C1C73D 48C1C733 (in the standard interpretation, that means: rolq
    107    $3, %rdi; rolq $13, %rdi; rolq $61, %rdi; rolq $51, %rdi).
    108    Following that, one of the following 3 are allowed (standard
    109    interpretation in parentheses):
    110 
    111       4887DB (xchgq %rbx,%rbx)   %RDX = client_request ( %RAX )
    112       4887C9 (xchgq %rcx,%rcx)   %RAX = guest_NRADDR
    113       4887D2 (xchgq %rdx,%rdx)   call-noredir *%RAX
    114       4887F6 (xchgq %rdi,%rdi)   IR injection
    115 
    116    Any other bytes following the 16-byte preamble are illegal and
    117    constitute a failure in instruction decoding.  This all assumes
    118    that the preamble will never occur except in specific code
    119    fragments designed for Valgrind to catch.
    120 
    121    No prefixes may precede a "Special" instruction.
    122 */
    123 
    124 /* casLE (implementation of lock-prefixed insns) and rep-prefixed
    125    insns: the side-exit back to the start of the insn is done with
    126    Ijk_Boring.  This is quite wrong, it should be done with
    127    Ijk_NoRedir, since otherwise the side exit, which is intended to
    128    restart the instruction for whatever reason, could go somewhere
    129    entirely else.  Doing it right (with Ijk_NoRedir jumps) would make
    130    no-redir jumps performance critical, at least for rep-prefixed
    131    instructions, since all iterations thereof would involve such a
    132    jump.  It's not such a big deal with casLE since the side exit is
    133    only taken if the CAS fails, that is, the location is contended,
    134    which is relatively unlikely.
    135 
    136    Note also, the test for CAS success vs failure is done using
    137    Iop_CasCmp{EQ,NE}{8,16,32,64} rather than the ordinary
    138    Iop_Cmp{EQ,NE} equivalents.  This is so as to tell Memcheck that it
    139    shouldn't definedness-check these comparisons.  See
    140    COMMENT_ON_CasCmpEQ in memcheck/mc_translate.c for
    141    background/rationale.
    142 */
    143 
    144 /* LOCK prefixed instructions.  These are translated using IR-level
    145    CAS statements (IRCAS) and are believed to preserve atomicity, even
    146    from the point of view of some other process racing against a
    147    simulated one (presumably they communicate via a shared memory
    148    segment).
    149 
    150    Handlers which are aware of LOCK prefixes are:
    151       dis_op2_G_E      (add, or, adc, sbb, and, sub, xor)
    152       dis_cmpxchg_G_E  (cmpxchg)
    153       dis_Grp1         (add, or, adc, sbb, and, sub, xor)
    154       dis_Grp3         (not, neg)
    155       dis_Grp4         (inc, dec)
    156       dis_Grp5         (inc, dec)
    157       dis_Grp8_Imm     (bts, btc, btr)
    158       dis_bt_G_E       (bts, btc, btr)
    159       dis_xadd_G_E     (xadd)
    160 */
    161 
    162 
    163 #include "libvex_basictypes.h"
    164 #include "libvex_ir.h"
    165 #include "libvex.h"
    166 #include "libvex_guest_amd64.h"
    167 
    168 #include "main_util.h"
    169 #include "main_globals.h"
    170 #include "guest_generic_bb_to_IR.h"
    171 #include "guest_generic_x87.h"
    172 #include "guest_amd64_defs.h"
    173 
    174 
    175 /*------------------------------------------------------------*/
    176 /*--- Globals                                              ---*/
    177 /*------------------------------------------------------------*/
    178 
    179 /* These are set at the start of the translation of an insn, right
    180    down in disInstr_AMD64, so that we don't have to pass them around
    181    endlessly.  They are all constant during the translation of any
    182    given insn. */
    183 
    184 /* These are set at the start of the translation of a BB, so
    185    that we don't have to pass them around endlessly. */
    186 
    187 /* We need to know this to do sub-register accesses correctly. */
    188 static VexEndness host_endness;
    189 
    190 /* Pointer to the guest code area (points to start of BB, not to the
    191    insn being processed). */
    192 static const UChar* guest_code;
    193 
    194 /* The guest address corresponding to guest_code[0]. */
    195 static Addr64 guest_RIP_bbstart;
    196 
    197 /* The guest address for the instruction currently being
    198    translated. */
    199 static Addr64 guest_RIP_curr_instr;
    200 
    201 /* The IRSB* into which we're generating code. */
    202 static IRSB* irsb;
    203 
    204 /* For ensuring that %rip-relative addressing is done right.  A read
    205    of %rip generates the address of the next instruction.  It may be
    206    that we don't conveniently know that inside disAMode().  For sanity
    207    checking, if the next insn %rip is needed, we make a guess at what
    208    it is, record that guess here, and set the accompanying Bool to
    209    indicate that -- after this insn's decode is finished -- that guess
    210    needs to be checked.  */
    211 
    212 /* At the start of each insn decode, is set to (0, False).
    213    After the decode, if _mustcheck is now True, _assumed is
    214    checked. */
    215 
    216 static Addr64 guest_RIP_next_assumed;
    217 static Bool   guest_RIP_next_mustcheck;
    218 
    219 
    220 /*------------------------------------------------------------*/
    221 /*--- Helpers for constructing IR.                         ---*/
    222 /*------------------------------------------------------------*/
    223 
    224 /* Generate a new temporary of the given type. */
    225 static IRTemp newTemp ( IRType ty )
    226 {
    227    vassert(isPlausibleIRType(ty));
    228    return newIRTemp( irsb->tyenv, ty );
    229 }
    230 
    231 /* Add a statement to the list held by "irsb". */
    232 static void stmt ( IRStmt* st )
    233 {
    234    addStmtToIRSB( irsb, st );
    235 }
    236 
    237 /* Generate a statement "dst := e". */
    238 static void assign ( IRTemp dst, IRExpr* e )
    239 {
    240    stmt( IRStmt_WrTmp(dst, e) );
    241 }
    242 
    243 static IRExpr* unop ( IROp op, IRExpr* a )
    244 {
    245    return IRExpr_Unop(op, a);
    246 }
    247 
    248 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
    249 {
    250    return IRExpr_Binop(op, a1, a2);
    251 }
    252 
    253 static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
    254 {
    255    return IRExpr_Triop(op, a1, a2, a3);
    256 }
    257 
    258 static IRExpr* mkexpr ( IRTemp tmp )
    259 {
    260    return IRExpr_RdTmp(tmp);
    261 }
    262 
    263 static IRExpr* mkU8 ( ULong i )
    264 {
    265    vassert(i < 256);
    266    return IRExpr_Const(IRConst_U8( (UChar)i ));
    267 }
    268 
    269 static IRExpr* mkU16 ( ULong i )
    270 {
    271    vassert(i < 0x10000ULL);
    272    return IRExpr_Const(IRConst_U16( (UShort)i ));
    273 }
    274 
    275 static IRExpr* mkU32 ( ULong i )
    276 {
    277    vassert(i < 0x100000000ULL);
    278    return IRExpr_Const(IRConst_U32( (UInt)i ));
    279 }
    280 
    281 static IRExpr* mkU64 ( ULong i )
    282 {
    283    return IRExpr_Const(IRConst_U64(i));
    284 }
    285 
    286 static IRExpr* mkU ( IRType ty, ULong i )
    287 {
    288    switch (ty) {
    289       case Ity_I8:  return mkU8(i);
    290       case Ity_I16: return mkU16(i);
    291       case Ity_I32: return mkU32(i);
    292       case Ity_I64: return mkU64(i);
    293       default: vpanic("mkU(amd64)");
    294    }
    295 }
    296 
    297 static void storeLE ( IRExpr* addr, IRExpr* data )
    298 {
    299    stmt( IRStmt_Store(Iend_LE, addr, data) );
    300 }
    301 
    302 static IRExpr* loadLE ( IRType ty, IRExpr* addr )
    303 {
    304    return IRExpr_Load(Iend_LE, ty, addr);
    305 }
    306 
    307 static IROp mkSizedOp ( IRType ty, IROp op8 )
    308 {
    309    vassert(op8 == Iop_Add8 || op8 == Iop_Sub8
    310            || op8 == Iop_Mul8
    311            || op8 == Iop_Or8 || op8 == Iop_And8 || op8 == Iop_Xor8
    312            || op8 == Iop_Shl8 || op8 == Iop_Shr8 || op8 == Iop_Sar8
    313            || op8 == Iop_CmpEQ8 || op8 == Iop_CmpNE8
    314            || op8 == Iop_CasCmpNE8
    315            || op8 == Iop_Not8 );
    316    switch (ty) {
    317       case Ity_I8:  return 0 +op8;
    318       case Ity_I16: return 1 +op8;
    319       case Ity_I32: return 2 +op8;
    320       case Ity_I64: return 3 +op8;
    321       default: vpanic("mkSizedOp(amd64)");
    322    }
    323 }
    324 
    325 static
    326 IRExpr* doScalarWidening ( Int szSmall, Int szBig, Bool signd, IRExpr* src )
    327 {
    328    if (szSmall == 1 && szBig == 4) {
    329       return unop(signd ? Iop_8Sto32 : Iop_8Uto32, src);
    330    }
    331    if (szSmall == 1 && szBig == 2) {
    332       return unop(signd ? Iop_8Sto16 : Iop_8Uto16, src);
    333    }
    334    if (szSmall == 2 && szBig == 4) {
    335       return unop(signd ? Iop_16Sto32 : Iop_16Uto32, src);
    336    }
    337    if (szSmall == 1 && szBig == 8 && !signd) {
    338       return unop(Iop_8Uto64, src);
    339    }
    340    if (szSmall == 1 && szBig == 8 && signd) {
    341       return unop(Iop_8Sto64, src);
    342    }
    343    if (szSmall == 2 && szBig == 8 && !signd) {
    344       return unop(Iop_16Uto64, src);
    345    }
    346    if (szSmall == 2 && szBig == 8 && signd) {
    347       return unop(Iop_16Sto64, src);
    348    }
    349    vpanic("doScalarWidening(amd64)");
    350 }
    351 
    352 static
    353 void putGuarded ( Int gstOffB, IRExpr* guard, IRExpr* value )
    354 {
    355    IRType ty = typeOfIRExpr(irsb->tyenv, value);
    356    stmt( IRStmt_Put(gstOffB,
    357                     IRExpr_ITE(guard, value, IRExpr_Get(gstOffB, ty))) );
    358 }
    359 
    360 
    361 /*------------------------------------------------------------*/
    362 /*--- Debugging output                                     ---*/
    363 /*------------------------------------------------------------*/
    364 
    365 /* Bomb out if we can't handle something. */
    366 __attribute__ ((noreturn))
    367 static void unimplemented ( const HChar* str )
    368 {
    369    vex_printf("amd64toIR: unimplemented feature\n");
    370    vpanic(str);
    371 }
    372 
    373 #define DIP(format, args...)           \
    374    if (vex_traceflags & VEX_TRACE_FE)  \
    375       vex_printf(format, ## args)
    376 
    377 #define DIS(buf, format, args...)      \
    378    if (vex_traceflags & VEX_TRACE_FE)  \
    379       vex_sprintf(buf, format, ## args)
    380 
    381 
    382 /*------------------------------------------------------------*/
    383 /*--- Offsets of various parts of the amd64 guest state.   ---*/
    384 /*------------------------------------------------------------*/
    385 
    386 #define OFFB_RAX       offsetof(VexGuestAMD64State,guest_RAX)
    387 #define OFFB_RBX       offsetof(VexGuestAMD64State,guest_RBX)
    388 #define OFFB_RCX       offsetof(VexGuestAMD64State,guest_RCX)
    389 #define OFFB_RDX       offsetof(VexGuestAMD64State,guest_RDX)
    390 #define OFFB_RSP       offsetof(VexGuestAMD64State,guest_RSP)
    391 #define OFFB_RBP       offsetof(VexGuestAMD64State,guest_RBP)
    392 #define OFFB_RSI       offsetof(VexGuestAMD64State,guest_RSI)
    393 #define OFFB_RDI       offsetof(VexGuestAMD64State,guest_RDI)
    394 #define OFFB_R8        offsetof(VexGuestAMD64State,guest_R8)
    395 #define OFFB_R9        offsetof(VexGuestAMD64State,guest_R9)
    396 #define OFFB_R10       offsetof(VexGuestAMD64State,guest_R10)
    397 #define OFFB_R11       offsetof(VexGuestAMD64State,guest_R11)
    398 #define OFFB_R12       offsetof(VexGuestAMD64State,guest_R12)
    399 #define OFFB_R13       offsetof(VexGuestAMD64State,guest_R13)
    400 #define OFFB_R14       offsetof(VexGuestAMD64State,guest_R14)
    401 #define OFFB_R15       offsetof(VexGuestAMD64State,guest_R15)
    402 
    403 #define OFFB_RIP       offsetof(VexGuestAMD64State,guest_RIP)
    404 
    405 #define OFFB_FS_CONST  offsetof(VexGuestAMD64State,guest_FS_CONST)
    406 #define OFFB_GS_CONST  offsetof(VexGuestAMD64State,guest_GS_CONST)
    407 
    408 #define OFFB_CC_OP     offsetof(VexGuestAMD64State,guest_CC_OP)
    409 #define OFFB_CC_DEP1   offsetof(VexGuestAMD64State,guest_CC_DEP1)
    410 #define OFFB_CC_DEP2   offsetof(VexGuestAMD64State,guest_CC_DEP2)
    411 #define OFFB_CC_NDEP   offsetof(VexGuestAMD64State,guest_CC_NDEP)
    412 
    413 #define OFFB_FPREGS    offsetof(VexGuestAMD64State,guest_FPREG[0])
    414 #define OFFB_FPTAGS    offsetof(VexGuestAMD64State,guest_FPTAG[0])
    415 #define OFFB_DFLAG     offsetof(VexGuestAMD64State,guest_DFLAG)
    416 #define OFFB_ACFLAG    offsetof(VexGuestAMD64State,guest_ACFLAG)
    417 #define OFFB_IDFLAG    offsetof(VexGuestAMD64State,guest_IDFLAG)
    418 #define OFFB_FTOP      offsetof(VexGuestAMD64State,guest_FTOP)
    419 #define OFFB_FC3210    offsetof(VexGuestAMD64State,guest_FC3210)
    420 #define OFFB_FPROUND   offsetof(VexGuestAMD64State,guest_FPROUND)
    421 
    422 #define OFFB_SSEROUND  offsetof(VexGuestAMD64State,guest_SSEROUND)
    423 #define OFFB_YMM0      offsetof(VexGuestAMD64State,guest_YMM0)
    424 #define OFFB_YMM1      offsetof(VexGuestAMD64State,guest_YMM1)
    425 #define OFFB_YMM2      offsetof(VexGuestAMD64State,guest_YMM2)
    426 #define OFFB_YMM3      offsetof(VexGuestAMD64State,guest_YMM3)
    427 #define OFFB_YMM4      offsetof(VexGuestAMD64State,guest_YMM4)
    428 #define OFFB_YMM5      offsetof(VexGuestAMD64State,guest_YMM5)
    429 #define OFFB_YMM6      offsetof(VexGuestAMD64State,guest_YMM6)
    430 #define OFFB_YMM7      offsetof(VexGuestAMD64State,guest_YMM7)
    431 #define OFFB_YMM8      offsetof(VexGuestAMD64State,guest_YMM8)
    432 #define OFFB_YMM9      offsetof(VexGuestAMD64State,guest_YMM9)
    433 #define OFFB_YMM10     offsetof(VexGuestAMD64State,guest_YMM10)
    434 #define OFFB_YMM11     offsetof(VexGuestAMD64State,guest_YMM11)
    435 #define OFFB_YMM12     offsetof(VexGuestAMD64State,guest_YMM12)
    436 #define OFFB_YMM13     offsetof(VexGuestAMD64State,guest_YMM13)
    437 #define OFFB_YMM14     offsetof(VexGuestAMD64State,guest_YMM14)
    438 #define OFFB_YMM15     offsetof(VexGuestAMD64State,guest_YMM15)
    439 #define OFFB_YMM16     offsetof(VexGuestAMD64State,guest_YMM16)
    440 
    441 #define OFFB_EMNOTE    offsetof(VexGuestAMD64State,guest_EMNOTE)
    442 #define OFFB_CMSTART   offsetof(VexGuestAMD64State,guest_CMSTART)
    443 #define OFFB_CMLEN     offsetof(VexGuestAMD64State,guest_CMLEN)
    444 
    445 #define OFFB_NRADDR    offsetof(VexGuestAMD64State,guest_NRADDR)
    446 
    447 
    448 /*------------------------------------------------------------*/
    449 /*--- Helper bits and pieces for deconstructing the        ---*/
    450 /*--- amd64 insn stream.                                   ---*/
    451 /*------------------------------------------------------------*/
    452 
    453 /* This is the AMD64 register encoding -- integer regs. */
    454 #define R_RAX 0
    455 #define R_RCX 1
    456 #define R_RDX 2
    457 #define R_RBX 3
    458 #define R_RSP 4
    459 #define R_RBP 5
    460 #define R_RSI 6
    461 #define R_RDI 7
    462 #define R_R8  8
    463 #define R_R9  9
    464 #define R_R10 10
    465 #define R_R11 11
    466 #define R_R12 12
    467 #define R_R13 13
    468 #define R_R14 14
    469 #define R_R15 15
    470 
    471 /* This is the Intel register encoding -- segment regs. */
    472 #define R_ES 0
    473 #define R_CS 1
    474 #define R_SS 2
    475 #define R_DS 3
    476 #define R_FS 4
    477 #define R_GS 5
    478 
    479 
    480 /* Various simple conversions */
    481 
    482 static ULong extend_s_8to64 ( UChar x )
    483 {
    484    return (ULong)((Long)(((ULong)x) << 56) >> 56);
    485 }
    486 
    487 static ULong extend_s_16to64 ( UShort x )
    488 {
    489    return (ULong)((Long)(((ULong)x) << 48) >> 48);
    490 }
    491 
    492 static ULong extend_s_32to64 ( UInt x )
    493 {
    494    return (ULong)((Long)(((ULong)x) << 32) >> 32);
    495 }
    496 
    497 /* Figure out whether the mod and rm parts of a modRM byte refer to a
    498    register or memory.  If so, the byte will have the form 11XXXYYY,
    499    where YYY is the register number. */
    500 inline
    501 static Bool epartIsReg ( UChar mod_reg_rm )
    502 {
    503    return toBool(0xC0 == (mod_reg_rm & 0xC0));
    504 }
    505 
    506 /* Extract the 'g' field from a modRM byte.  This only produces 3
    507    bits, which is not a complete register number.  You should avoid
    508    this function if at all possible. */
    509 inline
    510 static Int gregLO3ofRM ( UChar mod_reg_rm )
    511 {
    512    return (Int)( (mod_reg_rm >> 3) & 7 );
    513 }
    514 
    515 /* Ditto the 'e' field of a modRM byte. */
    516 inline
    517 static Int eregLO3ofRM ( UChar mod_reg_rm )
    518 {
    519    return (Int)(mod_reg_rm & 0x7);
    520 }
    521 
    522 /* Get a 8/16/32-bit unsigned value out of the insn stream. */
    523 
    524 static inline UChar getUChar ( Long delta )
    525 {
    526    UChar v = guest_code[delta+0];
    527    return v;
    528 }
    529 
    530 static UInt getUDisp16 ( Long delta )
    531 {
    532    UInt v = guest_code[delta+1]; v <<= 8;
    533    v |= guest_code[delta+0];
    534    return v & 0xFFFF;
    535 }
    536 
    537 //.. static UInt getUDisp ( Int size, Long delta )
    538 //.. {
    539 //..    switch (size) {
    540 //..       case 4: return getUDisp32(delta);
    541 //..       case 2: return getUDisp16(delta);
    542 //..       case 1: return getUChar(delta);
    543 //..       default: vpanic("getUDisp(x86)");
    544 //..    }
    545 //..    return 0; /*notreached*/
    546 //.. }
    547 
    548 
    549 /* Get a byte value out of the insn stream and sign-extend to 64
    550    bits. */
    551 static Long getSDisp8 ( Long delta )
    552 {
    553    return extend_s_8to64( guest_code[delta] );
    554 }
    555 
    556 /* Get a 16-bit value out of the insn stream and sign-extend to 64
    557    bits. */
    558 static Long getSDisp16 ( Long delta )
    559 {
    560    UInt v = guest_code[delta+1]; v <<= 8;
    561    v |= guest_code[delta+0];
    562    return extend_s_16to64( (UShort)v );
    563 }
    564 
    565 /* Get a 32-bit value out of the insn stream and sign-extend to 64
    566    bits. */
    567 static Long getSDisp32 ( Long delta )
    568 {
    569    UInt v = guest_code[delta+3]; v <<= 8;
    570    v |= guest_code[delta+2]; v <<= 8;
    571    v |= guest_code[delta+1]; v <<= 8;
    572    v |= guest_code[delta+0];
    573    return extend_s_32to64( v );
    574 }
    575 
    576 /* Get a 64-bit value out of the insn stream. */
    577 static Long getDisp64 ( Long delta )
    578 {
    579    ULong v = 0;
    580    v |= guest_code[delta+7]; v <<= 8;
    581    v |= guest_code[delta+6]; v <<= 8;
    582    v |= guest_code[delta+5]; v <<= 8;
    583    v |= guest_code[delta+4]; v <<= 8;
    584    v |= guest_code[delta+3]; v <<= 8;
    585    v |= guest_code[delta+2]; v <<= 8;
    586    v |= guest_code[delta+1]; v <<= 8;
    587    v |= guest_code[delta+0];
    588    return v;
    589 }
    590 
    591 /* Note: because AMD64 doesn't allow 64-bit literals, it is an error
    592    if this is called with size==8.  Should not happen. */
    593 static Long getSDisp ( Int size, Long delta )
    594 {
    595    switch (size) {
    596       case 4: return getSDisp32(delta);
    597       case 2: return getSDisp16(delta);
    598       case 1: return getSDisp8(delta);
    599       default: vpanic("getSDisp(amd64)");
    600   }
    601 }
    602 
    603 static ULong mkSizeMask ( Int sz )
    604 {
    605    switch (sz) {
    606       case 1: return 0x00000000000000FFULL;
    607       case 2: return 0x000000000000FFFFULL;
    608       case 4: return 0x00000000FFFFFFFFULL;
    609       case 8: return 0xFFFFFFFFFFFFFFFFULL;
    610       default: vpanic("mkSzMask(amd64)");
    611    }
    612 }
    613 
    614 static Int imin ( Int a, Int b )
    615 {
    616    return (a < b) ? a : b;
    617 }
    618 
    619 static IRType szToITy ( Int n )
    620 {
    621    switch (n) {
    622       case 1: return Ity_I8;
    623       case 2: return Ity_I16;
    624       case 4: return Ity_I32;
    625       case 8: return Ity_I64;
    626       default: vex_printf("\nszToITy(%d)\n", n);
    627                vpanic("szToITy(amd64)");
    628    }
    629 }
    630 
    631 
    632 /*------------------------------------------------------------*/
    633 /*--- For dealing with prefixes.                           ---*/
    634 /*------------------------------------------------------------*/
    635 
    636 /* The idea is to pass around an int holding a bitmask summarising
    637    info from the prefixes seen on the current instruction, including
    638    info from the REX byte.  This info is used in various places, but
    639    most especially when making sense of register fields in
    640    instructions.
    641 
    642    The top 8 bits of the prefix are 0x55, just as a hacky way to
    643    ensure it really is a valid prefix.
    644 
    645    Things you can safely assume about a well-formed prefix:
    646    * at most one segment-override bit (CS,DS,ES,FS,GS,SS) is set.
    647    * if REX is not present then REXW,REXR,REXX,REXB will read
    648      as zero.
    649    * F2 and F3 will not both be 1.
    650 */
    651 
    652 typedef UInt  Prefix;
    653 
    654 #define PFX_ASO    (1<<0)    /* address-size override present (0x67) */
    655 #define PFX_66     (1<<1)    /* operand-size override-to-16 present (0x66) */
    656 #define PFX_REX    (1<<2)    /* REX byte present (0x40 to 0x4F) */
    657 #define PFX_REXW   (1<<3)    /* REX W bit, if REX present, else 0 */
    658 #define PFX_REXR   (1<<4)    /* REX R bit, if REX present, else 0 */
    659 #define PFX_REXX   (1<<5)    /* REX X bit, if REX present, else 0 */
    660 #define PFX_REXB   (1<<6)    /* REX B bit, if REX present, else 0 */
    661 #define PFX_LOCK   (1<<7)    /* bus LOCK prefix present (0xF0) */
    662 #define PFX_F2     (1<<8)    /* REP/REPE/REPZ prefix present (0xF2) */
    663 #define PFX_F3     (1<<9)    /* REPNE/REPNZ prefix present (0xF3) */
    664 #define PFX_CS     (1<<10)   /* CS segment prefix present (0x2E) */
    665 #define PFX_DS     (1<<11)   /* DS segment prefix present (0x3E) */
    666 #define PFX_ES     (1<<12)   /* ES segment prefix present (0x26) */
    667 #define PFX_FS     (1<<13)   /* FS segment prefix present (0x64) */
    668 #define PFX_GS     (1<<14)   /* GS segment prefix present (0x65) */
    669 #define PFX_SS     (1<<15)   /* SS segment prefix present (0x36) */
    670 #define PFX_VEX    (1<<16)   /* VEX prefix present (0xC4 or 0xC5) */
    671 #define PFX_VEXL   (1<<17)   /* VEX L bit, if VEX present, else 0 */
    672 /* The extra register field VEX.vvvv is encoded (after not-ing it) as
    673    PFX_VEXnV3 .. PFX_VEXnV0, so these must occupy adjacent bit
    674    positions. */
    675 #define PFX_VEXnV0 (1<<18)   /* ~VEX vvvv[0], if VEX present, else 0 */
    676 #define PFX_VEXnV1 (1<<19)   /* ~VEX vvvv[1], if VEX present, else 0 */
    677 #define PFX_VEXnV2 (1<<20)   /* ~VEX vvvv[2], if VEX present, else 0 */
    678 #define PFX_VEXnV3 (1<<21)   /* ~VEX vvvv[3], if VEX present, else 0 */
    679 
    680 
    681 #define PFX_EMPTY 0x55000000
    682 
    683 static Bool IS_VALID_PFX ( Prefix pfx ) {
    684    return toBool((pfx & 0xFF000000) == PFX_EMPTY);
    685 }
    686 
    687 static Bool haveREX ( Prefix pfx ) {
    688    return toBool(pfx & PFX_REX);
    689 }
    690 
    691 static Int getRexW ( Prefix pfx ) {
    692    return (pfx & PFX_REXW) ? 1 : 0;
    693 }
    694 static Int getRexR ( Prefix pfx ) {
    695    return (pfx & PFX_REXR) ? 1 : 0;
    696 }
    697 static Int getRexX ( Prefix pfx ) {
    698    return (pfx & PFX_REXX) ? 1 : 0;
    699 }
    700 static Int getRexB ( Prefix pfx ) {
    701    return (pfx & PFX_REXB) ? 1 : 0;
    702 }
    703 
    704 /* Check a prefix doesn't have F2 or F3 set in it, since usually that
    705    completely changes what instruction it really is. */
    706 static Bool haveF2orF3 ( Prefix pfx ) {
    707    return toBool((pfx & (PFX_F2|PFX_F3)) > 0);
    708 }
    709 static Bool haveF2andF3 ( Prefix pfx ) {
    710    return toBool((pfx & (PFX_F2|PFX_F3)) == (PFX_F2|PFX_F3));
    711 }
    712 static Bool haveF2 ( Prefix pfx ) {
    713    return toBool((pfx & PFX_F2) > 0);
    714 }
    715 static Bool haveF3 ( Prefix pfx ) {
    716    return toBool((pfx & PFX_F3) > 0);
    717 }
    718 
    719 static Bool have66 ( Prefix pfx ) {
    720    return toBool((pfx & PFX_66) > 0);
    721 }
    722 static Bool haveASO ( Prefix pfx ) {
    723    return toBool((pfx & PFX_ASO) > 0);
    724 }
    725 static Bool haveLOCK ( Prefix pfx ) {
    726    return toBool((pfx & PFX_LOCK) > 0);
    727 }
    728 
    729 /* Return True iff pfx has 66 set and F2 and F3 clear */
    730 static Bool have66noF2noF3 ( Prefix pfx )
    731 {
    732   return
    733      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_66);
    734 }
    735 
    736 /* Return True iff pfx has F2 set and 66 and F3 clear */
    737 static Bool haveF2no66noF3 ( Prefix pfx )
    738 {
    739   return
    740      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_F2);
    741 }
    742 
    743 /* Return True iff pfx has F3 set and 66 and F2 clear */
    744 static Bool haveF3no66noF2 ( Prefix pfx )
    745 {
    746   return
    747      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_F3);
    748 }
    749 
    750 /* Return True iff pfx has F3 set and F2 clear */
    751 static Bool haveF3noF2 ( Prefix pfx )
    752 {
    753   return
    754      toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F3);
    755 }
    756 
    757 /* Return True iff pfx has F2 set and F3 clear */
    758 static Bool haveF2noF3 ( Prefix pfx )
    759 {
    760   return
    761      toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F2);
    762 }
    763 
    764 /* Return True iff pfx has 66, F2 and F3 clear */
    765 static Bool haveNo66noF2noF3 ( Prefix pfx )
    766 {
    767   return
    768      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == 0);
    769 }
    770 
    771 /* Return True iff pfx has any of 66, F2 and F3 set */
    772 static Bool have66orF2orF3 ( Prefix pfx )
    773 {
    774   return toBool( ! haveNo66noF2noF3(pfx) );
    775 }
    776 
    777 /* Return True iff pfx has 66 or F3 set */
    778 static Bool have66orF3 ( Prefix pfx )
    779 {
    780    return toBool((pfx & (PFX_66|PFX_F3)) > 0);
    781 }
    782 
    783 /* Clear all the segment-override bits in a prefix. */
    784 static Prefix clearSegBits ( Prefix p )
    785 {
    786    return
    787       p & ~(PFX_CS | PFX_DS | PFX_ES | PFX_FS | PFX_GS | PFX_SS);
    788 }
    789 
    790 /* Get the (inverted, hence back to "normal") VEX.vvvv field. */
    791 static UInt getVexNvvvv ( Prefix pfx ) {
    792    UInt r = (UInt)pfx;
    793    r /= (UInt)PFX_VEXnV0; /* pray this turns into a shift */
    794    return r & 0xF;
    795 }
    796 
    797 static Bool haveVEX ( Prefix pfx ) {
    798    return toBool(pfx & PFX_VEX);
    799 }
    800 
    801 static Int getVexL ( Prefix pfx ) {
    802    return (pfx & PFX_VEXL) ? 1 : 0;
    803 }
    804 
    805 
    806 /*------------------------------------------------------------*/
    807 /*--- For dealing with escapes                             ---*/
    808 /*------------------------------------------------------------*/
    809 
    810 
    811 /* Escapes come after the prefixes, but before the primary opcode
    812    byte.  They escape the primary opcode byte into a bigger space.
    813    The 0xF0000000 isn't significant, except so as to make it not
    814    overlap valid Prefix values, for sanity checking.
    815 */
    816 
    817 typedef
    818    enum {
    819       ESC_NONE=0xF0000000, // none
    820       ESC_0F,              // 0F
    821       ESC_0F38,            // 0F 38
    822       ESC_0F3A             // 0F 3A
    823    }
    824    Escape;
    825 
    826 
    827 /*------------------------------------------------------------*/
    828 /*--- For dealing with integer registers                   ---*/
    829 /*------------------------------------------------------------*/
    830 
    831 /* This is somewhat complex.  The rules are:
    832 
    833    For 64, 32 and 16 bit register references, the e or g fields in the
    834    modrm bytes supply the low 3 bits of the register number.  The
    835    fourth (most-significant) bit of the register number is supplied by
    836    the REX byte, if it is present; else that bit is taken to be zero.
    837 
    838    The REX.R bit supplies the high bit corresponding to the g register
    839    field, and the REX.B bit supplies the high bit corresponding to the
    840    e register field (when the mod part of modrm indicates that modrm's
    841    e component refers to a register and not to memory).
    842 
    843    The REX.X bit supplies a high register bit for certain registers
    844    in SIB address modes, and is generally rarely used.
    845 
    846    For 8 bit register references, the presence of the REX byte itself
    847    has significance.  If there is no REX present, then the 3-bit
    848    number extracted from the modrm e or g field is treated as an index
    849    into the sequence %al %cl %dl %bl %ah %ch %dh %bh -- that is, the
    850    old x86 encoding scheme.
    851 
    852    But if there is a REX present, the register reference is
    853    interpreted in the same way as for 64/32/16-bit references: a high
    854    bit is extracted from REX, giving a 4-bit number, and the denoted
    855    register is the lowest 8 bits of the 16 integer registers denoted
    856    by the number.  In particular, values 3 through 7 of this sequence
    857    do not refer to %ah %ch %dh %bh but instead to the lowest 8 bits of
    858    %rsp %rbp %rsi %rdi.
    859 
    860    The REX.W bit has no bearing at all on register numbers.  Instead
    861    its presence indicates that the operand size is to be overridden
    862    from its default value (32 bits) to 64 bits instead.  This is in
    863    the same fashion that an 0x66 prefix indicates the operand size is
    864    to be overridden from 32 bits down to 16 bits.  When both REX.W and
    865    0x66 are present there is a conflict, and REX.W takes precedence.
    866 
    867    Rather than try to handle this complexity using a single huge
    868    function, several smaller ones are provided.  The aim is to make it
    869    as difficult as possible to screw up register decoding in a subtle
    870    and hard-to-track-down way.
    871 
    872    Because these routines fish around in the host's memory (that is,
    873    in the guest state area) for sub-parts of guest registers, their
    874    correctness depends on the host's endianness.  So far these
    875    routines only work for little-endian hosts.  Those for which
    876    endianness is important have assertions to ensure sanity.
    877 */
    878 
    879 
    880 /* About the simplest question you can ask: where do the 64-bit
    881    integer registers live (in the guest state) ? */
    882 
    883 static Int integerGuestReg64Offset ( UInt reg )
    884 {
    885    switch (reg) {
    886       case R_RAX: return OFFB_RAX;
    887       case R_RCX: return OFFB_RCX;
    888       case R_RDX: return OFFB_RDX;
    889       case R_RBX: return OFFB_RBX;
    890       case R_RSP: return OFFB_RSP;
    891       case R_RBP: return OFFB_RBP;
    892       case R_RSI: return OFFB_RSI;
    893       case R_RDI: return OFFB_RDI;
    894       case R_R8:  return OFFB_R8;
    895       case R_R9:  return OFFB_R9;
    896       case R_R10: return OFFB_R10;
    897       case R_R11: return OFFB_R11;
    898       case R_R12: return OFFB_R12;
    899       case R_R13: return OFFB_R13;
    900       case R_R14: return OFFB_R14;
    901       case R_R15: return OFFB_R15;
    902       default: vpanic("integerGuestReg64Offset(amd64)");
    903    }
    904 }
    905 
    906 
    907 /* Produce the name of an integer register, for printing purposes.
    908    reg is a number in the range 0 .. 15 that has been generated from a
    909    3-bit reg-field number and a REX extension bit.  irregular denotes
    910    the case where sz==1 and no REX byte is present. */
    911 
    912 static
    913 const HChar* nameIReg ( Int sz, UInt reg, Bool irregular )
    914 {
    915    static const HChar* ireg64_names[16]
    916      = { "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
    917          "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" };
    918    static const HChar* ireg32_names[16]
    919      = { "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
    920          "%r8d", "%r9d", "%r10d","%r11d","%r12d","%r13d","%r14d","%r15d" };
    921    static const HChar* ireg16_names[16]
    922      = { "%ax",  "%cx",  "%dx",  "%bx",  "%sp",  "%bp",  "%si",  "%di",
    923          "%r8w", "%r9w", "%r10w","%r11w","%r12w","%r13w","%r14w","%r15w" };
    924    static const HChar* ireg8_names[16]
    925      = { "%al",  "%cl",  "%dl",  "%bl",  "%spl", "%bpl", "%sil", "%dil",
    926          "%r8b", "%r9b", "%r10b","%r11b","%r12b","%r13b","%r14b","%r15b" };
    927    static const HChar* ireg8_irregular[8]
    928      = { "%al", "%cl", "%dl", "%bl", "%ah", "%ch", "%dh", "%bh" };
    929 
    930    vassert(reg < 16);
    931    if (sz == 1) {
    932       if (irregular)
    933          vassert(reg < 8);
    934    } else {
    935       vassert(irregular == False);
    936    }
    937 
    938    switch (sz) {
    939       case 8: return ireg64_names[reg];
    940       case 4: return ireg32_names[reg];
    941       case 2: return ireg16_names[reg];
    942       case 1: if (irregular) {
    943                  return ireg8_irregular[reg];
    944               } else {
    945                  return ireg8_names[reg];
    946               }
    947       default: vpanic("nameIReg(amd64)");
    948    }
    949 }
    950 
    951 /* Using the same argument conventions as nameIReg, produce the
    952    guest state offset of an integer register. */
    953 
    954 static
    955 Int offsetIReg ( Int sz, UInt reg, Bool irregular )
    956 {
    957    vassert(reg < 16);
    958    if (sz == 1) {
    959       if (irregular)
    960          vassert(reg < 8);
    961    } else {
    962       vassert(irregular == False);
    963    }
    964 
    965    /* Deal with irregular case -- sz==1 and no REX present */
    966    if (sz == 1 && irregular) {
    967       switch (reg) {
    968          case R_RSP: return 1+ OFFB_RAX;
    969          case R_RBP: return 1+ OFFB_RCX;
    970          case R_RSI: return 1+ OFFB_RDX;
    971          case R_RDI: return 1+ OFFB_RBX;
    972          default:    break; /* use the normal case */
    973       }
    974    }
    975 
    976    /* Normal case */
    977    return integerGuestReg64Offset(reg);
    978 }
    979 
    980 
    981 /* Read the %CL register :: Ity_I8, for shift/rotate operations. */
    982 
    983 static IRExpr* getIRegCL ( void )
    984 {
    985    vassert(host_endness == VexEndnessLE);
    986    return IRExpr_Get( OFFB_RCX, Ity_I8 );
    987 }
    988 
    989 
    990 /* Write to the %AH register. */
    991 
    992 static void putIRegAH ( IRExpr* e )
    993 {
    994    vassert(host_endness == VexEndnessLE);
    995    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I8);
    996    stmt( IRStmt_Put( OFFB_RAX+1, e ) );
    997 }
    998 
    999 
   1000 /* Read/write various widths of %RAX, as it has various
   1001    special-purpose uses. */
   1002 
   1003 static const HChar* nameIRegRAX ( Int sz )
   1004 {
   1005    switch (sz) {
   1006       case 1: return "%al";
   1007       case 2: return "%ax";
   1008       case 4: return "%eax";
   1009       case 8: return "%rax";
   1010       default: vpanic("nameIRegRAX(amd64)");
   1011    }
   1012 }
   1013 
   1014 static IRExpr* getIRegRAX ( Int sz )
   1015 {
   1016    vassert(host_endness == VexEndnessLE);
   1017    switch (sz) {
   1018       case 1: return IRExpr_Get( OFFB_RAX, Ity_I8 );
   1019       case 2: return IRExpr_Get( OFFB_RAX, Ity_I16 );
   1020       case 4: return unop(Iop_64to32, IRExpr_Get( OFFB_RAX, Ity_I64 ));
   1021       case 8: return IRExpr_Get( OFFB_RAX, Ity_I64 );
   1022       default: vpanic("getIRegRAX(amd64)");
   1023    }
   1024 }
   1025 
   1026 static void putIRegRAX ( Int sz, IRExpr* e )
   1027 {
   1028    IRType ty = typeOfIRExpr(irsb->tyenv, e);
   1029    vassert(host_endness == VexEndnessLE);
   1030    switch (sz) {
   1031       case 8: vassert(ty == Ity_I64);
   1032               stmt( IRStmt_Put( OFFB_RAX, e ));
   1033               break;
   1034       case 4: vassert(ty == Ity_I32);
   1035               stmt( IRStmt_Put( OFFB_RAX, unop(Iop_32Uto64,e) ));
   1036               break;
   1037       case 2: vassert(ty == Ity_I16);
   1038               stmt( IRStmt_Put( OFFB_RAX, e ));
   1039               break;
   1040       case 1: vassert(ty == Ity_I8);
   1041               stmt( IRStmt_Put( OFFB_RAX, e ));
   1042               break;
   1043       default: vpanic("putIRegRAX(amd64)");
   1044    }
   1045 }
   1046 
   1047 
   1048 /* Read/write various widths of %RDX, as it has various
   1049    special-purpose uses. */
   1050 
   1051 static const HChar* nameIRegRDX ( Int sz )
   1052 {
   1053    switch (sz) {
   1054       case 1: return "%dl";
   1055       case 2: return "%dx";
   1056       case 4: return "%edx";
   1057       case 8: return "%rdx";
   1058       default: vpanic("nameIRegRDX(amd64)");
   1059    }
   1060 }
   1061 
   1062 static IRExpr* getIRegRDX ( Int sz )
   1063 {
   1064    vassert(host_endness == VexEndnessLE);
   1065    switch (sz) {
   1066       case 1: return IRExpr_Get( OFFB_RDX, Ity_I8 );
   1067       case 2: return IRExpr_Get( OFFB_RDX, Ity_I16 );
   1068       case 4: return unop(Iop_64to32, IRExpr_Get( OFFB_RDX, Ity_I64 ));
   1069       case 8: return IRExpr_Get( OFFB_RDX, Ity_I64 );
   1070       default: vpanic("getIRegRDX(amd64)");
   1071    }
   1072 }
   1073 
   1074 static void putIRegRDX ( Int sz, IRExpr* e )
   1075 {
   1076    vassert(host_endness == VexEndnessLE);
   1077    vassert(typeOfIRExpr(irsb->tyenv, e) == szToITy(sz));
   1078    switch (sz) {
   1079       case 8: stmt( IRStmt_Put( OFFB_RDX, e ));
   1080               break;
   1081       case 4: stmt( IRStmt_Put( OFFB_RDX, unop(Iop_32Uto64,e) ));
   1082               break;
   1083       case 2: stmt( IRStmt_Put( OFFB_RDX, e ));
   1084               break;
   1085       case 1: stmt( IRStmt_Put( OFFB_RDX, e ));
   1086               break;
   1087       default: vpanic("putIRegRDX(amd64)");
   1088    }
   1089 }
   1090 
   1091 
   1092 /* Simplistic functions to deal with the integer registers as a
   1093    straightforward bank of 16 64-bit regs. */
   1094 
   1095 static IRExpr* getIReg64 ( UInt regno )
   1096 {
   1097    return IRExpr_Get( integerGuestReg64Offset(regno),
   1098                       Ity_I64 );
   1099 }
   1100 
   1101 static void putIReg64 ( UInt regno, IRExpr* e )
   1102 {
   1103    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   1104    stmt( IRStmt_Put( integerGuestReg64Offset(regno), e ) );
   1105 }
   1106 
   1107 static const HChar* nameIReg64 ( UInt regno )
   1108 {
   1109    return nameIReg( 8, regno, False );
   1110 }
   1111 
   1112 
   1113 /* Simplistic functions to deal with the lower halves of integer
   1114    registers as a straightforward bank of 16 32-bit regs. */
   1115 
   1116 static IRExpr* getIReg32 ( UInt regno )
   1117 {
   1118    vassert(host_endness == VexEndnessLE);
   1119    return unop(Iop_64to32,
   1120                IRExpr_Get( integerGuestReg64Offset(regno),
   1121                            Ity_I64 ));
   1122 }
   1123 
   1124 static void putIReg32 ( UInt regno, IRExpr* e )
   1125 {
   1126    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
   1127    stmt( IRStmt_Put( integerGuestReg64Offset(regno),
   1128                      unop(Iop_32Uto64,e) ) );
   1129 }
   1130 
   1131 static const HChar* nameIReg32 ( UInt regno )
   1132 {
   1133    return nameIReg( 4, regno, False );
   1134 }
   1135 
   1136 
   1137 /* Simplistic functions to deal with the lower quarters of integer
   1138    registers as a straightforward bank of 16 16-bit regs. */
   1139 
   1140 static IRExpr* getIReg16 ( UInt regno )
   1141 {
   1142    vassert(host_endness == VexEndnessLE);
   1143    return IRExpr_Get( integerGuestReg64Offset(regno),
   1144                       Ity_I16 );
   1145 }
   1146 
   1147 static void putIReg16 ( UInt regno, IRExpr* e )
   1148 {
   1149    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
   1150    stmt( IRStmt_Put( integerGuestReg64Offset(regno),
   1151                      unop(Iop_16Uto64,e) ) );
   1152 }
   1153 
   1154 static const HChar* nameIReg16 ( UInt regno )
   1155 {
   1156    return nameIReg( 2, regno, False );
   1157 }
   1158 
   1159 
   1160 /* Sometimes what we know is a 3-bit register number, a REX byte, and
   1161    which field of the REX byte is to be used to extend to a 4-bit
   1162    number.  These functions cater for that situation.
   1163 */
   1164 static IRExpr* getIReg64rexX ( Prefix pfx, UInt lo3bits )
   1165 {
   1166    vassert(lo3bits < 8);
   1167    vassert(IS_VALID_PFX(pfx));
   1168    return getIReg64( lo3bits | (getRexX(pfx) << 3) );
   1169 }
   1170 
   1171 static const HChar* nameIReg64rexX ( Prefix pfx, UInt lo3bits )
   1172 {
   1173    vassert(lo3bits < 8);
   1174    vassert(IS_VALID_PFX(pfx));
   1175    return nameIReg( 8, lo3bits | (getRexX(pfx) << 3), False );
   1176 }
   1177 
   1178 static const HChar* nameIRegRexB ( Int sz, Prefix pfx, UInt lo3bits )
   1179 {
   1180    vassert(lo3bits < 8);
   1181    vassert(IS_VALID_PFX(pfx));
   1182    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1183    return nameIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1184                         toBool(sz==1 && !haveREX(pfx)) );
   1185 }
   1186 
   1187 static IRExpr* getIRegRexB ( Int sz, Prefix pfx, UInt lo3bits )
   1188 {
   1189    vassert(lo3bits < 8);
   1190    vassert(IS_VALID_PFX(pfx));
   1191    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1192    if (sz == 4) {
   1193       sz = 8;
   1194       return unop(Iop_64to32,
   1195                   IRExpr_Get(
   1196                      offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1197                                      False/*!irregular*/ ),
   1198                      szToITy(sz)
   1199                  )
   1200              );
   1201    } else {
   1202       return IRExpr_Get(
   1203                 offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1204                                 toBool(sz==1 && !haveREX(pfx)) ),
   1205                 szToITy(sz)
   1206              );
   1207    }
   1208 }
   1209 
   1210 static void putIRegRexB ( Int sz, Prefix pfx, UInt lo3bits, IRExpr* e )
   1211 {
   1212    vassert(lo3bits < 8);
   1213    vassert(IS_VALID_PFX(pfx));
   1214    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1215    vassert(typeOfIRExpr(irsb->tyenv, e) == szToITy(sz));
   1216    stmt( IRStmt_Put(
   1217             offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1218                             toBool(sz==1 && !haveREX(pfx)) ),
   1219             sz==4 ? unop(Iop_32Uto64,e) : e
   1220    ));
   1221 }
   1222 
   1223 
   1224 /* Functions for getting register numbers from modrm bytes and REX
   1225    when we don't have to consider the complexities of integer subreg
   1226    accesses.
   1227 */
   1228 /* Extract the g reg field from a modRM byte, and augment it using the
   1229    REX.R bit from the supplied REX byte.  The R bit usually is
   1230    associated with the g register field.
   1231 */
   1232 static UInt gregOfRexRM ( Prefix pfx, UChar mod_reg_rm )
   1233 {
   1234    Int reg = (Int)( (mod_reg_rm >> 3) & 7 );
   1235    reg += (pfx & PFX_REXR) ? 8 : 0;
   1236    return reg;
   1237 }
   1238 
   1239 /* Extract the e reg field from a modRM byte, and augment it using the
   1240    REX.B bit from the supplied REX byte.  The B bit usually is
   1241    associated with the e register field (when modrm indicates e is a
   1242    register, that is).
   1243 */
   1244 static UInt eregOfRexRM ( Prefix pfx, UChar mod_reg_rm )
   1245 {
   1246    Int rm;
   1247    vassert(epartIsReg(mod_reg_rm));
   1248    rm = (Int)(mod_reg_rm & 0x7);
   1249    rm += (pfx & PFX_REXB) ? 8 : 0;
   1250    return rm;
   1251 }
   1252 
   1253 
   1254 /* General functions for dealing with integer register access. */
   1255 
   1256 /* Produce the guest state offset for a reference to the 'g' register
   1257    field in a modrm byte, taking into account REX (or its absence),
   1258    and the size of the access.
   1259 */
   1260 static UInt offsetIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1261 {
   1262    UInt reg;
   1263    vassert(host_endness == VexEndnessLE);
   1264    vassert(IS_VALID_PFX(pfx));
   1265    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1266    reg = gregOfRexRM( pfx, mod_reg_rm );
   1267    return offsetIReg( sz, reg, toBool(sz == 1 && !haveREX(pfx)) );
   1268 }
   1269 
   1270 static
   1271 IRExpr* getIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1272 {
   1273    if (sz == 4) {
   1274       sz = 8;
   1275       return unop(Iop_64to32,
   1276                   IRExpr_Get( offsetIRegG( sz, pfx, mod_reg_rm ),
   1277                               szToITy(sz) ));
   1278    } else {
   1279       return IRExpr_Get( offsetIRegG( sz, pfx, mod_reg_rm ),
   1280                          szToITy(sz) );
   1281    }
   1282 }
   1283 
   1284 static
   1285 void putIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm, IRExpr* e )
   1286 {
   1287    vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
   1288    if (sz == 4) {
   1289       e = unop(Iop_32Uto64,e);
   1290    }
   1291    stmt( IRStmt_Put( offsetIRegG( sz, pfx, mod_reg_rm ), e ) );
   1292 }
   1293 
   1294 static
   1295 const HChar* nameIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1296 {
   1297    return nameIReg( sz, gregOfRexRM(pfx,mod_reg_rm),
   1298                         toBool(sz==1 && !haveREX(pfx)) );
   1299 }
   1300 
   1301 
   1302 static
   1303 IRExpr* getIRegV ( Int sz, Prefix pfx )
   1304 {
   1305    if (sz == 4) {
   1306       sz = 8;
   1307       return unop(Iop_64to32,
   1308                   IRExpr_Get( offsetIReg( sz, getVexNvvvv(pfx), False ),
   1309                               szToITy(sz) ));
   1310    } else {
   1311       return IRExpr_Get( offsetIReg( sz, getVexNvvvv(pfx), False ),
   1312                          szToITy(sz) );
   1313    }
   1314 }
   1315 
   1316 static
   1317 void putIRegV ( Int sz, Prefix pfx, IRExpr* e )
   1318 {
   1319    vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
   1320    if (sz == 4) {
   1321       e = unop(Iop_32Uto64,e);
   1322    }
   1323    stmt( IRStmt_Put( offsetIReg( sz, getVexNvvvv(pfx), False ), e ) );
   1324 }
   1325 
   1326 static
   1327 const HChar* nameIRegV ( Int sz, Prefix pfx )
   1328 {
   1329    return nameIReg( sz, getVexNvvvv(pfx), False );
   1330 }
   1331 
   1332 
   1333 
   1334 /* Produce the guest state offset for a reference to the 'e' register
   1335    field in a modrm byte, taking into account REX (or its absence),
   1336    and the size of the access.  eregOfRexRM will assert if mod_reg_rm
   1337    denotes a memory access rather than a register access.
   1338 */
   1339 static UInt offsetIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1340 {
   1341    UInt reg;
   1342    vassert(host_endness == VexEndnessLE);
   1343    vassert(IS_VALID_PFX(pfx));
   1344    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1345    reg = eregOfRexRM( pfx, mod_reg_rm );
   1346    return offsetIReg( sz, reg, toBool(sz == 1 && !haveREX(pfx)) );
   1347 }
   1348 
   1349 static
   1350 IRExpr* getIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1351 {
   1352    if (sz == 4) {
   1353       sz = 8;
   1354       return unop(Iop_64to32,
   1355                   IRExpr_Get( offsetIRegE( sz, pfx, mod_reg_rm ),
   1356                               szToITy(sz) ));
   1357    } else {
   1358       return IRExpr_Get( offsetIRegE( sz, pfx, mod_reg_rm ),
   1359                          szToITy(sz) );
   1360    }
   1361 }
   1362 
   1363 static
   1364 void putIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm, IRExpr* e )
   1365 {
   1366    vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
   1367    if (sz == 4) {
   1368       e = unop(Iop_32Uto64,e);
   1369    }
   1370    stmt( IRStmt_Put( offsetIRegE( sz, pfx, mod_reg_rm ), e ) );
   1371 }
   1372 
   1373 static
   1374 const HChar* nameIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1375 {
   1376    return nameIReg( sz, eregOfRexRM(pfx,mod_reg_rm),
   1377                         toBool(sz==1 && !haveREX(pfx)) );
   1378 }
   1379 
   1380 
   1381 /*------------------------------------------------------------*/
   1382 /*--- For dealing with XMM registers                       ---*/
   1383 /*------------------------------------------------------------*/
   1384 
   1385 static Int ymmGuestRegOffset ( UInt ymmreg )
   1386 {
   1387    switch (ymmreg) {
   1388       case 0:  return OFFB_YMM0;
   1389       case 1:  return OFFB_YMM1;
   1390       case 2:  return OFFB_YMM2;
   1391       case 3:  return OFFB_YMM3;
   1392       case 4:  return OFFB_YMM4;
   1393       case 5:  return OFFB_YMM5;
   1394       case 6:  return OFFB_YMM6;
   1395       case 7:  return OFFB_YMM7;
   1396       case 8:  return OFFB_YMM8;
   1397       case 9:  return OFFB_YMM9;
   1398       case 10: return OFFB_YMM10;
   1399       case 11: return OFFB_YMM11;
   1400       case 12: return OFFB_YMM12;
   1401       case 13: return OFFB_YMM13;
   1402       case 14: return OFFB_YMM14;
   1403       case 15: return OFFB_YMM15;
   1404       default: vpanic("ymmGuestRegOffset(amd64)");
   1405    }
   1406 }
   1407 
   1408 static Int xmmGuestRegOffset ( UInt xmmreg )
   1409 {
   1410    /* Correct for little-endian host only. */
   1411    vassert(host_endness == VexEndnessLE);
   1412    return ymmGuestRegOffset( xmmreg );
   1413 }
   1414 
   1415 /* Lanes of vector registers are always numbered from zero being the
   1416    least significant lane (rightmost in the register).  */
   1417 
   1418 static Int xmmGuestRegLane16offset ( UInt xmmreg, Int laneno )
   1419 {
   1420    /* Correct for little-endian host only. */
   1421    vassert(host_endness == VexEndnessLE);
   1422    vassert(laneno >= 0 && laneno < 8);
   1423    return xmmGuestRegOffset( xmmreg ) + 2 * laneno;
   1424 }
   1425 
   1426 static Int xmmGuestRegLane32offset ( UInt xmmreg, Int laneno )
   1427 {
   1428    /* Correct for little-endian host only. */
   1429    vassert(host_endness == VexEndnessLE);
   1430    vassert(laneno >= 0 && laneno < 4);
   1431    return xmmGuestRegOffset( xmmreg ) + 4 * laneno;
   1432 }
   1433 
   1434 static Int xmmGuestRegLane64offset ( UInt xmmreg, Int laneno )
   1435 {
   1436    /* Correct for little-endian host only. */
   1437    vassert(host_endness == VexEndnessLE);
   1438    vassert(laneno >= 0 && laneno < 2);
   1439    return xmmGuestRegOffset( xmmreg ) + 8 * laneno;
   1440 }
   1441 
   1442 static Int ymmGuestRegLane128offset ( UInt ymmreg, Int laneno )
   1443 {
   1444    /* Correct for little-endian host only. */
   1445    vassert(host_endness == VexEndnessLE);
   1446    vassert(laneno >= 0 && laneno < 2);
   1447    return ymmGuestRegOffset( ymmreg ) + 16 * laneno;
   1448 }
   1449 
   1450 static Int ymmGuestRegLane64offset ( UInt ymmreg, Int laneno )
   1451 {
   1452    /* Correct for little-endian host only. */
   1453    vassert(host_endness == VexEndnessLE);
   1454    vassert(laneno >= 0 && laneno < 4);
   1455    return ymmGuestRegOffset( ymmreg ) + 8 * laneno;
   1456 }
   1457 
   1458 static Int ymmGuestRegLane32offset ( UInt ymmreg, Int laneno )
   1459 {
   1460    /* Correct for little-endian host only. */
   1461    vassert(host_endness == VexEndnessLE);
   1462    vassert(laneno >= 0 && laneno < 8);
   1463    return ymmGuestRegOffset( ymmreg ) + 4 * laneno;
   1464 }
   1465 
   1466 static IRExpr* getXMMReg ( UInt xmmreg )
   1467 {
   1468    return IRExpr_Get( xmmGuestRegOffset(xmmreg), Ity_V128 );
   1469 }
   1470 
   1471 static IRExpr* getXMMRegLane64 ( UInt xmmreg, Int laneno )
   1472 {
   1473    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_I64 );
   1474 }
   1475 
   1476 static IRExpr* getXMMRegLane64F ( UInt xmmreg, Int laneno )
   1477 {
   1478    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_F64 );
   1479 }
   1480 
   1481 static IRExpr* getXMMRegLane32 ( UInt xmmreg, Int laneno )
   1482 {
   1483    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_I32 );
   1484 }
   1485 
   1486 static IRExpr* getXMMRegLane32F ( UInt xmmreg, Int laneno )
   1487 {
   1488    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_F32 );
   1489 }
   1490 
   1491 static IRExpr* getXMMRegLane16 ( UInt xmmreg, Int laneno )
   1492 {
   1493   return IRExpr_Get( xmmGuestRegLane16offset(xmmreg,laneno), Ity_I16 );
   1494 }
   1495 
   1496 static void putXMMReg ( UInt xmmreg, IRExpr* e )
   1497 {
   1498    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
   1499    stmt( IRStmt_Put( xmmGuestRegOffset(xmmreg), e ) );
   1500 }
   1501 
   1502 static void putXMMRegLane64 ( UInt xmmreg, Int laneno, IRExpr* e )
   1503 {
   1504    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   1505    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
   1506 }
   1507 
   1508 static void putXMMRegLane64F ( UInt xmmreg, Int laneno, IRExpr* e )
   1509 {
   1510    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
   1511    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
   1512 }
   1513 
   1514 static void putXMMRegLane32F ( UInt xmmreg, Int laneno, IRExpr* e )
   1515 {
   1516    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
   1517    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
   1518 }
   1519 
   1520 static void putXMMRegLane32 ( UInt xmmreg, Int laneno, IRExpr* e )
   1521 {
   1522    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
   1523    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
   1524 }
   1525 
   1526 static IRExpr* getYMMReg ( UInt xmmreg )
   1527 {
   1528    return IRExpr_Get( ymmGuestRegOffset(xmmreg), Ity_V256 );
   1529 }
   1530 
   1531 static IRExpr* getYMMRegLane128 ( UInt ymmreg, Int laneno )
   1532 {
   1533    return IRExpr_Get( ymmGuestRegLane128offset(ymmreg,laneno), Ity_V128 );
   1534 }
   1535 
   1536 static IRExpr* getYMMRegLane64F ( UInt ymmreg, Int laneno )
   1537 {
   1538    return IRExpr_Get( ymmGuestRegLane64offset(ymmreg,laneno), Ity_F64 );
   1539 }
   1540 
   1541 static IRExpr* getYMMRegLane64 ( UInt ymmreg, Int laneno )
   1542 {
   1543    return IRExpr_Get( ymmGuestRegLane64offset(ymmreg,laneno), Ity_I64 );
   1544 }
   1545 
   1546 static IRExpr* getYMMRegLane32F ( UInt ymmreg, Int laneno )
   1547 {
   1548    return IRExpr_Get( ymmGuestRegLane32offset(ymmreg,laneno), Ity_F32 );
   1549 }
   1550 
   1551 static IRExpr* getYMMRegLane32 ( UInt ymmreg, Int laneno )
   1552 {
   1553    return IRExpr_Get( ymmGuestRegLane32offset(ymmreg,laneno), Ity_I32 );
   1554 }
   1555 
   1556 static void putYMMReg ( UInt ymmreg, IRExpr* e )
   1557 {
   1558    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V256);
   1559    stmt( IRStmt_Put( ymmGuestRegOffset(ymmreg), e ) );
   1560 }
   1561 
   1562 static void putYMMRegLane128 ( UInt ymmreg, Int laneno, IRExpr* e )
   1563 {
   1564    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
   1565    stmt( IRStmt_Put( ymmGuestRegLane128offset(ymmreg,laneno), e ) );
   1566 }
   1567 
   1568 static void putYMMRegLane64F ( UInt ymmreg, Int laneno, IRExpr* e )
   1569 {
   1570    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
   1571    stmt( IRStmt_Put( ymmGuestRegLane64offset(ymmreg,laneno), e ) );
   1572 }
   1573 
   1574 static void putYMMRegLane64 ( UInt ymmreg, Int laneno, IRExpr* e )
   1575 {
   1576    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   1577    stmt( IRStmt_Put( ymmGuestRegLane64offset(ymmreg,laneno), e ) );
   1578 }
   1579 
   1580 static void putYMMRegLane32F ( UInt ymmreg, Int laneno, IRExpr* e )
   1581 {
   1582    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
   1583    stmt( IRStmt_Put( ymmGuestRegLane32offset(ymmreg,laneno), e ) );
   1584 }
   1585 
   1586 static void putYMMRegLane32 ( UInt ymmreg, Int laneno, IRExpr* e )
   1587 {
   1588    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
   1589    stmt( IRStmt_Put( ymmGuestRegLane32offset(ymmreg,laneno), e ) );
   1590 }
   1591 
   1592 static IRExpr* mkV128 ( UShort mask )
   1593 {
   1594    return IRExpr_Const(IRConst_V128(mask));
   1595 }
   1596 
   1597 /* Write the low half of a YMM reg and zero out the upper half. */
   1598 static void putYMMRegLoAndZU ( UInt ymmreg, IRExpr* e )
   1599 {
   1600    putYMMRegLane128( ymmreg, 0, e );
   1601    putYMMRegLane128( ymmreg, 1, mkV128(0) );
   1602 }
   1603 
   1604 static IRExpr* mkAnd1 ( IRExpr* x, IRExpr* y )
   1605 {
   1606    vassert(typeOfIRExpr(irsb->tyenv,x) == Ity_I1);
   1607    vassert(typeOfIRExpr(irsb->tyenv,y) == Ity_I1);
   1608    return unop(Iop_64to1,
   1609                binop(Iop_And64,
   1610                      unop(Iop_1Uto64,x),
   1611                      unop(Iop_1Uto64,y)));
   1612 }
   1613 
   1614 /* Generate a compare-and-swap operation, operating on memory at
   1615    'addr'.  The expected value is 'expVal' and the new value is
   1616    'newVal'.  If the operation fails, then transfer control (with a
   1617    no-redir jump (XXX no -- see comment at top of this file)) to
   1618    'restart_point', which is presumably the address of the guest
   1619    instruction again -- retrying, essentially. */
   1620 static void casLE ( IRExpr* addr, IRExpr* expVal, IRExpr* newVal,
   1621                     Addr64 restart_point )
   1622 {
   1623    IRCAS* cas;
   1624    IRType tyE    = typeOfIRExpr(irsb->tyenv, expVal);
   1625    IRType tyN    = typeOfIRExpr(irsb->tyenv, newVal);
   1626    IRTemp oldTmp = newTemp(tyE);
   1627    IRTemp expTmp = newTemp(tyE);
   1628    vassert(tyE == tyN);
   1629    vassert(tyE == Ity_I64 || tyE == Ity_I32
   1630            || tyE == Ity_I16 || tyE == Ity_I8);
   1631    assign(expTmp, expVal);
   1632    cas = mkIRCAS( IRTemp_INVALID, oldTmp, Iend_LE, addr,
   1633                   NULL, mkexpr(expTmp), NULL, newVal );
   1634    stmt( IRStmt_CAS(cas) );
   1635    stmt( IRStmt_Exit(
   1636             binop( mkSizedOp(tyE,Iop_CasCmpNE8),
   1637                    mkexpr(oldTmp), mkexpr(expTmp) ),
   1638             Ijk_Boring, /*Ijk_NoRedir*/
   1639             IRConst_U64( restart_point ),
   1640             OFFB_RIP
   1641          ));
   1642 }
   1643 
   1644 
   1645 /*------------------------------------------------------------*/
   1646 /*--- Helpers for %rflags.                                 ---*/
   1647 /*------------------------------------------------------------*/
   1648 
   1649 /* -------------- Evaluating the flags-thunk. -------------- */
   1650 
   1651 /* Build IR to calculate all the eflags from stored
   1652    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
   1653    Ity_I64. */
   1654 static IRExpr* mk_amd64g_calculate_rflags_all ( void )
   1655 {
   1656    IRExpr** args
   1657       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
   1658                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
   1659                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
   1660                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   1661    IRExpr* call
   1662       = mkIRExprCCall(
   1663            Ity_I64,
   1664            0/*regparm*/,
   1665            "amd64g_calculate_rflags_all", &amd64g_calculate_rflags_all,
   1666            args
   1667         );
   1668    /* Exclude OP and NDEP from definedness checking.  We're only
   1669       interested in DEP1 and DEP2. */
   1670    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
   1671    return call;
   1672 }
   1673 
   1674 /* Build IR to calculate some particular condition from stored
   1675    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
   1676    Ity_Bit. */
   1677 static IRExpr* mk_amd64g_calculate_condition ( AMD64Condcode cond )
   1678 {
   1679    IRExpr** args
   1680       = mkIRExprVec_5( mkU64(cond),
   1681                        IRExpr_Get(OFFB_CC_OP,   Ity_I64),
   1682                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
   1683                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
   1684                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   1685    IRExpr* call
   1686       = mkIRExprCCall(
   1687            Ity_I64,
   1688            0/*regparm*/,
   1689            "amd64g_calculate_condition", &amd64g_calculate_condition,
   1690            args
   1691         );
   1692    /* Exclude the requested condition, OP and NDEP from definedness
   1693       checking.  We're only interested in DEP1 and DEP2. */
   1694    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<1) | (1<<4);
   1695    return unop(Iop_64to1, call);
   1696 }
   1697 
   1698 /* Build IR to calculate just the carry flag from stored
   1699    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression :: Ity_I64. */
   1700 static IRExpr* mk_amd64g_calculate_rflags_c ( void )
   1701 {
   1702    IRExpr** args
   1703       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
   1704                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
   1705                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
   1706                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   1707    IRExpr* call
   1708       = mkIRExprCCall(
   1709            Ity_I64,
   1710            0/*regparm*/,
   1711            "amd64g_calculate_rflags_c", &amd64g_calculate_rflags_c,
   1712            args
   1713         );
   1714    /* Exclude OP and NDEP from definedness checking.  We're only
   1715       interested in DEP1 and DEP2. */
   1716    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
   1717    return call;
   1718 }
   1719 
   1720 
   1721 /* -------------- Building the flags-thunk. -------------- */
   1722 
   1723 /* The machinery in this section builds the flag-thunk following a
   1724    flag-setting operation.  Hence the various setFlags_* functions.
   1725 */
   1726 
   1727 static Bool isAddSub ( IROp op8 )
   1728 {
   1729    return toBool(op8 == Iop_Add8 || op8 == Iop_Sub8);
   1730 }
   1731 
   1732 static Bool isLogic ( IROp op8 )
   1733 {
   1734    return toBool(op8 == Iop_And8 || op8 == Iop_Or8 || op8 == Iop_Xor8);
   1735 }
   1736 
   1737 /* U-widen 1/8/16/32/64 bit int expr to 64. */
   1738 static IRExpr* widenUto64 ( IRExpr* e )
   1739 {
   1740    switch (typeOfIRExpr(irsb->tyenv,e)) {
   1741       case Ity_I64: return e;
   1742       case Ity_I32: return unop(Iop_32Uto64, e);
   1743       case Ity_I16: return unop(Iop_16Uto64, e);
   1744       case Ity_I8:  return unop(Iop_8Uto64, e);
   1745       case Ity_I1:  return unop(Iop_1Uto64, e);
   1746       default: vpanic("widenUto64");
   1747    }
   1748 }
   1749 
   1750 /* S-widen 8/16/32/64 bit int expr to 32. */
   1751 static IRExpr* widenSto64 ( IRExpr* e )
   1752 {
   1753    switch (typeOfIRExpr(irsb->tyenv,e)) {
   1754       case Ity_I64: return e;
   1755       case Ity_I32: return unop(Iop_32Sto64, e);
   1756       case Ity_I16: return unop(Iop_16Sto64, e);
   1757       case Ity_I8:  return unop(Iop_8Sto64, e);
   1758       default: vpanic("widenSto64");
   1759    }
   1760 }
   1761 
   1762 /* Narrow 8/16/32/64 bit int expr to 8/16/32/64.  Clearly only some
   1763    of these combinations make sense. */
   1764 static IRExpr* narrowTo ( IRType dst_ty, IRExpr* e )
   1765 {
   1766    IRType src_ty = typeOfIRExpr(irsb->tyenv,e);
   1767    if (src_ty == dst_ty)
   1768       return e;
   1769    if (src_ty == Ity_I32 && dst_ty == Ity_I16)
   1770       return unop(Iop_32to16, e);
   1771    if (src_ty == Ity_I32 && dst_ty == Ity_I8)
   1772       return unop(Iop_32to8, e);
   1773    if (src_ty == Ity_I64 && dst_ty == Ity_I32)
   1774       return unop(Iop_64to32, e);
   1775    if (src_ty == Ity_I64 && dst_ty == Ity_I16)
   1776       return unop(Iop_64to16, e);
   1777    if (src_ty == Ity_I64 && dst_ty == Ity_I8)
   1778       return unop(Iop_64to8, e);
   1779 
   1780    vex_printf("\nsrc, dst tys are: ");
   1781    ppIRType(src_ty);
   1782    vex_printf(", ");
   1783    ppIRType(dst_ty);
   1784    vex_printf("\n");
   1785    vpanic("narrowTo(amd64)");
   1786 }
   1787 
   1788 
   1789 /* Set the flags thunk OP, DEP1 and DEP2 fields.  The supplied op is
   1790    auto-sized up to the real op. */
   1791 
   1792 static
   1793 void setFlags_DEP1_DEP2 ( IROp op8, IRTemp dep1, IRTemp dep2, IRType ty )
   1794 {
   1795    Int ccOp = 0;
   1796    switch (ty) {
   1797       case Ity_I8:  ccOp = 0; break;
   1798       case Ity_I16: ccOp = 1; break;
   1799       case Ity_I32: ccOp = 2; break;
   1800       case Ity_I64: ccOp = 3; break;
   1801       default: vassert(0);
   1802    }
   1803    switch (op8) {
   1804       case Iop_Add8: ccOp += AMD64G_CC_OP_ADDB;   break;
   1805       case Iop_Sub8: ccOp += AMD64G_CC_OP_SUBB;   break;
   1806       default:       ppIROp(op8);
   1807                      vpanic("setFlags_DEP1_DEP2(amd64)");
   1808    }
   1809    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
   1810    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dep1))) );
   1811    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(dep2))) );
   1812 }
   1813 
   1814 
   1815 /* Set the OP and DEP1 fields only, and write zero to DEP2. */
   1816 
   1817 static
   1818 void setFlags_DEP1 ( IROp op8, IRTemp dep1, IRType ty )
   1819 {
   1820    Int ccOp = 0;
   1821    switch (ty) {
   1822       case Ity_I8:  ccOp = 0; break;
   1823       case Ity_I16: ccOp = 1; break;
   1824       case Ity_I32: ccOp = 2; break;
   1825       case Ity_I64: ccOp = 3; break;
   1826       default: vassert(0);
   1827    }
   1828    switch (op8) {
   1829       case Iop_Or8:
   1830       case Iop_And8:
   1831       case Iop_Xor8: ccOp += AMD64G_CC_OP_LOGICB; break;
   1832       default:       ppIROp(op8);
   1833                      vpanic("setFlags_DEP1(amd64)");
   1834    }
   1835    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
   1836    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dep1))) );
   1837    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
   1838 }
   1839 
   1840 
   1841 /* For shift operations, we put in the result and the undershifted
   1842    result.  Except if the shift amount is zero, the thunk is left
   1843    unchanged. */
   1844 
   1845 static void setFlags_DEP1_DEP2_shift ( IROp    op64,
   1846                                        IRTemp  res,
   1847                                        IRTemp  resUS,
   1848                                        IRType  ty,
   1849                                        IRTemp  guard )
   1850 {
   1851    Int ccOp = 0;
   1852    switch (ty) {
   1853       case Ity_I8:  ccOp = 0; break;
   1854       case Ity_I16: ccOp = 1; break;
   1855       case Ity_I32: ccOp = 2; break;
   1856       case Ity_I64: ccOp = 3; break;
   1857       default: vassert(0);
   1858    }
   1859 
   1860    vassert(guard);
   1861 
   1862    /* Both kinds of right shifts are handled by the same thunk
   1863       operation. */
   1864    switch (op64) {
   1865       case Iop_Shr64:
   1866       case Iop_Sar64: ccOp += AMD64G_CC_OP_SHRB; break;
   1867       case Iop_Shl64: ccOp += AMD64G_CC_OP_SHLB; break;
   1868       default:        ppIROp(op64);
   1869                       vpanic("setFlags_DEP1_DEP2_shift(amd64)");
   1870    }
   1871 
   1872    /* guard :: Ity_I8.  We need to convert it to I1. */
   1873    IRTemp guardB = newTemp(Ity_I1);
   1874    assign( guardB, binop(Iop_CmpNE8, mkexpr(guard), mkU8(0)) );
   1875 
   1876    /* DEP1 contains the result, DEP2 contains the undershifted value. */
   1877    stmt( IRStmt_Put( OFFB_CC_OP,
   1878                      IRExpr_ITE( mkexpr(guardB),
   1879                                  mkU64(ccOp),
   1880                                  IRExpr_Get(OFFB_CC_OP,Ity_I64) ) ));
   1881    stmt( IRStmt_Put( OFFB_CC_DEP1,
   1882                      IRExpr_ITE( mkexpr(guardB),
   1883                                  widenUto64(mkexpr(res)),
   1884                                  IRExpr_Get(OFFB_CC_DEP1,Ity_I64) ) ));
   1885    stmt( IRStmt_Put( OFFB_CC_DEP2,
   1886                      IRExpr_ITE( mkexpr(guardB),
   1887                                  widenUto64(mkexpr(resUS)),
   1888                                  IRExpr_Get(OFFB_CC_DEP2,Ity_I64) ) ));
   1889 }
   1890 
   1891 
   1892 /* For the inc/dec case, we store in DEP1 the result value and in NDEP
   1893    the former value of the carry flag, which unfortunately we have to
   1894    compute. */
   1895 
   1896 static void setFlags_INC_DEC ( Bool inc, IRTemp res, IRType ty )
   1897 {
   1898    Int ccOp = inc ? AMD64G_CC_OP_INCB : AMD64G_CC_OP_DECB;
   1899 
   1900    switch (ty) {
   1901       case Ity_I8:  ccOp += 0; break;
   1902       case Ity_I16: ccOp += 1; break;
   1903       case Ity_I32: ccOp += 2; break;
   1904       case Ity_I64: ccOp += 3; break;
   1905       default: vassert(0);
   1906    }
   1907 
   1908    /* This has to come first, because calculating the C flag
   1909       may require reading all four thunk fields. */
   1910    stmt( IRStmt_Put( OFFB_CC_NDEP, mk_amd64g_calculate_rflags_c()) );
   1911    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
   1912    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(res))) );
   1913    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
   1914 }
   1915 
   1916 
   1917 /* Multiplies are pretty much like add and sub: DEP1 and DEP2 hold the
   1918    two arguments. */
   1919 
   1920 static
   1921 void setFlags_MUL ( IRType ty, IRTemp arg1, IRTemp arg2, ULong base_op )
   1922 {
   1923    switch (ty) {
   1924       case Ity_I8:
   1925          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+0) ) );
   1926          break;
   1927       case Ity_I16:
   1928          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+1) ) );
   1929          break;
   1930       case Ity_I32:
   1931          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+2) ) );
   1932          break;
   1933       case Ity_I64:
   1934          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+3) ) );
   1935          break;
   1936       default:
   1937          vpanic("setFlags_MUL(amd64)");
   1938    }
   1939    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(arg1)) ));
   1940    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(arg2)) ));
   1941 }
   1942 
   1943 
   1944 /* -------------- Condition codes. -------------- */
   1945 
   1946 /* Condition codes, using the AMD encoding.  */
   1947 
   1948 static const HChar* name_AMD64Condcode ( AMD64Condcode cond )
   1949 {
   1950    switch (cond) {
   1951       case AMD64CondO:      return "o";
   1952       case AMD64CondNO:     return "no";
   1953       case AMD64CondB:      return "b";
   1954       case AMD64CondNB:     return "ae"; /*"nb";*/
   1955       case AMD64CondZ:      return "e"; /*"z";*/
   1956       case AMD64CondNZ:     return "ne"; /*"nz";*/
   1957       case AMD64CondBE:     return "be";
   1958       case AMD64CondNBE:    return "a"; /*"nbe";*/
   1959       case AMD64CondS:      return "s";
   1960       case AMD64CondNS:     return "ns";
   1961       case AMD64CondP:      return "p";
   1962       case AMD64CondNP:     return "np";
   1963       case AMD64CondL:      return "l";
   1964       case AMD64CondNL:     return "ge"; /*"nl";*/
   1965       case AMD64CondLE:     return "le";
   1966       case AMD64CondNLE:    return "g"; /*"nle";*/
   1967       case AMD64CondAlways: return "ALWAYS";
   1968       default: vpanic("name_AMD64Condcode");
   1969    }
   1970 }
   1971 
   1972 static
   1973 AMD64Condcode positiveIse_AMD64Condcode ( AMD64Condcode  cond,
   1974                                           /*OUT*/Bool*   needInvert )
   1975 {
   1976    vassert(cond >= AMD64CondO && cond <= AMD64CondNLE);
   1977    if (cond & 1) {
   1978       *needInvert = True;
   1979       return cond-1;
   1980    } else {
   1981       *needInvert = False;
   1982       return cond;
   1983    }
   1984 }
   1985 
   1986 
   1987 /* -------------- Helpers for ADD/SUB with carry. -------------- */
   1988 
   1989 /* Given ta1, ta2 and tres, compute tres = ADC(ta1,ta2) and set flags
   1990    appropriately.
   1991 
   1992    Optionally, generate a store for the 'tres' value.  This can either
   1993    be a normal store, or it can be a cas-with-possible-failure style
   1994    store:
   1995 
   1996    if taddr is IRTemp_INVALID, then no store is generated.
   1997 
   1998    if taddr is not IRTemp_INVALID, then a store (using taddr as
   1999    the address) is generated:
   2000 
   2001      if texpVal is IRTemp_INVALID then a normal store is
   2002      generated, and restart_point must be zero (it is irrelevant).
   2003 
   2004      if texpVal is not IRTemp_INVALID then a cas-style store is
   2005      generated.  texpVal is the expected value, restart_point
   2006      is the restart point if the store fails, and texpVal must
   2007      have the same type as tres.
   2008 
   2009 */
   2010 static void helper_ADC ( Int sz,
   2011                          IRTemp tres, IRTemp ta1, IRTemp ta2,
   2012                          /* info about optional store: */
   2013                          IRTemp taddr, IRTemp texpVal, Addr64 restart_point )
   2014 {
   2015    UInt    thunkOp;
   2016    IRType  ty    = szToITy(sz);
   2017    IRTemp  oldc  = newTemp(Ity_I64);
   2018    IRTemp  oldcn = newTemp(ty);
   2019    IROp    plus  = mkSizedOp(ty, Iop_Add8);
   2020    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
   2021 
   2022    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
   2023 
   2024    switch (sz) {
   2025       case 8:  thunkOp = AMD64G_CC_OP_ADCQ; break;
   2026       case 4:  thunkOp = AMD64G_CC_OP_ADCL; break;
   2027       case 2:  thunkOp = AMD64G_CC_OP_ADCW; break;
   2028       case 1:  thunkOp = AMD64G_CC_OP_ADCB; break;
   2029       default: vassert(0);
   2030    }
   2031 
   2032    /* oldc = old carry flag, 0 or 1 */
   2033    assign( oldc,  binop(Iop_And64,
   2034                         mk_amd64g_calculate_rflags_c(),
   2035                         mkU64(1)) );
   2036 
   2037    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
   2038 
   2039    assign( tres, binop(plus,
   2040                        binop(plus,mkexpr(ta1),mkexpr(ta2)),
   2041                        mkexpr(oldcn)) );
   2042 
   2043    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
   2044       start of this function. */
   2045    if (taddr != IRTemp_INVALID) {
   2046       if (texpVal == IRTemp_INVALID) {
   2047          vassert(restart_point == 0);
   2048          storeLE( mkexpr(taddr), mkexpr(tres) );
   2049       } else {
   2050          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
   2051          /* .. and hence 'texpVal' has the same type as 'tres'. */
   2052          casLE( mkexpr(taddr),
   2053                 mkexpr(texpVal), mkexpr(tres), restart_point );
   2054       }
   2055    }
   2056 
   2057    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
   2058    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1))  ));
   2059    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
   2060                                                          mkexpr(oldcn)) )) );
   2061    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
   2062 }
   2063 
   2064 
   2065 /* Given ta1, ta2 and tres, compute tres = SBB(ta1,ta2) and set flags
   2066    appropriately.  As with helper_ADC, possibly generate a store of
   2067    the result -- see comments on helper_ADC for details.
   2068 */
   2069 static void helper_SBB ( Int sz,
   2070                          IRTemp tres, IRTemp ta1, IRTemp ta2,
   2071                          /* info about optional store: */
   2072                          IRTemp taddr, IRTemp texpVal, Addr64 restart_point )
   2073 {
   2074    UInt    thunkOp;
   2075    IRType  ty    = szToITy(sz);
   2076    IRTemp  oldc  = newTemp(Ity_I64);
   2077    IRTemp  oldcn = newTemp(ty);
   2078    IROp    minus = mkSizedOp(ty, Iop_Sub8);
   2079    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
   2080 
   2081    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
   2082 
   2083    switch (sz) {
   2084       case 8:  thunkOp = AMD64G_CC_OP_SBBQ; break;
   2085       case 4:  thunkOp = AMD64G_CC_OP_SBBL; break;
   2086       case 2:  thunkOp = AMD64G_CC_OP_SBBW; break;
   2087       case 1:  thunkOp = AMD64G_CC_OP_SBBB; break;
   2088       default: vassert(0);
   2089    }
   2090 
   2091    /* oldc = old carry flag, 0 or 1 */
   2092    assign( oldc, binop(Iop_And64,
   2093                        mk_amd64g_calculate_rflags_c(),
   2094                        mkU64(1)) );
   2095 
   2096    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
   2097 
   2098    assign( tres, binop(minus,
   2099                        binop(minus,mkexpr(ta1),mkexpr(ta2)),
   2100                        mkexpr(oldcn)) );
   2101 
   2102    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
   2103       start of this function. */
   2104    if (taddr != IRTemp_INVALID) {
   2105       if (texpVal == IRTemp_INVALID) {
   2106          vassert(restart_point == 0);
   2107          storeLE( mkexpr(taddr), mkexpr(tres) );
   2108       } else {
   2109          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
   2110          /* .. and hence 'texpVal' has the same type as 'tres'. */
   2111          casLE( mkexpr(taddr),
   2112                 mkexpr(texpVal), mkexpr(tres), restart_point );
   2113       }
   2114    }
   2115 
   2116    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
   2117    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1) )) );
   2118    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
   2119                                                          mkexpr(oldcn)) )) );
   2120    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
   2121 }
   2122 
   2123 
   2124 /* Given ta1, ta2 and tres, compute tres = ADCX(ta1,ta2) or tres = ADOX(ta1,ta2)
   2125    and set flags appropriately.
   2126 */
   2127 static void helper_ADCX_ADOX ( Bool isADCX, Int sz,
   2128                                IRTemp tres, IRTemp ta1, IRTemp ta2 )
   2129 {
   2130    UInt    thunkOp;
   2131    IRType  ty        = szToITy(sz);
   2132    IRTemp  oldflags  = newTemp(Ity_I64);
   2133    IRTemp  oldOC     = newTemp(Ity_I64); // old O or C flag
   2134    IRTemp  oldOCn    = newTemp(ty);      // old O or C flag, narrowed
   2135    IROp    plus      = mkSizedOp(ty, Iop_Add8);
   2136    IROp    xor       = mkSizedOp(ty, Iop_Xor8);
   2137 
   2138    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
   2139 
   2140    switch (sz) {
   2141       case 8:  thunkOp = isADCX ? AMD64G_CC_OP_ADCX64
   2142                                 : AMD64G_CC_OP_ADOX64; break;
   2143       case 4:  thunkOp = isADCX ? AMD64G_CC_OP_ADCX32
   2144                                 : AMD64G_CC_OP_ADOX32; break;
   2145       default: vassert(0);
   2146    }
   2147 
   2148    assign( oldflags, mk_amd64g_calculate_rflags_all() );
   2149 
   2150    /* oldOC = old overflow/carry flag, 0 or 1 */
   2151    assign( oldOC, binop(Iop_And64,
   2152                         binop(Iop_Shr64,
   2153                               mkexpr(oldflags),
   2154                               mkU8(isADCX ? AMD64G_CC_SHIFT_C
   2155                                           : AMD64G_CC_SHIFT_O)),
   2156                         mkU64(1)) );
   2157 
   2158    assign( oldOCn, narrowTo(ty, mkexpr(oldOC)) );
   2159 
   2160    assign( tres, binop(plus,
   2161                        binop(plus,mkexpr(ta1),mkexpr(ta2)),
   2162                        mkexpr(oldOCn)) );
   2163 
   2164    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
   2165    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1))  ));
   2166    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
   2167                                                          mkexpr(oldOCn)) )) );
   2168    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldflags) ) );
   2169 }
   2170 
   2171 
   2172 /* -------------- Helpers for disassembly printing. -------------- */
   2173 
   2174 static const HChar* nameGrp1 ( Int opc_aux )
   2175 {
   2176    static const HChar* grp1_names[8]
   2177      = { "add", "or", "adc", "sbb", "and", "sub", "xor", "cmp" };
   2178    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp1(amd64)");
   2179    return grp1_names[opc_aux];
   2180 }
   2181 
   2182 static const HChar* nameGrp2 ( Int opc_aux )
   2183 {
   2184    static const HChar* grp2_names[8]
   2185      = { "rol", "ror", "rcl", "rcr", "shl", "shr", "shl", "sar" };
   2186    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp2(amd64)");
   2187    return grp2_names[opc_aux];
   2188 }
   2189 
   2190 static const HChar* nameGrp4 ( Int opc_aux )
   2191 {
   2192    static const HChar* grp4_names[8]
   2193      = { "inc", "dec", "???", "???", "???", "???", "???", "???" };
   2194    if (opc_aux < 0 || opc_aux > 1) vpanic("nameGrp4(amd64)");
   2195    return grp4_names[opc_aux];
   2196 }
   2197 
   2198 static const HChar* nameGrp5 ( Int opc_aux )
   2199 {
   2200    static const HChar* grp5_names[8]
   2201      = { "inc", "dec", "call*", "call*", "jmp*", "jmp*", "push", "???" };
   2202    if (opc_aux < 0 || opc_aux > 6) vpanic("nameGrp5(amd64)");
   2203    return grp5_names[opc_aux];
   2204 }
   2205 
   2206 static const HChar* nameGrp8 ( Int opc_aux )
   2207 {
   2208    static const HChar* grp8_names[8]
   2209       = { "???", "???", "???", "???", "bt", "bts", "btr", "btc" };
   2210    if (opc_aux < 4 || opc_aux > 7) vpanic("nameGrp8(amd64)");
   2211    return grp8_names[opc_aux];
   2212 }
   2213 
   2214 static const HChar* nameSReg ( UInt sreg )
   2215 {
   2216    switch (sreg) {
   2217       case R_ES: return "%es";
   2218       case R_CS: return "%cs";
   2219       case R_SS: return "%ss";
   2220       case R_DS: return "%ds";
   2221       case R_FS: return "%fs";
   2222       case R_GS: return "%gs";
   2223       default: vpanic("nameSReg(amd64)");
   2224    }
   2225 }
   2226 
   2227 static const HChar* nameMMXReg ( Int mmxreg )
   2228 {
   2229    static const HChar* mmx_names[8]
   2230      = { "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" };
   2231    if (mmxreg < 0 || mmxreg > 7) vpanic("nameMMXReg(amd64,guest)");
   2232    return mmx_names[mmxreg];
   2233 }
   2234 
   2235 static const HChar* nameXMMReg ( Int xmmreg )
   2236 {
   2237    static const HChar* xmm_names[16]
   2238      = { "%xmm0",  "%xmm1",  "%xmm2",  "%xmm3",
   2239          "%xmm4",  "%xmm5",  "%xmm6",  "%xmm7",
   2240          "%xmm8",  "%xmm9",  "%xmm10", "%xmm11",
   2241          "%xmm12", "%xmm13", "%xmm14", "%xmm15" };
   2242    if (xmmreg < 0 || xmmreg > 15) vpanic("nameXMMReg(amd64)");
   2243    return xmm_names[xmmreg];
   2244 }
   2245 
   2246 static const HChar* nameMMXGran ( Int gran )
   2247 {
   2248    switch (gran) {
   2249       case 0: return "b";
   2250       case 1: return "w";
   2251       case 2: return "d";
   2252       case 3: return "q";
   2253       default: vpanic("nameMMXGran(amd64,guest)");
   2254    }
   2255 }
   2256 
   2257 static HChar nameISize ( Int size )
   2258 {
   2259    switch (size) {
   2260       case 8: return 'q';
   2261       case 4: return 'l';
   2262       case 2: return 'w';
   2263       case 1: return 'b';
   2264       default: vpanic("nameISize(amd64)");
   2265    }
   2266 }
   2267 
   2268 static const HChar* nameYMMReg ( Int ymmreg )
   2269 {
   2270    static const HChar* ymm_names[16]
   2271      = { "%ymm0",  "%ymm1",  "%ymm2",  "%ymm3",
   2272          "%ymm4",  "%ymm5",  "%ymm6",  "%ymm7",
   2273          "%ymm8",  "%ymm9",  "%ymm10", "%ymm11",
   2274          "%ymm12", "%ymm13", "%ymm14", "%ymm15" };
   2275    if (ymmreg < 0 || ymmreg > 15) vpanic("nameYMMReg(amd64)");
   2276    return ymm_names[ymmreg];
   2277 }
   2278 
   2279 
   2280 /*------------------------------------------------------------*/
   2281 /*--- JMP helpers                                          ---*/
   2282 /*------------------------------------------------------------*/
   2283 
   2284 static void jmp_lit( /*MOD*/DisResult* dres,
   2285                      IRJumpKind kind, Addr64 d64 )
   2286 {
   2287    vassert(dres->whatNext    == Dis_Continue);
   2288    vassert(dres->len         == 0);
   2289    vassert(dres->continueAt  == 0);
   2290    vassert(dres->jk_StopHere == Ijk_INVALID);
   2291    dres->whatNext    = Dis_StopHere;
   2292    dres->jk_StopHere = kind;
   2293    stmt( IRStmt_Put( OFFB_RIP, mkU64(d64) ) );
   2294 }
   2295 
   2296 static void jmp_treg( /*MOD*/DisResult* dres,
   2297                       IRJumpKind kind, IRTemp t )
   2298 {
   2299    vassert(dres->whatNext    == Dis_Continue);
   2300    vassert(dres->len         == 0);
   2301    vassert(dres->continueAt  == 0);
   2302    vassert(dres->jk_StopHere == Ijk_INVALID);
   2303    dres->whatNext    = Dis_StopHere;
   2304    dres->jk_StopHere = kind;
   2305    stmt( IRStmt_Put( OFFB_RIP, mkexpr(t) ) );
   2306 }
   2307 
   2308 static
   2309 void jcc_01 ( /*MOD*/DisResult* dres,
   2310               AMD64Condcode cond, Addr64 d64_false, Addr64 d64_true )
   2311 {
   2312    Bool          invert;
   2313    AMD64Condcode condPos;
   2314    vassert(dres->whatNext    == Dis_Continue);
   2315    vassert(dres->len         == 0);
   2316    vassert(dres->continueAt  == 0);
   2317    vassert(dres->jk_StopHere == Ijk_INVALID);
   2318    dres->whatNext    = Dis_StopHere;
   2319    dres->jk_StopHere = Ijk_Boring;
   2320    condPos = positiveIse_AMD64Condcode ( cond, &invert );
   2321    if (invert) {
   2322       stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
   2323                          Ijk_Boring,
   2324                          IRConst_U64(d64_false),
   2325                          OFFB_RIP ) );
   2326       stmt( IRStmt_Put( OFFB_RIP, mkU64(d64_true) ) );
   2327    } else {
   2328       stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
   2329                          Ijk_Boring,
   2330                          IRConst_U64(d64_true),
   2331                          OFFB_RIP ) );
   2332       stmt( IRStmt_Put( OFFB_RIP, mkU64(d64_false) ) );
   2333    }
   2334 }
   2335 
   2336 /* Let new_rsp be the %rsp value after a call/return.  Let nia be the
   2337    guest address of the next instruction to be executed.
   2338 
   2339    This function generates an AbiHint to say that -128(%rsp)
   2340    .. -1(%rsp) should now be regarded as uninitialised.
   2341 */
   2342 static
   2343 void make_redzone_AbiHint ( const VexAbiInfo* vbi,
   2344                             IRTemp new_rsp, IRTemp nia, const HChar* who )
   2345 {
   2346    Int szB = vbi->guest_stack_redzone_size;
   2347    vassert(szB >= 0);
   2348 
   2349    /* A bit of a kludge.  Currently the only AbI we've guested AMD64
   2350       for is ELF.  So just check it's the expected 128 value
   2351       (paranoia). */
   2352    vassert(szB == 128);
   2353 
   2354    if (0) vex_printf("AbiHint: %s\n", who);
   2355    vassert(typeOfIRTemp(irsb->tyenv, new_rsp) == Ity_I64);
   2356    vassert(typeOfIRTemp(irsb->tyenv, nia) == Ity_I64);
   2357    if (szB > 0)
   2358       stmt( IRStmt_AbiHint(
   2359                binop(Iop_Sub64, mkexpr(new_rsp), mkU64(szB)),
   2360                szB,
   2361                mkexpr(nia)
   2362             ));
   2363 }
   2364 
   2365 
   2366 /*------------------------------------------------------------*/
   2367 /*--- Disassembling addressing modes                       ---*/
   2368 /*------------------------------------------------------------*/
   2369 
   2370 static
   2371 const HChar* segRegTxt ( Prefix pfx )
   2372 {
   2373    if (pfx & PFX_CS) return "%cs:";
   2374    if (pfx & PFX_DS) return "%ds:";
   2375    if (pfx & PFX_ES) return "%es:";
   2376    if (pfx & PFX_FS) return "%fs:";
   2377    if (pfx & PFX_GS) return "%gs:";
   2378    if (pfx & PFX_SS) return "%ss:";
   2379    return ""; /* no override */
   2380 }
   2381 
   2382 
   2383 /* 'virtual' is an IRExpr* holding a virtual address.  Convert it to a
   2384    linear address by adding any required segment override as indicated
   2385    by sorb, and also dealing with any address size override
   2386    present. */
   2387 static
   2388 IRExpr* handleAddrOverrides ( const VexAbiInfo* vbi,
   2389                               Prefix pfx, IRExpr* virtual )
   2390 {
   2391    /* --- address size override --- */
   2392    if (haveASO(pfx))
   2393       virtual = unop(Iop_32Uto64, unop(Iop_64to32, virtual));
   2394 
   2395    /* Note that the below are hacks that relies on the assumption
   2396       that %fs or %gs are constant.
   2397       Typically, %fs is always 0x63 on linux (in the main thread, it
   2398       stays at value 0), %gs always 0x60 on Darwin, ... */
   2399    /* --- segment overrides --- */
   2400    if (pfx & PFX_FS) {
   2401       if (vbi->guest_amd64_assume_fs_is_const) {
   2402          /* return virtual + guest_FS_CONST. */
   2403          virtual = binop(Iop_Add64, virtual,
   2404                                     IRExpr_Get(OFFB_FS_CONST, Ity_I64));
   2405       } else {
   2406          unimplemented("amd64 %fs segment override");
   2407       }
   2408    }
   2409 
   2410    if (pfx & PFX_GS) {
   2411       if (vbi->guest_amd64_assume_gs_is_const) {
   2412          /* return virtual + guest_GS_CONST. */
   2413          virtual = binop(Iop_Add64, virtual,
   2414                                     IRExpr_Get(OFFB_GS_CONST, Ity_I64));
   2415       } else {
   2416          unimplemented("amd64 %gs segment override");
   2417       }
   2418    }
   2419 
   2420    /* cs, ds, es and ss are simply ignored in 64-bit mode. */
   2421 
   2422    return virtual;
   2423 }
   2424 
   2425 //.. {
   2426 //..    Int    sreg;
   2427 //..    IRType hWordTy;
   2428 //..    IRTemp ldt_ptr, gdt_ptr, seg_selector, r64;
   2429 //..
   2430 //..    if (sorb == 0)
   2431 //..       /* the common case - no override */
   2432 //..       return virtual;
   2433 //..
   2434 //..    switch (sorb) {
   2435 //..       case 0x3E: sreg = R_DS; break;
   2436 //..       case 0x26: sreg = R_ES; break;
   2437 //..       case 0x64: sreg = R_FS; break;
   2438 //..       case 0x65: sreg = R_GS; break;
   2439 //..       default: vpanic("handleAddrOverrides(x86,guest)");
   2440 //..    }
   2441 //..
   2442 //..    hWordTy = sizeof(HWord)==4 ? Ity_I32 : Ity_I64;
   2443 //..
   2444 //..    seg_selector = newTemp(Ity_I32);
   2445 //..    ldt_ptr      = newTemp(hWordTy);
   2446 //..    gdt_ptr      = newTemp(hWordTy);
   2447 //..    r64          = newTemp(Ity_I64);
   2448 //..
   2449 //..    assign( seg_selector, unop(Iop_16Uto32, getSReg(sreg)) );
   2450 //..    assign( ldt_ptr, IRExpr_Get( OFFB_LDT, hWordTy ));
   2451 //..    assign( gdt_ptr, IRExpr_Get( OFFB_GDT, hWordTy ));
   2452 //..
   2453 //..    /*
   2454 //..    Call this to do the translation and limit checks:
   2455 //..    ULong x86g_use_seg_selector ( HWord ldt, HWord gdt,
   2456 //..                                  UInt seg_selector, UInt virtual_addr )
   2457 //..    */
   2458 //..    assign(
   2459 //..       r64,
   2460 //..       mkIRExprCCall(
   2461 //..          Ity_I64,
   2462 //..          0/*regparms*/,
   2463 //..          "x86g_use_seg_selector",
   2464 //..          &x86g_use_seg_selector,
   2465 //..          mkIRExprVec_4( mkexpr(ldt_ptr), mkexpr(gdt_ptr),
   2466 //..                         mkexpr(seg_selector), virtual)
   2467 //..       )
   2468 //..    );
   2469 //..
   2470 //..    /* If the high 32 of the result are non-zero, there was a
   2471 //..       failure in address translation.  In which case, make a
   2472 //..       quick exit.
   2473 //..    */
   2474 //..    stmt(
   2475 //..       IRStmt_Exit(
   2476 //..          binop(Iop_CmpNE32, unop(Iop_64HIto32, mkexpr(r64)), mkU32(0)),
   2477 //..          Ijk_MapFail,
   2478 //..          IRConst_U32( guest_eip_curr_instr )
   2479 //..       )
   2480 //..    );
   2481 //..
   2482 //..    /* otherwise, here's the translated result. */
   2483 //..    return unop(Iop_64to32, mkexpr(r64));
   2484 //.. }
   2485 
   2486 
   2487 /* Generate IR to calculate an address indicated by a ModRM and
   2488    following SIB bytes.  The expression, and the number of bytes in
   2489    the address mode, are returned (the latter in *len).  Note that
   2490    this fn should not be called if the R/M part of the address denotes
   2491    a register instead of memory.  If print_codegen is true, text of
   2492    the addressing mode is placed in buf.
   2493 
   2494    The computed address is stored in a new tempreg, and the
   2495    identity of the tempreg is returned.
   2496 
   2497    extra_bytes holds the number of bytes after the amode, as supplied
   2498    by the caller.  This is needed to make sense of %rip-relative
   2499    addresses.  Note that the value that *len is set to is only the
   2500    length of the amode itself and does not include the value supplied
   2501    in extra_bytes.
   2502  */
   2503 
   2504 static IRTemp disAMode_copy2tmp ( IRExpr* addr64 )
   2505 {
   2506    IRTemp tmp = newTemp(Ity_I64);
   2507    assign( tmp, addr64 );
   2508    return tmp;
   2509 }
   2510 
   2511 static
   2512 IRTemp disAMode ( /*OUT*/Int* len,
   2513                   const VexAbiInfo* vbi, Prefix pfx, Long delta,
   2514                   /*OUT*/HChar* buf, Int extra_bytes )
   2515 {
   2516    UChar mod_reg_rm = getUChar(delta);
   2517    delta++;
   2518 
   2519    buf[0] = (UChar)0;
   2520    vassert(extra_bytes >= 0 && extra_bytes < 10);
   2521 
   2522    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
   2523       jump table seems a bit excessive.
   2524    */
   2525    mod_reg_rm &= 0xC7;                         /* is now XX000YYY */
   2526    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
   2527                                                /* is now XX0XXYYY */
   2528    mod_reg_rm &= 0x1F;                         /* is now 000XXYYY */
   2529    switch (mod_reg_rm) {
   2530 
   2531       /* REX.B==0: (%rax) .. (%rdi), not including (%rsp) or (%rbp).
   2532          REX.B==1: (%r8)  .. (%r15), not including (%r12) or (%r13).
   2533       */
   2534       case 0x00: case 0x01: case 0x02: case 0x03:
   2535       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
   2536          { UChar rm = toUChar(mod_reg_rm & 7);
   2537            DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,rm));
   2538            *len = 1;
   2539            return disAMode_copy2tmp(
   2540                   handleAddrOverrides(vbi, pfx, getIRegRexB(8,pfx,rm)));
   2541          }
   2542 
   2543       /* REX.B==0: d8(%rax) ... d8(%rdi), not including d8(%rsp)
   2544          REX.B==1: d8(%r8)  ... d8(%r15), not including d8(%r12)
   2545       */
   2546       case 0x08: case 0x09: case 0x0A: case 0x0B:
   2547       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
   2548          { UChar rm = toUChar(mod_reg_rm & 7);
   2549            Long d   = getSDisp8(delta);
   2550            if (d == 0) {
   2551               DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,rm));
   2552            } else {
   2553               DIS(buf, "%s%lld(%s)", segRegTxt(pfx), d, nameIRegRexB(8,pfx,rm));
   2554            }
   2555            *len = 2;
   2556            return disAMode_copy2tmp(
   2557                   handleAddrOverrides(vbi, pfx,
   2558                      binop(Iop_Add64,getIRegRexB(8,pfx,rm),mkU64(d))));
   2559          }
   2560 
   2561       /* REX.B==0: d32(%rax) ... d32(%rdi), not including d32(%rsp)
   2562          REX.B==1: d32(%r8)  ... d32(%r15), not including d32(%r12)
   2563       */
   2564       case 0x10: case 0x11: case 0x12: case 0x13:
   2565       /* ! 14 */ case 0x15: case 0x16: case 0x17:
   2566          { UChar rm = toUChar(mod_reg_rm & 7);
   2567            Long  d  = getSDisp32(delta);
   2568            DIS(buf, "%s%lld(%s)", segRegTxt(pfx), d, nameIRegRexB(8,pfx,rm));
   2569            *len = 5;
   2570            return disAMode_copy2tmp(
   2571                   handleAddrOverrides(vbi, pfx,
   2572                      binop(Iop_Add64,getIRegRexB(8,pfx,rm),mkU64(d))));
   2573          }
   2574 
   2575       /* REX.B==0: a register, %rax .. %rdi.  This shouldn't happen. */
   2576       /* REX.B==1: a register, %r8  .. %r16.  This shouldn't happen. */
   2577       case 0x18: case 0x19: case 0x1A: case 0x1B:
   2578       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
   2579          vpanic("disAMode(amd64): not an addr!");
   2580 
   2581       /* RIP + disp32.  This assumes that guest_RIP_curr_instr is set
   2582          correctly at the start of handling each instruction. */
   2583       case 0x05:
   2584          { Long d = getSDisp32(delta);
   2585            *len = 5;
   2586            DIS(buf, "%s%lld(%%rip)", segRegTxt(pfx), d);
   2587            /* We need to know the next instruction's start address.
   2588               Try and figure out what it is, record the guess, and ask
   2589               the top-level driver logic (bbToIR_AMD64) to check we
   2590               guessed right, after the instruction is completely
   2591               decoded. */
   2592            guest_RIP_next_mustcheck = True;
   2593            guest_RIP_next_assumed = guest_RIP_bbstart
   2594                                     + delta+4 + extra_bytes;
   2595            return disAMode_copy2tmp(
   2596                      handleAddrOverrides(vbi, pfx,
   2597                         binop(Iop_Add64, mkU64(guest_RIP_next_assumed),
   2598                                          mkU64(d))));
   2599          }
   2600 
   2601       case 0x04: {
   2602          /* SIB, with no displacement.  Special cases:
   2603             -- %rsp cannot act as an index value.
   2604                If index_r indicates %rsp, zero is used for the index.
   2605             -- when mod is zero and base indicates RBP or R13, base is
   2606                instead a 32-bit sign-extended literal.
   2607             It's all madness, I tell you.  Extract %index, %base and
   2608             scale from the SIB byte.  The value denoted is then:
   2609                | %index == %RSP && (%base == %RBP || %base == %R13)
   2610                = d32 following SIB byte
   2611                | %index == %RSP && !(%base == %RBP || %base == %R13)
   2612                = %base
   2613                | %index != %RSP && (%base == %RBP || %base == %R13)
   2614                = d32 following SIB byte + (%index << scale)
   2615                | %index != %RSP && !(%base == %RBP || %base == %R13)
   2616                = %base + (%index << scale)
   2617          */
   2618          UChar sib     = getUChar(delta);
   2619          UChar scale   = toUChar((sib >> 6) & 3);
   2620          UChar index_r = toUChar((sib >> 3) & 7);
   2621          UChar base_r  = toUChar(sib & 7);
   2622          /* correct since #(R13) == 8 + #(RBP) */
   2623          Bool  base_is_BPor13 = toBool(base_r == R_RBP);
   2624          Bool  index_is_SP    = toBool(index_r == R_RSP && 0==getRexX(pfx));
   2625          delta++;
   2626 
   2627          if ((!index_is_SP) && (!base_is_BPor13)) {
   2628             if (scale == 0) {
   2629                DIS(buf, "%s(%s,%s)", segRegTxt(pfx),
   2630                          nameIRegRexB(8,pfx,base_r),
   2631                          nameIReg64rexX(pfx,index_r));
   2632             } else {
   2633                DIS(buf, "%s(%s,%s,%d)", segRegTxt(pfx),
   2634                          nameIRegRexB(8,pfx,base_r),
   2635                          nameIReg64rexX(pfx,index_r), 1<<scale);
   2636             }
   2637             *len = 2;
   2638             return
   2639                disAMode_copy2tmp(
   2640                handleAddrOverrides(vbi, pfx,
   2641                   binop(Iop_Add64,
   2642                         getIRegRexB(8,pfx,base_r),
   2643                         binop(Iop_Shl64, getIReg64rexX(pfx,index_r),
   2644                               mkU8(scale)))));
   2645          }
   2646 
   2647          if ((!index_is_SP) && base_is_BPor13) {
   2648             Long d = getSDisp32(delta);
   2649             DIS(buf, "%s%lld(,%s,%d)", segRegTxt(pfx), d,
   2650                       nameIReg64rexX(pfx,index_r), 1<<scale);
   2651             *len = 6;
   2652             return
   2653                disAMode_copy2tmp(
   2654                handleAddrOverrides(vbi, pfx,
   2655                   binop(Iop_Add64,
   2656                         binop(Iop_Shl64, getIReg64rexX(pfx,index_r),
   2657                                          mkU8(scale)),
   2658                         mkU64(d))));
   2659          }
   2660 
   2661          if (index_is_SP && (!base_is_BPor13)) {
   2662             DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,base_r));
   2663             *len = 2;
   2664             return disAMode_copy2tmp(
   2665                    handleAddrOverrides(vbi, pfx, getIRegRexB(8,pfx,base_r)));
   2666          }
   2667 
   2668          if (index_is_SP && base_is_BPor13) {
   2669             Long d = getSDisp32(delta);
   2670             DIS(buf, "%s%lld", segRegTxt(pfx), d);
   2671             *len = 6;
   2672             return disAMode_copy2tmp(
   2673                    handleAddrOverrides(vbi, pfx, mkU64(d)));
   2674          }
   2675 
   2676          vassert(0);
   2677       }
   2678 
   2679       /* SIB, with 8-bit displacement.  Special cases:
   2680          -- %esp cannot act as an index value.
   2681             If index_r indicates %esp, zero is used for the index.
   2682          Denoted value is:
   2683             | %index == %ESP
   2684             = d8 + %base
   2685             | %index != %ESP
   2686             = d8 + %base + (%index << scale)
   2687       */
   2688       case 0x0C: {
   2689          UChar sib     = getUChar(delta);
   2690          UChar scale   = toUChar((sib >> 6) & 3);
   2691          UChar index_r = toUChar((sib >> 3) & 7);
   2692          UChar base_r  = toUChar(sib & 7);
   2693          Long d        = getSDisp8(delta+1);
   2694 
   2695          if (index_r == R_RSP && 0==getRexX(pfx)) {
   2696             DIS(buf, "%s%lld(%s)", segRegTxt(pfx),
   2697                                    d, nameIRegRexB(8,pfx,base_r));
   2698             *len = 3;
   2699             return disAMode_copy2tmp(
   2700                    handleAddrOverrides(vbi, pfx,
   2701                       binop(Iop_Add64, getIRegRexB(8,pfx,base_r), mkU64(d)) ));
   2702          } else {
   2703             if (scale == 0) {
   2704                DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
   2705                          nameIRegRexB(8,pfx,base_r),
   2706                          nameIReg64rexX(pfx,index_r));
   2707             } else {
   2708                DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
   2709                          nameIRegRexB(8,pfx,base_r),
   2710                          nameIReg64rexX(pfx,index_r), 1<<scale);
   2711             }
   2712             *len = 3;
   2713             return
   2714                 disAMode_copy2tmp(
   2715                 handleAddrOverrides(vbi, pfx,
   2716                   binop(Iop_Add64,
   2717                         binop(Iop_Add64,
   2718                               getIRegRexB(8,pfx,base_r),
   2719                               binop(Iop_Shl64,
   2720                                     getIReg64rexX(pfx,index_r), mkU8(scale))),
   2721                         mkU64(d))));
   2722          }
   2723          vassert(0); /*NOTREACHED*/
   2724       }
   2725 
   2726       /* SIB, with 32-bit displacement.  Special cases:
   2727          -- %rsp cannot act as an index value.
   2728             If index_r indicates %rsp, zero is used for the index.
   2729          Denoted value is:
   2730             | %index == %RSP
   2731             = d32 + %base
   2732             | %index != %RSP
   2733             = d32 + %base + (%index << scale)
   2734       */
   2735       case 0x14: {
   2736          UChar sib     = getUChar(delta);
   2737          UChar scale   = toUChar((sib >> 6) & 3);
   2738          UChar index_r = toUChar((sib >> 3) & 7);
   2739          UChar base_r  = toUChar(sib & 7);
   2740          Long d        = getSDisp32(delta+1);
   2741 
   2742          if (index_r == R_RSP && 0==getRexX(pfx)) {
   2743             DIS(buf, "%s%lld(%s)", segRegTxt(pfx),
   2744                                    d, nameIRegRexB(8,pfx,base_r));
   2745             *len = 6;
   2746             return disAMode_copy2tmp(
   2747                    handleAddrOverrides(vbi, pfx,
   2748                       binop(Iop_Add64, getIRegRexB(8,pfx,base_r), mkU64(d)) ));
   2749          } else {
   2750             if (scale == 0) {
   2751                DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
   2752                          nameIRegRexB(8,pfx,base_r),
   2753                          nameIReg64rexX(pfx,index_r));
   2754             } else {
   2755                DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
   2756                          nameIRegRexB(8,pfx,base_r),
   2757                          nameIReg64rexX(pfx,index_r), 1<<scale);
   2758             }
   2759             *len = 6;
   2760             return
   2761                 disAMode_copy2tmp(
   2762                 handleAddrOverrides(vbi, pfx,
   2763                   binop(Iop_Add64,
   2764                         binop(Iop_Add64,
   2765                               getIRegRexB(8,pfx,base_r),
   2766                               binop(Iop_Shl64,
   2767                                     getIReg64rexX(pfx,index_r), mkU8(scale))),
   2768                         mkU64(d))));
   2769          }
   2770          vassert(0); /*NOTREACHED*/
   2771       }
   2772 
   2773       default:
   2774          vpanic("disAMode(amd64)");
   2775          return 0; /*notreached*/
   2776    }
   2777 }
   2778 
   2779 
   2780 /* Similarly for VSIB addressing.  This returns just the addend,
   2781    and fills in *rI and *vscale with the register number of the vector
   2782    index and its multiplicand.  */
   2783 static
   2784 IRTemp disAVSIBMode ( /*OUT*/Int* len,
   2785                       const VexAbiInfo* vbi, Prefix pfx, Long delta,
   2786                       /*OUT*/HChar* buf, /*OUT*/UInt* rI,
   2787                       IRType ty, /*OUT*/Int* vscale )
   2788 {
   2789    UChar mod_reg_rm = getUChar(delta);
   2790    const HChar *vindex;
   2791 
   2792    *len = 0;
   2793    *rI = 0;
   2794    *vscale = 0;
   2795    buf[0] = (UChar)0;
   2796    if ((mod_reg_rm & 7) != 4 || epartIsReg(mod_reg_rm))
   2797       return IRTemp_INVALID;
   2798 
   2799    UChar sib     = getUChar(delta+1);
   2800    UChar scale   = toUChar((sib >> 6) & 3);
   2801    UChar index_r = toUChar((sib >> 3) & 7);
   2802    UChar base_r  = toUChar(sib & 7);
   2803    Long  d       = 0;
   2804    /* correct since #(R13) == 8 + #(RBP) */
   2805    Bool  base_is_BPor13 = toBool(base_r == R_RBP);
   2806    delta += 2;
   2807    *len = 2;
   2808 
   2809    *rI = index_r | (getRexX(pfx) << 3);
   2810    if (ty == Ity_V128)
   2811       vindex = nameXMMReg(*rI);
   2812    else
   2813       vindex = nameYMMReg(*rI);
   2814    *vscale = 1<<scale;
   2815 
   2816    switch (mod_reg_rm >> 6) {
   2817    case 0:
   2818       if (base_is_BPor13) {
   2819          d = getSDisp32(delta);
   2820          *len += 4;
   2821          if (scale == 0) {
   2822             DIS(buf, "%s%lld(,%s)", segRegTxt(pfx), d, vindex);
   2823          } else {
   2824             DIS(buf, "%s%lld(,%s,%d)", segRegTxt(pfx), d, vindex, 1<<scale);
   2825          }
   2826          return disAMode_copy2tmp( mkU64(d) );
   2827       } else {
   2828          if (scale == 0) {
   2829             DIS(buf, "%s(%s,%s)", segRegTxt(pfx),
   2830                      nameIRegRexB(8,pfx,base_r), vindex);
   2831          } else {
   2832             DIS(buf, "%s(%s,%s,%d)", segRegTxt(pfx),
   2833                      nameIRegRexB(8,pfx,base_r), vindex, 1<<scale);
   2834          }
   2835       }
   2836       break;
   2837    case 1:
   2838       d = getSDisp8(delta);
   2839       *len += 1;
   2840       goto have_disp;
   2841    case 2:
   2842       d = getSDisp32(delta);
   2843       *len += 4;
   2844    have_disp:
   2845       if (scale == 0) {
   2846          DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
   2847                   nameIRegRexB(8,pfx,base_r), vindex);
   2848       } else {
   2849          DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
   2850                   nameIRegRexB(8,pfx,base_r), vindex, 1<<scale);
   2851       }
   2852       break;
   2853    }
   2854 
   2855    if (!d)
   2856       return disAMode_copy2tmp( getIRegRexB(8,pfx,base_r) );
   2857    return disAMode_copy2tmp( binop(Iop_Add64, getIRegRexB(8,pfx,base_r),
   2858                                    mkU64(d)) );
   2859 }
   2860 
   2861 
   2862 /* Figure out the number of (insn-stream) bytes constituting the amode
   2863    beginning at delta.  Is useful for getting hold of literals beyond
   2864    the end of the amode before it has been disassembled.  */
   2865 
   2866 static UInt lengthAMode ( Prefix pfx, Long delta )
   2867 {
   2868    UChar mod_reg_rm = getUChar(delta);
   2869    delta++;
   2870 
   2871    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
   2872       jump table seems a bit excessive.
   2873    */
   2874    mod_reg_rm &= 0xC7;                         /* is now XX000YYY */
   2875    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
   2876                                                /* is now XX0XXYYY */
   2877    mod_reg_rm &= 0x1F;                         /* is now 000XXYYY */
   2878    switch (mod_reg_rm) {
   2879 
   2880       /* REX.B==0: (%rax) .. (%rdi), not including (%rsp) or (%rbp).
   2881          REX.B==1: (%r8)  .. (%r15), not including (%r12) or (%r13).
   2882       */
   2883       case 0x00: case 0x01: case 0x02: case 0x03:
   2884       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
   2885          return 1;
   2886 
   2887       /* REX.B==0: d8(%rax) ... d8(%rdi), not including d8(%rsp)
   2888          REX.B==1: d8(%r8)  ... d8(%r15), not including d8(%r12)
   2889       */
   2890       case 0x08: case 0x09: case 0x0A: case 0x0B:
   2891       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
   2892          return 2;
   2893 
   2894       /* REX.B==0: d32(%rax) ... d32(%rdi), not including d32(%rsp)
   2895          REX.B==1: d32(%r8)  ... d32(%r15), not including d32(%r12)
   2896       */
   2897       case 0x10: case 0x11: case 0x12: case 0x13:
   2898       /* ! 14 */ case 0x15: case 0x16: case 0x17:
   2899          return 5;
   2900 
   2901       /* REX.B==0: a register, %rax .. %rdi.  This shouldn't happen. */
   2902       /* REX.B==1: a register, %r8  .. %r16.  This shouldn't happen. */
   2903       /* Not an address, but still handled. */
   2904       case 0x18: case 0x19: case 0x1A: case 0x1B:
   2905       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
   2906          return 1;
   2907 
   2908       /* RIP + disp32. */
   2909       case 0x05:
   2910          return 5;
   2911 
   2912       case 0x04: {
   2913          /* SIB, with no displacement. */
   2914          UChar sib     = getUChar(delta);
   2915          UChar base_r  = toUChar(sib & 7);
   2916          /* correct since #(R13) == 8 + #(RBP) */
   2917          Bool  base_is_BPor13 = toBool(base_r == R_RBP);
   2918 
   2919          if (base_is_BPor13) {
   2920             return 6;
   2921          } else {
   2922             return 2;
   2923          }
   2924       }
   2925 
   2926       /* SIB, with 8-bit displacement. */
   2927       case 0x0C:
   2928          return 3;
   2929 
   2930       /* SIB, with 32-bit displacement. */
   2931       case 0x14:
   2932          return 6;
   2933 
   2934       default:
   2935          vpanic("lengthAMode(amd64)");
   2936          return 0; /*notreached*/
   2937    }
   2938 }
   2939 
   2940 
   2941 /*------------------------------------------------------------*/
   2942 /*--- Disassembling common idioms                          ---*/
   2943 /*------------------------------------------------------------*/
   2944 
   2945 typedef
   2946   enum { WithFlagNone=2, WithFlagCarry, WithFlagCarryX, WithFlagOverX }
   2947   WithFlag;
   2948 
   2949 /* Handle binary integer instructions of the form
   2950       op E, G  meaning
   2951       op reg-or-mem, reg
   2952    Is passed the a ptr to the modRM byte, the actual operation, and the
   2953    data size.  Returns the address advanced completely over this
   2954    instruction.
   2955 
   2956    E(src) is reg-or-mem
   2957    G(dst) is reg.
   2958 
   2959    If E is reg, -->    GET %G,  tmp
   2960                        OP %E,   tmp
   2961                        PUT tmp, %G
   2962 
   2963    If E is mem and OP is not reversible,
   2964                 -->    (getAddr E) -> tmpa
   2965                        LD (tmpa), tmpa
   2966                        GET %G, tmp2
   2967                        OP tmpa, tmp2
   2968                        PUT tmp2, %G
   2969 
   2970    If E is mem and OP is reversible
   2971                 -->    (getAddr E) -> tmpa
   2972                        LD (tmpa), tmpa
   2973                        OP %G, tmpa
   2974                        PUT tmpa, %G
   2975 */
   2976 static
   2977 ULong dis_op2_E_G ( const VexAbiInfo* vbi,
   2978                     Prefix      pfx,
   2979                     IROp        op8,
   2980                     WithFlag    flag,
   2981                     Bool        keep,
   2982                     Int         size,
   2983                     Long        delta0,
   2984                     const HChar* t_amd64opc )
   2985 {
   2986    HChar   dis_buf[50];
   2987    Int     len;
   2988    IRType  ty   = szToITy(size);
   2989    IRTemp  dst1 = newTemp(ty);
   2990    IRTemp  src  = newTemp(ty);
   2991    IRTemp  dst0 = newTemp(ty);
   2992    UChar   rm   = getUChar(delta0);
   2993    IRTemp  addr = IRTemp_INVALID;
   2994 
   2995    /* Stay sane -- check for valid (op8, flag, keep) combinations. */
   2996    switch (op8) {
   2997       case Iop_Add8:
   2998          switch (flag) {
   2999             case WithFlagNone: case WithFlagCarry:
   3000             case WithFlagCarryX: case WithFlagOverX:
   3001                vassert(keep);
   3002                break;
   3003             default:
   3004                vassert(0);
   3005          }
   3006          break;
   3007       case Iop_Sub8:
   3008          vassert(flag == WithFlagNone || flag == WithFlagCarry);
   3009          if (flag == WithFlagCarry) vassert(keep);
   3010          break;
   3011       case Iop_And8:
   3012          vassert(flag == WithFlagNone);
   3013          break;
   3014       case Iop_Or8: case Iop_Xor8:
   3015          vassert(flag == WithFlagNone);
   3016          vassert(keep);
   3017          break;
   3018       default:
   3019          vassert(0);
   3020    }
   3021 
   3022    if (epartIsReg(rm)) {
   3023       /* Specially handle XOR reg,reg, because that doesn't really
   3024          depend on reg, and doing the obvious thing potentially
   3025          generates a spurious value check failure due to the bogus
   3026          dependency.  Ditto SUB/SBB reg,reg. */
   3027       if ((op8 == Iop_Xor8 || ((op8 == Iop_Sub8) && keep))
   3028           && offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
   3029          putIRegG(size,pfx,rm, mkU(ty,0));
   3030       }
   3031 
   3032       assign( dst0, getIRegG(size,pfx,rm) );
   3033       assign( src,  getIRegE(size,pfx,rm) );
   3034 
   3035       if (op8 == Iop_Add8 && flag == WithFlagCarry) {
   3036          helper_ADC( size, dst1, dst0, src,
   3037                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3038          putIRegG(size, pfx, rm, mkexpr(dst1));
   3039       } else
   3040       if (op8 == Iop_Sub8 && flag == WithFlagCarry) {
   3041          helper_SBB( size, dst1, dst0, src,
   3042                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3043          putIRegG(size, pfx, rm, mkexpr(dst1));
   3044       } else
   3045       if (op8 == Iop_Add8 && flag == WithFlagCarryX) {
   3046          helper_ADCX_ADOX( True/*isADCX*/, size, dst1, dst0, src );
   3047          putIRegG(size, pfx, rm, mkexpr(dst1));
   3048       } else
   3049       if (op8 == Iop_Add8 && flag == WithFlagOverX) {
   3050          helper_ADCX_ADOX( False/*!isADCX*/, size, dst1, dst0, src );
   3051          putIRegG(size, pfx, rm, mkexpr(dst1));
   3052       } else {
   3053          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   3054          if (isAddSub(op8))
   3055             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3056          else
   3057             setFlags_DEP1(op8, dst1, ty);
   3058          if (keep)
   3059             putIRegG(size, pfx, rm, mkexpr(dst1));
   3060       }
   3061 
   3062       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   3063                           nameIRegE(size,pfx,rm),
   3064                           nameIRegG(size,pfx,rm));
   3065       return 1+delta0;
   3066    } else {
   3067       /* E refers to memory */
   3068       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   3069       assign( dst0, getIRegG(size,pfx,rm) );
   3070       assign( src,  loadLE(szToITy(size), mkexpr(addr)) );
   3071 
   3072       if (op8 == Iop_Add8 && flag == WithFlagCarry) {
   3073          helper_ADC( size, dst1, dst0, src,
   3074                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3075          putIRegG(size, pfx, rm, mkexpr(dst1));
   3076       } else
   3077       if (op8 == Iop_Sub8 && flag == WithFlagCarry) {
   3078          helper_SBB( size, dst1, dst0, src,
   3079                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3080          putIRegG(size, pfx, rm, mkexpr(dst1));
   3081       } else
   3082       if (op8 == Iop_Add8 && flag == WithFlagCarryX) {
   3083          /* normal store */
   3084          helper_ADCX_ADOX( True/*isADCX*/, size, dst1, dst0, src );
   3085       } else
   3086       if (op8 == Iop_Add8 && flag == WithFlagOverX) {
   3087          /* normal store */
   3088          helper_ADCX_ADOX( False/*!isADCX*/, size, dst1, dst0, src );
   3089       } else {
   3090          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   3091          if (isAddSub(op8))
   3092             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3093          else
   3094             setFlags_DEP1(op8, dst1, ty);
   3095          if (keep)
   3096             putIRegG(size, pfx, rm, mkexpr(dst1));
   3097       }
   3098 
   3099       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   3100                           dis_buf, nameIRegG(size, pfx, rm));
   3101       return len+delta0;
   3102    }
   3103 }
   3104 
   3105 
   3106 
   3107 /* Handle binary integer instructions of the form
   3108       op G, E  meaning
   3109       op reg, reg-or-mem
   3110    Is passed the a ptr to the modRM byte, the actual operation, and the
   3111    data size.  Returns the address advanced completely over this
   3112    instruction.
   3113 
   3114    G(src) is reg.
   3115    E(dst) is reg-or-mem
   3116 
   3117    If E is reg, -->    GET %E,  tmp
   3118                        OP %G,   tmp
   3119                        PUT tmp, %E
   3120 
   3121    If E is mem, -->    (getAddr E) -> tmpa
   3122                        LD (tmpa), tmpv
   3123                        OP %G, tmpv
   3124                        ST tmpv, (tmpa)
   3125 */
   3126 static
   3127 ULong dis_op2_G_E ( const VexAbiInfo* vbi,
   3128                     Prefix      pfx,
   3129                     IROp        op8,
   3130                     WithFlag    flag,
   3131                     Bool        keep,
   3132                     Int         size,
   3133                     Long        delta0,
   3134                     const HChar* t_amd64opc )
   3135 {
   3136    HChar   dis_buf[50];
   3137    Int     len;
   3138    IRType  ty   = szToITy(size);
   3139    IRTemp  dst1 = newTemp(ty);
   3140    IRTemp  src  = newTemp(ty);
   3141    IRTemp  dst0 = newTemp(ty);
   3142    UChar   rm   = getUChar(delta0);
   3143    IRTemp  addr = IRTemp_INVALID;
   3144 
   3145    /* Stay sane -- check for valid (op8, flag, keep) combinations. */
   3146    switch (op8) {
   3147       case Iop_Add8:
   3148          vassert(flag == WithFlagNone || flag == WithFlagCarry);
   3149          vassert(keep);
   3150          break;
   3151       case Iop_Sub8:
   3152          vassert(flag == WithFlagNone || flag == WithFlagCarry);
   3153          if (flag == WithFlagCarry) vassert(keep);
   3154          break;
   3155       case Iop_And8: case Iop_Or8: case Iop_Xor8:
   3156          vassert(flag == WithFlagNone);
   3157          vassert(keep);
   3158          break;
   3159       default:
   3160          vassert(0);
   3161    }
   3162 
   3163    /* flag != WithFlagNone is only allowed for Add and Sub and indicates the
   3164       intended operation is add-with-carry or subtract-with-borrow. */
   3165 
   3166    if (epartIsReg(rm)) {
   3167       /* Specially handle XOR reg,reg, because that doesn't really
   3168          depend on reg, and doing the obvious thing potentially
   3169          generates a spurious value check failure due to the bogus
   3170          dependency.  Ditto SUB/SBB reg,reg. */
   3171       if ((op8 == Iop_Xor8 || ((op8 == Iop_Sub8) && keep))
   3172           && offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
   3173          putIRegE(size,pfx,rm, mkU(ty,0));
   3174       }
   3175 
   3176       assign(dst0, getIRegE(size,pfx,rm));
   3177       assign(src,  getIRegG(size,pfx,rm));
   3178 
   3179       if (op8 == Iop_Add8 && flag == WithFlagCarry) {
   3180          helper_ADC( size, dst1, dst0, src,
   3181                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3182          putIRegE(size, pfx, rm, mkexpr(dst1));
   3183       } else
   3184       if (op8 == Iop_Sub8 && flag == WithFlagCarry) {
   3185          helper_SBB( size, dst1, dst0, src,
   3186                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3187          putIRegE(size, pfx, rm, mkexpr(dst1));
   3188       } else {
   3189          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   3190          if (isAddSub(op8))
   3191             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3192          else
   3193             setFlags_DEP1(op8, dst1, ty);
   3194          if (keep)
   3195             putIRegE(size, pfx, rm, mkexpr(dst1));
   3196       }
   3197 
   3198       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   3199                           nameIRegG(size,pfx,rm),
   3200                           nameIRegE(size,pfx,rm));
   3201       return 1+delta0;
   3202    }
   3203 
   3204    /* E refers to memory */
   3205    {
   3206       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   3207       assign(dst0, loadLE(ty,mkexpr(addr)));
   3208       assign(src,  getIRegG(size,pfx,rm));
   3209 
   3210       if (op8 == Iop_Add8 && flag == WithFlagCarry) {
   3211          if (haveLOCK(pfx)) {
   3212             /* cas-style store */
   3213             helper_ADC( size, dst1, dst0, src,
   3214                         /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   3215          } else {
   3216             /* normal store */
   3217             helper_ADC( size, dst1, dst0, src,
   3218                         /*store*/addr, IRTemp_INVALID, 0 );
   3219          }
   3220       } else
   3221       if (op8 == Iop_Sub8 && flag == WithFlagCarry) {
   3222          if (haveLOCK(pfx)) {
   3223             /* cas-style store */
   3224             helper_SBB( size, dst1, dst0, src,
   3225                         /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   3226          } else {
   3227             /* normal store */
   3228             helper_SBB( size, dst1, dst0, src,
   3229                         /*store*/addr, IRTemp_INVALID, 0 );
   3230          }
   3231       } else {
   3232          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   3233          if (keep) {
   3234             if (haveLOCK(pfx)) {
   3235                if (0) vex_printf("locked case\n" );
   3236                casLE( mkexpr(addr),
   3237                       mkexpr(dst0)/*expval*/,
   3238                       mkexpr(dst1)/*newval*/, guest_RIP_curr_instr );
   3239             } else {
   3240                if (0) vex_printf("nonlocked case\n");
   3241                storeLE(mkexpr(addr), mkexpr(dst1));
   3242             }
   3243          }
   3244          if (isAddSub(op8))
   3245             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3246          else
   3247             setFlags_DEP1(op8, dst1, ty);
   3248       }
   3249 
   3250       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   3251                           nameIRegG(size,pfx,rm), dis_buf);
   3252       return len+delta0;
   3253    }
   3254 }
   3255 
   3256 
   3257 /* Handle move instructions of the form
   3258       mov E, G  meaning
   3259       mov reg-or-mem, reg
   3260    Is passed the a ptr to the modRM byte, and the data size.  Returns
   3261    the address advanced completely over this instruction.
   3262 
   3263    E(src) is reg-or-mem
   3264    G(dst) is reg.
   3265 
   3266    If E is reg, -->    GET %E,  tmpv
   3267                        PUT tmpv, %G
   3268 
   3269    If E is mem  -->    (getAddr E) -> tmpa
   3270                        LD (tmpa), tmpb
   3271                        PUT tmpb, %G
   3272 */
   3273 static
   3274 ULong dis_mov_E_G ( const VexAbiInfo* vbi,
   3275                     Prefix      pfx,
   3276                     Int         size,
   3277                     Long        delta0 )
   3278 {
   3279    Int len;
   3280    UChar rm = getUChar(delta0);
   3281    HChar dis_buf[50];
   3282 
   3283    if (epartIsReg(rm)) {
   3284       putIRegG(size, pfx, rm, getIRegE(size, pfx, rm));
   3285       DIP("mov%c %s,%s\n", nameISize(size),
   3286                            nameIRegE(size,pfx,rm),
   3287                            nameIRegG(size,pfx,rm));
   3288       return 1+delta0;
   3289    }
   3290 
   3291    /* E refers to memory */
   3292    {
   3293       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   3294       putIRegG(size, pfx, rm, loadLE(szToITy(size), mkexpr(addr)));
   3295       DIP("mov%c %s,%s\n", nameISize(size),
   3296                            dis_buf,
   3297                            nameIRegG(size,pfx,rm));
   3298       return delta0+len;
   3299    }
   3300 }
   3301 
   3302 
   3303 /* Handle move instructions of the form
   3304       mov G, E  meaning
   3305       mov reg, reg-or-mem
   3306    Is passed the a ptr to the modRM byte, and the data size.  Returns
   3307    the address advanced completely over this instruction.
   3308    We have to decide here whether F2 or F3 are acceptable.  F2 never is.
   3309 
   3310    G(src) is reg.
   3311    E(dst) is reg-or-mem
   3312 
   3313    If E is reg, -->    GET %G,  tmp
   3314                        PUT tmp, %E
   3315 
   3316    If E is mem, -->    (getAddr E) -> tmpa
   3317                        GET %G, tmpv
   3318                        ST tmpv, (tmpa)
   3319 */
   3320 static
   3321 ULong dis_mov_G_E ( const VexAbiInfo*  vbi,
   3322                     Prefix       pfx,
   3323                     Int          size,
   3324                     Long         delta0,
   3325                     /*OUT*/Bool* ok )
   3326 {
   3327    Int   len;
   3328    UChar rm = getUChar(delta0);
   3329    HChar dis_buf[50];
   3330 
   3331    *ok = True;
   3332 
   3333    if (epartIsReg(rm)) {
   3334       if (haveF2orF3(pfx)) { *ok = False; return delta0; }
   3335       putIRegE(size, pfx, rm, getIRegG(size, pfx, rm));
   3336       DIP("mov%c %s,%s\n", nameISize(size),
   3337                            nameIRegG(size,pfx,rm),
   3338                            nameIRegE(size,pfx,rm));
   3339       return 1+delta0;
   3340    }
   3341 
   3342    /* E refers to memory */
   3343    {
   3344       if (haveF2(pfx)) { *ok = False; return delta0; }
   3345       /* F3(XRELEASE) is acceptable, though. */
   3346       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   3347       storeLE( mkexpr(addr), getIRegG(size, pfx, rm) );
   3348       DIP("mov%c %s,%s\n", nameISize(size),
   3349                            nameIRegG(size,pfx,rm),
   3350                            dis_buf);
   3351       return len+delta0;
   3352    }
   3353 }
   3354 
   3355 
   3356 /* op $immediate, AL/AX/EAX/RAX. */
   3357 static
   3358 ULong dis_op_imm_A ( Int    size,
   3359                      Bool   carrying,
   3360                      IROp   op8,
   3361                      Bool   keep,
   3362                      Long   delta,
   3363                      const HChar* t_amd64opc )
   3364 {
   3365    Int    size4 = imin(size,4);
   3366    IRType ty    = szToITy(size);
   3367    IRTemp dst0  = newTemp(ty);
   3368    IRTemp src   = newTemp(ty);
   3369    IRTemp dst1  = newTemp(ty);
   3370    Long  lit    = getSDisp(size4,delta);
   3371    assign(dst0, getIRegRAX(size));
   3372    assign(src,  mkU(ty,lit & mkSizeMask(size)));
   3373 
   3374    if (isAddSub(op8) && !carrying) {
   3375       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   3376       setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3377    }
   3378    else
   3379    if (isLogic(op8)) {
   3380       vassert(!carrying);
   3381       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   3382       setFlags_DEP1(op8, dst1, ty);
   3383    }
   3384    else
   3385    if (op8 == Iop_Add8 && carrying) {
   3386       helper_ADC( size, dst1, dst0, src,
   3387                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3388    }
   3389    else
   3390    if (op8 == Iop_Sub8 && carrying) {
   3391       helper_SBB( size, dst1, dst0, src,
   3392                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3393    }
   3394    else
   3395       vpanic("dis_op_imm_A(amd64,guest)");
   3396 
   3397    if (keep)
   3398       putIRegRAX(size, mkexpr(dst1));
   3399 
   3400    DIP("%s%c $%lld, %s\n", t_amd64opc, nameISize(size),
   3401                            lit, nameIRegRAX(size));
   3402    return delta+size4;
   3403 }
   3404 
   3405 
   3406 /* Sign- and Zero-extending moves. */
   3407 static
   3408 ULong dis_movx_E_G ( const VexAbiInfo* vbi,
   3409                      Prefix pfx,
   3410                      Long delta, Int szs, Int szd, Bool sign_extend )
   3411 {
   3412    UChar rm = getUChar(delta);
   3413    if (epartIsReg(rm)) {
   3414       putIRegG(szd, pfx, rm,
   3415                     doScalarWidening(
   3416                        szs,szd,sign_extend,
   3417                        getIRegE(szs,pfx,rm)));
   3418       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
   3419                                nameISize(szs),
   3420                                nameISize(szd),
   3421                                nameIRegE(szs,pfx,rm),
   3422                                nameIRegG(szd,pfx,rm));
   3423       return 1+delta;
   3424    }
   3425 
   3426    /* E refers to memory */
   3427    {
   3428       Int    len;
   3429       HChar  dis_buf[50];
   3430       IRTemp addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   3431       putIRegG(szd, pfx, rm,
   3432                     doScalarWidening(
   3433                        szs,szd,sign_extend,
   3434                        loadLE(szToITy(szs),mkexpr(addr))));
   3435       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
   3436                                nameISize(szs),
   3437                                nameISize(szd),
   3438                                dis_buf,
   3439                                nameIRegG(szd,pfx,rm));
   3440       return len+delta;
   3441    }
   3442 }
   3443 
   3444 
   3445 /* Generate code to divide ArchRegs RDX:RAX / EDX:EAX / DX:AX / AX by
   3446    the 64 / 32 / 16 / 8 bit quantity in the given IRTemp.  */
   3447 static
   3448 void codegen_div ( Int sz, IRTemp t, Bool signed_divide )
   3449 {
   3450    /* special-case the 64-bit case */
   3451    if (sz == 8) {
   3452       IROp   op     = signed_divide ? Iop_DivModS128to64
   3453                                     : Iop_DivModU128to64;
   3454       IRTemp src128 = newTemp(Ity_I128);
   3455       IRTemp dst128 = newTemp(Ity_I128);
   3456       assign( src128, binop(Iop_64HLto128,
   3457                             getIReg64(R_RDX),
   3458                             getIReg64(R_RAX)) );
   3459       assign( dst128, binop(op, mkexpr(src128), mkexpr(t)) );
   3460       putIReg64( R_RAX, unop(Iop_128to64,mkexpr(dst128)) );
   3461       putIReg64( R_RDX, unop(Iop_128HIto64,mkexpr(dst128)) );
   3462    } else {
   3463       IROp   op    = signed_divide ? Iop_DivModS64to32
   3464                                    : Iop_DivModU64to32;
   3465       IRTemp src64 = newTemp(Ity_I64);
   3466       IRTemp dst64 = newTemp(Ity_I64);
   3467       switch (sz) {
   3468       case 4:
   3469          assign( src64,
   3470                  binop(Iop_32HLto64, getIRegRDX(4), getIRegRAX(4)) );
   3471          assign( dst64,
   3472                  binop(op, mkexpr(src64), mkexpr(t)) );
   3473          putIRegRAX( 4, unop(Iop_64to32,mkexpr(dst64)) );
   3474          putIRegRDX( 4, unop(Iop_64HIto32,mkexpr(dst64)) );
   3475          break;
   3476       case 2: {
   3477          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
   3478          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
   3479          assign( src64, unop(widen3264,
   3480                              binop(Iop_16HLto32,
   3481                                    getIRegRDX(2),
   3482                                    getIRegRAX(2))) );
   3483          assign( dst64, binop(op, mkexpr(src64), unop(widen1632,mkexpr(t))) );
   3484          putIRegRAX( 2, unop(Iop_32to16,unop(Iop_64to32,mkexpr(dst64))) );
   3485          putIRegRDX( 2, unop(Iop_32to16,unop(Iop_64HIto32,mkexpr(dst64))) );
   3486          break;
   3487       }
   3488       case 1: {
   3489          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
   3490          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
   3491          IROp widen816  = signed_divide ? Iop_8Sto16  : Iop_8Uto16;
   3492          assign( src64, unop(widen3264,
   3493                         unop(widen1632, getIRegRAX(2))) );
   3494          assign( dst64,
   3495                  binop(op, mkexpr(src64),
   3496                            unop(widen1632, unop(widen816, mkexpr(t)))) );
   3497          putIRegRAX( 1, unop(Iop_16to8,
   3498                         unop(Iop_32to16,
   3499                         unop(Iop_64to32,mkexpr(dst64)))) );
   3500          putIRegAH( unop(Iop_16to8,
   3501                     unop(Iop_32to16,
   3502                     unop(Iop_64HIto32,mkexpr(dst64)))) );
   3503          break;
   3504       }
   3505       default:
   3506          vpanic("codegen_div(amd64)");
   3507       }
   3508    }
   3509 }
   3510 
   3511 static
   3512 ULong dis_Grp1 ( const VexAbiInfo* vbi,
   3513                  Prefix pfx,
   3514                  Long delta, UChar modrm,
   3515                  Int am_sz, Int d_sz, Int sz, Long d64 )
   3516 {
   3517    Int     len;
   3518    HChar   dis_buf[50];
   3519    IRType  ty   = szToITy(sz);
   3520    IRTemp  dst1 = newTemp(ty);
   3521    IRTemp  src  = newTemp(ty);
   3522    IRTemp  dst0 = newTemp(ty);
   3523    IRTemp  addr = IRTemp_INVALID;
   3524    IROp    op8  = Iop_INVALID;
   3525    ULong   mask = mkSizeMask(sz);
   3526 
   3527    switch (gregLO3ofRM(modrm)) {
   3528       case 0: op8 = Iop_Add8; break;  case 1: op8 = Iop_Or8;  break;
   3529       case 2: break;  // ADC
   3530       case 3: break;  // SBB
   3531       case 4: op8 = Iop_And8; break;  case 5: op8 = Iop_Sub8; break;
   3532       case 6: op8 = Iop_Xor8; break;  case 7: op8 = Iop_Sub8; break;
   3533       /*NOTREACHED*/
   3534       default: vpanic("dis_Grp1(amd64): unhandled case");
   3535    }
   3536 
   3537    if (epartIsReg(modrm)) {
   3538       vassert(am_sz == 1);
   3539 
   3540       assign(dst0, getIRegE(sz,pfx,modrm));
   3541       assign(src,  mkU(ty,d64 & mask));
   3542 
   3543       if (gregLO3ofRM(modrm) == 2 /* ADC */) {
   3544          helper_ADC( sz, dst1, dst0, src,
   3545                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3546       } else
   3547       if (gregLO3ofRM(modrm) == 3 /* SBB */) {
   3548          helper_SBB( sz, dst1, dst0, src,
   3549                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3550       } else {
   3551          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   3552          if (isAddSub(op8))
   3553             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3554          else
   3555             setFlags_DEP1(op8, dst1, ty);
   3556       }
   3557 
   3558       if (gregLO3ofRM(modrm) < 7)
   3559          putIRegE(sz, pfx, modrm, mkexpr(dst1));
   3560 
   3561       delta += (am_sz + d_sz);
   3562       DIP("%s%c $%lld, %s\n",
   3563           nameGrp1(gregLO3ofRM(modrm)), nameISize(sz), d64,
   3564           nameIRegE(sz,pfx,modrm));
   3565    } else {
   3566       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, /*xtra*/d_sz );
   3567 
   3568       assign(dst0, loadLE(ty,mkexpr(addr)));
   3569       assign(src, mkU(ty,d64 & mask));
   3570 
   3571       if (gregLO3ofRM(modrm) == 2 /* ADC */) {
   3572          if (haveLOCK(pfx)) {
   3573             /* cas-style store */
   3574             helper_ADC( sz, dst1, dst0, src,
   3575                        /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   3576          } else {
   3577             /* normal store */
   3578             helper_ADC( sz, dst1, dst0, src,
   3579                         /*store*/addr, IRTemp_INVALID, 0 );
   3580          }
   3581       } else
   3582       if (gregLO3ofRM(modrm) == 3 /* SBB */) {
   3583          if (haveLOCK(pfx)) {
   3584             /* cas-style store */
   3585             helper_SBB( sz, dst1, dst0, src,
   3586                        /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   3587          } else {
   3588             /* normal store */
   3589             helper_SBB( sz, dst1, dst0, src,
   3590                         /*store*/addr, IRTemp_INVALID, 0 );
   3591          }
   3592       } else {
   3593          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   3594          if (gregLO3ofRM(modrm) < 7) {
   3595             if (haveLOCK(pfx)) {
   3596                casLE( mkexpr(addr), mkexpr(dst0)/*expVal*/,
   3597                                     mkexpr(dst1)/*newVal*/,
   3598                                     guest_RIP_curr_instr );
   3599             } else {
   3600                storeLE(mkexpr(addr), mkexpr(dst1));
   3601             }
   3602          }
   3603          if (isAddSub(op8))
   3604             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3605          else
   3606             setFlags_DEP1(op8, dst1, ty);
   3607       }
   3608 
   3609       delta += (len+d_sz);
   3610       DIP("%s%c $%lld, %s\n",
   3611           nameGrp1(gregLO3ofRM(modrm)), nameISize(sz),
   3612           d64, dis_buf);
   3613    }
   3614    return delta;
   3615 }
   3616 
   3617 
   3618 /* Group 2 extended opcodes.  shift_expr must be an 8-bit typed
   3619    expression. */
   3620 
   3621 static
   3622 ULong dis_Grp2 ( const VexAbiInfo* vbi,
   3623                  Prefix pfx,
   3624                  Long delta, UChar modrm,
   3625                  Int am_sz, Int d_sz, Int sz, IRExpr* shift_expr,
   3626                  const HChar* shift_expr_txt, Bool* decode_OK )
   3627 {
   3628    /* delta on entry points at the modrm byte. */
   3629    HChar  dis_buf[50];
   3630    Int    len;
   3631    Bool   isShift, isRotate, isRotateC;
   3632    IRType ty    = szToITy(sz);
   3633    IRTemp dst0  = newTemp(ty);
   3634    IRTemp dst1  = newTemp(ty);
   3635    IRTemp addr  = IRTemp_INVALID;
   3636 
   3637    *decode_OK = True;
   3638 
   3639    vassert(sz == 1 || sz == 2 || sz == 4 || sz == 8);
   3640 
   3641    /* Put value to shift/rotate in dst0. */
   3642    if (epartIsReg(modrm)) {
   3643       assign(dst0, getIRegE(sz, pfx, modrm));
   3644       delta += (am_sz + d_sz);
   3645    } else {
   3646       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, /*xtra*/d_sz );
   3647       assign(dst0, loadLE(ty,mkexpr(addr)));
   3648       delta += len + d_sz;
   3649    }
   3650 
   3651    isShift = False;
   3652    switch (gregLO3ofRM(modrm)) { case 4: case 5: case 6: case 7: isShift = True; }
   3653 
   3654    isRotate = False;
   3655    switch (gregLO3ofRM(modrm)) { case 0: case 1: isRotate = True; }
   3656 
   3657    isRotateC = False;
   3658    switch (gregLO3ofRM(modrm)) { case 2: case 3: isRotateC = True; }
   3659 
   3660    if (!isShift && !isRotate && !isRotateC) {
   3661       /*NOTREACHED*/
   3662       vpanic("dis_Grp2(Reg): unhandled case(amd64)");
   3663    }
   3664 
   3665    if (isRotateC) {
   3666       /* Call a helper; this insn is so ridiculous it does not deserve
   3667          better.  One problem is, the helper has to calculate both the
   3668          new value and the new flags.  This is more than 64 bits, and
   3669          there is no way to return more than 64 bits from the helper.
   3670          Hence the crude and obvious solution is to call it twice,
   3671          using the sign of the sz field to indicate whether it is the
   3672          value or rflags result we want.
   3673       */
   3674       Bool     left = toBool(gregLO3ofRM(modrm) == 2);
   3675       IRExpr** argsVALUE;
   3676       IRExpr** argsRFLAGS;
   3677 
   3678       IRTemp new_value  = newTemp(Ity_I64);
   3679       IRTemp new_rflags = newTemp(Ity_I64);
   3680       IRTemp old_rflags = newTemp(Ity_I64);
   3681 
   3682       assign( old_rflags, widenUto64(mk_amd64g_calculate_rflags_all()) );
   3683 
   3684       argsVALUE
   3685          = mkIRExprVec_4( widenUto64(mkexpr(dst0)), /* thing to rotate */
   3686                           widenUto64(shift_expr),   /* rotate amount */
   3687                           mkexpr(old_rflags),
   3688                           mkU64(sz) );
   3689       assign( new_value,
   3690                  mkIRExprCCall(
   3691                     Ity_I64,
   3692                     0/*regparm*/,
   3693                     left ? "amd64g_calculate_RCL" : "amd64g_calculate_RCR",
   3694                     left ? &amd64g_calculate_RCL  : &amd64g_calculate_RCR,
   3695                     argsVALUE
   3696                  )
   3697             );
   3698 
   3699       argsRFLAGS
   3700          = mkIRExprVec_4( widenUto64(mkexpr(dst0)), /* thing to rotate */
   3701                           widenUto64(shift_expr),   /* rotate amount */
   3702                           mkexpr(old_rflags),
   3703                           mkU64(-sz) );
   3704       assign( new_rflags,
   3705                  mkIRExprCCall(
   3706                     Ity_I64,
   3707                     0/*regparm*/,
   3708                     left ? "amd64g_calculate_RCL" : "amd64g_calculate_RCR",
   3709                     left ? &amd64g_calculate_RCL  : &amd64g_calculate_RCR,
   3710                     argsRFLAGS
   3711                  )
   3712             );
   3713 
   3714       assign( dst1, narrowTo(ty, mkexpr(new_value)) );
   3715       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   3716       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(new_rflags) ));
   3717       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   3718       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   3719    }
   3720 
   3721    else
   3722    if (isShift) {
   3723 
   3724       IRTemp pre64     = newTemp(Ity_I64);
   3725       IRTemp res64     = newTemp(Ity_I64);
   3726       IRTemp res64ss   = newTemp(Ity_I64);
   3727       IRTemp shift_amt = newTemp(Ity_I8);
   3728       UChar  mask      = toUChar(sz==8 ? 63 : 31);
   3729       IROp   op64;
   3730 
   3731       switch (gregLO3ofRM(modrm)) {
   3732          case 4: op64 = Iop_Shl64; break;
   3733          case 5: op64 = Iop_Shr64; break;
   3734          case 6: op64 = Iop_Shl64; break;
   3735          case 7: op64 = Iop_Sar64; break;
   3736          /*NOTREACHED*/
   3737          default: vpanic("dis_Grp2:shift"); break;
   3738       }
   3739 
   3740       /* Widen the value to be shifted to 64 bits, do the shift, and
   3741          narrow back down.  This seems surprisingly long-winded, but
   3742          unfortunately the AMD semantics requires that 8/16/32-bit
   3743          shifts give defined results for shift values all the way up
   3744          to 32, and this seems the simplest way to do it.  It has the
   3745          advantage that the only IR level shifts generated are of 64
   3746          bit values, and the shift amount is guaranteed to be in the
   3747          range 0 .. 63, thereby observing the IR semantics requiring
   3748          all shift values to be in the range 0 .. 2^word_size-1.
   3749 
   3750          Therefore the shift amount is masked with 63 for 64-bit shifts
   3751          and 31 for all others.
   3752       */
   3753       /* shift_amt = shift_expr & MASK, regardless of operation size */
   3754       assign( shift_amt, binop(Iop_And8, shift_expr, mkU8(mask)) );
   3755 
   3756       /* suitably widen the value to be shifted to 64 bits. */
   3757       assign( pre64, op64==Iop_Sar64 ? widenSto64(mkexpr(dst0))
   3758                                      : widenUto64(mkexpr(dst0)) );
   3759 
   3760       /* res64 = pre64 `shift` shift_amt */
   3761       assign( res64, binop(op64, mkexpr(pre64), mkexpr(shift_amt)) );
   3762 
   3763       /* res64ss = pre64 `shift` ((shift_amt - 1) & MASK) */
   3764       assign( res64ss,
   3765               binop(op64,
   3766                     mkexpr(pre64),
   3767                     binop(Iop_And8,
   3768                           binop(Iop_Sub8,
   3769                                 mkexpr(shift_amt), mkU8(1)),
   3770                           mkU8(mask))) );
   3771 
   3772       /* Build the flags thunk. */
   3773       setFlags_DEP1_DEP2_shift(op64, res64, res64ss, ty, shift_amt);
   3774 
   3775       /* Narrow the result back down. */
   3776       assign( dst1, narrowTo(ty, mkexpr(res64)) );
   3777 
   3778    } /* if (isShift) */
   3779 
   3780    else
   3781    if (isRotate) {
   3782       Int    ccOp      = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1
   3783                                         : (ty==Ity_I32 ? 2 : 3));
   3784       Bool   left      = toBool(gregLO3ofRM(modrm) == 0);
   3785       IRTemp rot_amt   = newTemp(Ity_I8);
   3786       IRTemp rot_amt64 = newTemp(Ity_I8);
   3787       IRTemp oldFlags  = newTemp(Ity_I64);
   3788       UChar  mask      = toUChar(sz==8 ? 63 : 31);
   3789 
   3790       /* rot_amt = shift_expr & mask */
   3791       /* By masking the rotate amount thusly, the IR-level Shl/Shr
   3792          expressions never shift beyond the word size and thus remain
   3793          well defined. */
   3794       assign(rot_amt64, binop(Iop_And8, shift_expr, mkU8(mask)));
   3795 
   3796       if (ty == Ity_I64)
   3797          assign(rot_amt, mkexpr(rot_amt64));
   3798       else
   3799          assign(rot_amt, binop(Iop_And8, mkexpr(rot_amt64), mkU8(8*sz-1)));
   3800 
   3801       if (left) {
   3802 
   3803          /* dst1 = (dst0 << rot_amt) | (dst0 >>u (wordsize-rot_amt)) */
   3804          assign(dst1,
   3805             binop( mkSizedOp(ty,Iop_Or8),
   3806                    binop( mkSizedOp(ty,Iop_Shl8),
   3807                           mkexpr(dst0),
   3808                           mkexpr(rot_amt)
   3809                    ),
   3810                    binop( mkSizedOp(ty,Iop_Shr8),
   3811                           mkexpr(dst0),
   3812                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
   3813                    )
   3814             )
   3815          );
   3816          ccOp += AMD64G_CC_OP_ROLB;
   3817 
   3818       } else { /* right */
   3819 
   3820          /* dst1 = (dst0 >>u rot_amt) | (dst0 << (wordsize-rot_amt)) */
   3821          assign(dst1,
   3822             binop( mkSizedOp(ty,Iop_Or8),
   3823                    binop( mkSizedOp(ty,Iop_Shr8),
   3824                           mkexpr(dst0),
   3825                           mkexpr(rot_amt)
   3826                    ),
   3827                    binop( mkSizedOp(ty,Iop_Shl8),
   3828                           mkexpr(dst0),
   3829                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
   3830                    )
   3831             )
   3832          );
   3833          ccOp += AMD64G_CC_OP_RORB;
   3834 
   3835       }
   3836 
   3837       /* dst1 now holds the rotated value.  Build flag thunk.  We
   3838          need the resulting value for this, and the previous flags.
   3839          Except don't set it if the rotate count is zero. */
   3840 
   3841       assign(oldFlags, mk_amd64g_calculate_rflags_all());
   3842 
   3843       /* rot_amt64 :: Ity_I8.  We need to convert it to I1. */
   3844       IRTemp rot_amt64b = newTemp(Ity_I1);
   3845       assign(rot_amt64b, binop(Iop_CmpNE8, mkexpr(rot_amt64), mkU8(0)) );
   3846 
   3847       /* CC_DEP1 is the rotated value.  CC_NDEP is flags before. */
   3848       stmt( IRStmt_Put( OFFB_CC_OP,
   3849                         IRExpr_ITE( mkexpr(rot_amt64b),
   3850                                     mkU64(ccOp),
   3851                                     IRExpr_Get(OFFB_CC_OP,Ity_I64) ) ));
   3852       stmt( IRStmt_Put( OFFB_CC_DEP1,
   3853                         IRExpr_ITE( mkexpr(rot_amt64b),
   3854                                     widenUto64(mkexpr(dst1)),
   3855                                     IRExpr_Get(OFFB_CC_DEP1,Ity_I64) ) ));
   3856       stmt( IRStmt_Put( OFFB_CC_DEP2,
   3857                         IRExpr_ITE( mkexpr(rot_amt64b),
   3858                                     mkU64(0),
   3859                                     IRExpr_Get(OFFB_CC_DEP2,Ity_I64) ) ));
   3860       stmt( IRStmt_Put( OFFB_CC_NDEP,
   3861                         IRExpr_ITE( mkexpr(rot_amt64b),
   3862                                     mkexpr(oldFlags),
   3863                                     IRExpr_Get(OFFB_CC_NDEP,Ity_I64) ) ));
   3864    } /* if (isRotate) */
   3865 
   3866    /* Save result, and finish up. */
   3867    if (epartIsReg(modrm)) {
   3868       putIRegE(sz, pfx, modrm, mkexpr(dst1));
   3869       if (vex_traceflags & VEX_TRACE_FE) {
   3870          vex_printf("%s%c ",
   3871                     nameGrp2(gregLO3ofRM(modrm)), nameISize(sz) );
   3872          if (shift_expr_txt)
   3873             vex_printf("%s", shift_expr_txt);
   3874          else
   3875             ppIRExpr(shift_expr);
   3876          vex_printf(", %s\n", nameIRegE(sz,pfx,modrm));
   3877       }
   3878    } else {
   3879       storeLE(mkexpr(addr), mkexpr(dst1));
   3880       if (vex_traceflags & VEX_TRACE_FE) {
   3881          vex_printf("%s%c ",
   3882                     nameGrp2(gregLO3ofRM(modrm)), nameISize(sz) );
   3883          if (shift_expr_txt)
   3884             vex_printf("%s", shift_expr_txt);
   3885          else
   3886             ppIRExpr(shift_expr);
   3887          vex_printf(", %s\n", dis_buf);
   3888       }
   3889    }
   3890    return delta;
   3891 }
   3892 
   3893 
   3894 /* Group 8 extended opcodes (but BT/BTS/BTC/BTR only). */
   3895 static
   3896 ULong dis_Grp8_Imm ( const VexAbiInfo* vbi,
   3897                      Prefix pfx,
   3898                      Long delta, UChar modrm,
   3899                      Int am_sz, Int sz, ULong src_val,
   3900                      Bool* decode_OK )
   3901 {
   3902    /* src_val denotes a d8.
   3903       And delta on entry points at the modrm byte. */
   3904 
   3905    IRType ty     = szToITy(sz);
   3906    IRTemp t2     = newTemp(Ity_I64);
   3907    IRTemp t2m    = newTemp(Ity_I64);
   3908    IRTemp t_addr = IRTemp_INVALID;
   3909    HChar  dis_buf[50];
   3910    ULong  mask;
   3911 
   3912    /* we're optimists :-) */
   3913    *decode_OK = True;
   3914 
   3915    /* Check whether F2 or F3 are acceptable. */
   3916    if (epartIsReg(modrm)) {
   3917       /* F2 or F3 are not allowed in the register case. */
   3918       if (haveF2orF3(pfx)) {
   3919          *decode_OK = False;
   3920          return delta;
   3921      }
   3922    } else {
   3923       /* F2 or F3 (but not both) are allowable provided LOCK is also
   3924          present. */
   3925       if (haveF2orF3(pfx)) {
   3926          if (haveF2andF3(pfx) || !haveLOCK(pfx)) {
   3927             *decode_OK = False;
   3928             return delta;
   3929          }
   3930       }
   3931    }
   3932 
   3933    /* Limit src_val -- the bit offset -- to something within a word.
   3934       The Intel docs say that literal offsets larger than a word are
   3935       masked in this way. */
   3936    switch (sz) {
   3937       case 2:  src_val &= 15; break;
   3938       case 4:  src_val &= 31; break;
   3939       case 8:  src_val &= 63; break;
   3940       default: *decode_OK = False; return delta;
   3941    }
   3942 
   3943    /* Invent a mask suitable for the operation. */
   3944    switch (gregLO3ofRM(modrm)) {
   3945       case 4: /* BT */  mask = 0;                  break;
   3946       case 5: /* BTS */ mask = 1ULL << src_val;    break;
   3947       case 6: /* BTR */ mask = ~(1ULL << src_val); break;
   3948       case 7: /* BTC */ mask = 1ULL << src_val;    break;
   3949          /* If this needs to be extended, probably simplest to make a
   3950             new function to handle the other cases (0 .. 3).  The
   3951             Intel docs do however not indicate any use for 0 .. 3, so
   3952             we don't expect this to happen. */
   3953       default: *decode_OK = False; return delta;
   3954    }
   3955 
   3956    /* Fetch the value to be tested and modified into t2, which is
   3957       64-bits wide regardless of sz. */
   3958    if (epartIsReg(modrm)) {
   3959       vassert(am_sz == 1);
   3960       assign( t2, widenUto64(getIRegE(sz, pfx, modrm)) );
   3961       delta += (am_sz + 1);
   3962       DIP("%s%c $0x%llx, %s\n", nameGrp8(gregLO3ofRM(modrm)),
   3963                                 nameISize(sz),
   3964                                 src_val, nameIRegE(sz,pfx,modrm));
   3965    } else {
   3966       Int len;
   3967       t_addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 1 );
   3968       delta  += (len+1);
   3969       assign( t2, widenUto64(loadLE(ty, mkexpr(t_addr))) );
   3970       DIP("%s%c $0x%llx, %s\n", nameGrp8(gregLO3ofRM(modrm)),
   3971                                 nameISize(sz),
   3972                                 src_val, dis_buf);
   3973    }
   3974 
   3975    /* Compute the new value into t2m, if non-BT. */
   3976    switch (gregLO3ofRM(modrm)) {
   3977       case 4: /* BT */
   3978          break;
   3979       case 5: /* BTS */
   3980          assign( t2m, binop(Iop_Or64, mkU64(mask), mkexpr(t2)) );
   3981          break;
   3982       case 6: /* BTR */
   3983          assign( t2m, binop(Iop_And64, mkU64(mask), mkexpr(t2)) );
   3984          break;
   3985       case 7: /* BTC */
   3986          assign( t2m, binop(Iop_Xor64, mkU64(mask), mkexpr(t2)) );
   3987          break;
   3988      default:
   3989          /*NOTREACHED*/ /*the previous switch guards this*/
   3990          vassert(0);
   3991    }
   3992 
   3993    /* Write the result back, if non-BT. */
   3994    if (gregLO3ofRM(modrm) != 4 /* BT */) {
   3995       if (epartIsReg(modrm)) {
   3996         putIRegE(sz, pfx, modrm, narrowTo(ty, mkexpr(t2m)));
   3997       } else {
   3998          if (haveLOCK(pfx)) {
   3999             casLE( mkexpr(t_addr),
   4000                    narrowTo(ty, mkexpr(t2))/*expd*/,
   4001                    narrowTo(ty, mkexpr(t2m))/*new*/,
   4002                    guest_RIP_curr_instr );
   4003          } else {
   4004             storeLE(mkexpr(t_addr), narrowTo(ty, mkexpr(t2m)));
   4005          }
   4006       }
   4007    }
   4008 
   4009    /* Copy relevant bit from t2 into the carry flag. */
   4010    /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
   4011    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   4012    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   4013    stmt( IRStmt_Put(
   4014             OFFB_CC_DEP1,
   4015             binop(Iop_And64,
   4016                   binop(Iop_Shr64, mkexpr(t2), mkU8(src_val)),
   4017                   mkU64(1))
   4018        ));
   4019    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   4020       elimination of previous stores to this field work better. */
   4021    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   4022 
   4023    return delta;
   4024 }
   4025 
   4026 
   4027 /* Signed/unsigned widening multiply.  Generate IR to multiply the
   4028    value in RAX/EAX/AX/AL by the given IRTemp, and park the result in
   4029    RDX:RAX/EDX:EAX/DX:AX/AX.
   4030 */
   4031 static void codegen_mulL_A_D ( Int sz, Bool syned,
   4032                                IRTemp tmp, const HChar* tmp_txt )
   4033 {
   4034    IRType ty = szToITy(sz);
   4035    IRTemp t1 = newTemp(ty);
   4036 
   4037    assign( t1, getIRegRAX(sz) );
   4038 
   4039    switch (ty) {
   4040       case Ity_I64: {
   4041          IRTemp res128  = newTemp(Ity_I128);
   4042          IRTemp resHi   = newTemp(Ity_I64);
   4043          IRTemp resLo   = newTemp(Ity_I64);
   4044          IROp   mulOp   = syned ? Iop_MullS64 : Iop_MullU64;
   4045          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   4046          setFlags_MUL ( Ity_I64, t1, tmp, tBaseOp );
   4047          assign( res128, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   4048          assign( resHi, unop(Iop_128HIto64,mkexpr(res128)));
   4049          assign( resLo, unop(Iop_128to64,mkexpr(res128)));
   4050          putIReg64(R_RDX, mkexpr(resHi));
   4051          putIReg64(R_RAX, mkexpr(resLo));
   4052          break;
   4053       }
   4054       case Ity_I32: {
   4055          IRTemp res64   = newTemp(Ity_I64);
   4056          IRTemp resHi   = newTemp(Ity_I32);
   4057          IRTemp resLo   = newTemp(Ity_I32);
   4058          IROp   mulOp   = syned ? Iop_MullS32 : Iop_MullU32;
   4059          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   4060          setFlags_MUL ( Ity_I32, t1, tmp, tBaseOp );
   4061          assign( res64, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   4062          assign( resHi, unop(Iop_64HIto32,mkexpr(res64)));
   4063          assign( resLo, unop(Iop_64to32,mkexpr(res64)));
   4064          putIRegRDX(4, mkexpr(resHi));
   4065          putIRegRAX(4, mkexpr(resLo));
   4066          break;
   4067       }
   4068       case Ity_I16: {
   4069          IRTemp res32   = newTemp(Ity_I32);
   4070          IRTemp resHi   = newTemp(Ity_I16);
   4071          IRTemp resLo   = newTemp(Ity_I16);
   4072          IROp   mulOp   = syned ? Iop_MullS16 : Iop_MullU16;
   4073          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   4074          setFlags_MUL ( Ity_I16, t1, tmp, tBaseOp );
   4075          assign( res32, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   4076          assign( resHi, unop(Iop_32HIto16,mkexpr(res32)));
   4077          assign( resLo, unop(Iop_32to16,mkexpr(res32)));
   4078          putIRegRDX(2, mkexpr(resHi));
   4079          putIRegRAX(2, mkexpr(resLo));
   4080          break;
   4081       }
   4082       case Ity_I8: {
   4083          IRTemp res16   = newTemp(Ity_I16);
   4084          IRTemp resHi   = newTemp(Ity_I8);
   4085          IRTemp resLo   = newTemp(Ity_I8);
   4086          IROp   mulOp   = syned ? Iop_MullS8 : Iop_MullU8;
   4087          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   4088          setFlags_MUL ( Ity_I8, t1, tmp, tBaseOp );
   4089          assign( res16, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   4090          assign( resHi, unop(Iop_16HIto8,mkexpr(res16)));
   4091          assign( resLo, unop(Iop_16to8,mkexpr(res16)));
   4092          putIRegRAX(2, mkexpr(res16));
   4093          break;
   4094       }
   4095       default:
   4096          ppIRType(ty);
   4097          vpanic("codegen_mulL_A_D(amd64)");
   4098    }
   4099    DIP("%s%c %s\n", syned ? "imul" : "mul", nameISize(sz), tmp_txt);
   4100 }
   4101 
   4102 
   4103 /* Group 3 extended opcodes.  We have to decide here whether F2 and F3
   4104    might be valid.*/
   4105 static
   4106 ULong dis_Grp3 ( const VexAbiInfo* vbi,
   4107                  Prefix pfx, Int sz, Long delta, Bool* decode_OK )
   4108 {
   4109    Long    d64;
   4110    UChar   modrm;
   4111    HChar   dis_buf[50];
   4112    Int     len;
   4113    IRTemp  addr;
   4114    IRType  ty = szToITy(sz);
   4115    IRTemp  t1 = newTemp(ty);
   4116    IRTemp dst1, src, dst0;
   4117    *decode_OK = True;
   4118    modrm = getUChar(delta);
   4119    if (epartIsReg(modrm)) {
   4120       /* F2/XACQ and F3/XREL are always invalid in the non-mem case. */
   4121       if (haveF2orF3(pfx)) goto unhandled;
   4122       switch (gregLO3ofRM(modrm)) {
   4123          case 0: { /* TEST */
   4124             delta++;
   4125             d64 = getSDisp(imin(4,sz), delta);
   4126             delta += imin(4,sz);
   4127             dst1 = newTemp(ty);
   4128             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
   4129                                getIRegE(sz,pfx,modrm),
   4130                                mkU(ty, d64 & mkSizeMask(sz))));
   4131             setFlags_DEP1( Iop_And8, dst1, ty );
   4132             DIP("test%c $%lld, %s\n",
   4133                 nameISize(sz), d64,
   4134                 nameIRegE(sz, pfx, modrm));
   4135             break;
   4136          }
   4137          case 1:
   4138             *decode_OK = False;
   4139             return delta;
   4140          case 2: /* NOT */
   4141             delta++;
   4142             putIRegE(sz, pfx, modrm,
   4143                               unop(mkSizedOp(ty,Iop_Not8),
   4144                                    getIRegE(sz, pfx, modrm)));
   4145             DIP("not%c %s\n", nameISize(sz),
   4146                               nameIRegE(sz, pfx, modrm));
   4147             break;
   4148          case 3: /* NEG */
   4149             delta++;
   4150             dst0 = newTemp(ty);
   4151             src  = newTemp(ty);
   4152             dst1 = newTemp(ty);
   4153             assign(dst0, mkU(ty,0));
   4154             assign(src,  getIRegE(sz, pfx, modrm));
   4155             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0),
   4156                                                        mkexpr(src)));
   4157             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
   4158             putIRegE(sz, pfx, modrm, mkexpr(dst1));
   4159             DIP("neg%c %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm));
   4160             break;
   4161          case 4: /* MUL (unsigned widening) */
   4162             delta++;
   4163             src = newTemp(ty);
   4164             assign(src, getIRegE(sz,pfx,modrm));
   4165             codegen_mulL_A_D ( sz, False, src,
   4166                                nameIRegE(sz,pfx,modrm) );
   4167             break;
   4168          case 5: /* IMUL (signed widening) */
   4169             delta++;
   4170             src = newTemp(ty);
   4171             assign(src, getIRegE(sz,pfx,modrm));
   4172             codegen_mulL_A_D ( sz, True, src,
   4173                                nameIRegE(sz,pfx,modrm) );
   4174             break;
   4175          case 6: /* DIV */
   4176             delta++;
   4177             assign( t1, getIRegE(sz, pfx, modrm) );
   4178             codegen_div ( sz, t1, False );
   4179             DIP("div%c %s\n", nameISize(sz),
   4180                               nameIRegE(sz, pfx, modrm));
   4181             break;
   4182          case 7: /* IDIV */
   4183             delta++;
   4184             assign( t1, getIRegE(sz, pfx, modrm) );
   4185             codegen_div ( sz, t1, True );
   4186             DIP("idiv%c %s\n", nameISize(sz),
   4187                                nameIRegE(sz, pfx, modrm));
   4188             break;
   4189          default:
   4190             /*NOTREACHED*/
   4191             vpanic("Grp3(amd64,R)");
   4192       }
   4193    } else {
   4194       /* Decide if F2/XACQ or F3/XREL might be valid. */
   4195       Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
   4196       if ((gregLO3ofRM(modrm) == 3/*NEG*/ || gregLO3ofRM(modrm) == 2/*NOT*/)
   4197           && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
   4198          validF2orF3 = True;
   4199       }
   4200       if (!validF2orF3) goto unhandled;
   4201       /* */
   4202       addr = disAMode ( &len, vbi, pfx, delta, dis_buf,
   4203                         /* we have to inform disAMode of any immediate
   4204                            bytes used */
   4205                         gregLO3ofRM(modrm)==0/*TEST*/
   4206                            ? imin(4,sz)
   4207                            : 0
   4208                       );
   4209       t1   = newTemp(ty);
   4210       delta += len;
   4211       assign(t1, loadLE(ty,mkexpr(addr)));
   4212       switch (gregLO3ofRM(modrm)) {
   4213          case 0: { /* TEST */
   4214             d64 = getSDisp(imin(4,sz), delta);
   4215             delta += imin(4,sz);
   4216             dst1 = newTemp(ty);
   4217             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
   4218                                mkexpr(t1),
   4219                                mkU(ty, d64 & mkSizeMask(sz))));
   4220             setFlags_DEP1( Iop_And8, dst1, ty );
   4221             DIP("test%c $%lld, %s\n", nameISize(sz), d64, dis_buf);
   4222             break;
   4223          }
   4224          case 1:
   4225             *decode_OK = False;
   4226             return delta;
   4227          case 2: /* NOT */
   4228             dst1 = newTemp(ty);
   4229             assign(dst1, unop(mkSizedOp(ty,Iop_Not8), mkexpr(t1)));
   4230             if (haveLOCK(pfx)) {
   4231                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
   4232                                     guest_RIP_curr_instr );
   4233             } else {
   4234                storeLE( mkexpr(addr), mkexpr(dst1) );
   4235             }
   4236             DIP("not%c %s\n", nameISize(sz), dis_buf);
   4237             break;
   4238          case 3: /* NEG */
   4239             dst0 = newTemp(ty);
   4240             src  = newTemp(ty);
   4241             dst1 = newTemp(ty);
   4242             assign(dst0, mkU(ty,0));
   4243             assign(src,  mkexpr(t1));
   4244             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0),
   4245                                                        mkexpr(src)));
   4246             if (haveLOCK(pfx)) {
   4247                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
   4248                                     guest_RIP_curr_instr );
   4249             } else {
   4250                storeLE( mkexpr(addr), mkexpr(dst1) );
   4251             }
   4252             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
   4253             DIP("neg%c %s\n", nameISize(sz), dis_buf);
   4254             break;
   4255          case 4: /* MUL (unsigned widening) */
   4256             codegen_mulL_A_D ( sz, False, t1, dis_buf );
   4257             break;
   4258          case 5: /* IMUL */
   4259             codegen_mulL_A_D ( sz, True, t1, dis_buf );
   4260             break;
   4261          case 6: /* DIV */
   4262             codegen_div ( sz, t1, False );
   4263             DIP("div%c %s\n", nameISize(sz), dis_buf);
   4264             break;
   4265          case 7: /* IDIV */
   4266             codegen_div ( sz, t1, True );
   4267             DIP("idiv%c %s\n", nameISize(sz), dis_buf);
   4268             break;
   4269          default:
   4270             /*NOTREACHED*/
   4271             vpanic("Grp3(amd64,M)");
   4272       }
   4273    }
   4274    return delta;
   4275   unhandled:
   4276    *decode_OK = False;
   4277    return delta;
   4278 }
   4279 
   4280 
   4281 /* Group 4 extended opcodes.  We have to decide here whether F2 and F3
   4282    might be valid. */
   4283 static
   4284 ULong dis_Grp4 ( const VexAbiInfo* vbi,
   4285                  Prefix pfx, Long delta, Bool* decode_OK )
   4286 {
   4287    Int   alen;
   4288    UChar modrm;
   4289    HChar dis_buf[50];
   4290    IRType ty = Ity_I8;
   4291    IRTemp t1 = newTemp(ty);
   4292    IRTemp t2 = newTemp(ty);
   4293 
   4294    *decode_OK = True;
   4295 
   4296    modrm = getUChar(delta);
   4297    if (epartIsReg(modrm)) {
   4298       /* F2/XACQ and F3/XREL are always invalid in the non-mem case. */
   4299       if (haveF2orF3(pfx)) goto unhandled;
   4300       assign(t1, getIRegE(1, pfx, modrm));
   4301       switch (gregLO3ofRM(modrm)) {
   4302          case 0: /* INC */
   4303             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
   4304             putIRegE(1, pfx, modrm, mkexpr(t2));
   4305             setFlags_INC_DEC( True, t2, ty );
   4306             break;
   4307          case 1: /* DEC */
   4308             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
   4309             putIRegE(1, pfx, modrm, mkexpr(t2));
   4310             setFlags_INC_DEC( False, t2, ty );
   4311             break;
   4312          default:
   4313             *decode_OK = False;
   4314             return delta;
   4315       }
   4316       delta++;
   4317       DIP("%sb %s\n", nameGrp4(gregLO3ofRM(modrm)),
   4318                       nameIRegE(1, pfx, modrm));
   4319    } else {
   4320       /* Decide if F2/XACQ or F3/XREL might be valid. */
   4321       Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
   4322       if ((gregLO3ofRM(modrm) == 0/*INC*/ || gregLO3ofRM(modrm) == 1/*DEC*/)
   4323           && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
   4324          validF2orF3 = True;
   4325       }
   4326       if (!validF2orF3) goto unhandled;
   4327       /* */
   4328       IRTemp addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   4329       assign( t1, loadLE(ty, mkexpr(addr)) );
   4330       switch (gregLO3ofRM(modrm)) {
   4331          case 0: /* INC */
   4332             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
   4333             if (haveLOCK(pfx)) {
   4334                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
   4335                       guest_RIP_curr_instr );
   4336             } else {
   4337                storeLE( mkexpr(addr), mkexpr(t2) );
   4338             }
   4339             setFlags_INC_DEC( True, t2, ty );
   4340             break;
   4341          case 1: /* DEC */
   4342             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
   4343             if (haveLOCK(pfx)) {
   4344                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
   4345                       guest_RIP_curr_instr );
   4346             } else {
   4347                storeLE( mkexpr(addr), mkexpr(t2) );
   4348             }
   4349             setFlags_INC_DEC( False, t2, ty );
   4350             break;
   4351          default:
   4352             *decode_OK = False;
   4353             return delta;
   4354       }
   4355       delta += alen;
   4356       DIP("%sb %s\n", nameGrp4(gregLO3ofRM(modrm)), dis_buf);
   4357    }
   4358    return delta;
   4359   unhandled:
   4360    *decode_OK = False;
   4361    return delta;
   4362 }
   4363 
   4364 
   4365 /* Group 5 extended opcodes.  We have to decide here whether F2 and F3
   4366    might be valid. */
   4367 static
   4368 ULong dis_Grp5 ( const VexAbiInfo* vbi,
   4369                  Prefix pfx, Int sz, Long delta,
   4370                  /*MOD*/DisResult* dres, /*OUT*/Bool* decode_OK )
   4371 {
   4372    Int     len;
   4373    UChar   modrm;
   4374    HChar   dis_buf[50];
   4375    IRTemp  addr = IRTemp_INVALID;
   4376    IRType  ty = szToITy(sz);
   4377    IRTemp  t1 = newTemp(ty);
   4378    IRTemp  t2 = IRTemp_INVALID;
   4379    IRTemp  t3 = IRTemp_INVALID;
   4380    Bool    showSz = True;
   4381 
   4382    *decode_OK = True;
   4383 
   4384    modrm = getUChar(delta);
   4385    if (epartIsReg(modrm)) {
   4386       /* F2/XACQ and F3/XREL are always invalid in the non-mem case.
   4387          F2/CALL and F2/JMP may have bnd prefix. */
   4388      if (haveF2orF3(pfx)
   4389          && ! (haveF2(pfx)
   4390                && (gregLO3ofRM(modrm) == 2 || gregLO3ofRM(modrm) == 4)))
   4391         goto unhandledR;
   4392       assign(t1, getIRegE(sz,pfx,modrm));
   4393       switch (gregLO3ofRM(modrm)) {
   4394          case 0: /* INC */
   4395             t2 = newTemp(ty);
   4396             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
   4397                              mkexpr(t1), mkU(ty,1)));
   4398             setFlags_INC_DEC( True, t2, ty );
   4399             putIRegE(sz,pfx,modrm, mkexpr(t2));
   4400             break;
   4401          case 1: /* DEC */
   4402             t2 = newTemp(ty);
   4403             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
   4404                              mkexpr(t1), mkU(ty,1)));
   4405             setFlags_INC_DEC( False, t2, ty );
   4406             putIRegE(sz,pfx,modrm, mkexpr(t2));
   4407             break;
   4408          case 2: /* call Ev */
   4409             /* Ignore any sz value and operate as if sz==8. */
   4410             if (!(sz == 4 || sz == 8)) goto unhandledR;
   4411             if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   4412             sz = 8;
   4413             t3 = newTemp(Ity_I64);
   4414             assign(t3, getIRegE(sz,pfx,modrm));
   4415             t2 = newTemp(Ity_I64);
   4416             assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   4417             putIReg64(R_RSP, mkexpr(t2));
   4418             storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+1));
   4419             make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(reg)");
   4420             jmp_treg(dres, Ijk_Call, t3);
   4421             vassert(dres->whatNext == Dis_StopHere);
   4422             showSz = False;
   4423             break;
   4424          case 4: /* jmp Ev */
   4425             /* Ignore any sz value and operate as if sz==8. */
   4426             if (!(sz == 4 || sz == 8)) goto unhandledR;
   4427             if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   4428             sz = 8;
   4429             t3 = newTemp(Ity_I64);
   4430             assign(t3, getIRegE(sz,pfx,modrm));
   4431             jmp_treg(dres, Ijk_Boring, t3);
   4432             vassert(dres->whatNext == Dis_StopHere);
   4433             showSz = False;
   4434             break;
   4435          case 6: /* PUSH Ev */
   4436             /* There is no encoding for 32-bit operand size; hence ... */
   4437             if (sz == 4) sz = 8;
   4438             if (sz == 8 || sz == 2) {
   4439                ty = szToITy(sz); /* redo it, since sz might have changed */
   4440                t3 = newTemp(ty);
   4441                assign(t3, getIRegE(sz,pfx,modrm));
   4442                t2 = newTemp(Ity_I64);
   4443                assign( t2, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
   4444                putIReg64(R_RSP, mkexpr(t2) );
   4445                storeLE( mkexpr(t2), mkexpr(t3) );
   4446                break;
   4447             } else {
   4448                goto unhandledR; /* awaiting test case */
   4449             }
   4450          default:
   4451          unhandledR:
   4452             *decode_OK = False;
   4453             return delta;
   4454       }
   4455       delta++;
   4456       DIP("%s%c %s\n", nameGrp5(gregLO3ofRM(modrm)),
   4457                        showSz ? nameISize(sz) : ' ',
   4458                        nameIRegE(sz, pfx, modrm));
   4459    } else {
   4460       /* Decide if F2/XACQ, F3/XREL, F2/CALL or F2/JMP might be valid. */
   4461       Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
   4462       if ((gregLO3ofRM(modrm) == 0/*INC*/ || gregLO3ofRM(modrm) == 1/*DEC*/)
   4463           && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
   4464          validF2orF3 = True;
   4465       } else if ((gregLO3ofRM(modrm) == 2 || gregLO3ofRM(modrm) == 4)
   4466                  && (haveF2(pfx) && !haveF3(pfx))) {
   4467          validF2orF3 = True;
   4468       }
   4469       if (!validF2orF3) goto unhandledM;
   4470       /* */
   4471       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   4472       if (gregLO3ofRM(modrm) != 2 && gregLO3ofRM(modrm) != 4
   4473                                   && gregLO3ofRM(modrm) != 6) {
   4474          assign(t1, loadLE(ty,mkexpr(addr)));
   4475       }
   4476       switch (gregLO3ofRM(modrm)) {
   4477          case 0: /* INC */
   4478             t2 = newTemp(ty);
   4479             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
   4480                              mkexpr(t1), mkU(ty,1)));
   4481             if (haveLOCK(pfx)) {
   4482                casLE( mkexpr(addr),
   4483                       mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
   4484             } else {
   4485                storeLE(mkexpr(addr),mkexpr(t2));
   4486             }
   4487             setFlags_INC_DEC( True, t2, ty );
   4488             break;
   4489          case 1: /* DEC */
   4490             t2 = newTemp(ty);
   4491             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
   4492                              mkexpr(t1), mkU(ty,1)));
   4493             if (haveLOCK(pfx)) {
   4494                casLE( mkexpr(addr),
   4495                       mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
   4496             } else {
   4497                storeLE(mkexpr(addr),mkexpr(t2));
   4498             }
   4499             setFlags_INC_DEC( False, t2, ty );
   4500             break;
   4501          case 2: /* call Ev */
   4502             /* Ignore any sz value and operate as if sz==8. */
   4503             if (!(sz == 4 || sz == 8)) goto unhandledM;
   4504             if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   4505             sz = 8;
   4506             t3 = newTemp(Ity_I64);
   4507             assign(t3, loadLE(Ity_I64,mkexpr(addr)));
   4508             t2 = newTemp(Ity_I64);
   4509             assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   4510             putIReg64(R_RSP, mkexpr(t2));
   4511             storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+len));
   4512             make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(mem)");
   4513             jmp_treg(dres, Ijk_Call, t3);
   4514             vassert(dres->whatNext == Dis_StopHere);
   4515             showSz = False;
   4516             break;
   4517          case 4: /* JMP Ev */
   4518             /* Ignore any sz value and operate as if sz==8. */
   4519             if (!(sz == 4 || sz == 8)) goto unhandledM;
   4520             if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   4521             sz = 8;
   4522             t3 = newTemp(Ity_I64);
   4523             assign(t3, loadLE(Ity_I64,mkexpr(addr)));
   4524             jmp_treg(dres, Ijk_Boring, t3);
   4525             vassert(dres->whatNext == Dis_StopHere);
   4526             showSz = False;
   4527             break;
   4528          case 6: /* PUSH Ev */
   4529             /* There is no encoding for 32-bit operand size; hence ... */
   4530             if (sz == 4) sz = 8;
   4531             if (sz == 8 || sz == 2) {
   4532                ty = szToITy(sz); /* redo it, since sz might have changed */
   4533                t3 = newTemp(ty);
   4534                assign(t3, loadLE(ty,mkexpr(addr)));
   4535                t2 = newTemp(Ity_I64);
   4536                assign( t2, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
   4537                putIReg64(R_RSP, mkexpr(t2) );
   4538                storeLE( mkexpr(t2), mkexpr(t3) );
   4539                break;
   4540             } else {
   4541                goto unhandledM; /* awaiting test case */
   4542             }
   4543          default:
   4544          unhandledM:
   4545             *decode_OK = False;
   4546             return delta;
   4547       }
   4548       delta += len;
   4549       DIP("%s%c %s\n", nameGrp5(gregLO3ofRM(modrm)),
   4550                        showSz ? nameISize(sz) : ' ',
   4551                        dis_buf);
   4552    }
   4553    return delta;
   4554 }
   4555 
   4556 
   4557 /*------------------------------------------------------------*/
   4558 /*--- Disassembling string ops (including REP prefixes)    ---*/
   4559 /*------------------------------------------------------------*/
   4560 
   4561 /* Code shared by all the string ops */
   4562 static
   4563 void dis_string_op_increment ( Int sz, IRTemp t_inc )
   4564 {
   4565    UChar logSz;
   4566    if (sz == 8 || sz == 4 || sz == 2) {
   4567       logSz = 1;
   4568       if (sz == 4) logSz = 2;
   4569       if (sz == 8) logSz = 3;
   4570       assign( t_inc,
   4571               binop(Iop_Shl64, IRExpr_Get( OFFB_DFLAG, Ity_I64 ),
   4572                                mkU8(logSz) ) );
   4573    } else {
   4574       assign( t_inc,
   4575               IRExpr_Get( OFFB_DFLAG, Ity_I64 ) );
   4576    }
   4577 }
   4578 
   4579 static
   4580 void dis_string_op( void (*dis_OP)( Int, IRTemp, Prefix pfx ),
   4581                     Int sz, const HChar* name, Prefix pfx )
   4582 {
   4583    IRTemp t_inc = newTemp(Ity_I64);
   4584    /* Really we ought to inspect the override prefixes, but we don't.
   4585       The following assertion catches any resulting sillyness. */
   4586    vassert(pfx == clearSegBits(pfx));
   4587    dis_string_op_increment(sz, t_inc);
   4588    dis_OP( sz, t_inc, pfx );
   4589    DIP("%s%c\n", name, nameISize(sz));
   4590 }
   4591 
   4592 static
   4593 void dis_MOVS ( Int sz, IRTemp t_inc, Prefix pfx )
   4594 {
   4595    IRType ty = szToITy(sz);
   4596    IRTemp td = newTemp(Ity_I64);   /* RDI */
   4597    IRTemp ts = newTemp(Ity_I64);   /* RSI */
   4598    IRExpr *incd, *incs;
   4599 
   4600    if (haveASO(pfx)) {
   4601       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   4602       assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
   4603    } else {
   4604       assign( td, getIReg64(R_RDI) );
   4605       assign( ts, getIReg64(R_RSI) );
   4606    }
   4607 
   4608    storeLE( mkexpr(td), loadLE(ty,mkexpr(ts)) );
   4609 
   4610    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   4611    incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
   4612    if (haveASO(pfx)) {
   4613       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   4614       incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
   4615    }
   4616    putIReg64( R_RDI, incd );
   4617    putIReg64( R_RSI, incs );
   4618 }
   4619 
   4620 static
   4621 void dis_LODS ( Int sz, IRTemp t_inc, Prefix pfx )
   4622 {
   4623    IRType ty = szToITy(sz);
   4624    IRTemp ts = newTemp(Ity_I64);   /* RSI */
   4625    IRExpr *incs;
   4626 
   4627    if (haveASO(pfx))
   4628       assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
   4629    else
   4630       assign( ts, getIReg64(R_RSI) );
   4631 
   4632    putIRegRAX ( sz, loadLE(ty, mkexpr(ts)) );
   4633 
   4634    incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
   4635    if (haveASO(pfx))
   4636       incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
   4637    putIReg64( R_RSI, incs );
   4638 }
   4639 
   4640 static
   4641 void dis_STOS ( Int sz, IRTemp t_inc, Prefix pfx )
   4642 {
   4643    IRType ty = szToITy(sz);
   4644    IRTemp ta = newTemp(ty);        /* rAX */
   4645    IRTemp td = newTemp(Ity_I64);   /* RDI */
   4646    IRExpr *incd;
   4647 
   4648    assign( ta, getIRegRAX(sz) );
   4649 
   4650    if (haveASO(pfx))
   4651       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   4652    else
   4653       assign( td, getIReg64(R_RDI) );
   4654 
   4655    storeLE( mkexpr(td), mkexpr(ta) );
   4656 
   4657    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   4658    if (haveASO(pfx))
   4659       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   4660    putIReg64( R_RDI, incd );
   4661 }
   4662 
   4663 static
   4664 void dis_CMPS ( Int sz, IRTemp t_inc, Prefix pfx )
   4665 {
   4666    IRType ty  = szToITy(sz);
   4667    IRTemp tdv = newTemp(ty);      /* (RDI) */
   4668    IRTemp tsv = newTemp(ty);      /* (RSI) */
   4669    IRTemp td  = newTemp(Ity_I64); /*  RDI  */
   4670    IRTemp ts  = newTemp(Ity_I64); /*  RSI  */
   4671    IRExpr *incd, *incs;
   4672 
   4673    if (haveASO(pfx)) {
   4674       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   4675       assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
   4676    } else {
   4677       assign( td, getIReg64(R_RDI) );
   4678       assign( ts, getIReg64(R_RSI) );
   4679    }
   4680 
   4681    assign( tdv, loadLE(ty,mkexpr(td)) );
   4682 
   4683    assign( tsv, loadLE(ty,mkexpr(ts)) );
   4684 
   4685    setFlags_DEP1_DEP2 ( Iop_Sub8, tsv, tdv, ty );
   4686 
   4687    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   4688    incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
   4689    if (haveASO(pfx)) {
   4690       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   4691       incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
   4692    }
   4693    putIReg64( R_RDI, incd );
   4694    putIReg64( R_RSI, incs );
   4695 }
   4696 
   4697 static
   4698 void dis_SCAS ( Int sz, IRTemp t_inc, Prefix pfx )
   4699 {
   4700    IRType ty  = szToITy(sz);
   4701    IRTemp ta  = newTemp(ty);       /*  rAX  */
   4702    IRTemp td  = newTemp(Ity_I64);  /*  RDI  */
   4703    IRTemp tdv = newTemp(ty);       /* (RDI) */
   4704    IRExpr *incd;
   4705 
   4706    assign( ta, getIRegRAX(sz) );
   4707 
   4708    if (haveASO(pfx))
   4709       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   4710    else
   4711       assign( td, getIReg64(R_RDI) );
   4712 
   4713    assign( tdv, loadLE(ty,mkexpr(td)) );
   4714 
   4715    setFlags_DEP1_DEP2 ( Iop_Sub8, ta, tdv, ty );
   4716 
   4717    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   4718    if (haveASO(pfx))
   4719       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   4720    putIReg64( R_RDI, incd );
   4721 }
   4722 
   4723 
   4724 /* Wrap the appropriate string op inside a REP/REPE/REPNE.  We assume
   4725    the insn is the last one in the basic block, and so emit a jump to
   4726    the next insn, rather than just falling through. */
   4727 static
   4728 void dis_REP_op ( /*MOD*/DisResult* dres,
   4729                   AMD64Condcode cond,
   4730                   void (*dis_OP)(Int, IRTemp, Prefix),
   4731                   Int sz, Addr64 rip, Addr64 rip_next, const HChar* name,
   4732                   Prefix pfx )
   4733 {
   4734    IRTemp t_inc = newTemp(Ity_I64);
   4735    IRTemp tc;
   4736    IRExpr* cmp;
   4737 
   4738    /* Really we ought to inspect the override prefixes, but we don't.
   4739       The following assertion catches any resulting sillyness. */
   4740    vassert(pfx == clearSegBits(pfx));
   4741 
   4742    if (haveASO(pfx)) {
   4743       tc = newTemp(Ity_I32);  /*  ECX  */
   4744       assign( tc, getIReg32(R_RCX) );
   4745       cmp = binop(Iop_CmpEQ32, mkexpr(tc), mkU32(0));
   4746    } else {
   4747       tc = newTemp(Ity_I64);  /*  RCX  */
   4748       assign( tc, getIReg64(R_RCX) );
   4749       cmp = binop(Iop_CmpEQ64, mkexpr(tc), mkU64(0));
   4750    }
   4751 
   4752    stmt( IRStmt_Exit( cmp, Ijk_Boring,
   4753                       IRConst_U64(rip_next), OFFB_RIP ) );
   4754 
   4755    if (haveASO(pfx))
   4756       putIReg32(R_RCX, binop(Iop_Sub32, mkexpr(tc), mkU32(1)) );
   4757   else
   4758       putIReg64(R_RCX, binop(Iop_Sub64, mkexpr(tc), mkU64(1)) );
   4759 
   4760    dis_string_op_increment(sz, t_inc);
   4761    dis_OP (sz, t_inc, pfx);
   4762 
   4763    if (cond == AMD64CondAlways) {
   4764       jmp_lit(dres, Ijk_Boring, rip);
   4765       vassert(dres->whatNext == Dis_StopHere);
   4766    } else {
   4767       stmt( IRStmt_Exit( mk_amd64g_calculate_condition(cond),
   4768                          Ijk_Boring,
   4769                          IRConst_U64(rip),
   4770                          OFFB_RIP ) );
   4771       jmp_lit(dres, Ijk_Boring, rip_next);
   4772       vassert(dres->whatNext == Dis_StopHere);
   4773    }
   4774    DIP("%s%c\n", name, nameISize(sz));
   4775 }
   4776 
   4777 
   4778 /*------------------------------------------------------------*/
   4779 /*--- Arithmetic, etc.                                     ---*/
   4780 /*------------------------------------------------------------*/
   4781 
   4782 /* IMUL E, G.  Supplied eip points to the modR/M byte. */
   4783 static
   4784 ULong dis_mul_E_G ( const VexAbiInfo* vbi,
   4785                     Prefix      pfx,
   4786                     Int         size,
   4787                     Long        delta0 )
   4788 {
   4789    Int    alen;
   4790    HChar  dis_buf[50];
   4791    UChar  rm = getUChar(delta0);
   4792    IRType ty = szToITy(size);
   4793    IRTemp te = newTemp(ty);
   4794    IRTemp tg = newTemp(ty);
   4795    IRTemp resLo = newTemp(ty);
   4796 
   4797    assign( tg, getIRegG(size, pfx, rm) );
   4798    if (epartIsReg(rm)) {
   4799       assign( te, getIRegE(size, pfx, rm) );
   4800    } else {
   4801       IRTemp addr = disAMode( &alen, vbi, pfx, delta0, dis_buf, 0 );
   4802       assign( te, loadLE(ty,mkexpr(addr)) );
   4803    }
   4804 
   4805    setFlags_MUL ( ty, te, tg, AMD64G_CC_OP_SMULB );
   4806 
   4807    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tg) ) );
   4808 
   4809    putIRegG(size, pfx, rm, mkexpr(resLo) );
   4810 
   4811    if (epartIsReg(rm)) {
   4812       DIP("imul%c %s, %s\n", nameISize(size),
   4813                              nameIRegE(size,pfx,rm),
   4814                              nameIRegG(size,pfx,rm));
   4815       return 1+delta0;
   4816    } else {
   4817       DIP("imul%c %s, %s\n", nameISize(size),
   4818                              dis_buf,
   4819                              nameIRegG(size,pfx,rm));
   4820       return alen+delta0;
   4821    }
   4822 }
   4823 
   4824 
   4825 /* IMUL I * E -> G.  Supplied rip points to the modR/M byte. */
   4826 static
   4827 ULong dis_imul_I_E_G ( const VexAbiInfo* vbi,
   4828                        Prefix      pfx,
   4829                        Int         size,
   4830                        Long        delta,
   4831                        Int         litsize )
   4832 {
   4833    Long   d64;
   4834    Int    alen;
   4835    HChar  dis_buf[50];
   4836    UChar  rm = getUChar(delta);
   4837    IRType ty = szToITy(size);
   4838    IRTemp te = newTemp(ty);
   4839    IRTemp tl = newTemp(ty);
   4840    IRTemp resLo = newTemp(ty);
   4841 
   4842    vassert(/*size == 1 ||*/ size == 2 || size == 4 || size == 8);
   4843 
   4844    if (epartIsReg(rm)) {
   4845       assign(te, getIRegE(size, pfx, rm));
   4846       delta++;
   4847    } else {
   4848       IRTemp addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   4849                                      imin(4,litsize) );
   4850       assign(te, loadLE(ty, mkexpr(addr)));
   4851       delta += alen;
   4852    }
   4853    d64 = getSDisp(imin(4,litsize),delta);
   4854    delta += imin(4,litsize);
   4855 
   4856    d64 &= mkSizeMask(size);
   4857    assign(tl, mkU(ty,d64));
   4858 
   4859    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tl) ));
   4860 
   4861    setFlags_MUL ( ty, te, tl, AMD64G_CC_OP_SMULB );
   4862 
   4863    putIRegG(size, pfx, rm, mkexpr(resLo));
   4864 
   4865    DIP("imul%c $%lld, %s, %s\n",
   4866        nameISize(size), d64,
   4867        ( epartIsReg(rm) ? nameIRegE(size,pfx,rm) : dis_buf ),
   4868        nameIRegG(size,pfx,rm) );
   4869    return delta;
   4870 }
   4871 
   4872 
   4873 /* Generate an IR sequence to do a popcount operation on the supplied
   4874    IRTemp, and return a new IRTemp holding the result.  'ty' may be
   4875    Ity_I16, Ity_I32 or Ity_I64 only. */
   4876 static IRTemp gen_POPCOUNT ( IRType ty, IRTemp src )
   4877 {
   4878    Int i;
   4879    if (ty == Ity_I16) {
   4880       IRTemp old = IRTemp_INVALID;
   4881       IRTemp nyu = IRTemp_INVALID;
   4882       IRTemp mask[4], shift[4];
   4883       for (i = 0; i < 4; i++) {
   4884          mask[i]  = newTemp(ty);
   4885          shift[i] = 1 << i;
   4886       }
   4887       assign(mask[0], mkU16(0x5555));
   4888       assign(mask[1], mkU16(0x3333));
   4889       assign(mask[2], mkU16(0x0F0F));
   4890       assign(mask[3], mkU16(0x00FF));
   4891       old = src;
   4892       for (i = 0; i < 4; i++) {
   4893          nyu = newTemp(ty);
   4894          assign(nyu,
   4895                 binop(Iop_Add16,
   4896                       binop(Iop_And16,
   4897                             mkexpr(old),
   4898                             mkexpr(mask[i])),
   4899                       binop(Iop_And16,
   4900                             binop(Iop_Shr16, mkexpr(old), mkU8(shift[i])),
   4901                             mkexpr(mask[i]))));
   4902          old = nyu;
   4903       }
   4904       return nyu;
   4905    }
   4906    if (ty == Ity_I32) {
   4907       IRTemp old = IRTemp_INVALID;
   4908       IRTemp nyu = IRTemp_INVALID;
   4909       IRTemp mask[5], shift[5];
   4910       for (i = 0; i < 5; i++) {
   4911          mask[i]  = newTemp(ty);
   4912          shift[i] = 1 << i;
   4913       }
   4914       assign(mask[0], mkU32(0x55555555));
   4915       assign(mask[1], mkU32(0x33333333));
   4916       assign(mask[2], mkU32(0x0F0F0F0F));
   4917       assign(mask[3], mkU32(0x00FF00FF));
   4918       assign(mask[4], mkU32(0x0000FFFF));
   4919       old = src;
   4920       for (i = 0; i < 5; i++) {
   4921          nyu = newTemp(ty);
   4922          assign(nyu,
   4923                 binop(Iop_Add32,
   4924                       binop(Iop_And32,
   4925                             mkexpr(old),
   4926                             mkexpr(mask[i])),
   4927                       binop(Iop_And32,
   4928                             binop(Iop_Shr32, mkexpr(old), mkU8(shift[i])),
   4929                             mkexpr(mask[i]))));
   4930          old = nyu;
   4931       }
   4932       return nyu;
   4933    }
   4934    if (ty == Ity_I64) {
   4935       IRTemp old = IRTemp_INVALID;
   4936       IRTemp nyu = IRTemp_INVALID;
   4937       IRTemp mask[6], shift[6];
   4938       for (i = 0; i < 6; i++) {
   4939          mask[i]  = newTemp(ty);
   4940          shift[i] = 1 << i;
   4941       }
   4942       assign(mask[0], mkU64(0x5555555555555555ULL));
   4943       assign(mask[1], mkU64(0x3333333333333333ULL));
   4944       assign(mask[2], mkU64(0x0F0F0F0F0F0F0F0FULL));
   4945       assign(mask[3], mkU64(0x00FF00FF00FF00FFULL));
   4946       assign(mask[4], mkU64(0x0000FFFF0000FFFFULL));
   4947       assign(mask[5], mkU64(0x00000000FFFFFFFFULL));
   4948       old = src;
   4949       for (i = 0; i < 6; i++) {
   4950          nyu = newTemp(ty);
   4951          assign(nyu,
   4952                 binop(Iop_Add64,
   4953                       binop(Iop_And64,
   4954                             mkexpr(old),
   4955                             mkexpr(mask[i])),
   4956                       binop(Iop_And64,
   4957                             binop(Iop_Shr64, mkexpr(old), mkU8(shift[i])),
   4958                             mkexpr(mask[i]))));
   4959          old = nyu;
   4960       }
   4961       return nyu;
   4962    }
   4963    /*NOTREACHED*/
   4964    vassert(0);
   4965 }
   4966 
   4967 
   4968 /* Generate an IR sequence to do a count-leading-zeroes operation on
   4969    the supplied IRTemp, and return a new IRTemp holding the result.
   4970    'ty' may be Ity_I16, Ity_I32 or Ity_I64 only.  In the case where
   4971    the argument is zero, return the number of bits in the word (the
   4972    natural semantics). */
   4973 static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
   4974 {
   4975    vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16);
   4976 
   4977    IRTemp src64 = newTemp(Ity_I64);
   4978    assign(src64, widenUto64( mkexpr(src) ));
   4979 
   4980    IRTemp src64x = newTemp(Ity_I64);
   4981    assign(src64x,
   4982           binop(Iop_Shl64, mkexpr(src64),
   4983                            mkU8(64 - 8 * sizeofIRType(ty))));
   4984 
   4985    // Clz64 has undefined semantics when its input is zero, so
   4986    // special-case around that.
   4987    IRTemp res64 = newTemp(Ity_I64);
   4988    assign(res64,
   4989           IRExpr_ITE(
   4990              binop(Iop_CmpEQ64, mkexpr(src64x), mkU64(0)),
   4991              mkU64(8 * sizeofIRType(ty)),
   4992              unop(Iop_Clz64, mkexpr(src64x))
   4993    ));
   4994 
   4995    IRTemp res = newTemp(ty);
   4996    assign(res, narrowTo(ty, mkexpr(res64)));
   4997    return res;
   4998 }
   4999 
   5000 
   5001 /* Generate an IR sequence to do a count-trailing-zeroes operation on
   5002    the supplied IRTemp, and return a new IRTemp holding the result.
   5003    'ty' may be Ity_I16, Ity_I32 or Ity_I64 only.  In the case where
   5004    the argument is zero, return the number of bits in the word (the
   5005    natural semantics). */
   5006 static IRTemp gen_TZCNT ( IRType ty, IRTemp src )
   5007 {
   5008    vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16);
   5009 
   5010    IRTemp src64 = newTemp(Ity_I64);
   5011    assign(src64, widenUto64( mkexpr(src) ));
   5012 
   5013    // Ctz64 has undefined semantics when its input is zero, so
   5014    // special-case around that.
   5015    IRTemp res64 = newTemp(Ity_I64);
   5016    assign(res64,
   5017           IRExpr_ITE(
   5018              binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0)),
   5019              mkU64(8 * sizeofIRType(ty)),
   5020              unop(Iop_Ctz64, mkexpr(src64))
   5021    ));
   5022 
   5023    IRTemp res = newTemp(ty);
   5024    assign(res, narrowTo(ty, mkexpr(res64)));
   5025    return res;
   5026 }
   5027 
   5028 
   5029 /*------------------------------------------------------------*/
   5030 /*---                                                      ---*/
   5031 /*--- x87 FLOATING POINT INSTRUCTIONS                      ---*/
   5032 /*---                                                      ---*/
   5033 /*------------------------------------------------------------*/
   5034 
   5035 /* --- Helper functions for dealing with the register stack. --- */
   5036 
   5037 /* --- Set the emulation-warning pseudo-register. --- */
   5038 
   5039 static void put_emwarn ( IRExpr* e /* :: Ity_I32 */ )
   5040 {
   5041    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   5042    stmt( IRStmt_Put( OFFB_EMNOTE, e ) );
   5043 }
   5044 
   5045 /* --- Produce an IRExpr* denoting a 64-bit QNaN. --- */
   5046 
   5047 static IRExpr* mkQNaN64 ( void )
   5048 {
   5049   /* QNaN is 0 2047 1 0(51times)
   5050      == 0b 11111111111b 1 0(51times)
   5051      == 0x7FF8 0000 0000 0000
   5052    */
   5053    return IRExpr_Const(IRConst_F64i(0x7FF8000000000000ULL));
   5054 }
   5055 
   5056 /* --------- Get/put the top-of-stack pointer :: Ity_I32 --------- */
   5057 
   5058 static IRExpr* get_ftop ( void )
   5059 {
   5060    return IRExpr_Get( OFFB_FTOP, Ity_I32 );
   5061 }
   5062 
   5063 static void put_ftop ( IRExpr* e )
   5064 {
   5065    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   5066    stmt( IRStmt_Put( OFFB_FTOP, e ) );
   5067 }
   5068 
   5069 /* --------- Get/put the C3210 bits. --------- */
   5070 
   5071 static IRExpr*  /* :: Ity_I64 */ get_C3210 ( void )
   5072 {
   5073    return IRExpr_Get( OFFB_FC3210, Ity_I64 );
   5074 }
   5075 
   5076 static void put_C3210 ( IRExpr* e  /* :: Ity_I64 */ )
   5077 {
   5078    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
   5079    stmt( IRStmt_Put( OFFB_FC3210, e ) );
   5080 }
   5081 
   5082 /* --------- Get/put the FPU rounding mode. --------- */
   5083 static IRExpr* /* :: Ity_I32 */ get_fpround ( void )
   5084 {
   5085    return unop(Iop_64to32, IRExpr_Get( OFFB_FPROUND, Ity_I64 ));
   5086 }
   5087 
   5088 static void put_fpround ( IRExpr* /* :: Ity_I32 */ e )
   5089 {
   5090    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   5091    stmt( IRStmt_Put( OFFB_FPROUND, unop(Iop_32Uto64,e) ) );
   5092 }
   5093 
   5094 
   5095 /* --------- Synthesise a 2-bit FPU rounding mode. --------- */
   5096 /* Produces a value in 0 .. 3, which is encoded as per the type
   5097    IRRoundingMode.  Since the guest_FPROUND value is also encoded as
   5098    per IRRoundingMode, we merely need to get it and mask it for
   5099    safety.
   5100 */
   5101 static IRExpr* /* :: Ity_I32 */ get_roundingmode ( void )
   5102 {
   5103    return binop( Iop_And32, get_fpround(), mkU32(3) );
   5104 }
   5105 
   5106 static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
   5107 {
   5108    return mkU32(Irrm_NEAREST);
   5109 }
   5110 
   5111 
   5112 /* --------- Get/set FP register tag bytes. --------- */
   5113 
   5114 /* Given i, and some expression e, generate 'ST_TAG(i) = e'. */
   5115 
   5116 static void put_ST_TAG ( Int i, IRExpr* value )
   5117 {
   5118    IRRegArray* descr;
   5119    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_I8);
   5120    descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   5121    stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
   5122 }
   5123 
   5124 /* Given i, generate an expression yielding 'ST_TAG(i)'.  This will be
   5125    zero to indicate "Empty" and nonzero to indicate "NonEmpty".  */
   5126 
   5127 static IRExpr* get_ST_TAG ( Int i )
   5128 {
   5129    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   5130    return IRExpr_GetI( descr, get_ftop(), i );
   5131 }
   5132 
   5133 
   5134 /* --------- Get/set FP registers. --------- */
   5135 
   5136 /* Given i, and some expression e, emit 'ST(i) = e' and set the
   5137    register's tag to indicate the register is full.  The previous
   5138    state of the register is not checked. */
   5139 
   5140 static void put_ST_UNCHECKED ( Int i, IRExpr* value )
   5141 {
   5142    IRRegArray* descr;
   5143    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_F64);
   5144    descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
   5145    stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
   5146    /* Mark the register as in-use. */
   5147    put_ST_TAG(i, mkU8(1));
   5148 }
   5149 
   5150 /* Given i, and some expression e, emit
   5151       ST(i) = is_full(i) ? NaN : e
   5152    and set the tag accordingly.
   5153 */
   5154 
   5155 static void put_ST ( Int i, IRExpr* value )
   5156 {
   5157    put_ST_UNCHECKED(
   5158       i,
   5159       IRExpr_ITE( binop(Iop_CmpNE8, get_ST_TAG(i), mkU8(0)),
   5160                   /* non-0 means full */
   5161                   mkQNaN64(),
   5162                   /* 0 means empty */
   5163                   value
   5164       )
   5165    );
   5166 }
   5167 
   5168 
   5169 /* Given i, generate an expression yielding 'ST(i)'. */
   5170 
   5171 static IRExpr* get_ST_UNCHECKED ( Int i )
   5172 {
   5173    IRRegArray* descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
   5174    return IRExpr_GetI( descr, get_ftop(), i );
   5175 }
   5176 
   5177 
   5178 /* Given i, generate an expression yielding
   5179   is_full(i) ? ST(i) : NaN
   5180 */
   5181 
   5182 static IRExpr* get_ST ( Int i )
   5183 {
   5184    return
   5185       IRExpr_ITE( binop(Iop_CmpNE8, get_ST_TAG(i), mkU8(0)),
   5186                   /* non-0 means full */
   5187                   get_ST_UNCHECKED(i),
   5188                   /* 0 means empty */
   5189                   mkQNaN64());
   5190 }
   5191 
   5192 
   5193 /* Given i, and some expression e, and a condition cond, generate IR
   5194    which has the same effect as put_ST(i,e) when cond is true and has
   5195    no effect when cond is false.  Given the lack of proper
   5196    if-then-else in the IR, this is pretty tricky.
   5197 */
   5198 
   5199 static void maybe_put_ST ( IRTemp cond, Int i, IRExpr* value )
   5200 {
   5201    // new_tag = if cond then FULL else old_tag
   5202    // new_val = if cond then (if old_tag==FULL then NaN else val)
   5203    //                   else old_val
   5204 
   5205    IRTemp old_tag = newTemp(Ity_I8);
   5206    assign(old_tag, get_ST_TAG(i));
   5207    IRTemp new_tag = newTemp(Ity_I8);
   5208    assign(new_tag,
   5209           IRExpr_ITE(mkexpr(cond), mkU8(1)/*FULL*/, mkexpr(old_tag)));
   5210 
   5211    IRTemp old_val = newTemp(Ity_F64);
   5212    assign(old_val, get_ST_UNCHECKED(i));
   5213    IRTemp new_val = newTemp(Ity_F64);
   5214    assign(new_val,
   5215           IRExpr_ITE(mkexpr(cond),
   5216                      IRExpr_ITE(binop(Iop_CmpNE8, mkexpr(old_tag), mkU8(0)),
   5217                                 /* non-0 means full */
   5218                                 mkQNaN64(),
   5219                                 /* 0 means empty */
   5220                                 value),
   5221                      mkexpr(old_val)));
   5222 
   5223    put_ST_UNCHECKED(i, mkexpr(new_val));
   5224    // put_ST_UNCHECKED incorrectly sets tag(i) to always be FULL.  So
   5225    // now set it to new_tag instead.
   5226    put_ST_TAG(i, mkexpr(new_tag));
   5227 }
   5228 
   5229 /* Adjust FTOP downwards by one register. */
   5230 
   5231 static void fp_push ( void )
   5232 {
   5233    put_ftop( binop(Iop_Sub32, get_ftop(), mkU32(1)) );
   5234 }
   5235 
   5236 /* Adjust FTOP downwards by one register when COND is 1:I1.  Else
   5237    don't change it. */
   5238 
   5239 static void maybe_fp_push ( IRTemp cond )
   5240 {
   5241    put_ftop( binop(Iop_Sub32, get_ftop(), unop(Iop_1Uto32,mkexpr(cond))) );
   5242 }
   5243 
   5244 /* Adjust FTOP upwards by one register, and mark the vacated register
   5245    as empty.  */
   5246 
   5247 static void fp_pop ( void )
   5248 {
   5249    put_ST_TAG(0, mkU8(0));
   5250    put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
   5251 }
   5252 
   5253 /* Set the C2 bit of the FPU status register to e[0].  Assumes that
   5254    e[31:1] == 0.
   5255 */
   5256 static void set_C2 ( IRExpr* e )
   5257 {
   5258    IRExpr* cleared = binop(Iop_And64, get_C3210(), mkU64(~AMD64G_FC_MASK_C2));
   5259    put_C3210( binop(Iop_Or64,
   5260                     cleared,
   5261                     binop(Iop_Shl64, e, mkU8(AMD64G_FC_SHIFT_C2))) );
   5262 }
   5263 
   5264 /* Generate code to check that abs(d64) < 2^63 and is finite.  This is
   5265    used to do the range checks for FSIN, FCOS, FSINCOS and FPTAN.  The
   5266    test is simple, but the derivation of it is not so simple.
   5267 
   5268    The exponent field for an IEEE754 double is 11 bits.  That means it
   5269    can take values 0 through 0x7FF.  If the exponent has value 0x7FF,
   5270    the number is either a NaN or an Infinity and so is not finite.
   5271    Furthermore, a finite value of exactly 2^63 is the smallest value
   5272    that has exponent value 0x43E.  Hence, what we need to do is
   5273    extract the exponent, ignoring the sign bit and mantissa, and check
   5274    it is < 0x43E, or <= 0x43D.
   5275 
   5276    To make this easily applicable to 32- and 64-bit targets, a
   5277    roundabout approach is used.  First the number is converted to I64,
   5278    then the top 32 bits are taken.  Shifting them right by 20 bits
   5279    places the sign bit and exponent in the bottom 12 bits.  Anding
   5280    with 0x7FF gets rid of the sign bit, leaving just the exponent
   5281    available for comparison.
   5282 */
   5283 static IRTemp math_IS_TRIG_ARG_FINITE_AND_IN_RANGE ( IRTemp d64 )
   5284 {
   5285    IRTemp i64 = newTemp(Ity_I64);
   5286    assign(i64, unop(Iop_ReinterpF64asI64, mkexpr(d64)) );
   5287    IRTemp exponent = newTemp(Ity_I32);
   5288    assign(exponent,
   5289           binop(Iop_And32,
   5290                 binop(Iop_Shr32, unop(Iop_64HIto32, mkexpr(i64)), mkU8(20)),
   5291                 mkU32(0x7FF)));
   5292    IRTemp in_range_and_finite = newTemp(Ity_I1);
   5293    assign(in_range_and_finite,
   5294           binop(Iop_CmpLE32U, mkexpr(exponent), mkU32(0x43D)));
   5295    return in_range_and_finite;
   5296 }
   5297 
   5298 /* Invent a plausible-looking FPU status word value:
   5299       ((ftop & 7) << 11) | (c3210 & 0x4700)
   5300  */
   5301 static IRExpr* get_FPU_sw ( void )
   5302 {
   5303    return
   5304       unop(Iop_32to16,
   5305            binop(Iop_Or32,
   5306                  binop(Iop_Shl32,
   5307                        binop(Iop_And32, get_ftop(), mkU32(7)),
   5308                              mkU8(11)),
   5309                        binop(Iop_And32, unop(Iop_64to32, get_C3210()),
   5310                                         mkU32(0x4700))
   5311       ));
   5312 }
   5313 
   5314 
   5315 /* Generate a dirty helper call that initialises the x87 state a la
   5316    FINIT.  If |guard| is NULL, it is done unconditionally.  Otherwise
   5317    |guard| is used as a guarding condition.
   5318 */
   5319 static void gen_FINIT_SEQUENCE ( IRExpr* guard )
   5320 {
   5321    /* Uses dirty helper:
   5322          void amd64g_do_FINIT ( VexGuestAMD64State* ) */
   5323    IRDirty* d  = unsafeIRDirty_0_N (
   5324                     0/*regparms*/,
   5325                     "amd64g_dirtyhelper_FINIT",
   5326                     &amd64g_dirtyhelper_FINIT,
   5327                     mkIRExprVec_1( IRExpr_GSPTR() )
   5328                  );
   5329 
   5330    /* declare we're writing guest state */
   5331    d->nFxState = 5;
   5332    vex_bzero(&d->fxState, sizeof(d->fxState));
   5333 
   5334    d->fxState[0].fx     = Ifx_Write;
   5335    d->fxState[0].offset = OFFB_FTOP;
   5336    d->fxState[0].size   = sizeof(UInt);
   5337 
   5338    d->fxState[1].fx     = Ifx_Write;
   5339    d->fxState[1].offset = OFFB_FPREGS;
   5340    d->fxState[1].size   = 8 * sizeof(ULong);
   5341 
   5342    d->fxState[2].fx     = Ifx_Write;
   5343    d->fxState[2].offset = OFFB_FPTAGS;
   5344    d->fxState[2].size   = 8 * sizeof(UChar);
   5345 
   5346    d->fxState[3].fx     = Ifx_Write;
   5347    d->fxState[3].offset = OFFB_FPROUND;
   5348    d->fxState[3].size   = sizeof(ULong);
   5349 
   5350    d->fxState[4].fx     = Ifx_Write;
   5351    d->fxState[4].offset = OFFB_FC3210;
   5352    d->fxState[4].size   = sizeof(ULong);
   5353 
   5354    if (guard)
   5355       d->guard = guard;
   5356 
   5357    stmt( IRStmt_Dirty(d) );
   5358 }
   5359 
   5360 
   5361 /* ------------------------------------------------------- */
   5362 /* Given all that stack-mangling junk, we can now go ahead
   5363    and describe FP instructions.
   5364 */
   5365 
   5366 /* ST(0) = ST(0) `op` mem64/32(addr)
   5367    Need to check ST(0)'s tag on read, but not on write.
   5368 */
   5369 static
   5370 void fp_do_op_mem_ST_0 ( IRTemp addr, const HChar* op_txt, HChar* dis_buf,
   5371                          IROp op, Bool dbl )
   5372 {
   5373    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
   5374    if (dbl) {
   5375       put_ST_UNCHECKED(0,
   5376          triop( op,
   5377                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5378                 get_ST(0),
   5379                 loadLE(Ity_F64,mkexpr(addr))
   5380          ));
   5381    } else {
   5382       put_ST_UNCHECKED(0,
   5383          triop( op,
   5384                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5385                 get_ST(0),
   5386                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr)))
   5387          ));
   5388    }
   5389 }
   5390 
   5391 
   5392 /* ST(0) = mem64/32(addr) `op` ST(0)
   5393    Need to check ST(0)'s tag on read, but not on write.
   5394 */
   5395 static
   5396 void fp_do_oprev_mem_ST_0 ( IRTemp addr, const HChar* op_txt, HChar* dis_buf,
   5397                             IROp op, Bool dbl )
   5398 {
   5399    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
   5400    if (dbl) {
   5401       put_ST_UNCHECKED(0,
   5402          triop( op,
   5403                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5404                 loadLE(Ity_F64,mkexpr(addr)),
   5405                 get_ST(0)
   5406          ));
   5407    } else {
   5408       put_ST_UNCHECKED(0,
   5409          triop( op,
   5410                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5411                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr))),
   5412                 get_ST(0)
   5413          ));
   5414    }
   5415 }
   5416 
   5417 
   5418 /* ST(dst) = ST(dst) `op` ST(src).
   5419    Check dst and src tags when reading but not on write.
   5420 */
   5421 static
   5422 void fp_do_op_ST_ST ( const HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
   5423                       Bool pop_after )
   5424 {
   5425    DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"", st_src, st_dst );
   5426    put_ST_UNCHECKED(
   5427       st_dst,
   5428       triop( op,
   5429              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5430              get_ST(st_dst),
   5431              get_ST(st_src) )
   5432    );
   5433    if (pop_after)
   5434       fp_pop();
   5435 }
   5436 
   5437 /* ST(dst) = ST(src) `op` ST(dst).
   5438    Check dst and src tags when reading but not on write.
   5439 */
   5440 static
   5441 void fp_do_oprev_ST_ST ( const HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
   5442                          Bool pop_after )
   5443 {
   5444    DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"", st_src, st_dst );
   5445    put_ST_UNCHECKED(
   5446       st_dst,
   5447       triop( op,
   5448              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5449              get_ST(st_src),
   5450              get_ST(st_dst) )
   5451    );
   5452    if (pop_after)
   5453       fp_pop();
   5454 }
   5455 
   5456 /* %rflags(Z,P,C) = UCOMI( st(0), st(i) ) */
   5457 static void fp_do_ucomi_ST0_STi ( UInt i, Bool pop_after )
   5458 {
   5459    DIP("fucomi%s %%st(0),%%st(%u)\n", pop_after ? "p" : "", i);
   5460    /* This is a bit of a hack (and isn't really right).  It sets
   5461       Z,P,C,O correctly, but forces A and S to zero, whereas the Intel
   5462       documentation implies A and S are unchanged.
   5463    */
   5464    /* It's also fishy in that it is used both for COMIP and
   5465       UCOMIP, and they aren't the same (although similar). */
   5466    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   5467    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   5468    stmt( IRStmt_Put(
   5469             OFFB_CC_DEP1,
   5470             binop( Iop_And64,
   5471                    unop( Iop_32Uto64,
   5472                          binop(Iop_CmpF64, get_ST(0), get_ST(i))),
   5473                    mkU64(0x45)
   5474         )));
   5475    if (pop_after)
   5476       fp_pop();
   5477 }
   5478 
   5479 
   5480 /* returns
   5481    32to16( if e32 <s -32768 || e32 >s 32767 then -32768 else e32 )
   5482 */
   5483 static IRExpr* x87ishly_qnarrow_32_to_16 ( IRExpr* e32 )
   5484 {
   5485    IRTemp t32 = newTemp(Ity_I32);
   5486    assign( t32, e32 );
   5487    return
   5488       IRExpr_ITE(
   5489          binop(Iop_CmpLT64U,
   5490                unop(Iop_32Uto64,
   5491                     binop(Iop_Add32, mkexpr(t32), mkU32(32768))),
   5492                mkU64(65536)),
   5493          unop(Iop_32to16, mkexpr(t32)),
   5494          mkU16( 0x8000 ) );
   5495 }
   5496 
   5497 
   5498 static
   5499 ULong dis_FPU ( /*OUT*/Bool* decode_ok,
   5500                 const VexAbiInfo* vbi, Prefix pfx, Long delta )
   5501 {
   5502    Int    len;
   5503    UInt   r_src, r_dst;
   5504    HChar  dis_buf[50];
   5505    IRTemp t1, t2;
   5506 
   5507    /* On entry, delta points at the second byte of the insn (the modrm
   5508       byte).*/
   5509    UChar first_opcode = getUChar(delta-1);
   5510    UChar modrm        = getUChar(delta+0);
   5511 
   5512    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD8 opcodes +-+-+-+-+-+-+-+ */
   5513 
   5514    if (first_opcode == 0xD8) {
   5515       if (modrm < 0xC0) {
   5516 
   5517          /* bits 5,4,3 are an opcode extension, and the modRM also
   5518            specifies an address. */
   5519          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   5520          delta += len;
   5521 
   5522          switch (gregLO3ofRM(modrm)) {
   5523 
   5524             case 0: /* FADD single-real */
   5525                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, False );
   5526                break;
   5527 
   5528             case 1: /* FMUL single-real */
   5529                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, False );
   5530                break;
   5531 
   5532             case 2: /* FCOM single-real */
   5533                DIP("fcoms %s\n", dis_buf);
   5534                /* This forces C1 to zero, which isn't right. */
   5535                /* The AMD documentation suggests that forcing C1 to
   5536                   zero is correct (Eliot Moss) */
   5537                put_C3210(
   5538                    unop( Iop_32Uto64,
   5539                        binop( Iop_And32,
   5540                               binop(Iop_Shl32,
   5541                                     binop(Iop_CmpF64,
   5542                                           get_ST(0),
   5543                                           unop(Iop_F32toF64,
   5544                                                loadLE(Ity_F32,mkexpr(addr)))),
   5545                                     mkU8(8)),
   5546                               mkU32(0x4500)
   5547                    )));
   5548                break;
   5549 
   5550             case 3: /* FCOMP single-real */
   5551                /* The AMD documentation suggests that forcing C1 to
   5552                   zero is correct (Eliot Moss) */
   5553                DIP("fcomps %s\n", dis_buf);
   5554                /* This forces C1 to zero, which isn't right. */
   5555                put_C3210(
   5556                    unop( Iop_32Uto64,
   5557                        binop( Iop_And32,
   5558                               binop(Iop_Shl32,
   5559                                     binop(Iop_CmpF64,
   5560                                           get_ST(0),
   5561                                           unop(Iop_F32toF64,
   5562                                                loadLE(Ity_F32,mkexpr(addr)))),
   5563                                     mkU8(8)),
   5564                               mkU32(0x4500)
   5565                    )));
   5566                fp_pop();
   5567                break;
   5568 
   5569             case 4: /* FSUB single-real */
   5570                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, False );
   5571                break;
   5572 
   5573             case 5: /* FSUBR single-real */
   5574                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, False );
   5575                break;
   5576 
   5577             case 6: /* FDIV single-real */
   5578                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, False );
   5579                break;
   5580 
   5581             case 7: /* FDIVR single-real */
   5582                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, False );
   5583                break;
   5584 
   5585             default:
   5586                vex_printf("unhandled opc_aux = 0x%2x\n",
   5587                           (UInt)gregLO3ofRM(modrm));
   5588                vex_printf("first_opcode == 0xD8\n");
   5589                goto decode_fail;
   5590          }
   5591       } else {
   5592          delta++;
   5593          switch (modrm) {
   5594 
   5595             case 0xC0 ... 0xC7: /* FADD %st(?),%st(0) */
   5596                fp_do_op_ST_ST ( "add", Iop_AddF64, modrm - 0xC0, 0, False );
   5597                break;
   5598 
   5599             case 0xC8 ... 0xCF: /* FMUL %st(?),%st(0) */
   5600                fp_do_op_ST_ST ( "mul", Iop_MulF64, modrm - 0xC8, 0, False );
   5601                break;
   5602 
   5603             /* Dunno if this is right */
   5604             case 0xD0 ... 0xD7: /* FCOM %st(?),%st(0) */
   5605                r_dst = (UInt)modrm - 0xD0;
   5606                DIP("fcom %%st(0),%%st(%u)\n", r_dst);
   5607                /* This forces C1 to zero, which isn't right. */
   5608                put_C3210(
   5609                    unop(Iop_32Uto64,
   5610                    binop( Iop_And32,
   5611                           binop(Iop_Shl32,
   5612                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   5613                                 mkU8(8)),
   5614                           mkU32(0x4500)
   5615                    )));
   5616                break;
   5617 
   5618             /* Dunno if this is right */
   5619             case 0xD8 ... 0xDF: /* FCOMP %st(?),%st(0) */
   5620                r_dst = (UInt)modrm - 0xD8;
   5621                DIP("fcomp %%st(0),%%st(%u)\n", r_dst);
   5622                /* This forces C1 to zero, which isn't right. */
   5623                put_C3210(
   5624                    unop(Iop_32Uto64,
   5625                    binop( Iop_And32,
   5626                           binop(Iop_Shl32,
   5627                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   5628                                 mkU8(8)),
   5629                           mkU32(0x4500)
   5630                    )));
   5631                fp_pop();
   5632                break;
   5633 
   5634             case 0xE0 ... 0xE7: /* FSUB %st(?),%st(0) */
   5635                fp_do_op_ST_ST ( "sub", Iop_SubF64, modrm - 0xE0, 0, False );
   5636                break;
   5637 
   5638             case 0xE8 ... 0xEF: /* FSUBR %st(?),%st(0) */
   5639                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, modrm - 0xE8, 0, False );
   5640                break;
   5641 
   5642             case 0xF0 ... 0xF7: /* FDIV %st(?),%st(0) */
   5643                fp_do_op_ST_ST ( "div", Iop_DivF64, modrm - 0xF0, 0, False );
   5644                break;
   5645 
   5646             case 0xF8 ... 0xFF: /* FDIVR %st(?),%st(0) */
   5647                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, modrm - 0xF8, 0, False );
   5648                break;
   5649 
   5650             default:
   5651                goto decode_fail;
   5652          }
   5653       }
   5654    }
   5655 
   5656    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD9 opcodes +-+-+-+-+-+-+-+ */
   5657    else
   5658    if (first_opcode == 0xD9) {
   5659       if (modrm < 0xC0) {
   5660 
   5661          /* bits 5,4,3 are an opcode extension, and the modRM also
   5662             specifies an address. */
   5663          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   5664          delta += len;
   5665 
   5666          switch (gregLO3ofRM(modrm)) {
   5667 
   5668             case 0: /* FLD single-real */
   5669                DIP("flds %s\n", dis_buf);
   5670                fp_push();
   5671                put_ST(0, unop(Iop_F32toF64,
   5672                               loadLE(Ity_F32, mkexpr(addr))));
   5673                break;
   5674 
   5675             case 2: /* FST single-real */
   5676                DIP("fsts %s\n", dis_buf);
   5677                storeLE(mkexpr(addr),
   5678                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
   5679                break;
   5680 
   5681             case 3: /* FSTP single-real */
   5682                DIP("fstps %s\n", dis_buf);
   5683                storeLE(mkexpr(addr),
   5684                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
   5685                fp_pop();
   5686                break;
   5687 
   5688             case 4: { /* FLDENV m28 */
   5689                /* Uses dirty helper:
   5690                      VexEmNote amd64g_do_FLDENV ( VexGuestX86State*, HWord ) */
   5691                IRTemp    ew = newTemp(Ity_I32);
   5692                IRTemp   w64 = newTemp(Ity_I64);
   5693                IRDirty*   d = unsafeIRDirty_0_N (
   5694                                  0/*regparms*/,
   5695                                  "amd64g_dirtyhelper_FLDENV",
   5696                                  &amd64g_dirtyhelper_FLDENV,
   5697                                  mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
   5698                               );
   5699                d->tmp       = w64;
   5700                /* declare we're reading memory */
   5701                d->mFx   = Ifx_Read;
   5702                d->mAddr = mkexpr(addr);
   5703                d->mSize = 28;
   5704 
   5705                /* declare we're writing guest state */
   5706                d->nFxState = 4;
   5707                vex_bzero(&d->fxState, sizeof(d->fxState));
   5708 
   5709                d->fxState[0].fx     = Ifx_Write;
   5710                d->fxState[0].offset = OFFB_FTOP;
   5711                d->fxState[0].size   = sizeof(UInt);
   5712 
   5713                d->fxState[1].fx     = Ifx_Write;
   5714                d->fxState[1].offset = OFFB_FPTAGS;
   5715                d->fxState[1].size   = 8 * sizeof(UChar);
   5716 
   5717                d->fxState[2].fx     = Ifx_Write;
   5718                d->fxState[2].offset = OFFB_FPROUND;
   5719                d->fxState[2].size   = sizeof(ULong);
   5720 
   5721                d->fxState[3].fx     = Ifx_Write;
   5722                d->fxState[3].offset = OFFB_FC3210;
   5723                d->fxState[3].size   = sizeof(ULong);
   5724 
   5725                stmt( IRStmt_Dirty(d) );
   5726 
   5727                /* ew contains any emulation warning we may need to
   5728                   issue.  If needed, side-exit to the next insn,
   5729                   reporting the warning, so that Valgrind's dispatcher
   5730                   sees the warning. */
   5731                assign(ew, unop(Iop_64to32,mkexpr(w64)) );
   5732                put_emwarn( mkexpr(ew) );
   5733                stmt(
   5734                   IRStmt_Exit(
   5735                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   5736                      Ijk_EmWarn,
   5737                      IRConst_U64( guest_RIP_bbstart+delta ),
   5738                      OFFB_RIP
   5739                   )
   5740                );
   5741 
   5742                DIP("fldenv %s\n", dis_buf);
   5743                break;
   5744             }
   5745 
   5746             case 5: {/* FLDCW */
   5747                /* The only thing we observe in the control word is the
   5748                   rounding mode.  Therefore, pass the 16-bit value
   5749                   (x87 native-format control word) to a clean helper,
   5750                   getting back a 64-bit value, the lower half of which
   5751                   is the FPROUND value to store, and the upper half of
   5752                   which is the emulation-warning token which may be
   5753                   generated.
   5754                */
   5755                /* ULong amd64h_check_fldcw ( ULong ); */
   5756                IRTemp t64 = newTemp(Ity_I64);
   5757                IRTemp ew = newTemp(Ity_I32);
   5758                DIP("fldcw %s\n", dis_buf);
   5759                assign( t64, mkIRExprCCall(
   5760                                Ity_I64, 0/*regparms*/,
   5761                                "amd64g_check_fldcw",
   5762                                &amd64g_check_fldcw,
   5763                                mkIRExprVec_1(
   5764                                   unop( Iop_16Uto64,
   5765                                         loadLE(Ity_I16, mkexpr(addr)))
   5766                                )
   5767                             )
   5768                      );
   5769 
   5770                put_fpround( unop(Iop_64to32, mkexpr(t64)) );
   5771                assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
   5772                put_emwarn( mkexpr(ew) );
   5773                /* Finally, if an emulation warning was reported,
   5774                   side-exit to the next insn, reporting the warning,
   5775                   so that Valgrind's dispatcher sees the warning. */
   5776                stmt(
   5777                   IRStmt_Exit(
   5778                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   5779                      Ijk_EmWarn,
   5780                      IRConst_U64( guest_RIP_bbstart+delta ),
   5781                      OFFB_RIP
   5782                   )
   5783                );
   5784                break;
   5785             }
   5786 
   5787             case 6: { /* FNSTENV m28 */
   5788                /* Uses dirty helper:
   5789                      void amd64g_do_FSTENV ( VexGuestAMD64State*, HWord ) */
   5790                IRDirty* d = unsafeIRDirty_0_N (
   5791                                0/*regparms*/,
   5792                                "amd64g_dirtyhelper_FSTENV",
   5793                                &amd64g_dirtyhelper_FSTENV,
   5794                                mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
   5795                             );
   5796                /* declare we're writing memory */
   5797                d->mFx   = Ifx_Write;
   5798                d->mAddr = mkexpr(addr);
   5799                d->mSize = 28;
   5800 
   5801                /* declare we're reading guest state */
   5802                d->nFxState = 4;
   5803                vex_bzero(&d->fxState, sizeof(d->fxState));
   5804 
   5805                d->fxState[0].fx     = Ifx_Read;
   5806                d->fxState[0].offset = OFFB_FTOP;
   5807                d->fxState[0].size   = sizeof(UInt);
   5808 
   5809                d->fxState[1].fx     = Ifx_Read;
   5810                d->fxState[1].offset = OFFB_FPTAGS;
   5811                d->fxState[1].size   = 8 * sizeof(UChar);
   5812 
   5813                d->fxState[2].fx     = Ifx_Read;
   5814                d->fxState[2].offset = OFFB_FPROUND;
   5815                d->fxState[2].size   = sizeof(ULong);
   5816 
   5817                d->fxState[3].fx     = Ifx_Read;
   5818                d->fxState[3].offset = OFFB_FC3210;
   5819                d->fxState[3].size   = sizeof(ULong);
   5820 
   5821                stmt( IRStmt_Dirty(d) );
   5822 
   5823                DIP("fnstenv %s\n", dis_buf);
   5824                break;
   5825             }
   5826 
   5827             case 7: /* FNSTCW */
   5828                /* Fake up a native x87 FPU control word.  The only
   5829                   thing it depends on is FPROUND[1:0], so call a clean
   5830                   helper to cook it up. */
   5831                /* ULong amd64g_create_fpucw ( ULong fpround ) */
   5832                DIP("fnstcw %s\n", dis_buf);
   5833                storeLE(
   5834                   mkexpr(addr),
   5835                   unop( Iop_64to16,
   5836                         mkIRExprCCall(
   5837                            Ity_I64, 0/*regp*/,
   5838                            "amd64g_create_fpucw", &amd64g_create_fpucw,
   5839                            mkIRExprVec_1( unop(Iop_32Uto64, get_fpround()) )
   5840                         )
   5841                   )
   5842                );
   5843                break;
   5844 
   5845             default:
   5846                vex_printf("unhandled opc_aux = 0x%2x\n",
   5847                           (UInt)gregLO3ofRM(modrm));
   5848                vex_printf("first_opcode == 0xD9\n");
   5849                goto decode_fail;
   5850          }
   5851 
   5852       } else {
   5853          delta++;
   5854          switch (modrm) {
   5855 
   5856             case 0xC0 ... 0xC7: /* FLD %st(?) */
   5857                r_src = (UInt)modrm - 0xC0;
   5858                DIP("fld %%st(%u)\n", r_src);
   5859                t1 = newTemp(Ity_F64);
   5860                assign(t1, get_ST(r_src));
   5861                fp_push();
   5862                put_ST(0, mkexpr(t1));
   5863                break;
   5864 
   5865             case 0xC8 ... 0xCF: /* FXCH %st(?) */
   5866                r_src = (UInt)modrm - 0xC8;
   5867                DIP("fxch %%st(%u)\n", r_src);
   5868                t1 = newTemp(Ity_F64);
   5869                t2 = newTemp(Ity_F64);
   5870                assign(t1, get_ST(0));
   5871                assign(t2, get_ST(r_src));
   5872                put_ST_UNCHECKED(0, mkexpr(t2));
   5873                put_ST_UNCHECKED(r_src, mkexpr(t1));
   5874                break;
   5875 
   5876             case 0xE0: /* FCHS */
   5877                DIP("fchs\n");
   5878                put_ST_UNCHECKED(0, unop(Iop_NegF64, get_ST(0)));
   5879                break;
   5880 
   5881             case 0xE1: /* FABS */
   5882                DIP("fabs\n");
   5883                put_ST_UNCHECKED(0, unop(Iop_AbsF64, get_ST(0)));
   5884                break;
   5885 
   5886             case 0xE5: { /* FXAM */
   5887                /* This is an interesting one.  It examines %st(0),
   5888                   regardless of whether the tag says it's empty or not.
   5889                   Here, just pass both the tag (in our format) and the
   5890                   value (as a double, actually a ULong) to a helper
   5891                   function. */
   5892                IRExpr** args
   5893                   = mkIRExprVec_2( unop(Iop_8Uto64, get_ST_TAG(0)),
   5894                                    unop(Iop_ReinterpF64asI64,
   5895                                         get_ST_UNCHECKED(0)) );
   5896                put_C3210(mkIRExprCCall(
   5897                             Ity_I64,
   5898                             0/*regparm*/,
   5899                             "amd64g_calculate_FXAM", &amd64g_calculate_FXAM,
   5900                             args
   5901                         ));
   5902                DIP("fxam\n");
   5903                break;
   5904             }
   5905 
   5906             case 0xE8: /* FLD1 */
   5907                DIP("fld1\n");
   5908                fp_push();
   5909                /* put_ST(0, IRExpr_Const(IRConst_F64(1.0))); */
   5910                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff0000000000000ULL)));
   5911                break;
   5912 
   5913             case 0xE9: /* FLDL2T */
   5914                DIP("fldl2t\n");
   5915                fp_push();
   5916                /* put_ST(0, IRExpr_Const(IRConst_F64(3.32192809488736234781))); */
   5917                put_ST(0, IRExpr_Const(IRConst_F64i(0x400a934f0979a371ULL)));
   5918                break;
   5919 
   5920             case 0xEA: /* FLDL2E */
   5921                DIP("fldl2e\n");
   5922                fp_push();
   5923                /* put_ST(0, IRExpr_Const(IRConst_F64(1.44269504088896340739))); */
   5924                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff71547652b82feULL)));
   5925                break;
   5926 
   5927             case 0xEB: /* FLDPI */
   5928                DIP("fldpi\n");
   5929                fp_push();
   5930                /* put_ST(0, IRExpr_Const(IRConst_F64(3.14159265358979323851))); */
   5931                put_ST(0, IRExpr_Const(IRConst_F64i(0x400921fb54442d18ULL)));
   5932                break;
   5933 
   5934             case 0xEC: /* FLDLG2 */
   5935                DIP("fldlg2\n");
   5936                fp_push();
   5937                /* put_ST(0, IRExpr_Const(IRConst_F64(0.301029995663981143))); */
   5938                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fd34413509f79ffULL)));
   5939                break;
   5940 
   5941             case 0xED: /* FLDLN2 */
   5942                DIP("fldln2\n");
   5943                fp_push();
   5944                /* put_ST(0, IRExpr_Const(IRConst_F64(0.69314718055994530942))); */
   5945                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fe62e42fefa39efULL)));
   5946                break;
   5947 
   5948             case 0xEE: /* FLDZ */
   5949                DIP("fldz\n");
   5950                fp_push();
   5951                /* put_ST(0, IRExpr_Const(IRConst_F64(0.0))); */
   5952                put_ST(0, IRExpr_Const(IRConst_F64i(0x0000000000000000ULL)));
   5953                break;
   5954 
   5955             case 0xF0: /* F2XM1 */
   5956                DIP("f2xm1\n");
   5957                put_ST_UNCHECKED(0,
   5958                   binop(Iop_2xm1F64,
   5959                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5960                         get_ST(0)));
   5961                break;
   5962 
   5963             case 0xF1: /* FYL2X */
   5964                DIP("fyl2x\n");
   5965                put_ST_UNCHECKED(1,
   5966                   triop(Iop_Yl2xF64,
   5967                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5968                         get_ST(1),
   5969                         get_ST(0)));
   5970                fp_pop();
   5971                break;
   5972 
   5973             case 0xF2: { /* FPTAN */
   5974                DIP("fptan\n");
   5975                IRTemp argD = newTemp(Ity_F64);
   5976                assign(argD, get_ST(0));
   5977                IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
   5978                IRTemp resD = newTemp(Ity_F64);
   5979                assign(resD,
   5980                   IRExpr_ITE(
   5981                      mkexpr(argOK),
   5982                      binop(Iop_TanF64,
   5983                            get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5984                            mkexpr(argD)),
   5985                      mkexpr(argD))
   5986                );
   5987                put_ST_UNCHECKED(0, mkexpr(resD));
   5988                /* Conditionally push 1.0 on the stack, if the arg is
   5989                   in range */
   5990                maybe_fp_push(argOK);
   5991                maybe_put_ST(argOK, 0,
   5992                             IRExpr_Const(IRConst_F64(1.0)));
   5993                set_C2( binop(Iop_Xor64,
   5994                              unop(Iop_1Uto64, mkexpr(argOK)),
   5995                              mkU64(1)) );
   5996                break;
   5997             }
   5998 
   5999             case 0xF3: /* FPATAN */
   6000                DIP("fpatan\n");
   6001                put_ST_UNCHECKED(1,
   6002                   triop(Iop_AtanF64,
   6003                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6004                         get_ST(1),
   6005                         get_ST(0)));
   6006                fp_pop();
   6007                break;
   6008 
   6009             case 0xF4: { /* FXTRACT */
   6010                IRTemp argF = newTemp(Ity_F64);
   6011                IRTemp sigF = newTemp(Ity_F64);
   6012                IRTemp expF = newTemp(Ity_F64);
   6013                IRTemp argI = newTemp(Ity_I64);
   6014                IRTemp sigI = newTemp(Ity_I64);
   6015                IRTemp expI = newTemp(Ity_I64);
   6016                DIP("fxtract\n");
   6017                assign( argF, get_ST(0) );
   6018                assign( argI, unop(Iop_ReinterpF64asI64, mkexpr(argF)));
   6019                assign( sigI,
   6020                        mkIRExprCCall(
   6021                           Ity_I64, 0/*regparms*/,
   6022                           "x86amd64g_calculate_FXTRACT",
   6023                           &x86amd64g_calculate_FXTRACT,
   6024                           mkIRExprVec_2( mkexpr(argI),
   6025                                          mkIRExpr_HWord(0)/*sig*/ ))
   6026                );
   6027                assign( expI,
   6028                        mkIRExprCCall(
   6029                           Ity_I64, 0/*regparms*/,
   6030                           "x86amd64g_calculate_FXTRACT",
   6031                           &x86amd64g_calculate_FXTRACT,
   6032                           mkIRExprVec_2( mkexpr(argI),
   6033                                          mkIRExpr_HWord(1)/*exp*/ ))
   6034                );
   6035                assign( sigF, unop(Iop_ReinterpI64asF64, mkexpr(sigI)) );
   6036                assign( expF, unop(Iop_ReinterpI64asF64, mkexpr(expI)) );
   6037                /* exponent */
   6038                put_ST_UNCHECKED(0, mkexpr(expF) );
   6039                fp_push();
   6040                /* significand */
   6041                put_ST(0, mkexpr(sigF) );
   6042                break;
   6043             }
   6044 
   6045             case 0xF5: { /* FPREM1 -- IEEE compliant */
   6046                IRTemp a1 = newTemp(Ity_F64);
   6047                IRTemp a2 = newTemp(Ity_F64);
   6048                DIP("fprem1\n");
   6049                /* Do FPREM1 twice, once to get the remainder, and once
   6050                   to get the C3210 flag values. */
   6051                assign( a1, get_ST(0) );
   6052                assign( a2, get_ST(1) );
   6053                put_ST_UNCHECKED(0,
   6054                   triop(Iop_PRem1F64,
   6055                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6056                         mkexpr(a1),
   6057                         mkexpr(a2)));
   6058                put_C3210(
   6059                   unop(Iop_32Uto64,
   6060                   triop(Iop_PRem1C3210F64,
   6061                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6062                         mkexpr(a1),
   6063                         mkexpr(a2)) ));
   6064                break;
   6065             }
   6066 
   6067             case 0xF7: /* FINCSTP */
   6068                DIP("fincstp\n");
   6069                put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
   6070                break;
   6071 
   6072             case 0xF8: { /* FPREM -- not IEEE compliant */
   6073                IRTemp a1 = newTemp(Ity_F64);
   6074                IRTemp a2 = newTemp(Ity_F64);
   6075                DIP("fprem\n");
   6076                /* Do FPREM twice, once to get the remainder, and once
   6077                   to get the C3210 flag values. */
   6078                assign( a1, get_ST(0) );
   6079                assign( a2, get_ST(1) );
   6080                put_ST_UNCHECKED(0,
   6081                   triop(Iop_PRemF64,
   6082                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6083                         mkexpr(a1),
   6084                         mkexpr(a2)));
   6085                put_C3210(
   6086                   unop(Iop_32Uto64,
   6087                   triop(Iop_PRemC3210F64,
   6088                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6089                         mkexpr(a1),
   6090                         mkexpr(a2)) ));
   6091                break;
   6092             }
   6093 
   6094             case 0xF9: /* FYL2XP1 */
   6095                DIP("fyl2xp1\n");
   6096                put_ST_UNCHECKED(1,
   6097                   triop(Iop_Yl2xp1F64,
   6098                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6099                         get_ST(1),
   6100                         get_ST(0)));
   6101                fp_pop();
   6102                break;
   6103 
   6104             case 0xFA: /* FSQRT */
   6105                DIP("fsqrt\n");
   6106                put_ST_UNCHECKED(0,
   6107                   binop(Iop_SqrtF64,
   6108                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6109                         get_ST(0)));
   6110                break;
   6111 
   6112             case 0xFB: { /* FSINCOS */
   6113                DIP("fsincos\n");
   6114                IRTemp argD = newTemp(Ity_F64);
   6115                assign(argD, get_ST(0));
   6116                IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
   6117                IRTemp resD = newTemp(Ity_F64);
   6118                assign(resD,
   6119                   IRExpr_ITE(
   6120                      mkexpr(argOK),
   6121                      binop(Iop_SinF64,
   6122                            get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6123                            mkexpr(argD)),
   6124                      mkexpr(argD))
   6125                );
   6126                put_ST_UNCHECKED(0, mkexpr(resD));
   6127                /* Conditionally push the cos value on the stack, if
   6128                   the arg is in range */
   6129                maybe_fp_push(argOK);
   6130                maybe_put_ST(argOK, 0,
   6131                   binop(Iop_CosF64,
   6132                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6133                         mkexpr(argD)));
   6134                set_C2( binop(Iop_Xor64,
   6135                              unop(Iop_1Uto64, mkexpr(argOK)),
   6136                              mkU64(1)) );
   6137                break;
   6138             }
   6139 
   6140             case 0xFC: /* FRNDINT */
   6141                DIP("frndint\n");
   6142                put_ST_UNCHECKED(0,
   6143                   binop(Iop_RoundF64toInt, get_roundingmode(), get_ST(0)) );
   6144                break;
   6145 
   6146             case 0xFD: /* FSCALE */
   6147                DIP("fscale\n");
   6148                put_ST_UNCHECKED(0,
   6149                   triop(Iop_ScaleF64,
   6150                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6151                         get_ST(0),
   6152                         get_ST(1)));
   6153                break;
   6154 
   6155             case 0xFE:   /* FSIN */
   6156             case 0xFF: { /* FCOS */
   6157                Bool isSIN = modrm == 0xFE;
   6158                DIP("%s\n", isSIN ? "fsin" : "fcos");
   6159                IRTemp argD = newTemp(Ity_F64);
   6160                assign(argD, get_ST(0));
   6161                IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
   6162                IRTemp resD = newTemp(Ity_F64);
   6163                assign(resD,
   6164                   IRExpr_ITE(
   6165                      mkexpr(argOK),
   6166                      binop(isSIN ? Iop_SinF64 : Iop_CosF64,
   6167                            get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6168                            mkexpr(argD)),
   6169                      mkexpr(argD))
   6170                );
   6171                put_ST_UNCHECKED(0, mkexpr(resD));
   6172                set_C2( binop(Iop_Xor64,
   6173                              unop(Iop_1Uto64, mkexpr(argOK)),
   6174                              mkU64(1)) );
   6175                break;
   6176             }
   6177 
   6178             default:
   6179                goto decode_fail;
   6180          }
   6181       }
   6182    }
   6183 
   6184    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDA opcodes +-+-+-+-+-+-+-+ */
   6185    else
   6186    if (first_opcode == 0xDA) {
   6187 
   6188       if (modrm < 0xC0) {
   6189 
   6190          /* bits 5,4,3 are an opcode extension, and the modRM also
   6191             specifies an address. */
   6192          IROp   fop;
   6193          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6194          delta += len;
   6195          switch (gregLO3ofRM(modrm)) {
   6196 
   6197             case 0: /* FIADD m32int */ /* ST(0) += m32int */
   6198                DIP("fiaddl %s\n", dis_buf);
   6199                fop = Iop_AddF64;
   6200                goto do_fop_m32;
   6201 
   6202             case 1: /* FIMUL m32int */ /* ST(0) *= m32int */
   6203                DIP("fimull %s\n", dis_buf);
   6204                fop = Iop_MulF64;
   6205                goto do_fop_m32;
   6206 
   6207             case 4: /* FISUB m32int */ /* ST(0) -= m32int */
   6208                DIP("fisubl %s\n", dis_buf);
   6209                fop = Iop_SubF64;
   6210                goto do_fop_m32;
   6211 
   6212             case 5: /* FISUBR m32int */ /* ST(0) = m32int - ST(0) */
   6213                DIP("fisubrl %s\n", dis_buf);
   6214                fop = Iop_SubF64;
   6215                goto do_foprev_m32;
   6216 
   6217             case 6: /* FIDIV m32int */ /* ST(0) /= m32int */
   6218                DIP("fisubl %s\n", dis_buf);
   6219                fop = Iop_DivF64;
   6220                goto do_fop_m32;
   6221 
   6222             case 7: /* FIDIVR m32int */ /* ST(0) = m32int / ST(0) */
   6223                DIP("fidivrl %s\n", dis_buf);
   6224                fop = Iop_DivF64;
   6225                goto do_foprev_m32;
   6226 
   6227             do_fop_m32:
   6228                put_ST_UNCHECKED(0,
   6229                   triop(fop,
   6230                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6231                         get_ST(0),
   6232                         unop(Iop_I32StoF64,
   6233                              loadLE(Ity_I32, mkexpr(addr)))));
   6234                break;
   6235 
   6236             do_foprev_m32:
   6237                put_ST_UNCHECKED(0,
   6238                   triop(fop,
   6239                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6240                         unop(Iop_I32StoF64,
   6241                              loadLE(Ity_I32, mkexpr(addr))),
   6242                         get_ST(0)));
   6243                break;
   6244 
   6245             default:
   6246                vex_printf("unhandled opc_aux = 0x%2x\n",
   6247                           (UInt)gregLO3ofRM(modrm));
   6248                vex_printf("first_opcode == 0xDA\n");
   6249                goto decode_fail;
   6250          }
   6251 
   6252       } else {
   6253 
   6254          delta++;
   6255          switch (modrm) {
   6256 
   6257             case 0xC0 ... 0xC7: /* FCMOVB ST(i), ST(0) */
   6258                r_src = (UInt)modrm - 0xC0;
   6259                DIP("fcmovb %%st(%u), %%st(0)\n", r_src);
   6260                put_ST_UNCHECKED(0,
   6261                                 IRExpr_ITE(
   6262                                     mk_amd64g_calculate_condition(AMD64CondB),
   6263                                     get_ST(r_src), get_ST(0)) );
   6264                break;
   6265 
   6266             case 0xC8 ... 0xCF: /* FCMOVE(Z) ST(i), ST(0) */
   6267                r_src = (UInt)modrm - 0xC8;
   6268                DIP("fcmovz %%st(%u), %%st(0)\n", r_src);
   6269                put_ST_UNCHECKED(0,
   6270                                 IRExpr_ITE(
   6271                                     mk_amd64g_calculate_condition(AMD64CondZ),
   6272                                     get_ST(r_src), get_ST(0)) );
   6273                break;
   6274 
   6275             case 0xD0 ... 0xD7: /* FCMOVBE ST(i), ST(0) */
   6276                r_src = (UInt)modrm - 0xD0;
   6277                DIP("fcmovbe %%st(%u), %%st(0)\n", r_src);
   6278                put_ST_UNCHECKED(0,
   6279                                 IRExpr_ITE(
   6280                                     mk_amd64g_calculate_condition(AMD64CondBE),
   6281                                     get_ST(r_src), get_ST(0)) );
   6282                break;
   6283 
   6284             case 0xD8 ... 0xDF: /* FCMOVU ST(i), ST(0) */
   6285                r_src = (UInt)modrm - 0xD8;
   6286                DIP("fcmovu %%st(%u), %%st(0)\n", r_src);
   6287                put_ST_UNCHECKED(0,
   6288                                 IRExpr_ITE(
   6289                                     mk_amd64g_calculate_condition(AMD64CondP),
   6290                                     get_ST(r_src), get_ST(0)) );
   6291                break;
   6292 
   6293             case 0xE9: /* FUCOMPP %st(0),%st(1) */
   6294                DIP("fucompp %%st(0),%%st(1)\n");
   6295                /* This forces C1 to zero, which isn't right. */
   6296                put_C3210(
   6297                    unop(Iop_32Uto64,
   6298                    binop( Iop_And32,
   6299                           binop(Iop_Shl32,
   6300                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
   6301                                 mkU8(8)),
   6302                           mkU32(0x4500)
   6303                    )));
   6304                fp_pop();
   6305                fp_pop();
   6306                break;
   6307 
   6308             default:
   6309                goto decode_fail;
   6310          }
   6311 
   6312       }
   6313    }
   6314 
   6315    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDB opcodes +-+-+-+-+-+-+-+ */
   6316    else
   6317    if (first_opcode == 0xDB) {
   6318       if (modrm < 0xC0) {
   6319 
   6320          /* bits 5,4,3 are an opcode extension, and the modRM also
   6321             specifies an address. */
   6322          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6323          delta += len;
   6324 
   6325          switch (gregLO3ofRM(modrm)) {
   6326 
   6327             case 0: /* FILD m32int */
   6328                DIP("fildl %s\n", dis_buf);
   6329                fp_push();
   6330                put_ST(0, unop(Iop_I32StoF64,
   6331                               loadLE(Ity_I32, mkexpr(addr))));
   6332                break;
   6333 
   6334             case 1: /* FISTTPL m32 (SSE3) */
   6335                DIP("fisttpl %s\n", dis_buf);
   6336                storeLE( mkexpr(addr),
   6337                         binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) );
   6338                fp_pop();
   6339                break;
   6340 
   6341             case 2: /* FIST m32 */
   6342                DIP("fistl %s\n", dis_buf);
   6343                storeLE( mkexpr(addr),
   6344                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
   6345                break;
   6346 
   6347             case 3: /* FISTP m32 */
   6348                DIP("fistpl %s\n", dis_buf);
   6349                storeLE( mkexpr(addr),
   6350                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
   6351                fp_pop();
   6352                break;
   6353 
   6354             case 5: { /* FLD extended-real */
   6355                /* Uses dirty helper:
   6356                      ULong amd64g_loadF80le ( ULong )
   6357                   addr holds the address.  First, do a dirty call to
   6358                   get hold of the data. */
   6359                IRTemp   val  = newTemp(Ity_I64);
   6360                IRExpr** args = mkIRExprVec_1 ( mkexpr(addr) );
   6361 
   6362                IRDirty* d = unsafeIRDirty_1_N (
   6363                                val,
   6364                                0/*regparms*/,
   6365                                "amd64g_dirtyhelper_loadF80le",
   6366                                &amd64g_dirtyhelper_loadF80le,
   6367                                args
   6368                             );
   6369                /* declare that we're reading memory */
   6370                d->mFx   = Ifx_Read;
   6371                d->mAddr = mkexpr(addr);
   6372                d->mSize = 10;
   6373 
   6374                /* execute the dirty call, dumping the result in val. */
   6375                stmt( IRStmt_Dirty(d) );
   6376                fp_push();
   6377                put_ST(0, unop(Iop_ReinterpI64asF64, mkexpr(val)));
   6378 
   6379                DIP("fldt %s\n", dis_buf);
   6380                break;
   6381             }
   6382 
   6383             case 7: { /* FSTP extended-real */
   6384                /* Uses dirty helper:
   6385                      void amd64g_storeF80le ( ULong addr, ULong data )
   6386                */
   6387                IRExpr** args
   6388                   = mkIRExprVec_2( mkexpr(addr),
   6389                                    unop(Iop_ReinterpF64asI64, get_ST(0)) );
   6390 
   6391                IRDirty* d = unsafeIRDirty_0_N (
   6392                                0/*regparms*/,
   6393                                "amd64g_dirtyhelper_storeF80le",
   6394                                &amd64g_dirtyhelper_storeF80le,
   6395                                args
   6396                             );
   6397                /* declare we're writing memory */
   6398                d->mFx   = Ifx_Write;
   6399                d->mAddr = mkexpr(addr);
   6400                d->mSize = 10;
   6401 
   6402                /* execute the dirty call. */
   6403                stmt( IRStmt_Dirty(d) );
   6404                fp_pop();
   6405 
   6406                DIP("fstpt\n %s", dis_buf);
   6407                break;
   6408             }
   6409 
   6410             default:
   6411                vex_printf("unhandled opc_aux = 0x%2x\n",
   6412                           (UInt)gregLO3ofRM(modrm));
   6413                vex_printf("first_opcode == 0xDB\n");
   6414                goto decode_fail;
   6415          }
   6416 
   6417       } else {
   6418 
   6419          delta++;
   6420          switch (modrm) {
   6421 
   6422             case 0xC0 ... 0xC7: /* FCMOVNB ST(i), ST(0) */
   6423                r_src = (UInt)modrm - 0xC0;
   6424                DIP("fcmovnb %%st(%u), %%st(0)\n", r_src);
   6425                put_ST_UNCHECKED(0,
   6426                                 IRExpr_ITE(
   6427                                     mk_amd64g_calculate_condition(AMD64CondNB),
   6428                                     get_ST(r_src), get_ST(0)) );
   6429                break;
   6430 
   6431             case 0xC8 ... 0xCF: /* FCMOVNE(NZ) ST(i), ST(0) */
   6432                r_src = (UInt)modrm - 0xC8;
   6433                DIP("fcmovnz %%st(%u), %%st(0)\n", r_src);
   6434                put_ST_UNCHECKED(
   6435                   0,
   6436                   IRExpr_ITE(
   6437                      mk_amd64g_calculate_condition(AMD64CondNZ),
   6438                      get_ST(r_src),
   6439                      get_ST(0)
   6440                   )
   6441                );
   6442                break;
   6443 
   6444             case 0xD0 ... 0xD7: /* FCMOVNBE ST(i), ST(0) */
   6445                r_src = (UInt)modrm - 0xD0;
   6446                DIP("fcmovnbe %%st(%u), %%st(0)\n", r_src);
   6447                put_ST_UNCHECKED(
   6448                   0,
   6449                   IRExpr_ITE(
   6450                      mk_amd64g_calculate_condition(AMD64CondNBE),
   6451                      get_ST(r_src),
   6452                      get_ST(0)
   6453                   )
   6454                );
   6455                break;
   6456 
   6457             case 0xD8 ... 0xDF: /* FCMOVNU ST(i), ST(0) */
   6458                r_src = (UInt)modrm - 0xD8;
   6459                DIP("fcmovnu %%st(%u), %%st(0)\n", r_src);
   6460                put_ST_UNCHECKED(
   6461                   0,
   6462                   IRExpr_ITE(
   6463                      mk_amd64g_calculate_condition(AMD64CondNP),
   6464                      get_ST(r_src),
   6465                      get_ST(0)
   6466                   )
   6467                );
   6468                break;
   6469 
   6470             case 0xE2:
   6471                DIP("fnclex\n");
   6472                break;
   6473 
   6474             case 0xE3: {
   6475                gen_FINIT_SEQUENCE(NULL/*no guarding condition*/);
   6476                DIP("fninit\n");
   6477                break;
   6478             }
   6479 
   6480             case 0xE8 ... 0xEF: /* FUCOMI %st(0),%st(?) */
   6481                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, False );
   6482                break;
   6483 
   6484             case 0xF0 ... 0xF7: /* FCOMI %st(0),%st(?) */
   6485                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, False );
   6486                break;
   6487 
   6488             default:
   6489                goto decode_fail;
   6490          }
   6491       }
   6492    }
   6493 
   6494    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDC opcodes +-+-+-+-+-+-+-+ */
   6495    else
   6496    if (first_opcode == 0xDC) {
   6497       if (modrm < 0xC0) {
   6498 
   6499          /* bits 5,4,3 are an opcode extension, and the modRM also
   6500             specifies an address. */
   6501          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6502          delta += len;
   6503 
   6504          switch (gregLO3ofRM(modrm)) {
   6505 
   6506             case 0: /* FADD double-real */
   6507                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, True );
   6508                break;
   6509 
   6510             case 1: /* FMUL double-real */
   6511                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, True );
   6512                break;
   6513 
   6514             case 2: /* FCOM double-real */
   6515                DIP("fcoml %s\n", dis_buf);
   6516                /* This forces C1 to zero, which isn't right. */
   6517                put_C3210(
   6518                    unop(Iop_32Uto64,
   6519                    binop( Iop_And32,
   6520                           binop(Iop_Shl32,
   6521                                 binop(Iop_CmpF64,
   6522                                       get_ST(0),
   6523                                       loadLE(Ity_F64,mkexpr(addr))),
   6524                                 mkU8(8)),
   6525                           mkU32(0x4500)
   6526                    )));
   6527                break;
   6528 
   6529             case 3: /* FCOMP double-real */
   6530                DIP("fcompl %s\n", dis_buf);
   6531                /* This forces C1 to zero, which isn't right. */
   6532                put_C3210(
   6533                    unop(Iop_32Uto64,
   6534                    binop( Iop_And32,
   6535                           binop(Iop_Shl32,
   6536                                 binop(Iop_CmpF64,
   6537                                       get_ST(0),
   6538                                       loadLE(Ity_F64,mkexpr(addr))),
   6539                                 mkU8(8)),
   6540                           mkU32(0x4500)
   6541                    )));
   6542                fp_pop();
   6543                break;
   6544 
   6545             case 4: /* FSUB double-real */
   6546                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, True );
   6547                break;
   6548 
   6549             case 5: /* FSUBR double-real */
   6550                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, True );
   6551                break;
   6552 
   6553             case 6: /* FDIV double-real */
   6554                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, True );
   6555                break;
   6556 
   6557             case 7: /* FDIVR double-real */
   6558                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, True );
   6559                break;
   6560 
   6561             default:
   6562                vex_printf("unhandled opc_aux = 0x%2x\n",
   6563                           (UInt)gregLO3ofRM(modrm));
   6564                vex_printf("first_opcode == 0xDC\n");
   6565                goto decode_fail;
   6566          }
   6567 
   6568       } else {
   6569 
   6570          delta++;
   6571          switch (modrm) {
   6572 
   6573             case 0xC0 ... 0xC7: /* FADD %st(0),%st(?) */
   6574                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, False );
   6575                break;
   6576 
   6577             case 0xC8 ... 0xCF: /* FMUL %st(0),%st(?) */
   6578                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, False );
   6579                break;
   6580 
   6581             case 0xE0 ... 0xE7: /* FSUBR %st(0),%st(?) */
   6582                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0, modrm - 0xE0, False );
   6583                break;
   6584 
   6585             case 0xE8 ... 0xEF: /* FSUB %st(0),%st(?) */
   6586                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0, modrm - 0xE8, False );
   6587                break;
   6588 
   6589             case 0xF0 ... 0xF7: /* FDIVR %st(0),%st(?) */
   6590                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, False );
   6591                break;
   6592 
   6593             case 0xF8 ... 0xFF: /* FDIV %st(0),%st(?) */
   6594                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, False );
   6595                break;
   6596 
   6597             default:
   6598                goto decode_fail;
   6599          }
   6600 
   6601       }
   6602    }
   6603 
   6604    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDD opcodes +-+-+-+-+-+-+-+ */
   6605    else
   6606    if (first_opcode == 0xDD) {
   6607 
   6608       if (modrm < 0xC0) {
   6609 
   6610          /* bits 5,4,3 are an opcode extension, and the modRM also
   6611             specifies an address. */
   6612          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6613          delta += len;
   6614 
   6615          switch (gregLO3ofRM(modrm)) {
   6616 
   6617             case 0: /* FLD double-real */
   6618                DIP("fldl %s\n", dis_buf);
   6619                fp_push();
   6620                put_ST(0, loadLE(Ity_F64, mkexpr(addr)));
   6621                break;
   6622 
   6623             case 1: /* FISTTPQ m64 (SSE3) */
   6624                DIP("fistppll %s\n", dis_buf);
   6625                storeLE( mkexpr(addr),
   6626                         binop(Iop_F64toI64S, mkU32(Irrm_ZERO), get_ST(0)) );
   6627                fp_pop();
   6628                break;
   6629 
   6630             case 2: /* FST double-real */
   6631                DIP("fstl %s\n", dis_buf);
   6632                storeLE(mkexpr(addr), get_ST(0));
   6633                break;
   6634 
   6635             case 3: /* FSTP double-real */
   6636                DIP("fstpl %s\n", dis_buf);
   6637                storeLE(mkexpr(addr), get_ST(0));
   6638                fp_pop();
   6639                break;
   6640 
   6641             case 4: { /* FRSTOR m94/m108 */
   6642                IRTemp   ew = newTemp(Ity_I32);
   6643                IRTemp  w64 = newTemp(Ity_I64);
   6644                IRDirty*  d;
   6645                if ( have66(pfx) ) {
   6646                   /* Uses dirty helper:
   6647                      VexEmNote amd64g_dirtyhelper_FRSTORS
   6648                                   ( VexGuestAMD64State*, HWord ) */
   6649                   d = unsafeIRDirty_0_N (
   6650                          0/*regparms*/,
   6651                          "amd64g_dirtyhelper_FRSTORS",
   6652                          &amd64g_dirtyhelper_FRSTORS,
   6653                          mkIRExprVec_1( mkexpr(addr) )
   6654                       );
   6655                   d->mSize = 94;
   6656                } else {
   6657                   /* Uses dirty helper:
   6658                      VexEmNote amd64g_dirtyhelper_FRSTOR
   6659                                   ( VexGuestAMD64State*, HWord ) */
   6660                   d = unsafeIRDirty_0_N (
   6661                          0/*regparms*/,
   6662                          "amd64g_dirtyhelper_FRSTOR",
   6663                          &amd64g_dirtyhelper_FRSTOR,
   6664                          mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
   6665                       );
   6666                   d->mSize = 108;
   6667                }
   6668 
   6669                d->tmp    = w64;
   6670                /* declare we're reading memory */
   6671                d->mFx   = Ifx_Read;
   6672                d->mAddr = mkexpr(addr);
   6673                /* d->mSize set above */
   6674 
   6675                /* declare we're writing guest state */
   6676                d->nFxState = 5;
   6677                vex_bzero(&d->fxState, sizeof(d->fxState));
   6678 
   6679                d->fxState[0].fx     = Ifx_Write;
   6680                d->fxState[0].offset = OFFB_FTOP;
   6681                d->fxState[0].size   = sizeof(UInt);
   6682 
   6683                d->fxState[1].fx     = Ifx_Write;
   6684                d->fxState[1].offset = OFFB_FPREGS;
   6685                d->fxState[1].size   = 8 * sizeof(ULong);
   6686 
   6687                d->fxState[2].fx     = Ifx_Write;
   6688                d->fxState[2].offset = OFFB_FPTAGS;
   6689                d->fxState[2].size   = 8 * sizeof(UChar);
   6690 
   6691                d->fxState[3].fx     = Ifx_Write;
   6692                d->fxState[3].offset = OFFB_FPROUND;
   6693                d->fxState[3].size   = sizeof(ULong);
   6694 
   6695                d->fxState[4].fx     = Ifx_Write;
   6696                d->fxState[4].offset = OFFB_FC3210;
   6697                d->fxState[4].size   = sizeof(ULong);
   6698 
   6699                stmt( IRStmt_Dirty(d) );
   6700 
   6701                /* ew contains any emulation warning we may need to
   6702                   issue.  If needed, side-exit to the next insn,
   6703                   reporting the warning, so that Valgrind's dispatcher
   6704                   sees the warning. */
   6705                assign(ew, unop(Iop_64to32,mkexpr(w64)) );
   6706                put_emwarn( mkexpr(ew) );
   6707                stmt(
   6708                   IRStmt_Exit(
   6709                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   6710                      Ijk_EmWarn,
   6711                      IRConst_U64( guest_RIP_bbstart+delta ),
   6712                      OFFB_RIP
   6713                   )
   6714                );
   6715 
   6716                if ( have66(pfx) ) {
   6717                   DIP("frstors %s\n", dis_buf);
   6718                } else {
   6719                   DIP("frstor %s\n", dis_buf);
   6720                }
   6721                break;
   6722             }
   6723 
   6724             case 6: { /* FNSAVE m94/m108 */
   6725                IRDirty *d;
   6726                if ( have66(pfx) ) {
   6727                  /* Uses dirty helper:
   6728                     void amd64g_dirtyhelper_FNSAVES ( VexGuestAMD64State*,
   6729                                                       HWord ) */
   6730                   d = unsafeIRDirty_0_N (
   6731                          0/*regparms*/,
   6732                          "amd64g_dirtyhelper_FNSAVES",
   6733                          &amd64g_dirtyhelper_FNSAVES,
   6734                          mkIRExprVec_1( mkexpr(addr) )
   6735                          );
   6736                   d->mSize = 94;
   6737                } else {
   6738                  /* Uses dirty helper:
   6739                     void amd64g_dirtyhelper_FNSAVE ( VexGuestAMD64State*,
   6740                                                      HWord ) */
   6741                   d = unsafeIRDirty_0_N (
   6742                          0/*regparms*/,
   6743                          "amd64g_dirtyhelper_FNSAVE",
   6744                          &amd64g_dirtyhelper_FNSAVE,
   6745                          mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
   6746                       );
   6747                   d->mSize = 108;
   6748                }
   6749 
   6750                /* declare we're writing memory */
   6751                d->mFx   = Ifx_Write;
   6752                d->mAddr = mkexpr(addr);
   6753                /* d->mSize set above */
   6754 
   6755                /* declare we're reading guest state */
   6756                d->nFxState = 5;
   6757                vex_bzero(&d->fxState, sizeof(d->fxState));
   6758 
   6759                d->fxState[0].fx     = Ifx_Read;
   6760                d->fxState[0].offset = OFFB_FTOP;
   6761                d->fxState[0].size   = sizeof(UInt);
   6762 
   6763                d->fxState[1].fx     = Ifx_Read;
   6764                d->fxState[1].offset = OFFB_FPREGS;
   6765                d->fxState[1].size   = 8 * sizeof(ULong);
   6766 
   6767                d->fxState[2].fx     = Ifx_Read;
   6768                d->fxState[2].offset = OFFB_FPTAGS;
   6769                d->fxState[2].size   = 8 * sizeof(UChar);
   6770 
   6771                d->fxState[3].fx     = Ifx_Read;
   6772                d->fxState[3].offset = OFFB_FPROUND;
   6773                d->fxState[3].size   = sizeof(ULong);
   6774 
   6775                d->fxState[4].fx     = Ifx_Read;
   6776                d->fxState[4].offset = OFFB_FC3210;
   6777                d->fxState[4].size   = sizeof(ULong);
   6778 
   6779                stmt( IRStmt_Dirty(d) );
   6780 
   6781                if ( have66(pfx) ) {
   6782                  DIP("fnsaves %s\n", dis_buf);
   6783                } else {
   6784                  DIP("fnsave %s\n", dis_buf);
   6785                }
   6786                break;
   6787             }
   6788 
   6789             case 7: { /* FNSTSW m16 */
   6790                IRExpr* sw = get_FPU_sw();
   6791                vassert(typeOfIRExpr(irsb->tyenv, sw) == Ity_I16);
   6792                storeLE( mkexpr(addr), sw );
   6793                DIP("fnstsw %s\n", dis_buf);
   6794                break;
   6795             }
   6796 
   6797             default:
   6798                vex_printf("unhandled opc_aux = 0x%2x\n",
   6799                           (UInt)gregLO3ofRM(modrm));
   6800                vex_printf("first_opcode == 0xDD\n");
   6801                goto decode_fail;
   6802          }
   6803       } else {
   6804          delta++;
   6805          switch (modrm) {
   6806 
   6807             case 0xC0 ... 0xC7: /* FFREE %st(?) */
   6808                r_dst = (UInt)modrm - 0xC0;
   6809                DIP("ffree %%st(%u)\n", r_dst);
   6810                put_ST_TAG ( r_dst, mkU8(0) );
   6811                break;
   6812 
   6813             case 0xD0 ... 0xD7: /* FST %st(0),%st(?) */
   6814                r_dst = (UInt)modrm - 0xD0;
   6815                DIP("fst %%st(0),%%st(%u)\n", r_dst);
   6816                /* P4 manual says: "If the destination operand is a
   6817                   non-empty register, the invalid-operation exception
   6818                   is not generated.  Hence put_ST_UNCHECKED. */
   6819                put_ST_UNCHECKED(r_dst, get_ST(0));
   6820                break;
   6821 
   6822             case 0xD8 ... 0xDF: /* FSTP %st(0),%st(?) */
   6823                r_dst = (UInt)modrm - 0xD8;
   6824                DIP("fstp %%st(0),%%st(%u)\n", r_dst);
   6825                /* P4 manual says: "If the destination operand is a
   6826                   non-empty register, the invalid-operation exception
   6827                   is not generated.  Hence put_ST_UNCHECKED. */
   6828                put_ST_UNCHECKED(r_dst, get_ST(0));
   6829                fp_pop();
   6830                break;
   6831 
   6832             case 0xE0 ... 0xE7: /* FUCOM %st(0),%st(?) */
   6833                r_dst = (UInt)modrm - 0xE0;
   6834                DIP("fucom %%st(0),%%st(%u)\n", r_dst);
   6835                /* This forces C1 to zero, which isn't right. */
   6836                put_C3210(
   6837                    unop(Iop_32Uto64,
   6838                    binop( Iop_And32,
   6839                           binop(Iop_Shl32,
   6840                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   6841                                 mkU8(8)),
   6842                           mkU32(0x4500)
   6843                    )));
   6844                break;
   6845 
   6846             case 0xE8 ... 0xEF: /* FUCOMP %st(0),%st(?) */
   6847                r_dst = (UInt)modrm - 0xE8;
   6848                DIP("fucomp %%st(0),%%st(%u)\n", r_dst);
   6849                /* This forces C1 to zero, which isn't right. */
   6850                put_C3210(
   6851                    unop(Iop_32Uto64,
   6852                    binop( Iop_And32,
   6853                           binop(Iop_Shl32,
   6854                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   6855                                 mkU8(8)),
   6856                           mkU32(0x4500)
   6857                    )));
   6858                fp_pop();
   6859                break;
   6860 
   6861             default:
   6862                goto decode_fail;
   6863          }
   6864       }
   6865    }
   6866 
   6867    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDE opcodes +-+-+-+-+-+-+-+ */
   6868    else
   6869    if (first_opcode == 0xDE) {
   6870 
   6871       if (modrm < 0xC0) {
   6872 
   6873          /* bits 5,4,3 are an opcode extension, and the modRM also
   6874             specifies an address. */
   6875          IROp   fop;
   6876          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6877          delta += len;
   6878 
   6879          switch (gregLO3ofRM(modrm)) {
   6880 
   6881             case 0: /* FIADD m16int */ /* ST(0) += m16int */
   6882                DIP("fiaddw %s\n", dis_buf);
   6883                fop = Iop_AddF64;
   6884                goto do_fop_m16;
   6885 
   6886             case 1: /* FIMUL m16int */ /* ST(0) *= m16int */
   6887                DIP("fimulw %s\n", dis_buf);
   6888                fop = Iop_MulF64;
   6889                goto do_fop_m16;
   6890 
   6891             case 4: /* FISUB m16int */ /* ST(0) -= m16int */
   6892                DIP("fisubw %s\n", dis_buf);
   6893                fop = Iop_SubF64;
   6894                goto do_fop_m16;
   6895 
   6896             case 5: /* FISUBR m16int */ /* ST(0) = m16int - ST(0) */
   6897                DIP("fisubrw %s\n", dis_buf);
   6898                fop = Iop_SubF64;
   6899                goto do_foprev_m16;
   6900 
   6901             case 6: /* FIDIV m16int */ /* ST(0) /= m16int */
   6902                DIP("fisubw %s\n", dis_buf);
   6903                fop = Iop_DivF64;
   6904                goto do_fop_m16;
   6905 
   6906             case 7: /* FIDIVR m16int */ /* ST(0) = m16int / ST(0) */
   6907                DIP("fidivrw %s\n", dis_buf);
   6908                fop = Iop_DivF64;
   6909                goto do_foprev_m16;
   6910 
   6911             do_fop_m16:
   6912                put_ST_UNCHECKED(0,
   6913                   triop(fop,
   6914                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6915                         get_ST(0),
   6916                         unop(Iop_I32StoF64,
   6917                              unop(Iop_16Sto32,
   6918                                   loadLE(Ity_I16, mkexpr(addr))))));
   6919                break;
   6920 
   6921             do_foprev_m16:
   6922                put_ST_UNCHECKED(0,
   6923                   triop(fop,
   6924                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6925                         unop(Iop_I32StoF64,
   6926                              unop(Iop_16Sto32,
   6927                                   loadLE(Ity_I16, mkexpr(addr)))),
   6928                         get_ST(0)));
   6929                break;
   6930 
   6931             default:
   6932                vex_printf("unhandled opc_aux = 0x%2x\n",
   6933                           (UInt)gregLO3ofRM(modrm));
   6934                vex_printf("first_opcode == 0xDE\n");
   6935                goto decode_fail;
   6936          }
   6937 
   6938       } else {
   6939 
   6940          delta++;
   6941          switch (modrm) {
   6942 
   6943             case 0xC0 ... 0xC7: /* FADDP %st(0),%st(?) */
   6944                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, True );
   6945                break;
   6946 
   6947             case 0xC8 ... 0xCF: /* FMULP %st(0),%st(?) */
   6948                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, True );
   6949                break;
   6950 
   6951             case 0xD9: /* FCOMPP %st(0),%st(1) */
   6952                DIP("fcompp %%st(0),%%st(1)\n");
   6953                /* This forces C1 to zero, which isn't right. */
   6954                put_C3210(
   6955                    unop(Iop_32Uto64,
   6956                    binop( Iop_And32,
   6957                           binop(Iop_Shl32,
   6958                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
   6959                                 mkU8(8)),
   6960                           mkU32(0x4500)
   6961                    )));
   6962                fp_pop();
   6963                fp_pop();
   6964                break;
   6965 
   6966             case 0xE0 ... 0xE7: /* FSUBRP %st(0),%st(?) */
   6967                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0,  modrm - 0xE0, True );
   6968                break;
   6969 
   6970             case 0xE8 ... 0xEF: /* FSUBP %st(0),%st(?) */
   6971                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0,  modrm - 0xE8, True );
   6972                break;
   6973 
   6974             case 0xF0 ... 0xF7: /* FDIVRP %st(0),%st(?) */
   6975                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, True );
   6976                break;
   6977 
   6978             case 0xF8 ... 0xFF: /* FDIVP %st(0),%st(?) */
   6979                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, True );
   6980                break;
   6981 
   6982             default:
   6983                goto decode_fail;
   6984          }
   6985 
   6986       }
   6987    }
   6988 
   6989    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDF opcodes +-+-+-+-+-+-+-+ */
   6990    else
   6991    if (first_opcode == 0xDF) {
   6992 
   6993       if (modrm < 0xC0) {
   6994 
   6995          /* bits 5,4,3 are an opcode extension, and the modRM also
   6996             specifies an address. */
   6997          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6998          delta += len;
   6999 
   7000          switch (gregLO3ofRM(modrm)) {
   7001 
   7002             case 0: /* FILD m16int */
   7003                DIP("fildw %s\n", dis_buf);
   7004                fp_push();
   7005                put_ST(0, unop(Iop_I32StoF64,
   7006                               unop(Iop_16Sto32,
   7007                                    loadLE(Ity_I16, mkexpr(addr)))));
   7008                break;
   7009 
   7010             case 1: /* FISTTPS m16 (SSE3) */
   7011                DIP("fisttps %s\n", dis_buf);
   7012                storeLE( mkexpr(addr),
   7013                         x87ishly_qnarrow_32_to_16(
   7014                         binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) ));
   7015                fp_pop();
   7016                break;
   7017 
   7018             case 2: /* FIST m16 */
   7019                DIP("fists %s\n", dis_buf);
   7020                storeLE( mkexpr(addr),
   7021                         x87ishly_qnarrow_32_to_16(
   7022                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) ));
   7023                break;
   7024 
   7025             case 3: /* FISTP m16 */
   7026                DIP("fistps %s\n", dis_buf);
   7027                storeLE( mkexpr(addr),
   7028                         x87ishly_qnarrow_32_to_16(
   7029                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) ));
   7030                fp_pop();
   7031                break;
   7032 
   7033             case 5: /* FILD m64 */
   7034                DIP("fildll %s\n", dis_buf);
   7035                fp_push();
   7036                put_ST(0, binop(Iop_I64StoF64,
   7037                                get_roundingmode(),
   7038                                loadLE(Ity_I64, mkexpr(addr))));
   7039                break;
   7040 
   7041             case 7: /* FISTP m64 */
   7042                DIP("fistpll %s\n", dis_buf);
   7043                storeLE( mkexpr(addr),
   7044                         binop(Iop_F64toI64S, get_roundingmode(), get_ST(0)) );
   7045                fp_pop();
   7046                break;
   7047 
   7048             default:
   7049                vex_printf("unhandled opc_aux = 0x%2x\n",
   7050                           (UInt)gregLO3ofRM(modrm));
   7051                vex_printf("first_opcode == 0xDF\n");
   7052                goto decode_fail;
   7053          }
   7054 
   7055       } else {
   7056 
   7057          delta++;
   7058          switch (modrm) {
   7059 
   7060             case 0xC0: /* FFREEP %st(0) */
   7061                DIP("ffreep %%st(%d)\n", 0);
   7062                put_ST_TAG ( 0, mkU8(0) );
   7063                fp_pop();
   7064                break;
   7065 
   7066             case 0xE0: /* FNSTSW %ax */
   7067                DIP("fnstsw %%ax\n");
   7068                /* Invent a plausible-looking FPU status word value and
   7069                   dump it in %AX:
   7070                      ((ftop & 7) << 11) | (c3210 & 0x4700)
   7071                */
   7072                putIRegRAX(
   7073                   2,
   7074                   unop(Iop_32to16,
   7075                        binop(Iop_Or32,
   7076                              binop(Iop_Shl32,
   7077                                    binop(Iop_And32, get_ftop(), mkU32(7)),
   7078                                    mkU8(11)),
   7079                              binop(Iop_And32,
   7080                                    unop(Iop_64to32, get_C3210()),
   7081                                    mkU32(0x4700))
   7082                )));
   7083                break;
   7084 
   7085             case 0xE8 ... 0xEF: /* FUCOMIP %st(0),%st(?) */
   7086                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, True );
   7087                break;
   7088 
   7089             case 0xF0 ... 0xF7: /* FCOMIP %st(0),%st(?) */
   7090                /* not really right since COMIP != UCOMIP */
   7091                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, True );
   7092                break;
   7093 
   7094             default:
   7095                goto decode_fail;
   7096          }
   7097       }
   7098 
   7099    }
   7100 
   7101    else
   7102       goto decode_fail;
   7103 
   7104    *decode_ok = True;
   7105    return delta;
   7106 
   7107   decode_fail:
   7108    *decode_ok = False;
   7109    return delta;
   7110 }
   7111 
   7112 
   7113 /*------------------------------------------------------------*/
   7114 /*---                                                      ---*/
   7115 /*--- MMX INSTRUCTIONS                                     ---*/
   7116 /*---                                                      ---*/
   7117 /*------------------------------------------------------------*/
   7118 
   7119 /* Effect of MMX insns on x87 FPU state (table 11-2 of
   7120    IA32 arch manual, volume 3):
   7121 
   7122    Read from, or write to MMX register (viz, any insn except EMMS):
   7123    * All tags set to Valid (non-empty) -- FPTAGS[i] := nonzero
   7124    * FP stack pointer set to zero
   7125 
   7126    EMMS:
   7127    * All tags set to Invalid (empty) -- FPTAGS[i] := zero
   7128    * FP stack pointer set to zero
   7129 */
   7130 
   7131 static void do_MMX_preamble ( void )
   7132 {
   7133    Int         i;
   7134    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   7135    IRExpr*     zero  = mkU32(0);
   7136    IRExpr*     tag1  = mkU8(1);
   7137    put_ftop(zero);
   7138    for (i = 0; i < 8; i++)
   7139       stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag1) ) );
   7140 }
   7141 
   7142 static void do_EMMS_preamble ( void )
   7143 {
   7144    Int         i;
   7145    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   7146    IRExpr*     zero  = mkU32(0);
   7147    IRExpr*     tag0  = mkU8(0);
   7148    put_ftop(zero);
   7149    for (i = 0; i < 8; i++)
   7150       stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag0) ) );
   7151 }
   7152 
   7153 
   7154 static IRExpr* getMMXReg ( UInt archreg )
   7155 {
   7156    vassert(archreg < 8);
   7157    return IRExpr_Get( OFFB_FPREGS + 8 * archreg, Ity_I64 );
   7158 }
   7159 
   7160 
   7161 static void putMMXReg ( UInt archreg, IRExpr* e )
   7162 {
   7163    vassert(archreg < 8);
   7164    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   7165    stmt( IRStmt_Put( OFFB_FPREGS + 8 * archreg, e ) );
   7166 }
   7167 
   7168 
   7169 /* Helper for non-shift MMX insns.  Note this is incomplete in the
   7170    sense that it does not first call do_MMX_preamble() -- that is the
   7171    responsibility of its caller. */
   7172 
   7173 static
   7174 ULong dis_MMXop_regmem_to_reg ( const VexAbiInfo* vbi,
   7175                                 Prefix      pfx,
   7176                                 Long        delta,
   7177                                 UChar       opc,
   7178                                 const HChar* name,
   7179                                 Bool        show_granularity )
   7180 {
   7181    HChar   dis_buf[50];
   7182    UChar   modrm = getUChar(delta);
   7183    Bool    isReg = epartIsReg(modrm);
   7184    IRExpr* argL  = NULL;
   7185    IRExpr* argR  = NULL;
   7186    IRExpr* argG  = NULL;
   7187    IRExpr* argE  = NULL;
   7188    IRTemp  res   = newTemp(Ity_I64);
   7189 
   7190    Bool    invG  = False;
   7191    IROp    op    = Iop_INVALID;
   7192    void*   hAddr = NULL;
   7193    const HChar*  hName = NULL;
   7194    Bool    eLeft = False;
   7195 
   7196 #  define XXX(_name) do { hAddr = &_name; hName = #_name; } while (0)
   7197 
   7198    switch (opc) {
   7199       /* Original MMX ones */
   7200       case 0xFC: op = Iop_Add8x8; break;
   7201       case 0xFD: op = Iop_Add16x4; break;
   7202       case 0xFE: op = Iop_Add32x2; break;
   7203 
   7204       case 0xEC: op = Iop_QAdd8Sx8; break;
   7205       case 0xED: op = Iop_QAdd16Sx4; break;
   7206 
   7207       case 0xDC: op = Iop_QAdd8Ux8; break;
   7208       case 0xDD: op = Iop_QAdd16Ux4; break;
   7209 
   7210       case 0xF8: op = Iop_Sub8x8;  break;
   7211       case 0xF9: op = Iop_Sub16x4; break;
   7212       case 0xFA: op = Iop_Sub32x2; break;
   7213 
   7214       case 0xE8: op = Iop_QSub8Sx8; break;
   7215       case 0xE9: op = Iop_QSub16Sx4; break;
   7216 
   7217       case 0xD8: op = Iop_QSub8Ux8; break;
   7218       case 0xD9: op = Iop_QSub16Ux4; break;
   7219 
   7220       case 0xE5: op = Iop_MulHi16Sx4; break;
   7221       case 0xD5: op = Iop_Mul16x4; break;
   7222       case 0xF5: XXX(amd64g_calculate_mmx_pmaddwd); break;
   7223 
   7224       case 0x74: op = Iop_CmpEQ8x8; break;
   7225       case 0x75: op = Iop_CmpEQ16x4; break;
   7226       case 0x76: op = Iop_CmpEQ32x2; break;
   7227 
   7228       case 0x64: op = Iop_CmpGT8Sx8; break;
   7229       case 0x65: op = Iop_CmpGT16Sx4; break;
   7230       case 0x66: op = Iop_CmpGT32Sx2; break;
   7231 
   7232       case 0x6B: op = Iop_QNarrowBin32Sto16Sx4; eLeft = True; break;
   7233       case 0x63: op = Iop_QNarrowBin16Sto8Sx8;  eLeft = True; break;
   7234       case 0x67: op = Iop_QNarrowBin16Sto8Ux8;  eLeft = True; break;
   7235 
   7236       case 0x68: op = Iop_InterleaveHI8x8;  eLeft = True; break;
   7237       case 0x69: op = Iop_InterleaveHI16x4; eLeft = True; break;
   7238       case 0x6A: op = Iop_InterleaveHI32x2; eLeft = True; break;
   7239 
   7240       case 0x60: op = Iop_InterleaveLO8x8;  eLeft = True; break;
   7241       case 0x61: op = Iop_InterleaveLO16x4; eLeft = True; break;
   7242       case 0x62: op = Iop_InterleaveLO32x2; eLeft = True; break;
   7243 
   7244       case 0xDB: op = Iop_And64; break;
   7245       case 0xDF: op = Iop_And64; invG = True; break;
   7246       case 0xEB: op = Iop_Or64; break;
   7247       case 0xEF: /* Possibly do better here if argL and argR are the
   7248                     same reg */
   7249                  op = Iop_Xor64; break;
   7250 
   7251       /* Introduced in SSE1 */
   7252       case 0xE0: op = Iop_Avg8Ux8;    break;
   7253       case 0xE3: op = Iop_Avg16Ux4;   break;
   7254       case 0xEE: op = Iop_Max16Sx4;   break;
   7255       case 0xDE: op = Iop_Max8Ux8;    break;
   7256       case 0xEA: op = Iop_Min16Sx4;   break;
   7257       case 0xDA: op = Iop_Min8Ux8;    break;
   7258       case 0xE4: op = Iop_MulHi16Ux4; break;
   7259       case 0xF6: XXX(amd64g_calculate_mmx_psadbw); break;
   7260 
   7261       /* Introduced in SSE2 */
   7262       case 0xD4: op = Iop_Add64; break;
   7263       case 0xFB: op = Iop_Sub64; break;
   7264 
   7265       default:
   7266          vex_printf("\n0x%x\n", (UInt)opc);
   7267          vpanic("dis_MMXop_regmem_to_reg");
   7268    }
   7269 
   7270 #  undef XXX
   7271 
   7272    argG = getMMXReg(gregLO3ofRM(modrm));
   7273    if (invG)
   7274       argG = unop(Iop_Not64, argG);
   7275 
   7276    if (isReg) {
   7277       delta++;
   7278       argE = getMMXReg(eregLO3ofRM(modrm));
   7279    } else {
   7280       Int    len;
   7281       IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7282       delta += len;
   7283       argE = loadLE(Ity_I64, mkexpr(addr));
   7284    }
   7285 
   7286    if (eLeft) {
   7287       argL = argE;
   7288       argR = argG;
   7289    } else {
   7290       argL = argG;
   7291       argR = argE;
   7292    }
   7293 
   7294    if (op != Iop_INVALID) {
   7295       vassert(hName == NULL);
   7296       vassert(hAddr == NULL);
   7297       assign(res, binop(op, argL, argR));
   7298    } else {
   7299       vassert(hName != NULL);
   7300       vassert(hAddr != NULL);
   7301       assign( res,
   7302               mkIRExprCCall(
   7303                  Ity_I64,
   7304                  0/*regparms*/, hName, hAddr,
   7305                  mkIRExprVec_2( argL, argR )
   7306               )
   7307             );
   7308    }
   7309 
   7310    putMMXReg( gregLO3ofRM(modrm), mkexpr(res) );
   7311 
   7312    DIP("%s%s %s, %s\n",
   7313        name, show_granularity ? nameMMXGran(opc & 3) : "",
   7314        ( isReg ? nameMMXReg(eregLO3ofRM(modrm)) : dis_buf ),
   7315        nameMMXReg(gregLO3ofRM(modrm)) );
   7316 
   7317    return delta;
   7318 }
   7319 
   7320 
   7321 /* Vector by scalar shift of G by the amount specified at the bottom
   7322    of E.  This is a straight copy of dis_SSE_shiftG_byE. */
   7323 
   7324 static ULong dis_MMX_shiftG_byE ( const VexAbiInfo* vbi,
   7325                                   Prefix pfx, Long delta,
   7326                                   const HChar* opname, IROp op )
   7327 {
   7328    HChar   dis_buf[50];
   7329    Int     alen, size;
   7330    IRTemp  addr;
   7331    Bool    shl, shr, sar;
   7332    UChar   rm   = getUChar(delta);
   7333    IRTemp  g0   = newTemp(Ity_I64);
   7334    IRTemp  g1   = newTemp(Ity_I64);
   7335    IRTemp  amt  = newTemp(Ity_I64);
   7336    IRTemp  amt8 = newTemp(Ity_I8);
   7337 
   7338    if (epartIsReg(rm)) {
   7339       assign( amt, getMMXReg(eregLO3ofRM(rm)) );
   7340       DIP("%s %s,%s\n", opname,
   7341                         nameMMXReg(eregLO3ofRM(rm)),
   7342                         nameMMXReg(gregLO3ofRM(rm)) );
   7343       delta++;
   7344    } else {
   7345       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   7346       assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
   7347       DIP("%s %s,%s\n", opname,
   7348                         dis_buf,
   7349                         nameMMXReg(gregLO3ofRM(rm)) );
   7350       delta += alen;
   7351    }
   7352    assign( g0,   getMMXReg(gregLO3ofRM(rm)) );
   7353    assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
   7354 
   7355    shl = shr = sar = False;
   7356    size = 0;
   7357    switch (op) {
   7358       case Iop_ShlN16x4: shl = True; size = 32; break;
   7359       case Iop_ShlN32x2: shl = True; size = 32; break;
   7360       case Iop_Shl64:    shl = True; size = 64; break;
   7361       case Iop_ShrN16x4: shr = True; size = 16; break;
   7362       case Iop_ShrN32x2: shr = True; size = 32; break;
   7363       case Iop_Shr64:    shr = True; size = 64; break;
   7364       case Iop_SarN16x4: sar = True; size = 16; break;
   7365       case Iop_SarN32x2: sar = True; size = 32; break;
   7366       default: vassert(0);
   7367    }
   7368 
   7369    if (shl || shr) {
   7370      assign(
   7371         g1,
   7372         IRExpr_ITE(
   7373            binop(Iop_CmpLT64U,mkexpr(amt),mkU64(size)),
   7374            binop(op, mkexpr(g0), mkexpr(amt8)),
   7375            mkU64(0)
   7376         )
   7377      );
   7378    } else
   7379    if (sar) {
   7380      assign(
   7381         g1,
   7382         IRExpr_ITE(
   7383            binop(Iop_CmpLT64U,mkexpr(amt),mkU64(size)),
   7384            binop(op, mkexpr(g0), mkexpr(amt8)),
   7385            binop(op, mkexpr(g0), mkU8(size-1))
   7386         )
   7387      );
   7388    } else {
   7389       vassert(0);
   7390    }
   7391 
   7392    putMMXReg( gregLO3ofRM(rm), mkexpr(g1) );
   7393    return delta;
   7394 }
   7395 
   7396 
   7397 /* Vector by scalar shift of E by an immediate byte.  This is a
   7398    straight copy of dis_SSE_shiftE_imm. */
   7399 
   7400 static
   7401 ULong dis_MMX_shiftE_imm ( Long delta, const HChar* opname, IROp op )
   7402 {
   7403    Bool    shl, shr, sar;
   7404    UChar   rm   = getUChar(delta);
   7405    IRTemp  e0   = newTemp(Ity_I64);
   7406    IRTemp  e1   = newTemp(Ity_I64);
   7407    UChar   amt, size;
   7408    vassert(epartIsReg(rm));
   7409    vassert(gregLO3ofRM(rm) == 2
   7410            || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
   7411    amt = getUChar(delta+1);
   7412    delta += 2;
   7413    DIP("%s $%d,%s\n", opname,
   7414                       (Int)amt,
   7415                       nameMMXReg(eregLO3ofRM(rm)) );
   7416 
   7417    assign( e0, getMMXReg(eregLO3ofRM(rm)) );
   7418 
   7419    shl = shr = sar = False;
   7420    size = 0;
   7421    switch (op) {
   7422       case Iop_ShlN16x4: shl = True; size = 16; break;
   7423       case Iop_ShlN32x2: shl = True; size = 32; break;
   7424       case Iop_Shl64:    shl = True; size = 64; break;
   7425       case Iop_SarN16x4: sar = True; size = 16; break;
   7426       case Iop_SarN32x2: sar = True; size = 32; break;
   7427       case Iop_ShrN16x4: shr = True; size = 16; break;
   7428       case Iop_ShrN32x2: shr = True; size = 32; break;
   7429       case Iop_Shr64:    shr = True; size = 64; break;
   7430       default: vassert(0);
   7431    }
   7432 
   7433    if (shl || shr) {
   7434      assign( e1, amt >= size
   7435                     ? mkU64(0)
   7436                     : binop(op, mkexpr(e0), mkU8(amt))
   7437      );
   7438    } else
   7439    if (sar) {
   7440      assign( e1, amt >= size
   7441                     ? binop(op, mkexpr(e0), mkU8(size-1))
   7442                     : binop(op, mkexpr(e0), mkU8(amt))
   7443      );
   7444    } else {
   7445       vassert(0);
   7446    }
   7447 
   7448    putMMXReg( eregLO3ofRM(rm), mkexpr(e1) );
   7449    return delta;
   7450 }
   7451 
   7452 
   7453 /* Completely handle all MMX instructions except emms. */
   7454 
   7455 static
   7456 ULong dis_MMX ( Bool* decode_ok,
   7457                 const VexAbiInfo* vbi, Prefix pfx, Int sz, Long delta )
   7458 {
   7459    Int   len;
   7460    UChar modrm;
   7461    HChar dis_buf[50];
   7462    UChar opc = getUChar(delta);
   7463    delta++;
   7464 
   7465    /* dis_MMX handles all insns except emms. */
   7466    do_MMX_preamble();
   7467 
   7468    switch (opc) {
   7469 
   7470       case 0x6E:
   7471          if (sz == 4) {
   7472             /* MOVD (src)ireg32-or-mem32 (E), (dst)mmxreg (G)*/
   7473             modrm = getUChar(delta);
   7474             if (epartIsReg(modrm)) {
   7475                delta++;
   7476                putMMXReg(
   7477                   gregLO3ofRM(modrm),
   7478                   binop( Iop_32HLto64,
   7479                          mkU32(0),
   7480                          getIReg32(eregOfRexRM(pfx,modrm)) ) );
   7481                DIP("movd %s, %s\n",
   7482                    nameIReg32(eregOfRexRM(pfx,modrm)),
   7483                    nameMMXReg(gregLO3ofRM(modrm)));
   7484             } else {
   7485                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7486                delta += len;
   7487                putMMXReg(
   7488                   gregLO3ofRM(modrm),
   7489                   binop( Iop_32HLto64,
   7490                          mkU32(0),
   7491                          loadLE(Ity_I32, mkexpr(addr)) ) );
   7492                DIP("movd %s, %s\n", dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
   7493             }
   7494          }
   7495          else
   7496          if (sz == 8) {
   7497             /* MOVD (src)ireg64-or-mem64 (E), (dst)mmxreg (G)*/
   7498             modrm = getUChar(delta);
   7499             if (epartIsReg(modrm)) {
   7500                delta++;
   7501                putMMXReg( gregLO3ofRM(modrm),
   7502                           getIReg64(eregOfRexRM(pfx,modrm)) );
   7503                DIP("movd %s, %s\n",
   7504                    nameIReg64(eregOfRexRM(pfx,modrm)),
   7505                    nameMMXReg(gregLO3ofRM(modrm)));
   7506             } else {
   7507                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7508                delta += len;
   7509                putMMXReg( gregLO3ofRM(modrm),
   7510                           loadLE(Ity_I64, mkexpr(addr)) );
   7511                DIP("movd{64} %s, %s\n", dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
   7512             }
   7513          }
   7514          else {
   7515             goto mmx_decode_failure;
   7516          }
   7517          break;
   7518 
   7519       case 0x7E:
   7520          if (sz == 4) {
   7521             /* MOVD (src)mmxreg (G), (dst)ireg32-or-mem32 (E) */
   7522             modrm = getUChar(delta);
   7523             if (epartIsReg(modrm)) {
   7524                delta++;
   7525                putIReg32( eregOfRexRM(pfx,modrm),
   7526                           unop(Iop_64to32, getMMXReg(gregLO3ofRM(modrm)) ) );
   7527                DIP("movd %s, %s\n",
   7528                    nameMMXReg(gregLO3ofRM(modrm)),
   7529                    nameIReg32(eregOfRexRM(pfx,modrm)));
   7530             } else {
   7531                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7532                delta += len;
   7533                storeLE( mkexpr(addr),
   7534                         unop(Iop_64to32, getMMXReg(gregLO3ofRM(modrm)) ) );
   7535                DIP("movd %s, %s\n", nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
   7536             }
   7537          }
   7538          else
   7539          if (sz == 8) {
   7540             /* MOVD (src)mmxreg (G), (dst)ireg64-or-mem64 (E) */
   7541             modrm = getUChar(delta);
   7542             if (epartIsReg(modrm)) {
   7543                delta++;
   7544                putIReg64( eregOfRexRM(pfx,modrm),
   7545                           getMMXReg(gregLO3ofRM(modrm)) );
   7546                DIP("movd %s, %s\n",
   7547                    nameMMXReg(gregLO3ofRM(modrm)),
   7548                    nameIReg64(eregOfRexRM(pfx,modrm)));
   7549             } else {
   7550                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7551                delta += len;
   7552                storeLE( mkexpr(addr),
   7553                        getMMXReg(gregLO3ofRM(modrm)) );
   7554                DIP("movd{64} %s, %s\n", nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
   7555             }
   7556          } else {
   7557             goto mmx_decode_failure;
   7558          }
   7559          break;
   7560 
   7561       case 0x6F:
   7562          /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
   7563          if (sz != 4
   7564              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7565             goto mmx_decode_failure;
   7566          modrm = getUChar(delta);
   7567          if (epartIsReg(modrm)) {
   7568             delta++;
   7569             putMMXReg( gregLO3ofRM(modrm), getMMXReg(eregLO3ofRM(modrm)) );
   7570             DIP("movq %s, %s\n",
   7571                 nameMMXReg(eregLO3ofRM(modrm)),
   7572                 nameMMXReg(gregLO3ofRM(modrm)));
   7573          } else {
   7574             IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7575             delta += len;
   7576             putMMXReg( gregLO3ofRM(modrm), loadLE(Ity_I64, mkexpr(addr)) );
   7577             DIP("movq %s, %s\n",
   7578                 dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
   7579          }
   7580          break;
   7581 
   7582       case 0x7F:
   7583          /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
   7584          if (sz != 4
   7585              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7586             goto mmx_decode_failure;
   7587          modrm = getUChar(delta);
   7588          if (epartIsReg(modrm)) {
   7589             delta++;
   7590             putMMXReg( eregLO3ofRM(modrm), getMMXReg(gregLO3ofRM(modrm)) );
   7591             DIP("movq %s, %s\n",
   7592                 nameMMXReg(gregLO3ofRM(modrm)),
   7593                 nameMMXReg(eregLO3ofRM(modrm)));
   7594          } else {
   7595             IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7596             delta += len;
   7597             storeLE( mkexpr(addr), getMMXReg(gregLO3ofRM(modrm)) );
   7598             DIP("mov(nt)q %s, %s\n",
   7599                 nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
   7600          }
   7601          break;
   7602 
   7603       case 0xFC:
   7604       case 0xFD:
   7605       case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
   7606          if (sz != 4)
   7607             goto mmx_decode_failure;
   7608          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "padd", True );
   7609          break;
   7610 
   7611       case 0xEC:
   7612       case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
   7613          if (sz != 4
   7614              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7615             goto mmx_decode_failure;
   7616          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "padds", True );
   7617          break;
   7618 
   7619       case 0xDC:
   7620       case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   7621          if (sz != 4)
   7622             goto mmx_decode_failure;
   7623          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "paddus", True );
   7624          break;
   7625 
   7626       case 0xF8:
   7627       case 0xF9:
   7628       case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
   7629          if (sz != 4)
   7630             goto mmx_decode_failure;
   7631          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psub", True );
   7632          break;
   7633 
   7634       case 0xE8:
   7635       case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
   7636          if (sz != 4)
   7637             goto mmx_decode_failure;
   7638          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psubs", True );
   7639          break;
   7640 
   7641       case 0xD8:
   7642       case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   7643          if (sz != 4)
   7644             goto mmx_decode_failure;
   7645          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psubus", True );
   7646          break;
   7647 
   7648       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
   7649          if (sz != 4)
   7650             goto mmx_decode_failure;
   7651          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmulhw", False );
   7652          break;
   7653 
   7654       case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
   7655          if (sz != 4)
   7656             goto mmx_decode_failure;
   7657          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmullw", False );
   7658          break;
   7659 
   7660       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
   7661          vassert(sz == 4);
   7662          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmaddwd", False );
   7663          break;
   7664 
   7665       case 0x74:
   7666       case 0x75:
   7667       case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
   7668          if (sz != 4)
   7669             goto mmx_decode_failure;
   7670          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pcmpeq", True );
   7671          break;
   7672 
   7673       case 0x64:
   7674       case 0x65:
   7675       case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
   7676          if (sz != 4)
   7677             goto mmx_decode_failure;
   7678          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pcmpgt", True );
   7679          break;
   7680 
   7681       case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
   7682          if (sz != 4)
   7683             goto mmx_decode_failure;
   7684          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packssdw", False );
   7685          break;
   7686 
   7687       case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
   7688          if (sz != 4)
   7689             goto mmx_decode_failure;
   7690          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packsswb", False );
   7691          break;
   7692 
   7693       case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
   7694          if (sz != 4)
   7695             goto mmx_decode_failure;
   7696          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packuswb", False );
   7697          break;
   7698 
   7699       case 0x68:
   7700       case 0x69:
   7701       case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
   7702          if (sz != 4
   7703              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7704             goto mmx_decode_failure;
   7705          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "punpckh", True );
   7706          break;
   7707 
   7708       case 0x60:
   7709       case 0x61:
   7710       case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
   7711          if (sz != 4
   7712              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7713             goto mmx_decode_failure;
   7714          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "punpckl", True );
   7715          break;
   7716 
   7717       case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
   7718          if (sz != 4)
   7719             goto mmx_decode_failure;
   7720          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pand", False );
   7721          break;
   7722 
   7723       case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
   7724          if (sz != 4)
   7725             goto mmx_decode_failure;
   7726          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pandn", False );
   7727          break;
   7728 
   7729       case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
   7730          if (sz != 4)
   7731             goto mmx_decode_failure;
   7732          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "por", False );
   7733          break;
   7734 
   7735       case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
   7736          if (sz != 4)
   7737             goto mmx_decode_failure;
   7738          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pxor", False );
   7739          break;
   7740 
   7741 #     define SHIFT_BY_REG(_name,_op)                                     \
   7742                 delta = dis_MMX_shiftG_byE(vbi, pfx, delta, _name, _op); \
   7743                 break;
   7744 
   7745       /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
   7746       case 0xF1: SHIFT_BY_REG("psllw", Iop_ShlN16x4);
   7747       case 0xF2: SHIFT_BY_REG("pslld", Iop_ShlN32x2);
   7748       case 0xF3: SHIFT_BY_REG("psllq", Iop_Shl64);
   7749 
   7750       /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
   7751       case 0xD1: SHIFT_BY_REG("psrlw", Iop_ShrN16x4);
   7752       case 0xD2: SHIFT_BY_REG("psrld", Iop_ShrN32x2);
   7753       case 0xD3: SHIFT_BY_REG("psrlq", Iop_Shr64);
   7754 
   7755       /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
   7756       case 0xE1: SHIFT_BY_REG("psraw", Iop_SarN16x4);
   7757       case 0xE2: SHIFT_BY_REG("psrad", Iop_SarN32x2);
   7758 
   7759 #     undef SHIFT_BY_REG
   7760 
   7761       case 0x71:
   7762       case 0x72:
   7763       case 0x73: {
   7764          /* (sz==4): PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
   7765          UChar byte2, subopc;
   7766          if (sz != 4)
   7767             goto mmx_decode_failure;
   7768          byte2  = getUChar(delta);      /* amode / sub-opcode */
   7769          subopc = toUChar( (byte2 >> 3) & 7 );
   7770 
   7771 #        define SHIFT_BY_IMM(_name,_op)                        \
   7772             do { delta = dis_MMX_shiftE_imm(delta,_name,_op);  \
   7773             } while (0)
   7774 
   7775               if (subopc == 2 /*SRL*/ && opc == 0x71)
   7776                   SHIFT_BY_IMM("psrlw", Iop_ShrN16x4);
   7777          else if (subopc == 2 /*SRL*/ && opc == 0x72)
   7778                  SHIFT_BY_IMM("psrld", Iop_ShrN32x2);
   7779          else if (subopc == 2 /*SRL*/ && opc == 0x73)
   7780                  SHIFT_BY_IMM("psrlq", Iop_Shr64);
   7781 
   7782          else if (subopc == 4 /*SAR*/ && opc == 0x71)
   7783                  SHIFT_BY_IMM("psraw", Iop_SarN16x4);
   7784          else if (subopc == 4 /*SAR*/ && opc == 0x72)
   7785                  SHIFT_BY_IMM("psrad", Iop_SarN32x2);
   7786 
   7787          else if (subopc == 6 /*SHL*/ && opc == 0x71)
   7788                  SHIFT_BY_IMM("psllw", Iop_ShlN16x4);
   7789          else if (subopc == 6 /*SHL*/ && opc == 0x72)
   7790                   SHIFT_BY_IMM("pslld", Iop_ShlN32x2);
   7791          else if (subopc == 6 /*SHL*/ && opc == 0x73)
   7792                  SHIFT_BY_IMM("psllq", Iop_Shl64);
   7793 
   7794          else goto mmx_decode_failure;
   7795 
   7796 #        undef SHIFT_BY_IMM
   7797          break;
   7798       }
   7799 
   7800       case 0xF7: {
   7801          IRTemp addr    = newTemp(Ity_I64);
   7802          IRTemp regD    = newTemp(Ity_I64);
   7803          IRTemp regM    = newTemp(Ity_I64);
   7804          IRTemp mask    = newTemp(Ity_I64);
   7805          IRTemp olddata = newTemp(Ity_I64);
   7806          IRTemp newdata = newTemp(Ity_I64);
   7807 
   7808          modrm = getUChar(delta);
   7809          if (sz != 4 || (!epartIsReg(modrm)))
   7810             goto mmx_decode_failure;
   7811          delta++;
   7812 
   7813          assign( addr, handleAddrOverrides( vbi, pfx, getIReg64(R_RDI) ));
   7814          assign( regM, getMMXReg( eregLO3ofRM(modrm) ));
   7815          assign( regD, getMMXReg( gregLO3ofRM(modrm) ));
   7816          assign( mask, binop(Iop_SarN8x8, mkexpr(regM), mkU8(7)) );
   7817          assign( olddata, loadLE( Ity_I64, mkexpr(addr) ));
   7818          assign( newdata,
   7819                  binop(Iop_Or64,
   7820                        binop(Iop_And64,
   7821                              mkexpr(regD),
   7822                              mkexpr(mask) ),
   7823                        binop(Iop_And64,
   7824                              mkexpr(olddata),
   7825                              unop(Iop_Not64, mkexpr(mask)))) );
   7826          storeLE( mkexpr(addr), mkexpr(newdata) );
   7827          DIP("maskmovq %s,%s\n", nameMMXReg( eregLO3ofRM(modrm) ),
   7828                                  nameMMXReg( gregLO3ofRM(modrm) ) );
   7829          break;
   7830       }
   7831 
   7832       /* --- MMX decode failure --- */
   7833       default:
   7834       mmx_decode_failure:
   7835          *decode_ok = False;
   7836          return delta; /* ignored */
   7837 
   7838    }
   7839 
   7840    *decode_ok = True;
   7841    return delta;
   7842 }
   7843 
   7844 
   7845 /*------------------------------------------------------------*/
   7846 /*--- More misc arithmetic and other obscure insns.        ---*/
   7847 /*------------------------------------------------------------*/
   7848 
   7849 /* Generate base << amt with vacated places filled with stuff
   7850    from xtra.  amt guaranteed in 0 .. 63. */
   7851 static
   7852 IRExpr* shiftL64_with_extras ( IRTemp base, IRTemp xtra, IRTemp amt )
   7853 {
   7854    /* if   amt == 0
   7855       then base
   7856       else (base << amt) | (xtra >>u (64-amt))
   7857    */
   7858    return
   7859       IRExpr_ITE(
   7860          binop(Iop_CmpNE8, mkexpr(amt), mkU8(0)),
   7861          binop(Iop_Or64,
   7862                binop(Iop_Shl64, mkexpr(base), mkexpr(amt)),
   7863                binop(Iop_Shr64, mkexpr(xtra),
   7864                                 binop(Iop_Sub8, mkU8(64), mkexpr(amt)))
   7865                ),
   7866          mkexpr(base)
   7867       );
   7868 }
   7869 
   7870 /* Generate base >>u amt with vacated places filled with stuff
   7871    from xtra.  amt guaranteed in 0 .. 63. */
   7872 static
   7873 IRExpr* shiftR64_with_extras ( IRTemp xtra, IRTemp base, IRTemp amt )
   7874 {
   7875    /* if   amt == 0
   7876       then base
   7877       else (base >>u amt) | (xtra << (64-amt))
   7878    */
   7879    return
   7880       IRExpr_ITE(
   7881          binop(Iop_CmpNE8, mkexpr(amt), mkU8(0)),
   7882          binop(Iop_Or64,
   7883                binop(Iop_Shr64, mkexpr(base), mkexpr(amt)),
   7884                binop(Iop_Shl64, mkexpr(xtra),
   7885                                 binop(Iop_Sub8, mkU8(64), mkexpr(amt)))
   7886                ),
   7887          mkexpr(base)
   7888       );
   7889 }
   7890 
   7891 /* Double length left and right shifts.  Apparently only required in
   7892    v-size (no b- variant). */
   7893 static
   7894 ULong dis_SHLRD_Gv_Ev ( const VexAbiInfo* vbi,
   7895                         Prefix pfx,
   7896                         Long delta, UChar modrm,
   7897                         Int sz,
   7898                         IRExpr* shift_amt,
   7899                         Bool amt_is_literal,
   7900                         const HChar* shift_amt_txt,
   7901                         Bool left_shift )
   7902 {
   7903    /* shift_amt :: Ity_I8 is the amount to shift.  shift_amt_txt is used
   7904       for printing it.   And eip on entry points at the modrm byte. */
   7905    Int len;
   7906    HChar dis_buf[50];
   7907 
   7908    IRType ty     = szToITy(sz);
   7909    IRTemp gsrc   = newTemp(ty);
   7910    IRTemp esrc   = newTemp(ty);
   7911    IRTemp addr   = IRTemp_INVALID;
   7912    IRTemp tmpSH  = newTemp(Ity_I8);
   7913    IRTemp tmpSS  = newTemp(Ity_I8);
   7914    IRTemp tmp64  = IRTemp_INVALID;
   7915    IRTemp res64  = IRTemp_INVALID;
   7916    IRTemp rss64  = IRTemp_INVALID;
   7917    IRTemp resTy  = IRTemp_INVALID;
   7918    IRTemp rssTy  = IRTemp_INVALID;
   7919    Int    mask   = sz==8 ? 63 : 31;
   7920 
   7921    vassert(sz == 2 || sz == 4 || sz == 8);
   7922 
   7923    /* The E-part is the destination; this is shifted.  The G-part
   7924       supplies bits to be shifted into the E-part, but is not
   7925       changed.
   7926 
   7927       If shifting left, form a double-length word with E at the top
   7928       and G at the bottom, and shift this left.  The result is then in
   7929       the high part.
   7930 
   7931       If shifting right, form a double-length word with G at the top
   7932       and E at the bottom, and shift this right.  The result is then
   7933       at the bottom.  */
   7934 
   7935    /* Fetch the operands. */
   7936 
   7937    assign( gsrc, getIRegG(sz, pfx, modrm) );
   7938 
   7939    if (epartIsReg(modrm)) {
   7940       delta++;
   7941       assign( esrc, getIRegE(sz, pfx, modrm) );
   7942       DIP("sh%cd%c %s, %s, %s\n",
   7943           ( left_shift ? 'l' : 'r' ), nameISize(sz),
   7944           shift_amt_txt,
   7945           nameIRegG(sz, pfx, modrm), nameIRegE(sz, pfx, modrm));
   7946    } else {
   7947       addr = disAMode ( &len, vbi, pfx, delta, dis_buf,
   7948                         /* # bytes following amode */
   7949                         amt_is_literal ? 1 : 0 );
   7950       delta += len;
   7951       assign( esrc, loadLE(ty, mkexpr(addr)) );
   7952       DIP("sh%cd%c %s, %s, %s\n",
   7953           ( left_shift ? 'l' : 'r' ), nameISize(sz),
   7954           shift_amt_txt,
   7955           nameIRegG(sz, pfx, modrm), dis_buf);
   7956    }
   7957 
   7958    /* Calculate the masked shift amount (tmpSH), the masked subshift
   7959       amount (tmpSS), the shifted value (res64) and the subshifted
   7960       value (rss64). */
   7961 
   7962    assign( tmpSH, binop(Iop_And8, shift_amt, mkU8(mask)) );
   7963    assign( tmpSS, binop(Iop_And8,
   7964                         binop(Iop_Sub8, mkexpr(tmpSH), mkU8(1) ),
   7965                         mkU8(mask)));
   7966 
   7967    tmp64 = newTemp(Ity_I64);
   7968    res64 = newTemp(Ity_I64);
   7969    rss64 = newTemp(Ity_I64);
   7970 
   7971    if (sz == 2 || sz == 4) {
   7972 
   7973       /* G is xtra; E is data */
   7974       /* what a freaking nightmare: */
   7975       if (sz == 4 && left_shift) {
   7976          assign( tmp64, binop(Iop_32HLto64, mkexpr(esrc), mkexpr(gsrc)) );
   7977          assign( res64,
   7978                  binop(Iop_Shr64,
   7979                        binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSH)),
   7980                        mkU8(32)) );
   7981          assign( rss64,
   7982                  binop(Iop_Shr64,
   7983                        binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSS)),
   7984                        mkU8(32)) );
   7985       }
   7986       else
   7987       if (sz == 4 && !left_shift) {
   7988          assign( tmp64, binop(Iop_32HLto64, mkexpr(gsrc), mkexpr(esrc)) );
   7989          assign( res64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSH)) );
   7990          assign( rss64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSS)) );
   7991       }
   7992       else
   7993       if (sz == 2 && left_shift) {
   7994          assign( tmp64,
   7995                  binop(Iop_32HLto64,
   7996                        binop(Iop_16HLto32, mkexpr(esrc), mkexpr(gsrc)),
   7997                        binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(gsrc))
   7998          ));
   7999          /* result formed by shifting [esrc'gsrc'gsrc'gsrc] */
   8000          assign( res64,
   8001                  binop(Iop_Shr64,
   8002                        binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSH)),
   8003                        mkU8(48)) );
   8004          /* subshift formed by shifting [esrc'0000'0000'0000] */
   8005          assign( rss64,
   8006                  binop(Iop_Shr64,
   8007                        binop(Iop_Shl64,
   8008                              binop(Iop_Shl64, unop(Iop_16Uto64, mkexpr(esrc)),
   8009                                               mkU8(48)),
   8010                              mkexpr(tmpSS)),
   8011                        mkU8(48)) );
   8012       }
   8013       else
   8014       if (sz == 2 && !left_shift) {
   8015          assign( tmp64,
   8016                  binop(Iop_32HLto64,
   8017                        binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(gsrc)),
   8018                        binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(esrc))
   8019          ));
   8020          /* result formed by shifting [gsrc'gsrc'gsrc'esrc] */
   8021          assign( res64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSH)) );
   8022          /* subshift formed by shifting [0000'0000'0000'esrc] */
   8023          assign( rss64, binop(Iop_Shr64,
   8024                               unop(Iop_16Uto64, mkexpr(esrc)),
   8025                               mkexpr(tmpSS)) );
   8026       }
   8027 
   8028    } else {
   8029 
   8030       vassert(sz == 8);
   8031       if (left_shift) {
   8032          assign( res64, shiftL64_with_extras( esrc, gsrc, tmpSH ));
   8033          assign( rss64, shiftL64_with_extras( esrc, gsrc, tmpSS ));
   8034       } else {
   8035          assign( res64, shiftR64_with_extras( gsrc, esrc, tmpSH ));
   8036          assign( rss64, shiftR64_with_extras( gsrc, esrc, tmpSS ));
   8037       }
   8038 
   8039    }
   8040 
   8041    resTy = newTemp(ty);
   8042    rssTy = newTemp(ty);
   8043    assign( resTy, narrowTo(ty, mkexpr(res64)) );
   8044    assign( rssTy, narrowTo(ty, mkexpr(rss64)) );
   8045 
   8046    /* Put result back and write the flags thunk. */
   8047    setFlags_DEP1_DEP2_shift ( left_shift ? Iop_Shl64 : Iop_Sar64,
   8048                               resTy, rssTy, ty, tmpSH );
   8049 
   8050    if (epartIsReg(modrm)) {
   8051       putIRegE(sz, pfx, modrm, mkexpr(resTy));
   8052    } else {
   8053       storeLE( mkexpr(addr), mkexpr(resTy) );
   8054    }
   8055 
   8056    if (amt_is_literal) delta++;
   8057    return delta;
   8058 }
   8059 
   8060 
   8061 /* Handle BT/BTS/BTR/BTC Gv, Ev.  Apparently b-size is not
   8062    required. */
   8063 
   8064 typedef enum { BtOpNone, BtOpSet, BtOpReset, BtOpComp } BtOp;
   8065 
   8066 static const HChar* nameBtOp ( BtOp op )
   8067 {
   8068    switch (op) {
   8069       case BtOpNone:  return "";
   8070       case BtOpSet:   return "s";
   8071       case BtOpReset: return "r";
   8072       case BtOpComp:  return "c";
   8073       default: vpanic("nameBtOp(amd64)");
   8074    }
   8075 }
   8076 
   8077 
   8078 static
   8079 ULong dis_bt_G_E ( const VexAbiInfo* vbi,
   8080                    Prefix pfx, Int sz, Long delta, BtOp op,
   8081                    /*OUT*/Bool* decode_OK )
   8082 {
   8083    HChar  dis_buf[50];
   8084    UChar  modrm;
   8085    Int    len;
   8086    IRTemp t_fetched, t_bitno0, t_bitno1, t_bitno2, t_addr0,
   8087           t_addr1, t_rsp, t_mask, t_new;
   8088 
   8089    vassert(sz == 2 || sz == 4 || sz == 8);
   8090 
   8091    t_fetched = t_bitno0 = t_bitno1 = t_bitno2
   8092              = t_addr0 = t_addr1 = t_rsp
   8093              = t_mask = t_new = IRTemp_INVALID;
   8094 
   8095    t_fetched = newTemp(Ity_I8);
   8096    t_new     = newTemp(Ity_I8);
   8097    t_bitno0  = newTemp(Ity_I64);
   8098    t_bitno1  = newTemp(Ity_I64);
   8099    t_bitno2  = newTemp(Ity_I8);
   8100    t_addr1   = newTemp(Ity_I64);
   8101    modrm     = getUChar(delta);
   8102 
   8103    *decode_OK = True;
   8104    if (epartIsReg(modrm)) {
   8105       /* F2 and F3 are never acceptable. */
   8106       if (haveF2orF3(pfx)) {
   8107          *decode_OK = False;
   8108          return delta;
   8109       }
   8110    } else {
   8111       /* F2 or F3 (but not both) are allowed, provided LOCK is also
   8112          present, and only for the BTC/BTS/BTR cases (not BT). */
   8113       if (haveF2orF3(pfx)) {
   8114          if (haveF2andF3(pfx) || !haveLOCK(pfx) || op == BtOpNone) {
   8115             *decode_OK = False;
   8116             return delta;
   8117          }
   8118       }
   8119    }
   8120 
   8121    assign( t_bitno0, widenSto64(getIRegG(sz, pfx, modrm)) );
   8122 
   8123    if (epartIsReg(modrm)) {
   8124       delta++;
   8125       /* Get it onto the client's stack.  Oh, this is a horrible
   8126          kludge.  See https://bugs.kde.org/show_bug.cgi?id=245925.
   8127          Because of the ELF ABI stack redzone, there may be live data
   8128          up to 128 bytes below %RSP.  So we can't just push it on the
   8129          stack, else we may wind up trashing live data, and causing
   8130          impossible-to-find simulation errors.  (Yes, this did
   8131          happen.)  So we need to drop RSP before at least 128 before
   8132          pushing it.  That unfortunately means hitting Memcheck's
   8133          fast-case painting code.  Ideally we should drop more than
   8134          128, to reduce the chances of breaking buggy programs that
   8135          have live data below -128(%RSP).  Memcheck fast-cases moves
   8136          of 288 bytes due to the need to handle ppc64-linux quickly,
   8137          so let's use 288.  Of course the real fix is to get rid of
   8138          this kludge entirely.  */
   8139       t_rsp = newTemp(Ity_I64);
   8140       t_addr0 = newTemp(Ity_I64);
   8141 
   8142       vassert(vbi->guest_stack_redzone_size == 128);
   8143       assign( t_rsp, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(288)) );
   8144       putIReg64(R_RSP, mkexpr(t_rsp));
   8145 
   8146       storeLE( mkexpr(t_rsp), getIRegE(sz, pfx, modrm) );
   8147 
   8148       /* Make t_addr0 point at it. */
   8149       assign( t_addr0, mkexpr(t_rsp) );
   8150 
   8151       /* Mask out upper bits of the shift amount, since we're doing a
   8152          reg. */
   8153       assign( t_bitno1, binop(Iop_And64,
   8154                               mkexpr(t_bitno0),
   8155                               mkU64(sz == 8 ? 63 : sz == 4 ? 31 : 15)) );
   8156 
   8157    } else {
   8158       t_addr0 = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   8159       delta += len;
   8160       assign( t_bitno1, mkexpr(t_bitno0) );
   8161    }
   8162 
   8163    /* At this point: t_addr0 is the address being operated on.  If it
   8164       was a reg, we will have pushed it onto the client's stack.
   8165       t_bitno1 is the bit number, suitably masked in the case of a
   8166       reg.  */
   8167 
   8168    /* Now the main sequence. */
   8169    assign( t_addr1,
   8170            binop(Iop_Add64,
   8171                  mkexpr(t_addr0),
   8172                  binop(Iop_Sar64, mkexpr(t_bitno1), mkU8(3))) );
   8173 
   8174    /* t_addr1 now holds effective address */
   8175 
   8176    assign( t_bitno2,
   8177            unop(Iop_64to8,
   8178                 binop(Iop_And64, mkexpr(t_bitno1), mkU64(7))) );
   8179 
   8180    /* t_bitno2 contains offset of bit within byte */
   8181 
   8182    if (op != BtOpNone) {
   8183       t_mask = newTemp(Ity_I8);
   8184       assign( t_mask, binop(Iop_Shl8, mkU8(1), mkexpr(t_bitno2)) );
   8185    }
   8186 
   8187    /* t_mask is now a suitable byte mask */
   8188 
   8189    assign( t_fetched, loadLE(Ity_I8, mkexpr(t_addr1)) );
   8190 
   8191    if (op != BtOpNone) {
   8192       switch (op) {
   8193          case BtOpSet:
   8194             assign( t_new,
   8195                     binop(Iop_Or8, mkexpr(t_fetched), mkexpr(t_mask)) );
   8196             break;
   8197          case BtOpComp:
   8198             assign( t_new,
   8199                     binop(Iop_Xor8, mkexpr(t_fetched), mkexpr(t_mask)) );
   8200             break;
   8201          case BtOpReset:
   8202             assign( t_new,
   8203                     binop(Iop_And8, mkexpr(t_fetched),
   8204                                     unop(Iop_Not8, mkexpr(t_mask))) );
   8205             break;
   8206          default:
   8207             vpanic("dis_bt_G_E(amd64)");
   8208       }
   8209       if ((haveLOCK(pfx)) && !epartIsReg(modrm)) {
   8210          casLE( mkexpr(t_addr1), mkexpr(t_fetched)/*expd*/,
   8211                                  mkexpr(t_new)/*new*/,
   8212                                  guest_RIP_curr_instr );
   8213       } else {
   8214          storeLE( mkexpr(t_addr1), mkexpr(t_new) );
   8215       }
   8216    }
   8217 
   8218    /* Side effect done; now get selected bit into Carry flag.  The Intel docs
   8219       (as of 2015, at least) say that C holds the result, Z is unchanged, and
   8220       O,S,A and P are undefined.  However, on Skylake it appears that O,S,A,P
   8221       are also unchanged, so let's do that. */
   8222    const ULong maskC     = AMD64G_CC_MASK_C;
   8223    const ULong maskOSZAP = AMD64G_CC_MASK_O | AMD64G_CC_MASK_S
   8224                            | AMD64G_CC_MASK_Z | AMD64G_CC_MASK_A
   8225                            | AMD64G_CC_MASK_P;
   8226 
   8227    IRTemp old_rflags = newTemp(Ity_I64);
   8228    assign(old_rflags, mk_amd64g_calculate_rflags_all());
   8229 
   8230    IRTemp new_rflags = newTemp(Ity_I64);
   8231    assign(new_rflags,
   8232           binop(Iop_Or64,
   8233                 binop(Iop_And64, mkexpr(old_rflags), mkU64(maskOSZAP)),
   8234                 binop(Iop_And64,
   8235                       binop(Iop_Shr64,
   8236                             unop(Iop_8Uto64, mkexpr(t_fetched)),
   8237                             mkexpr(t_bitno2)),
   8238                       mkU64(maskC))));
   8239 
   8240    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   8241    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   8242    stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(new_rflags) ));
   8243    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   8244       elimination of previous stores to this field work better. */
   8245    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   8246 
   8247    /* Move reg operand from stack back to reg */
   8248    if (epartIsReg(modrm)) {
   8249       /* t_rsp still points at it. */
   8250       /* only write the reg if actually modifying it; doing otherwise
   8251          zeroes the top half erroneously when doing btl due to
   8252          standard zero-extend rule */
   8253       if (op != BtOpNone)
   8254          putIRegE(sz, pfx, modrm, loadLE(szToITy(sz), mkexpr(t_rsp)) );
   8255       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t_rsp), mkU64(288)) );
   8256    }
   8257 
   8258    DIP("bt%s%c %s, %s\n",
   8259        nameBtOp(op), nameISize(sz), nameIRegG(sz, pfx, modrm),
   8260        ( epartIsReg(modrm) ? nameIRegE(sz, pfx, modrm) : dis_buf ) );
   8261 
   8262    return delta;
   8263 }
   8264 
   8265 
   8266 
   8267 /* Handle BSF/BSR.  Only v-size seems necessary. */
   8268 static
   8269 ULong dis_bs_E_G ( const VexAbiInfo* vbi,
   8270                    Prefix pfx, Int sz, Long delta, Bool fwds )
   8271 {
   8272    Bool   isReg;
   8273    UChar  modrm;
   8274    HChar  dis_buf[50];
   8275 
   8276    IRType ty    = szToITy(sz);
   8277    IRTemp src   = newTemp(ty);
   8278    IRTemp dst   = newTemp(ty);
   8279    IRTemp src64 = newTemp(Ity_I64);
   8280    IRTemp dst64 = newTemp(Ity_I64);
   8281    IRTemp srcB  = newTemp(Ity_I1);
   8282 
   8283    vassert(sz == 8 || sz == 4 || sz == 2);
   8284 
   8285    modrm = getUChar(delta);
   8286    isReg = epartIsReg(modrm);
   8287    if (isReg) {
   8288       delta++;
   8289       assign( src, getIRegE(sz, pfx, modrm) );
   8290    } else {
   8291       Int    len;
   8292       IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   8293       delta += len;
   8294       assign( src, loadLE(ty, mkexpr(addr)) );
   8295    }
   8296 
   8297    DIP("bs%c%c %s, %s\n",
   8298        fwds ? 'f' : 'r', nameISize(sz),
   8299        ( isReg ? nameIRegE(sz, pfx, modrm) : dis_buf ),
   8300        nameIRegG(sz, pfx, modrm));
   8301 
   8302    /* First, widen src to 64 bits if it is not already. */
   8303    assign( src64, widenUto64(mkexpr(src)) );
   8304 
   8305    /* Generate a bool expression which is zero iff the original is
   8306       zero, and nonzero otherwise.  Ask for a CmpNE version which, if
   8307       instrumented by Memcheck, is instrumented expensively, since
   8308       this may be used on the output of a preceding movmskb insn,
   8309       which has been known to be partially defined, and in need of
   8310       careful handling. */
   8311    assign( srcB, binop(Iop_ExpCmpNE64, mkexpr(src64), mkU64(0)) );
   8312 
   8313    /* Flags: Z is 1 iff source value is zero.  All others
   8314       are undefined -- we force them to zero. */
   8315    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   8316    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   8317    stmt( IRStmt_Put(
   8318             OFFB_CC_DEP1,
   8319             IRExpr_ITE( mkexpr(srcB),
   8320                         /* src!=0 */
   8321                         mkU64(0),
   8322                         /* src==0 */
   8323                         mkU64(AMD64G_CC_MASK_Z)
   8324                         )
   8325        ));
   8326    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   8327       elimination of previous stores to this field work better. */
   8328    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   8329 
   8330    /* Result: iff source value is zero, we can't use
   8331       Iop_Clz64/Iop_Ctz64 as they have no defined result in that case.
   8332       But anyway, amd64 semantics say the result is undefined in
   8333       such situations.  Hence handle the zero case specially. */
   8334 
   8335    /* Bleh.  What we compute:
   8336 
   8337           bsf64:  if src == 0 then {dst is unchanged}
   8338                               else Ctz64(src)
   8339 
   8340           bsr64:  if src == 0 then {dst is unchanged}
   8341                               else 63 - Clz64(src)
   8342 
   8343           bsf32:  if src == 0 then {dst is unchanged}
   8344                               else Ctz64(32Uto64(src))
   8345 
   8346           bsr32:  if src == 0 then {dst is unchanged}
   8347                               else 63 - Clz64(32Uto64(src))
   8348 
   8349           bsf16:  if src == 0 then {dst is unchanged}
   8350                               else Ctz64(32Uto64(16Uto32(src)))
   8351 
   8352           bsr16:  if src == 0 then {dst is unchanged}
   8353                               else 63 - Clz64(32Uto64(16Uto32(src)))
   8354    */
   8355 
   8356    /* The main computation, guarding against zero. */
   8357    assign( dst64,
   8358            IRExpr_ITE(
   8359               mkexpr(srcB),
   8360               /* src != 0 */
   8361               fwds ? unop(Iop_Ctz64, mkexpr(src64))
   8362                    : binop(Iop_Sub64,
   8363                            mkU64(63),
   8364                            unop(Iop_Clz64, mkexpr(src64))),
   8365               /* src == 0 -- leave dst unchanged */
   8366               widenUto64( getIRegG( sz, pfx, modrm ) )
   8367            )
   8368          );
   8369 
   8370    if (sz == 2)
   8371       assign( dst, unop(Iop_64to16, mkexpr(dst64)) );
   8372    else
   8373    if (sz == 4)
   8374       assign( dst, unop(Iop_64to32, mkexpr(dst64)) );
   8375    else
   8376       assign( dst, mkexpr(dst64) );
   8377 
   8378    /* dump result back */
   8379    putIRegG( sz, pfx, modrm, mkexpr(dst) );
   8380 
   8381    return delta;
   8382 }
   8383 
   8384 
   8385 /* swap rAX with the reg specified by reg and REX.B */
   8386 static
   8387 void codegen_xchg_rAX_Reg ( Prefix pfx, Int sz, UInt regLo3 )
   8388 {
   8389    IRType ty = szToITy(sz);
   8390    IRTemp t1 = newTemp(ty);
   8391    IRTemp t2 = newTemp(ty);
   8392    vassert(sz == 2 || sz == 4 || sz == 8);
   8393    vassert(regLo3 < 8);
   8394    if (sz == 8) {
   8395       assign( t1, getIReg64(R_RAX) );
   8396       assign( t2, getIRegRexB(8, pfx, regLo3) );
   8397       putIReg64( R_RAX, mkexpr(t2) );
   8398       putIRegRexB(8, pfx, regLo3, mkexpr(t1) );
   8399    } else if (sz == 4) {
   8400       assign( t1, getIReg32(R_RAX) );
   8401       assign( t2, getIRegRexB(4, pfx, regLo3) );
   8402       putIReg32( R_RAX, mkexpr(t2) );
   8403       putIRegRexB(4, pfx, regLo3, mkexpr(t1) );
   8404    } else {
   8405       assign( t1, getIReg16(R_RAX) );
   8406       assign( t2, getIRegRexB(2, pfx, regLo3) );
   8407       putIReg16( R_RAX, mkexpr(t2) );
   8408       putIRegRexB(2, pfx, regLo3, mkexpr(t1) );
   8409    }
   8410    DIP("xchg%c %s, %s\n",
   8411        nameISize(sz), nameIRegRAX(sz),
   8412                       nameIRegRexB(sz,pfx, regLo3));
   8413 }
   8414 
   8415 
   8416 static
   8417 void codegen_SAHF ( void )
   8418 {
   8419    /* Set the flags to:
   8420       (amd64g_calculate_flags_all() & AMD64G_CC_MASK_O)
   8421                                     -- retain the old O flag
   8422       | (%AH & (AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
   8423                 |AMD64G_CC_MASK_P|AMD64G_CC_MASK_C)
   8424    */
   8425    ULong  mask_SZACP = AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
   8426                        |AMD64G_CC_MASK_C|AMD64G_CC_MASK_P;
   8427    IRTemp oldflags   = newTemp(Ity_I64);
   8428    assign( oldflags, mk_amd64g_calculate_rflags_all() );
   8429    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   8430    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   8431    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   8432    stmt( IRStmt_Put( OFFB_CC_DEP1,
   8433          binop(Iop_Or64,
   8434                binop(Iop_And64, mkexpr(oldflags), mkU64(AMD64G_CC_MASK_O)),
   8435                binop(Iop_And64,
   8436                      binop(Iop_Shr64, getIReg64(R_RAX), mkU8(8)),
   8437                      mkU64(mask_SZACP))
   8438               )
   8439    ));
   8440 }
   8441 
   8442 
   8443 static
   8444 void codegen_LAHF ( void  )
   8445 {
   8446    /* AH <- EFLAGS(SF:ZF:0:AF:0:PF:1:CF) */
   8447    IRExpr* rax_with_hole;
   8448    IRExpr* new_byte;
   8449    IRExpr* new_rax;
   8450    ULong   mask_SZACP = AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
   8451                         |AMD64G_CC_MASK_C|AMD64G_CC_MASK_P;
   8452 
   8453    IRTemp  flags = newTemp(Ity_I64);
   8454    assign( flags, mk_amd64g_calculate_rflags_all() );
   8455 
   8456    rax_with_hole
   8457       = binop(Iop_And64, getIReg64(R_RAX), mkU64(~0xFF00ULL));
   8458    new_byte
   8459       = binop(Iop_Or64, binop(Iop_And64, mkexpr(flags), mkU64(mask_SZACP)),
   8460                         mkU64(1<<1));
   8461    new_rax
   8462       = binop(Iop_Or64, rax_with_hole,
   8463                         binop(Iop_Shl64, new_byte, mkU8(8)));
   8464    putIReg64(R_RAX, new_rax);
   8465 }
   8466 
   8467 
   8468 static
   8469 ULong dis_cmpxchg_G_E ( /*OUT*/Bool* ok,
   8470                         const VexAbiInfo*  vbi,
   8471                         Prefix       pfx,
   8472                         Int          size,
   8473                         Long         delta0 )
   8474 {
   8475    HChar dis_buf[50];
   8476    Int   len;
   8477 
   8478    IRType ty    = szToITy(size);
   8479    IRTemp acc   = newTemp(ty);
   8480    IRTemp src   = newTemp(ty);
   8481    IRTemp dest  = newTemp(ty);
   8482    IRTemp dest2 = newTemp(ty);
   8483    IRTemp acc2  = newTemp(ty);
   8484    IRTemp cond  = newTemp(Ity_I1);
   8485    IRTemp addr  = IRTemp_INVALID;
   8486    UChar  rm    = getUChar(delta0);
   8487 
   8488    /* There are 3 cases to consider:
   8489 
   8490       reg-reg: ignore any lock prefix, generate sequence based
   8491                on ITE
   8492 
   8493       reg-mem, not locked: ignore any lock prefix, generate sequence
   8494                            based on ITE
   8495 
   8496       reg-mem, locked: use IRCAS
   8497    */
   8498 
   8499    /* Decide whether F2 or F3 are acceptable.  Never for register
   8500       case, but for the memory case, one or the other is OK provided
   8501       LOCK is also present. */
   8502    if (epartIsReg(rm)) {
   8503       if (haveF2orF3(pfx)) {
   8504          *ok = False;
   8505          return delta0;
   8506       }
   8507    } else {
   8508       if (haveF2orF3(pfx)) {
   8509          if (haveF2andF3(pfx) || !haveLOCK(pfx)) {
   8510             *ok = False;
   8511             return delta0;
   8512          }
   8513       }
   8514    }
   8515 
   8516    if (epartIsReg(rm)) {
   8517       /* case 1 */
   8518       assign( dest, getIRegE(size, pfx, rm) );
   8519       delta0++;
   8520       assign( src, getIRegG(size, pfx, rm) );
   8521       assign( acc, getIRegRAX(size) );
   8522       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   8523       assign( cond, mk_amd64g_calculate_condition(AMD64CondZ) );
   8524       assign( dest2, IRExpr_ITE(mkexpr(cond), mkexpr(src), mkexpr(dest)) );
   8525       assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
   8526       putIRegRAX(size, mkexpr(acc2));
   8527       putIRegE(size, pfx, rm, mkexpr(dest2));
   8528       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   8529                                nameIRegG(size,pfx,rm),
   8530                                nameIRegE(size,pfx,rm) );
   8531    }
   8532    else if (!epartIsReg(rm) && !haveLOCK(pfx)) {
   8533       /* case 2 */
   8534       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8535       assign( dest, loadLE(ty, mkexpr(addr)) );
   8536       delta0 += len;
   8537       assign( src, getIRegG(size, pfx, rm) );
   8538       assign( acc, getIRegRAX(size) );
   8539       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   8540       assign( cond, mk_amd64g_calculate_condition(AMD64CondZ) );
   8541       assign( dest2, IRExpr_ITE(mkexpr(cond), mkexpr(src), mkexpr(dest)) );
   8542       assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
   8543       putIRegRAX(size, mkexpr(acc2));
   8544       storeLE( mkexpr(addr), mkexpr(dest2) );
   8545       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   8546                                nameIRegG(size,pfx,rm), dis_buf);
   8547    }
   8548    else if (!epartIsReg(rm) && haveLOCK(pfx)) {
   8549       /* case 3 */
   8550       /* src is new value.  acc is expected value.  dest is old value.
   8551          Compute success from the output of the IRCAS, and steer the
   8552          new value for RAX accordingly: in case of success, RAX is
   8553          unchanged. */
   8554       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8555       delta0 += len;
   8556       assign( src, getIRegG(size, pfx, rm) );
   8557       assign( acc, getIRegRAX(size) );
   8558       stmt( IRStmt_CAS(
   8559          mkIRCAS( IRTemp_INVALID, dest, Iend_LE, mkexpr(addr),
   8560                   NULL, mkexpr(acc), NULL, mkexpr(src) )
   8561       ));
   8562       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   8563       assign( cond, mk_amd64g_calculate_condition(AMD64CondZ) );
   8564       assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
   8565       putIRegRAX(size, mkexpr(acc2));
   8566       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   8567                                nameIRegG(size,pfx,rm), dis_buf);
   8568    }
   8569    else vassert(0);
   8570 
   8571    *ok = True;
   8572    return delta0;
   8573 }
   8574 
   8575 
   8576 /* Handle conditional move instructions of the form
   8577       cmovcc E(reg-or-mem), G(reg)
   8578 
   8579    E(src) is reg-or-mem
   8580    G(dst) is reg.
   8581 
   8582    If E is reg, -->    GET %E, tmps
   8583                        GET %G, tmpd
   8584                        CMOVcc tmps, tmpd
   8585                        PUT tmpd, %G
   8586 
   8587    If E is mem  -->    (getAddr E) -> tmpa
   8588                        LD (tmpa), tmps
   8589                        GET %G, tmpd
   8590                        CMOVcc tmps, tmpd
   8591                        PUT tmpd, %G
   8592 */
   8593 static
   8594 ULong dis_cmov_E_G ( const VexAbiInfo* vbi,
   8595                      Prefix        pfx,
   8596                      Int           sz,
   8597                      AMD64Condcode cond,
   8598                      Long          delta0 )
   8599 {
   8600    UChar rm  = getUChar(delta0);
   8601    HChar dis_buf[50];
   8602    Int   len;
   8603 
   8604    IRType ty   = szToITy(sz);
   8605    IRTemp tmps = newTemp(ty);
   8606    IRTemp tmpd = newTemp(ty);
   8607 
   8608    if (epartIsReg(rm)) {
   8609       assign( tmps, getIRegE(sz, pfx, rm) );
   8610       assign( tmpd, getIRegG(sz, pfx, rm) );
   8611 
   8612       putIRegG( sz, pfx, rm,
   8613                 IRExpr_ITE( mk_amd64g_calculate_condition(cond),
   8614                             mkexpr(tmps),
   8615                             mkexpr(tmpd) )
   8616               );
   8617       DIP("cmov%s %s,%s\n", name_AMD64Condcode(cond),
   8618                             nameIRegE(sz,pfx,rm),
   8619                             nameIRegG(sz,pfx,rm));
   8620       return 1+delta0;
   8621    }
   8622 
   8623    /* E refers to memory */
   8624    {
   8625       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8626       assign( tmps, loadLE(ty, mkexpr(addr)) );
   8627       assign( tmpd, getIRegG(sz, pfx, rm) );
   8628 
   8629       putIRegG( sz, pfx, rm,
   8630                 IRExpr_ITE( mk_amd64g_calculate_condition(cond),
   8631                             mkexpr(tmps),
   8632                             mkexpr(tmpd) )
   8633               );
   8634 
   8635       DIP("cmov%s %s,%s\n", name_AMD64Condcode(cond),
   8636                             dis_buf,
   8637                             nameIRegG(sz,pfx,rm));
   8638       return len+delta0;
   8639    }
   8640 }
   8641 
   8642 
   8643 static
   8644 ULong dis_xadd_G_E ( /*OUT*/Bool* decode_ok,
   8645                      const VexAbiInfo* vbi,
   8646                      Prefix pfx, Int sz, Long delta0 )
   8647 {
   8648    Int   len;
   8649    UChar rm = getUChar(delta0);
   8650    HChar dis_buf[50];
   8651 
   8652    IRType ty    = szToITy(sz);
   8653    IRTemp tmpd  = newTemp(ty);
   8654    IRTemp tmpt0 = newTemp(ty);
   8655    IRTemp tmpt1 = newTemp(ty);
   8656 
   8657    /* There are 3 cases to consider:
   8658 
   8659       reg-reg: ignore any lock prefix,
   8660                generate 'naive' (non-atomic) sequence
   8661 
   8662       reg-mem, not locked: ignore any lock prefix, generate 'naive'
   8663                            (non-atomic) sequence
   8664 
   8665       reg-mem, locked: use IRCAS
   8666    */
   8667 
   8668    if (epartIsReg(rm)) {
   8669       /* case 1 */
   8670       assign( tmpd, getIRegE(sz, pfx, rm) );
   8671       assign( tmpt0, getIRegG(sz, pfx, rm) );
   8672       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   8673                            mkexpr(tmpd), mkexpr(tmpt0)) );
   8674       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   8675       putIRegG(sz, pfx, rm, mkexpr(tmpd));
   8676       putIRegE(sz, pfx, rm, mkexpr(tmpt1));
   8677       DIP("xadd%c %s, %s\n",
   8678           nameISize(sz), nameIRegG(sz,pfx,rm), nameIRegE(sz,pfx,rm));
   8679       *decode_ok = True;
   8680       return 1+delta0;
   8681    }
   8682    else if (!epartIsReg(rm) && !haveLOCK(pfx)) {
   8683       /* case 2 */
   8684       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8685       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
   8686       assign( tmpt0, getIRegG(sz, pfx, rm) );
   8687       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   8688                            mkexpr(tmpd), mkexpr(tmpt0)) );
   8689       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   8690       storeLE( mkexpr(addr), mkexpr(tmpt1) );
   8691       putIRegG(sz, pfx, rm, mkexpr(tmpd));
   8692       DIP("xadd%c %s, %s\n",
   8693           nameISize(sz), nameIRegG(sz,pfx,rm), dis_buf);
   8694       *decode_ok = True;
   8695       return len+delta0;
   8696    }
   8697    else if (!epartIsReg(rm) && haveLOCK(pfx)) {
   8698       /* case 3 */
   8699       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8700       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
   8701       assign( tmpt0, getIRegG(sz, pfx, rm) );
   8702       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   8703                            mkexpr(tmpd), mkexpr(tmpt0)) );
   8704       casLE( mkexpr(addr), mkexpr(tmpd)/*expVal*/,
   8705                            mkexpr(tmpt1)/*newVal*/, guest_RIP_curr_instr );
   8706       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   8707       putIRegG(sz, pfx, rm, mkexpr(tmpd));
   8708       DIP("xadd%c %s, %s\n",
   8709           nameISize(sz), nameIRegG(sz,pfx,rm), dis_buf);
   8710       *decode_ok = True;
   8711       return len+delta0;
   8712    }
   8713    /*UNREACHED*/
   8714    vassert(0);
   8715 }
   8716 
   8717 //.. /* Move 16 bits from Ew (ireg or mem) to G (a segment register). */
   8718 //..
   8719 //.. static
   8720 //.. UInt dis_mov_Ew_Sw ( UChar sorb, Long delta0 )
   8721 //.. {
   8722 //..    Int    len;
   8723 //..    IRTemp addr;
   8724 //..    UChar  rm  = getUChar(delta0);
   8725 //..    HChar  dis_buf[50];
   8726 //..
   8727 //..    if (epartIsReg(rm)) {
   8728 //..       putSReg( gregOfRM(rm), getIReg(2, eregOfRM(rm)) );
   8729 //..       DIP("movw %s,%s\n", nameIReg(2,eregOfRM(rm)), nameSReg(gregOfRM(rm)));
   8730 //..       return 1+delta0;
   8731 //..    } else {
   8732 //..       addr = disAMode ( &len, sorb, delta0, dis_buf );
   8733 //..       putSReg( gregOfRM(rm), loadLE(Ity_I16, mkexpr(addr)) );
   8734 //..       DIP("movw %s,%s\n", dis_buf, nameSReg(gregOfRM(rm)));
   8735 //..       return len+delta0;
   8736 //..    }
   8737 //.. }
   8738 //..
   8739 //.. /* Move 16 bits from G (a segment register) to Ew (ireg or mem).  If
   8740 //..    dst is ireg and sz==4, zero out top half of it.  */
   8741 //..
   8742 //.. static
   8743 //.. UInt dis_mov_Sw_Ew ( UChar sorb,
   8744 //..                      Int   sz,
   8745 //..                      UInt  delta0 )
   8746 //.. {
   8747 //..    Int    len;
   8748 //..    IRTemp addr;
   8749 //..    UChar  rm  = getUChar(delta0);
   8750 //..    HChar  dis_buf[50];
   8751 //..
   8752 //..    vassert(sz == 2 || sz == 4);
   8753 //..
   8754 //..    if (epartIsReg(rm)) {
   8755 //..       if (sz == 4)
   8756 //..          putIReg(4, eregOfRM(rm), unop(Iop_16Uto32, getSReg(gregOfRM(rm))));
   8757 //..       else
   8758 //..          putIReg(2, eregOfRM(rm), getSReg(gregOfRM(rm)));
   8759 //..
   8760 //..       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), nameIReg(sz,eregOfRM(rm)));
   8761 //..       return 1+delta0;
   8762 //..    } else {
   8763 //..       addr = disAMode ( &len, sorb, delta0, dis_buf );
   8764 //..       storeLE( mkexpr(addr), getSReg(gregOfRM(rm)) );
   8765 //..       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), dis_buf);
   8766 //..       return len+delta0;
   8767 //..    }
   8768 //.. }
   8769 
   8770 /* Handle move instructions of the form
   8771       mov S, E  meaning
   8772       mov sreg, reg-or-mem
   8773    Is passed the a ptr to the modRM byte, and the data size.  Returns
   8774    the address advanced completely over this instruction.
   8775 
   8776    VEX does not currently simulate segment registers on AMD64 which means that
   8777    instead of moving a value of a segment register, zero is moved to the
   8778    destination.  The zero value represents a null (unused) selector.  This is
   8779    not correct (especially for the %cs, %fs and %gs registers) but it seems to
   8780    provide a sufficient simulation for currently seen programs that use this
   8781    instruction.  If some program actually decides to use the obtained segment
   8782    selector for something meaningful then the zero value should be a clear
   8783    indicator that there is some problem.
   8784 
   8785    S(src) is sreg.
   8786    E(dst) is reg-or-mem
   8787 
   8788    If E is reg, -->    PUT $0, %E
   8789 
   8790    If E is mem, -->    (getAddr E) -> tmpa
   8791                        ST $0, (tmpa)
   8792 */
   8793 static
   8794 ULong dis_mov_S_E ( const VexAbiInfo* vbi,
   8795                     Prefix      pfx,
   8796                     Int         size,
   8797                     Long        delta0 )
   8798 {
   8799    Int   len;
   8800    UChar rm = getUChar(delta0);
   8801    HChar dis_buf[50];
   8802 
   8803    if (epartIsReg(rm)) {
   8804       putIRegE(size, pfx, rm, mkU(szToITy(size), 0));
   8805       DIP("mov %s,%s\n", nameSReg(gregOfRexRM(pfx, rm)),
   8806                          nameIRegE(size, pfx, rm));
   8807       return 1+delta0;
   8808    }
   8809 
   8810    /* E refers to memory */
   8811    {
   8812       IRTemp addr = disAMode(&len, vbi, pfx, delta0, dis_buf, 0);
   8813       storeLE(mkexpr(addr), mkU16(0));
   8814       DIP("mov %s,%s\n", nameSReg(gregOfRexRM(pfx, rm)),
   8815                          dis_buf);
   8816       return len+delta0;
   8817    }
   8818 }
   8819 
   8820 //.. static
   8821 //.. void dis_push_segreg ( UInt sreg, Int sz )
   8822 //.. {
   8823 //..     IRTemp t1 = newTemp(Ity_I16);
   8824 //..     IRTemp ta = newTemp(Ity_I32);
   8825 //..     vassert(sz == 2 || sz == 4);
   8826 //..
   8827 //..     assign( t1, getSReg(sreg) );
   8828 //..     assign( ta, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)) );
   8829 //..     putIReg(4, R_ESP, mkexpr(ta));
   8830 //..     storeLE( mkexpr(ta), mkexpr(t1) );
   8831 //..
   8832 //..     DIP("pushw %s\n", nameSReg(sreg));
   8833 //.. }
   8834 //..
   8835 //.. static
   8836 //.. void dis_pop_segreg ( UInt sreg, Int sz )
   8837 //.. {
   8838 //..     IRTemp t1 = newTemp(Ity_I16);
   8839 //..     IRTemp ta = newTemp(Ity_I32);
   8840 //..     vassert(sz == 2 || sz == 4);
   8841 //..
   8842 //..     assign( ta, getIReg(4, R_ESP) );
   8843 //..     assign( t1, loadLE(Ity_I16, mkexpr(ta)) );
   8844 //..
   8845 //..     putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(ta), mkU32(sz)) );
   8846 //..     putSReg( sreg, mkexpr(t1) );
   8847 //..     DIP("pop %s\n", nameSReg(sreg));
   8848 //.. }
   8849 
   8850 static
   8851 void dis_ret ( /*MOD*/DisResult* dres, const VexAbiInfo* vbi, ULong d64 )
   8852 {
   8853    IRTemp t1 = newTemp(Ity_I64);
   8854    IRTemp t2 = newTemp(Ity_I64);
   8855    IRTemp t3 = newTemp(Ity_I64);
   8856    assign(t1, getIReg64(R_RSP));
   8857    assign(t2, loadLE(Ity_I64,mkexpr(t1)));
   8858    assign(t3, binop(Iop_Add64, mkexpr(t1), mkU64(8+d64)));
   8859    putIReg64(R_RSP, mkexpr(t3));
   8860    make_redzone_AbiHint(vbi, t3, t2/*nia*/, "ret");
   8861    jmp_treg(dres, Ijk_Ret, t2);
   8862    vassert(dres->whatNext == Dis_StopHere);
   8863 }
   8864 
   8865 
   8866 /*------------------------------------------------------------*/
   8867 /*--- SSE/SSE2/SSE3 helpers                                ---*/
   8868 /*------------------------------------------------------------*/
   8869 
   8870 /* Indicates whether the op requires a rounding-mode argument.  Note
   8871    that this covers only vector floating point arithmetic ops, and
   8872    omits the scalar ones that need rounding modes.  Note also that
   8873    inconsistencies here will get picked up later by the IR sanity
   8874    checker, so this isn't correctness-critical. */
   8875 static Bool requiresRMode ( IROp op )
   8876 {
   8877    switch (op) {
   8878       /* 128 bit ops */
   8879       case Iop_Add32Fx4: case Iop_Sub32Fx4:
   8880       case Iop_Mul32Fx4: case Iop_Div32Fx4:
   8881       case Iop_Add64Fx2: case Iop_Sub64Fx2:
   8882       case Iop_Mul64Fx2: case Iop_Div64Fx2:
   8883       /* 256 bit ops */
   8884       case Iop_Add32Fx8: case Iop_Sub32Fx8:
   8885       case Iop_Mul32Fx8: case Iop_Div32Fx8:
   8886       case Iop_Add64Fx4: case Iop_Sub64Fx4:
   8887       case Iop_Mul64Fx4: case Iop_Div64Fx4:
   8888          return True;
   8889       default:
   8890          break;
   8891    }
   8892    return False;
   8893 }
   8894 
   8895 
   8896 /* Worker function; do not call directly.
   8897    Handles full width G = G `op` E   and   G = (not G) `op` E.
   8898 */
   8899 
   8900 static ULong dis_SSE_E_to_G_all_wrk (
   8901                 const VexAbiInfo* vbi,
   8902                 Prefix pfx, Long delta,
   8903                 const HChar* opname, IROp op,
   8904                 Bool   invertG
   8905              )
   8906 {
   8907    HChar   dis_buf[50];
   8908    Int     alen;
   8909    IRTemp  addr;
   8910    UChar   rm = getUChar(delta);
   8911    Bool    needsRMode = requiresRMode(op);
   8912    IRExpr* gpart
   8913       = invertG ? unop(Iop_NotV128, getXMMReg(gregOfRexRM(pfx,rm)))
   8914                 : getXMMReg(gregOfRexRM(pfx,rm));
   8915    if (epartIsReg(rm)) {
   8916       putXMMReg(
   8917          gregOfRexRM(pfx,rm),
   8918          needsRMode
   8919             ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   8920                         gpart,
   8921                         getXMMReg(eregOfRexRM(pfx,rm)))
   8922             : binop(op, gpart,
   8923                         getXMMReg(eregOfRexRM(pfx,rm)))
   8924       );
   8925       DIP("%s %s,%s\n", opname,
   8926                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8927                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8928       return delta+1;
   8929    } else {
   8930       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8931       putXMMReg(
   8932          gregOfRexRM(pfx,rm),
   8933          needsRMode
   8934             ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   8935                         gpart,
   8936                         loadLE(Ity_V128, mkexpr(addr)))
   8937             : binop(op, gpart,
   8938                         loadLE(Ity_V128, mkexpr(addr)))
   8939       );
   8940       DIP("%s %s,%s\n", opname,
   8941                         dis_buf,
   8942                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8943       return delta+alen;
   8944    }
   8945 }
   8946 
   8947 
   8948 /* All lanes SSE binary operation, G = G `op` E. */
   8949 
   8950 static
   8951 ULong dis_SSE_E_to_G_all ( const VexAbiInfo* vbi,
   8952                            Prefix pfx, Long delta,
   8953                            const HChar* opname, IROp op )
   8954 {
   8955    return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, False );
   8956 }
   8957 
   8958 /* All lanes SSE binary operation, G = (not G) `op` E. */
   8959 
   8960 static
   8961 ULong dis_SSE_E_to_G_all_invG ( const VexAbiInfo* vbi,
   8962                                 Prefix pfx, Long delta,
   8963                                 const HChar* opname, IROp op )
   8964 {
   8965    return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, True );
   8966 }
   8967 
   8968 
   8969 /* Lowest 32-bit lane only SSE binary operation, G = G `op` E. */
   8970 
   8971 static ULong dis_SSE_E_to_G_lo32 ( const VexAbiInfo* vbi,
   8972                                    Prefix pfx, Long delta,
   8973                                    const HChar* opname, IROp op )
   8974 {
   8975    HChar   dis_buf[50];
   8976    Int     alen;
   8977    IRTemp  addr;
   8978    UChar   rm = getUChar(delta);
   8979    IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
   8980    if (epartIsReg(rm)) {
   8981       putXMMReg( gregOfRexRM(pfx,rm),
   8982                  binop(op, gpart,
   8983                            getXMMReg(eregOfRexRM(pfx,rm))) );
   8984       DIP("%s %s,%s\n", opname,
   8985                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8986                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8987       return delta+1;
   8988    } else {
   8989       /* We can only do a 32-bit memory read, so the upper 3/4 of the
   8990          E operand needs to be made simply of zeroes. */
   8991       IRTemp epart = newTemp(Ity_V128);
   8992       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8993       assign( epart, unop( Iop_32UtoV128,
   8994                            loadLE(Ity_I32, mkexpr(addr))) );
   8995       putXMMReg( gregOfRexRM(pfx,rm),
   8996                  binop(op, gpart, mkexpr(epart)) );
   8997       DIP("%s %s,%s\n", opname,
   8998                         dis_buf,
   8999                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9000       return delta+alen;
   9001    }
   9002 }
   9003 
   9004 
   9005 /* Lower 64-bit lane only SSE binary operation, G = G `op` E. */
   9006 
   9007 static ULong dis_SSE_E_to_G_lo64 ( const VexAbiInfo* vbi,
   9008                                    Prefix pfx, Long delta,
   9009                                    const HChar* opname, IROp op )
   9010 {
   9011    HChar   dis_buf[50];
   9012    Int     alen;
   9013    IRTemp  addr;
   9014    UChar   rm = getUChar(delta);
   9015    IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
   9016    if (epartIsReg(rm)) {
   9017       putXMMReg( gregOfRexRM(pfx,rm),
   9018                  binop(op, gpart,
   9019                            getXMMReg(eregOfRexRM(pfx,rm))) );
   9020       DIP("%s %s,%s\n", opname,
   9021                         nameXMMReg(eregOfRexRM(pfx,rm)),
   9022                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9023       return delta+1;
   9024    } else {
   9025       /* We can only do a 64-bit memory read, so the upper half of the
   9026          E operand needs to be made simply of zeroes. */
   9027       IRTemp epart = newTemp(Ity_V128);
   9028       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9029       assign( epart, unop( Iop_64UtoV128,
   9030                            loadLE(Ity_I64, mkexpr(addr))) );
   9031       putXMMReg( gregOfRexRM(pfx,rm),
   9032                  binop(op, gpart, mkexpr(epart)) );
   9033       DIP("%s %s,%s\n", opname,
   9034                         dis_buf,
   9035                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9036       return delta+alen;
   9037    }
   9038 }
   9039 
   9040 
   9041 /* All lanes unary SSE operation, G = op(E). */
   9042 
   9043 static ULong dis_SSE_E_to_G_unary_all (
   9044                 const VexAbiInfo* vbi,
   9045                 Prefix pfx, Long delta,
   9046                 const HChar* opname, IROp op
   9047              )
   9048 {
   9049    HChar   dis_buf[50];
   9050    Int     alen;
   9051    IRTemp  addr;
   9052    UChar   rm = getUChar(delta);
   9053    // Sqrt32Fx4 and Sqrt64Fx2 take a rounding mode, which is faked
   9054    // up in the usual way.
   9055    Bool needsIRRM = op == Iop_Sqrt32Fx4 || op == Iop_Sqrt64Fx2;
   9056    if (epartIsReg(rm)) {
   9057       IRExpr* src = getXMMReg(eregOfRexRM(pfx,rm));
   9058       /* XXXROUNDINGFIXME */
   9059       IRExpr* res = needsIRRM ? binop(op, get_FAKE_roundingmode(), src)
   9060                               : unop(op, src);
   9061       putXMMReg( gregOfRexRM(pfx,rm), res );
   9062       DIP("%s %s,%s\n", opname,
   9063                         nameXMMReg(eregOfRexRM(pfx,rm)),
   9064                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9065       return delta+1;
   9066    } else {
   9067       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9068       IRExpr* src = loadLE(Ity_V128, mkexpr(addr));
   9069       /* XXXROUNDINGFIXME */
   9070       IRExpr* res = needsIRRM ? binop(op, get_FAKE_roundingmode(), src)
   9071                               : unop(op, src);
   9072       putXMMReg( gregOfRexRM(pfx,rm), res );
   9073       DIP("%s %s,%s\n", opname,
   9074                         dis_buf,
   9075                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9076       return delta+alen;
   9077    }
   9078 }
   9079 
   9080 
   9081 /* Lowest 32-bit lane only unary SSE operation, G = op(E). */
   9082 
   9083 static ULong dis_SSE_E_to_G_unary_lo32 (
   9084                 const VexAbiInfo* vbi,
   9085                 Prefix pfx, Long delta,
   9086                 const HChar* opname, IROp op
   9087              )
   9088 {
   9089    /* First we need to get the old G value and patch the low 32 bits
   9090       of the E operand into it.  Then apply op and write back to G. */
   9091    HChar   dis_buf[50];
   9092    Int     alen;
   9093    IRTemp  addr;
   9094    UChar   rm = getUChar(delta);
   9095    IRTemp  oldG0 = newTemp(Ity_V128);
   9096    IRTemp  oldG1 = newTemp(Ity_V128);
   9097 
   9098    assign( oldG0, getXMMReg(gregOfRexRM(pfx,rm)) );
   9099 
   9100    if (epartIsReg(rm)) {
   9101       assign( oldG1,
   9102               binop( Iop_SetV128lo32,
   9103                      mkexpr(oldG0),
   9104                      getXMMRegLane32(eregOfRexRM(pfx,rm), 0)) );
   9105       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   9106       DIP("%s %s,%s\n", opname,
   9107                         nameXMMReg(eregOfRexRM(pfx,rm)),
   9108                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9109       return delta+1;
   9110    } else {
   9111       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9112       assign( oldG1,
   9113               binop( Iop_SetV128lo32,
   9114                      mkexpr(oldG0),
   9115                      loadLE(Ity_I32, mkexpr(addr)) ));
   9116       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   9117       DIP("%s %s,%s\n", opname,
   9118                         dis_buf,
   9119                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9120       return delta+alen;
   9121    }
   9122 }
   9123 
   9124 
   9125 /* Lowest 64-bit lane only unary SSE operation, G = op(E). */
   9126 
   9127 static ULong dis_SSE_E_to_G_unary_lo64 (
   9128                 const VexAbiInfo* vbi,
   9129                 Prefix pfx, Long delta,
   9130                 const HChar* opname, IROp op
   9131              )
   9132 {
   9133    /* First we need to get the old G value and patch the low 64 bits
   9134       of the E operand into it.  Then apply op and write back to G. */
   9135    HChar   dis_buf[50];
   9136    Int     alen;
   9137    IRTemp  addr;
   9138    UChar   rm = getUChar(delta);
   9139    IRTemp  oldG0 = newTemp(Ity_V128);
   9140    IRTemp  oldG1 = newTemp(Ity_V128);
   9141 
   9142    assign( oldG0, getXMMReg(gregOfRexRM(pfx,rm)) );
   9143 
   9144    if (epartIsReg(rm)) {
   9145       assign( oldG1,
   9146               binop( Iop_SetV128lo64,
   9147                      mkexpr(oldG0),
   9148                      getXMMRegLane64(eregOfRexRM(pfx,rm), 0)) );
   9149       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   9150       DIP("%s %s,%s\n", opname,
   9151                         nameXMMReg(eregOfRexRM(pfx,rm)),
   9152                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9153       return delta+1;
   9154    } else {
   9155       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9156       assign( oldG1,
   9157               binop( Iop_SetV128lo64,
   9158                      mkexpr(oldG0),
   9159                      loadLE(Ity_I64, mkexpr(addr)) ));
   9160       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   9161       DIP("%s %s,%s\n", opname,
   9162                         dis_buf,
   9163                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9164       return delta+alen;
   9165    }
   9166 }
   9167 
   9168 
   9169 /* SSE integer binary operation:
   9170       G = G `op` E   (eLeft == False)
   9171       G = E `op` G   (eLeft == True)
   9172 */
   9173 static ULong dis_SSEint_E_to_G(
   9174                 const VexAbiInfo* vbi,
   9175                 Prefix pfx, Long delta,
   9176                 const HChar* opname, IROp op,
   9177                 Bool   eLeft
   9178              )
   9179 {
   9180    HChar   dis_buf[50];
   9181    Int     alen;
   9182    IRTemp  addr;
   9183    UChar   rm = getUChar(delta);
   9184    IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
   9185    IRExpr* epart = NULL;
   9186    if (epartIsReg(rm)) {
   9187       epart = getXMMReg(eregOfRexRM(pfx,rm));
   9188       DIP("%s %s,%s\n", opname,
   9189                         nameXMMReg(eregOfRexRM(pfx,rm)),
   9190                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9191       delta += 1;
   9192    } else {
   9193       addr  = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9194       epart = loadLE(Ity_V128, mkexpr(addr));
   9195       DIP("%s %s,%s\n", opname,
   9196                         dis_buf,
   9197                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9198       delta += alen;
   9199    }
   9200    putXMMReg( gregOfRexRM(pfx,rm),
   9201               eLeft ? binop(op, epart, gpart)
   9202                     : binop(op, gpart, epart) );
   9203    return delta;
   9204 }
   9205 
   9206 
   9207 /* Helper for doing SSE FP comparisons.  False return ==> unhandled.
   9208    This is all a bit of a kludge in that it ignores the subtleties of
   9209    ordered-vs-unordered and signalling-vs-nonsignalling in the Intel
   9210    spec. */
   9211 static Bool findSSECmpOp ( /*OUT*/Bool* preSwapP,
   9212                            /*OUT*/IROp* opP,
   9213                            /*OUT*/Bool* postNotP,
   9214                            UInt imm8, Bool all_lanes, Int sz )
   9215 {
   9216    if (imm8 >= 32) return False;
   9217 
   9218    /* First, compute a (preSwap, op, postNot) triple from
   9219       the supplied imm8. */
   9220    Bool pre = False;
   9221    IROp op  = Iop_INVALID;
   9222    Bool not = False;
   9223 
   9224 #  define XXX(_pre, _op, _not) { pre = _pre; op = _op; not = _not; }
   9225    // If you add a case here, add a corresponding test for both VCMPSD_128
   9226    // and VCMPSS_128 in avx-1.c.
   9227    // Cases 0xA and above are
   9228    //    "Enhanced Comparison Predicate[s] for VEX-Encoded [insns]"
   9229    switch (imm8) {
   9230       // "O" = ordered, "U" = unordered
   9231       // "Q" = non-signalling (quiet), "S" = signalling
   9232       //
   9233       //             swap operands?
   9234       //             |
   9235       //             |      cmp op          invert after?
   9236       //             |      |               |
   9237       //             v      v               v
   9238       case 0x0:  XXX(False, Iop_CmpEQ32Fx4, False); break; // EQ_OQ
   9239       case 0x8:  XXX(False, Iop_CmpEQ32Fx4, False); break; // EQ_UQ
   9240       case 0x10: XXX(False, Iop_CmpEQ32Fx4, False); break; // EQ_OS
   9241       case 0x18: XXX(False, Iop_CmpEQ32Fx4, False); break; // EQ_US
   9242       //
   9243       case 0x1:  XXX(False, Iop_CmpLT32Fx4, False); break; // LT_OS
   9244       case 0x11: XXX(False, Iop_CmpLT32Fx4, False); break; // LT_OQ
   9245       //
   9246       case 0x2:  XXX(False, Iop_CmpLE32Fx4, False); break; // LE_OS
   9247       case 0x12: XXX(False, Iop_CmpLE32Fx4, False); break; // LE_OQ
   9248       //
   9249       case 0x3:  XXX(False, Iop_CmpUN32Fx4, False); break; // UNORD_Q
   9250       case 0x13: XXX(False, Iop_CmpUN32Fx4, False); break; // UNORD_S
   9251       //
   9252       // 0xC: this isn't really right because it returns all-1s when
   9253       // either operand is a NaN, and it should return all-0s.
   9254       case 0x4:  XXX(False, Iop_CmpEQ32Fx4, True);  break; // NEQ_UQ
   9255       case 0xC:  XXX(False, Iop_CmpEQ32Fx4, True);  break; // NEQ_OQ
   9256       case 0x14: XXX(False, Iop_CmpEQ32Fx4, True);  break; // NEQ_US
   9257       case 0x1C: XXX(False, Iop_CmpEQ32Fx4, True);  break; // NEQ_OS
   9258       //
   9259       case 0x5:  XXX(False, Iop_CmpLT32Fx4, True);  break; // NLT_US
   9260       case 0x15: XXX(False, Iop_CmpLT32Fx4, True);  break; // NLT_UQ
   9261       //
   9262       case 0x6:  XXX(False, Iop_CmpLE32Fx4, True);  break; // NLE_US
   9263       case 0x16: XXX(False, Iop_CmpLE32Fx4, True);  break; // NLE_UQ
   9264       //
   9265       case 0x7:  XXX(False, Iop_CmpUN32Fx4, True);  break; // ORD_Q
   9266       case 0x17: XXX(False, Iop_CmpUN32Fx4, True);  break; // ORD_S
   9267       //
   9268       case 0x9:  XXX(True,  Iop_CmpLE32Fx4, True);  break; // NGE_US
   9269       case 0x19: XXX(True,  Iop_CmpLE32Fx4, True);  break; // NGE_UQ
   9270       //
   9271       case 0xA:  XXX(True,  Iop_CmpLT32Fx4, True);  break; // NGT_US
   9272       case 0x1A: XXX(True,  Iop_CmpLT32Fx4, True);  break; // NGT_UQ
   9273       //
   9274       case 0xD:  XXX(True,  Iop_CmpLE32Fx4, False); break; // GE_OS
   9275       case 0x1D: XXX(True,  Iop_CmpLE32Fx4, False); break; // GE_OQ
   9276       //
   9277       case 0xE:  XXX(True,  Iop_CmpLT32Fx4, False); break; // GT_OS
   9278       case 0x1E: XXX(True,  Iop_CmpLT32Fx4, False); break; // GT_OQ
   9279       // Unhandled:
   9280       // 0xB  FALSE_OQ
   9281       // 0xF  TRUE_UQ
   9282       // 0x1B  FALSE_OS
   9283       // 0x1F  TRUE_US
   9284       /* Don't forget to add test cases to VCMPSS_128_<imm8> in
   9285          avx-1.c if new cases turn up. */
   9286       default: break;
   9287    }
   9288 #  undef XXX
   9289    if (op == Iop_INVALID) return False;
   9290 
   9291    /* Now convert the op into one with the same arithmetic but that is
   9292       correct for the width and laneage requirements. */
   9293 
   9294    /**/ if (sz == 4 && all_lanes) {
   9295       switch (op) {
   9296          case Iop_CmpEQ32Fx4: op = Iop_CmpEQ32Fx4; break;
   9297          case Iop_CmpLT32Fx4: op = Iop_CmpLT32Fx4; break;
   9298          case Iop_CmpLE32Fx4: op = Iop_CmpLE32Fx4; break;
   9299          case Iop_CmpUN32Fx4: op = Iop_CmpUN32Fx4; break;
   9300          default: vassert(0);
   9301       }
   9302    }
   9303    else if (sz == 4 && !all_lanes) {
   9304       switch (op) {
   9305          case Iop_CmpEQ32Fx4: op = Iop_CmpEQ32F0x4; break;
   9306          case Iop_CmpLT32Fx4: op = Iop_CmpLT32F0x4; break;
   9307          case Iop_CmpLE32Fx4: op = Iop_CmpLE32F0x4; break;
   9308          case Iop_CmpUN32Fx4: op = Iop_CmpUN32F0x4; break;
   9309          default: vassert(0);
   9310       }
   9311    }
   9312    else if (sz == 8 && all_lanes) {
   9313       switch (op) {
   9314          case Iop_CmpEQ32Fx4: op = Iop_CmpEQ64Fx2; break;
   9315          case Iop_CmpLT32Fx4: op = Iop_CmpLT64Fx2; break;
   9316          case Iop_CmpLE32Fx4: op = Iop_CmpLE64Fx2; break;
   9317          case Iop_CmpUN32Fx4: op = Iop_CmpUN64Fx2; break;
   9318          default: vassert(0);
   9319       }
   9320    }
   9321    else if (sz == 8 && !all_lanes) {
   9322       switch (op) {
   9323          case Iop_CmpEQ32Fx4: op = Iop_CmpEQ64F0x2; break;
   9324          case Iop_CmpLT32Fx4: op = Iop_CmpLT64F0x2; break;
   9325          case Iop_CmpLE32Fx4: op = Iop_CmpLE64F0x2; break;
   9326          case Iop_CmpUN32Fx4: op = Iop_CmpUN64F0x2; break;
   9327          default: vassert(0);
   9328       }
   9329    }
   9330    else {
   9331       vpanic("findSSECmpOp(amd64,guest)");
   9332    }
   9333 
   9334    *preSwapP = pre; *opP = op; *postNotP = not;
   9335    return True;
   9336 }
   9337 
   9338 
   9339 /* Handles SSE 32F/64F comparisons.  It can fail, in which case it
   9340    returns the original delta to indicate failure. */
   9341 
   9342 static Long dis_SSE_cmp_E_to_G ( const VexAbiInfo* vbi,
   9343                                  Prefix pfx, Long delta,
   9344                                  const HChar* opname, Bool all_lanes, Int sz )
   9345 {
   9346    Long    delta0 = delta;
   9347    HChar   dis_buf[50];
   9348    Int     alen;
   9349    UInt    imm8;
   9350    IRTemp  addr;
   9351    Bool    preSwap = False;
   9352    IROp    op      = Iop_INVALID;
   9353    Bool    postNot = False;
   9354    IRTemp  plain   = newTemp(Ity_V128);
   9355    UChar   rm      = getUChar(delta);
   9356    UShort  mask    = 0;
   9357    vassert(sz == 4 || sz == 8);
   9358    if (epartIsReg(rm)) {
   9359       imm8 = getUChar(delta+1);
   9360       if (imm8 >= 8) return delta0; /* FAIL */
   9361       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
   9362       if (!ok) return delta0; /* FAIL */
   9363       vassert(!preSwap); /* never needed for imm8 < 8 */
   9364       assign( plain, binop(op, getXMMReg(gregOfRexRM(pfx,rm)),
   9365                                getXMMReg(eregOfRexRM(pfx,rm))) );
   9366       delta += 2;
   9367       DIP("%s $%u,%s,%s\n", opname,
   9368                             imm8,
   9369                             nameXMMReg(eregOfRexRM(pfx,rm)),
   9370                             nameXMMReg(gregOfRexRM(pfx,rm)) );
   9371    } else {
   9372       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   9373       imm8 = getUChar(delta+alen);
   9374       if (imm8 >= 8) return delta0; /* FAIL */
   9375       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
   9376       if (!ok) return delta0; /* FAIL */
   9377       vassert(!preSwap); /* never needed for imm8 < 8 */
   9378       assign( plain,
   9379               binop(
   9380                  op,
   9381                  getXMMReg(gregOfRexRM(pfx,rm)),
   9382                    all_lanes
   9383                       ? loadLE(Ity_V128, mkexpr(addr))
   9384                    : sz == 8
   9385                       ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
   9386                    : /*sz==4*/
   9387                       unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr)))
   9388               )
   9389       );
   9390       delta += alen+1;
   9391       DIP("%s $%u,%s,%s\n", opname,
   9392                             imm8,
   9393                             dis_buf,
   9394                             nameXMMReg(gregOfRexRM(pfx,rm)) );
   9395    }
   9396 
   9397    if (postNot && all_lanes) {
   9398       putXMMReg( gregOfRexRM(pfx,rm),
   9399                  unop(Iop_NotV128, mkexpr(plain)) );
   9400    }
   9401    else
   9402    if (postNot && !all_lanes) {
   9403       mask = toUShort(sz==4 ? 0x000F : 0x00FF);
   9404       putXMMReg( gregOfRexRM(pfx,rm),
   9405                  binop(Iop_XorV128, mkexpr(plain), mkV128(mask)) );
   9406    }
   9407    else {
   9408       putXMMReg( gregOfRexRM(pfx,rm), mkexpr(plain) );
   9409    }
   9410 
   9411    return delta;
   9412 }
   9413 
   9414 
   9415 /* Vector by scalar shift of G by the amount specified at the bottom
   9416    of E. */
   9417 
   9418 static ULong dis_SSE_shiftG_byE ( const VexAbiInfo* vbi,
   9419                                   Prefix pfx, Long delta,
   9420                                   const HChar* opname, IROp op )
   9421 {
   9422    HChar   dis_buf[50];
   9423    Int     alen, size;
   9424    IRTemp  addr;
   9425    Bool    shl, shr, sar;
   9426    UChar   rm   = getUChar(delta);
   9427    IRTemp  g0   = newTemp(Ity_V128);
   9428    IRTemp  g1   = newTemp(Ity_V128);
   9429    IRTemp  amt  = newTemp(Ity_I64);
   9430    IRTemp  amt8 = newTemp(Ity_I8);
   9431    if (epartIsReg(rm)) {
   9432       assign( amt, getXMMRegLane64(eregOfRexRM(pfx,rm), 0) );
   9433       DIP("%s %s,%s\n", opname,
   9434                         nameXMMReg(eregOfRexRM(pfx,rm)),
   9435                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9436       delta++;
   9437    } else {
   9438       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9439       assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
   9440       DIP("%s %s,%s\n", opname,
   9441                         dis_buf,
   9442                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9443       delta += alen;
   9444    }
   9445    assign( g0,   getXMMReg(gregOfRexRM(pfx,rm)) );
   9446    assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
   9447 
   9448    shl = shr = sar = False;
   9449    size = 0;
   9450    switch (op) {
   9451       case Iop_ShlN16x8: shl = True; size = 32; break;
   9452       case Iop_ShlN32x4: shl = True; size = 32; break;
   9453       case Iop_ShlN64x2: shl = True; size = 64; break;
   9454       case Iop_SarN16x8: sar = True; size = 16; break;
   9455       case Iop_SarN32x4: sar = True; size = 32; break;
   9456       case Iop_ShrN16x8: shr = True; size = 16; break;
   9457       case Iop_ShrN32x4: shr = True; size = 32; break;
   9458       case Iop_ShrN64x2: shr = True; size = 64; break;
   9459       default: vassert(0);
   9460    }
   9461 
   9462    if (shl || shr) {
   9463      assign(
   9464         g1,
   9465         IRExpr_ITE(
   9466            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
   9467            binop(op, mkexpr(g0), mkexpr(amt8)),
   9468            mkV128(0x0000)
   9469         )
   9470      );
   9471    } else
   9472    if (sar) {
   9473      assign(
   9474         g1,
   9475         IRExpr_ITE(
   9476            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
   9477            binop(op, mkexpr(g0), mkexpr(amt8)),
   9478            binop(op, mkexpr(g0), mkU8(size-1))
   9479         )
   9480      );
   9481    } else {
   9482       vassert(0);
   9483    }
   9484 
   9485    putXMMReg( gregOfRexRM(pfx,rm), mkexpr(g1) );
   9486    return delta;
   9487 }
   9488 
   9489 
   9490 /* Vector by scalar shift of E by an immediate byte. */
   9491 
   9492 static
   9493 ULong dis_SSE_shiftE_imm ( Prefix pfx,
   9494                            Long delta, const HChar* opname, IROp op )
   9495 {
   9496    Bool    shl, shr, sar;
   9497    UChar   rm   = getUChar(delta);
   9498    IRTemp  e0   = newTemp(Ity_V128);
   9499    IRTemp  e1   = newTemp(Ity_V128);
   9500    UChar   amt, size;
   9501    vassert(epartIsReg(rm));
   9502    vassert(gregLO3ofRM(rm) == 2
   9503            || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
   9504    amt = getUChar(delta+1);
   9505    delta += 2;
   9506    DIP("%s $%d,%s\n", opname,
   9507                       (Int)amt,
   9508                       nameXMMReg(eregOfRexRM(pfx,rm)) );
   9509    assign( e0, getXMMReg(eregOfRexRM(pfx,rm)) );
   9510 
   9511    shl = shr = sar = False;
   9512    size = 0;
   9513    switch (op) {
   9514       case Iop_ShlN16x8: shl = True; size = 16; break;
   9515       case Iop_ShlN32x4: shl = True; size = 32; break;
   9516       case Iop_ShlN64x2: shl = True; size = 64; break;
   9517       case Iop_SarN16x8: sar = True; size = 16; break;
   9518       case Iop_SarN32x4: sar = True; size = 32; break;
   9519       case Iop_ShrN16x8: shr = True; size = 16; break;
   9520       case Iop_ShrN32x4: shr = True; size = 32; break;
   9521       case Iop_ShrN64x2: shr = True; size = 64; break;
   9522       default: vassert(0);
   9523    }
   9524 
   9525    if (shl || shr) {
   9526      assign( e1, amt >= size
   9527                     ? mkV128(0x0000)
   9528                     : binop(op, mkexpr(e0), mkU8(amt))
   9529      );
   9530    } else
   9531    if (sar) {
   9532      assign( e1, amt >= size
   9533                     ? binop(op, mkexpr(e0), mkU8(size-1))
   9534                     : binop(op, mkexpr(e0), mkU8(amt))
   9535      );
   9536    } else {
   9537       vassert(0);
   9538    }
   9539 
   9540    putXMMReg( eregOfRexRM(pfx,rm), mkexpr(e1) );
   9541    return delta;
   9542 }
   9543 
   9544 
   9545 /* Get the current SSE rounding mode. */
   9546 
   9547 static IRExpr* /* :: Ity_I32 */ get_sse_roundingmode ( void )
   9548 {
   9549    return
   9550       unop( Iop_64to32,
   9551             binop( Iop_And64,
   9552                    IRExpr_Get( OFFB_SSEROUND, Ity_I64 ),
   9553                    mkU64(3) ));
   9554 }
   9555 
   9556 static void put_sse_roundingmode ( IRExpr* sseround )
   9557 {
   9558    vassert(typeOfIRExpr(irsb->tyenv, sseround) == Ity_I32);
   9559    stmt( IRStmt_Put( OFFB_SSEROUND,
   9560                      unop(Iop_32Uto64,sseround) ) );
   9561 }
   9562 
   9563 /* Break a V128-bit value up into four 32-bit ints. */
   9564 
   9565 static void breakupV128to32s ( IRTemp t128,
   9566                                /*OUTs*/
   9567                                IRTemp* t3, IRTemp* t2,
   9568                                IRTemp* t1, IRTemp* t0 )
   9569 {
   9570    IRTemp hi64 = newTemp(Ity_I64);
   9571    IRTemp lo64 = newTemp(Ity_I64);
   9572    assign( hi64, unop(Iop_V128HIto64, mkexpr(t128)) );
   9573    assign( lo64, unop(Iop_V128to64,   mkexpr(t128)) );
   9574 
   9575    vassert(t0 && *t0 == IRTemp_INVALID);
   9576    vassert(t1 && *t1 == IRTemp_INVALID);
   9577    vassert(t2 && *t2 == IRTemp_INVALID);
   9578    vassert(t3 && *t3 == IRTemp_INVALID);
   9579 
   9580    *t0 = newTemp(Ity_I32);
   9581    *t1 = newTemp(Ity_I32);
   9582    *t2 = newTemp(Ity_I32);
   9583    *t3 = newTemp(Ity_I32);
   9584    assign( *t0, unop(Iop_64to32,   mkexpr(lo64)) );
   9585    assign( *t1, unop(Iop_64HIto32, mkexpr(lo64)) );
   9586    assign( *t2, unop(Iop_64to32,   mkexpr(hi64)) );
   9587    assign( *t3, unop(Iop_64HIto32, mkexpr(hi64)) );
   9588 }
   9589 
   9590 /* Construct a V128-bit value from four 32-bit ints. */
   9591 
   9592 static IRExpr* mkV128from32s ( IRTemp t3, IRTemp t2,
   9593                                IRTemp t1, IRTemp t0 )
   9594 {
   9595    return
   9596       binop( Iop_64HLtoV128,
   9597              binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
   9598              binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0))
   9599    );
   9600 }
   9601 
   9602 /* Break a 64-bit value up into four 16-bit ints. */
   9603 
   9604 static void breakup64to16s ( IRTemp t64,
   9605                              /*OUTs*/
   9606                              IRTemp* t3, IRTemp* t2,
   9607                              IRTemp* t1, IRTemp* t0 )
   9608 {
   9609    IRTemp hi32 = newTemp(Ity_I32);
   9610    IRTemp lo32 = newTemp(Ity_I32);
   9611    assign( hi32, unop(Iop_64HIto32, mkexpr(t64)) );
   9612    assign( lo32, unop(Iop_64to32,   mkexpr(t64)) );
   9613 
   9614    vassert(t0 && *t0 == IRTemp_INVALID);
   9615    vassert(t1 && *t1 == IRTemp_INVALID);
   9616    vassert(t2 && *t2 == IRTemp_INVALID);
   9617    vassert(t3 && *t3 == IRTemp_INVALID);
   9618 
   9619    *t0 = newTemp(Ity_I16);
   9620    *t1 = newTemp(Ity_I16);
   9621    *t2 = newTemp(Ity_I16);
   9622    *t3 = newTemp(Ity_I16);
   9623    assign( *t0, unop(Iop_32to16,   mkexpr(lo32)) );
   9624    assign( *t1, unop(Iop_32HIto16, mkexpr(lo32)) );
   9625    assign( *t2, unop(Iop_32to16,   mkexpr(hi32)) );
   9626    assign( *t3, unop(Iop_32HIto16, mkexpr(hi32)) );
   9627 }
   9628 
   9629 /* Construct a 64-bit value from four 16-bit ints. */
   9630 
   9631 static IRExpr* mk64from16s ( IRTemp t3, IRTemp t2,
   9632                              IRTemp t1, IRTemp t0 )
   9633 {
   9634    return
   9635       binop( Iop_32HLto64,
   9636              binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)),
   9637              binop(Iop_16HLto32, mkexpr(t1), mkexpr(t0))
   9638    );
   9639 }
   9640 
   9641 /* Break a V256-bit value up into four 64-bit ints. */
   9642 
   9643 static void breakupV256to64s ( IRTemp t256,
   9644                                /*OUTs*/
   9645                                IRTemp* t3, IRTemp* t2,
   9646                                IRTemp* t1, IRTemp* t0 )
   9647 {
   9648    vassert(t0 && *t0 == IRTemp_INVALID);
   9649    vassert(t1 && *t1 == IRTemp_INVALID);
   9650    vassert(t2 && *t2 == IRTemp_INVALID);
   9651    vassert(t3 && *t3 == IRTemp_INVALID);
   9652    *t0 = newTemp(Ity_I64);
   9653    *t1 = newTemp(Ity_I64);
   9654    *t2 = newTemp(Ity_I64);
   9655    *t3 = newTemp(Ity_I64);
   9656    assign( *t0, unop(Iop_V256to64_0, mkexpr(t256)) );
   9657    assign( *t1, unop(Iop_V256to64_1, mkexpr(t256)) );
   9658    assign( *t2, unop(Iop_V256to64_2, mkexpr(t256)) );
   9659    assign( *t3, unop(Iop_V256to64_3, mkexpr(t256)) );
   9660 }
   9661 
   9662 /* Break a V256-bit value up into two V128s. */
   9663 
   9664 static void breakupV256toV128s ( IRTemp t256,
   9665                                  /*OUTs*/
   9666                                  IRTemp* t1, IRTemp* t0 )
   9667 {
   9668    vassert(t0 && *t0 == IRTemp_INVALID);
   9669    vassert(t1 && *t1 == IRTemp_INVALID);
   9670    *t0 = newTemp(Ity_V128);
   9671    *t1 = newTemp(Ity_V128);
   9672    assign(*t1, unop(Iop_V256toV128_1, mkexpr(t256)));
   9673    assign(*t0, unop(Iop_V256toV128_0, mkexpr(t256)));
   9674 }
   9675 
   9676 /* Break a V256-bit value up into eight 32-bit ints.  */
   9677 
   9678 static void breakupV256to32s ( IRTemp t256,
   9679                                /*OUTs*/
   9680                                IRTemp* t7, IRTemp* t6,
   9681                                IRTemp* t5, IRTemp* t4,
   9682                                IRTemp* t3, IRTemp* t2,
   9683                                IRTemp* t1, IRTemp* t0 )
   9684 {
   9685    IRTemp t128_1 = IRTemp_INVALID;
   9686    IRTemp t128_0 = IRTemp_INVALID;
   9687    breakupV256toV128s( t256, &t128_1, &t128_0 );
   9688    breakupV128to32s( t128_1, t7, t6, t5, t4 );
   9689    breakupV128to32s( t128_0, t3, t2, t1, t0 );
   9690 }
   9691 
   9692 /* Break a V128-bit value up into two 64-bit ints. */
   9693 
   9694 static void breakupV128to64s ( IRTemp t128,
   9695                                /*OUTs*/
   9696                                IRTemp* t1, IRTemp* t0 )
   9697 {
   9698    vassert(t0 && *t0 == IRTemp_INVALID);
   9699    vassert(t1 && *t1 == IRTemp_INVALID);
   9700    *t0 = newTemp(Ity_I64);
   9701    *t1 = newTemp(Ity_I64);
   9702    assign( *t0, unop(Iop_V128to64,   mkexpr(t128)) );
   9703    assign( *t1, unop(Iop_V128HIto64, mkexpr(t128)) );
   9704 }
   9705 
   9706 /* Construct a V256-bit value from eight 32-bit ints. */
   9707 
   9708 static IRExpr* mkV256from32s ( IRTemp t7, IRTemp t6,
   9709                                IRTemp t5, IRTemp t4,
   9710                                IRTemp t3, IRTemp t2,
   9711                                IRTemp t1, IRTemp t0 )
   9712 {
   9713    return
   9714       binop( Iop_V128HLtoV256,
   9715              binop( Iop_64HLtoV128,
   9716                     binop(Iop_32HLto64, mkexpr(t7), mkexpr(t6)),
   9717                     binop(Iop_32HLto64, mkexpr(t5), mkexpr(t4)) ),
   9718              binop( Iop_64HLtoV128,
   9719                     binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
   9720                     binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0)) )
   9721    );
   9722 }
   9723 
   9724 /* Construct a V256-bit value from four 64-bit ints. */
   9725 
   9726 static IRExpr* mkV256from64s ( IRTemp t3, IRTemp t2,
   9727                                IRTemp t1, IRTemp t0 )
   9728 {
   9729    return
   9730       binop( Iop_V128HLtoV256,
   9731              binop(Iop_64HLtoV128, mkexpr(t3), mkexpr(t2)),
   9732              binop(Iop_64HLtoV128, mkexpr(t1), mkexpr(t0))
   9733    );
   9734 }
   9735 
   9736 /* Helper for the SSSE3 (not SSE3) PMULHRSW insns.  Given two 64-bit
   9737    values (aa,bb), computes, for each of the 4 16-bit lanes:
   9738 
   9739    (((aa_lane *s32 bb_lane) >>u 14) + 1) >>u 1
   9740 */
   9741 static IRExpr* dis_PMULHRSW_helper ( IRExpr* aax, IRExpr* bbx )
   9742 {
   9743    IRTemp aa      = newTemp(Ity_I64);
   9744    IRTemp bb      = newTemp(Ity_I64);
   9745    IRTemp aahi32s = newTemp(Ity_I64);
   9746    IRTemp aalo32s = newTemp(Ity_I64);
   9747    IRTemp bbhi32s = newTemp(Ity_I64);
   9748    IRTemp bblo32s = newTemp(Ity_I64);
   9749    IRTemp rHi     = newTemp(Ity_I64);
   9750    IRTemp rLo     = newTemp(Ity_I64);
   9751    IRTemp one32x2 = newTemp(Ity_I64);
   9752    assign(aa, aax);
   9753    assign(bb, bbx);
   9754    assign( aahi32s,
   9755            binop(Iop_SarN32x2,
   9756                  binop(Iop_InterleaveHI16x4, mkexpr(aa), mkexpr(aa)),
   9757                  mkU8(16) ));
   9758    assign( aalo32s,
   9759            binop(Iop_SarN32x2,
   9760                  binop(Iop_InterleaveLO16x4, mkexpr(aa), mkexpr(aa)),
   9761                  mkU8(16) ));
   9762    assign( bbhi32s,
   9763            binop(Iop_SarN32x2,
   9764                  binop(Iop_InterleaveHI16x4, mkexpr(bb), mkexpr(bb)),
   9765                  mkU8(16) ));
   9766    assign( bblo32s,
   9767            binop(Iop_SarN32x2,
   9768                  binop(Iop_InterleaveLO16x4, mkexpr(bb), mkexpr(bb)),
   9769                  mkU8(16) ));
   9770    assign(one32x2, mkU64( (1ULL << 32) + 1 ));
   9771    assign(
   9772       rHi,
   9773       binop(
   9774          Iop_ShrN32x2,
   9775          binop(
   9776             Iop_Add32x2,
   9777             binop(
   9778                Iop_ShrN32x2,
   9779                binop(Iop_Mul32x2, mkexpr(aahi32s), mkexpr(bbhi32s)),
   9780                mkU8(14)
   9781             ),
   9782             mkexpr(one32x2)
   9783          ),
   9784          mkU8(1)
   9785       )
   9786    );
   9787    assign(
   9788       rLo,
   9789       binop(
   9790          Iop_ShrN32x2,
   9791          binop(
   9792             Iop_Add32x2,
   9793             binop(
   9794                Iop_ShrN32x2,
   9795                binop(Iop_Mul32x2, mkexpr(aalo32s), mkexpr(bblo32s)),
   9796                mkU8(14)
   9797             ),
   9798             mkexpr(one32x2)
   9799          ),
   9800          mkU8(1)
   9801       )
   9802    );
   9803    return
   9804       binop(Iop_CatEvenLanes16x4, mkexpr(rHi), mkexpr(rLo));
   9805 }
   9806 
   9807 /* Helper for the SSSE3 (not SSE3) PSIGN{B,W,D} insns.  Given two 64-bit
   9808    values (aa,bb), computes, for each lane:
   9809 
   9810           if aa_lane < 0 then - bb_lane
   9811      else if aa_lane > 0 then bb_lane
   9812      else 0
   9813 */
   9814 static IRExpr* dis_PSIGN_helper ( IRExpr* aax, IRExpr* bbx, Int laneszB )
   9815 {
   9816    IRTemp aa       = newTemp(Ity_I64);
   9817    IRTemp bb       = newTemp(Ity_I64);
   9818    IRTemp zero     = newTemp(Ity_I64);
   9819    IRTemp bbNeg    = newTemp(Ity_I64);
   9820    IRTemp negMask  = newTemp(Ity_I64);
   9821    IRTemp posMask  = newTemp(Ity_I64);
   9822    IROp   opSub    = Iop_INVALID;
   9823    IROp   opCmpGTS = Iop_INVALID;
   9824 
   9825    switch (laneszB) {
   9826       case 1: opSub = Iop_Sub8x8;  opCmpGTS = Iop_CmpGT8Sx8;  break;
   9827       case 2: opSub = Iop_Sub16x4; opCmpGTS = Iop_CmpGT16Sx4; break;
   9828       case 4: opSub = Iop_Sub32x2; opCmpGTS = Iop_CmpGT32Sx2; break;
   9829       default: vassert(0);
   9830    }
   9831 
   9832    assign( aa,      aax );
   9833    assign( bb,      bbx );
   9834    assign( zero,    mkU64(0) );
   9835    assign( bbNeg,   binop(opSub,    mkexpr(zero), mkexpr(bb)) );
   9836    assign( negMask, binop(opCmpGTS, mkexpr(zero), mkexpr(aa)) );
   9837    assign( posMask, binop(opCmpGTS, mkexpr(aa),   mkexpr(zero)) );
   9838 
   9839    return
   9840       binop(Iop_Or64,
   9841             binop(Iop_And64, mkexpr(bb),    mkexpr(posMask)),
   9842             binop(Iop_And64, mkexpr(bbNeg), mkexpr(negMask)) );
   9843 
   9844 }
   9845 
   9846 
   9847 /* Helper for the SSSE3 (not SSE3) PABS{B,W,D} insns.  Given a 64-bit
   9848    value aa, computes, for each lane
   9849 
   9850    if aa < 0 then -aa else aa
   9851 
   9852    Note that the result is interpreted as unsigned, so that the
   9853    absolute value of the most negative signed input can be
   9854    represented.
   9855 */
   9856 static IRTemp math_PABS_MMX ( IRTemp aa, Int laneszB )
   9857 {
   9858    IRTemp res     = newTemp(Ity_I64);
   9859    IRTemp zero    = newTemp(Ity_I64);
   9860    IRTemp aaNeg   = newTemp(Ity_I64);
   9861    IRTemp negMask = newTemp(Ity_I64);
   9862    IRTemp posMask = newTemp(Ity_I64);
   9863    IROp   opSub   = Iop_INVALID;
   9864    IROp   opSarN  = Iop_INVALID;
   9865 
   9866    switch (laneszB) {
   9867       case 1: opSub = Iop_Sub8x8;  opSarN = Iop_SarN8x8;  break;
   9868       case 2: opSub = Iop_Sub16x4; opSarN = Iop_SarN16x4; break;
   9869       case 4: opSub = Iop_Sub32x2; opSarN = Iop_SarN32x2; break;
   9870       default: vassert(0);
   9871    }
   9872 
   9873    assign( negMask, binop(opSarN, mkexpr(aa), mkU8(8*laneszB-1)) );
   9874    assign( posMask, unop(Iop_Not64, mkexpr(negMask)) );
   9875    assign( zero,    mkU64(0) );
   9876    assign( aaNeg,   binop(opSub, mkexpr(zero), mkexpr(aa)) );
   9877    assign( res,
   9878            binop(Iop_Or64,
   9879                  binop(Iop_And64, mkexpr(aa),    mkexpr(posMask)),
   9880                  binop(Iop_And64, mkexpr(aaNeg), mkexpr(negMask)) ));
   9881    return res;
   9882 }
   9883 
   9884 /* XMM version of math_PABS_MMX. */
   9885 static IRTemp math_PABS_XMM ( IRTemp aa, Int laneszB )
   9886 {
   9887    IRTemp res  = newTemp(Ity_V128);
   9888    IRTemp aaHi = newTemp(Ity_I64);
   9889    IRTemp aaLo = newTemp(Ity_I64);
   9890    assign(aaHi, unop(Iop_V128HIto64, mkexpr(aa)));
   9891    assign(aaLo, unop(Iop_V128to64, mkexpr(aa)));
   9892    assign(res, binop(Iop_64HLtoV128,
   9893                      mkexpr(math_PABS_MMX(aaHi, laneszB)),
   9894                      mkexpr(math_PABS_MMX(aaLo, laneszB))));
   9895    return res;
   9896 }
   9897 
   9898 /* Specialisations of math_PABS_XMM, since there's no easy way to do
   9899    partial applications in C :-( */
   9900 static IRTemp math_PABS_XMM_pap4 ( IRTemp aa ) {
   9901    return math_PABS_XMM(aa, 4);
   9902 }
   9903 
   9904 static IRTemp math_PABS_XMM_pap2 ( IRTemp aa ) {
   9905    return math_PABS_XMM(aa, 2);
   9906 }
   9907 
   9908 static IRTemp math_PABS_XMM_pap1 ( IRTemp aa ) {
   9909    return math_PABS_XMM(aa, 1);
   9910 }
   9911 
   9912 /* YMM version of math_PABS_XMM. */
   9913 static IRTemp math_PABS_YMM ( IRTemp aa, Int laneszB )
   9914 {
   9915    IRTemp res  = newTemp(Ity_V256);
   9916    IRTemp aaHi = IRTemp_INVALID;
   9917    IRTemp aaLo = IRTemp_INVALID;
   9918    breakupV256toV128s(aa, &aaHi, &aaLo);
   9919    assign(res, binop(Iop_V128HLtoV256,
   9920                      mkexpr(math_PABS_XMM(aaHi, laneszB)),
   9921                      mkexpr(math_PABS_XMM(aaLo, laneszB))));
   9922    return res;
   9923 }
   9924 
   9925 static IRTemp math_PABS_YMM_pap4 ( IRTemp aa ) {
   9926    return math_PABS_YMM(aa, 4);
   9927 }
   9928 
   9929 static IRTemp math_PABS_YMM_pap2 ( IRTemp aa ) {
   9930    return math_PABS_YMM(aa, 2);
   9931 }
   9932 
   9933 static IRTemp math_PABS_YMM_pap1 ( IRTemp aa ) {
   9934    return math_PABS_YMM(aa, 1);
   9935 }
   9936 
   9937 static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64,
   9938                                         IRTemp lo64, Long byteShift )
   9939 {
   9940    vassert(byteShift >= 1 && byteShift <= 7);
   9941    return
   9942       binop(Iop_Or64,
   9943             binop(Iop_Shl64, mkexpr(hi64), mkU8(8*(8-byteShift))),
   9944             binop(Iop_Shr64, mkexpr(lo64), mkU8(8*byteShift))
   9945       );
   9946 }
   9947 
   9948 static IRTemp math_PALIGNR_XMM ( IRTemp sV, IRTemp dV, UInt imm8 )
   9949 {
   9950    IRTemp res = newTemp(Ity_V128);
   9951    IRTemp sHi = newTemp(Ity_I64);
   9952    IRTemp sLo = newTemp(Ity_I64);
   9953    IRTemp dHi = newTemp(Ity_I64);
   9954    IRTemp dLo = newTemp(Ity_I64);
   9955    IRTemp rHi = newTemp(Ity_I64);
   9956    IRTemp rLo = newTemp(Ity_I64);
   9957 
   9958    assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   9959    assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   9960    assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   9961    assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   9962 
   9963    if (imm8 == 0) {
   9964       assign( rHi, mkexpr(sHi) );
   9965       assign( rLo, mkexpr(sLo) );
   9966    }
   9967    else if (imm8 >= 1 && imm8 <= 7) {
   9968       assign( rHi, dis_PALIGNR_XMM_helper(dLo, sHi, imm8) );
   9969       assign( rLo, dis_PALIGNR_XMM_helper(sHi, sLo, imm8) );
   9970    }
   9971    else if (imm8 == 8) {
   9972       assign( rHi, mkexpr(dLo) );
   9973       assign( rLo, mkexpr(sHi) );
   9974    }
   9975    else if (imm8 >= 9 && imm8 <= 15) {
   9976       assign( rHi, dis_PALIGNR_XMM_helper(dHi, dLo, imm8-8) );
   9977       assign( rLo, dis_PALIGNR_XMM_helper(dLo, sHi, imm8-8) );
   9978    }
   9979    else if (imm8 == 16) {
   9980       assign( rHi, mkexpr(dHi) );
   9981       assign( rLo, mkexpr(dLo) );
   9982    }
   9983    else if (imm8 >= 17 && imm8 <= 23) {
   9984       assign( rHi, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(imm8-16))) );
   9985       assign( rLo, dis_PALIGNR_XMM_helper(dHi, dLo, imm8-16) );
   9986    }
   9987    else if (imm8 == 24) {
   9988       assign( rHi, mkU64(0) );
   9989       assign( rLo, mkexpr(dHi) );
   9990    }
   9991    else if (imm8 >= 25 && imm8 <= 31) {
   9992       assign( rHi, mkU64(0) );
   9993       assign( rLo, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(imm8-24))) );
   9994    }
   9995    else if (imm8 >= 32 && imm8 <= 255) {
   9996       assign( rHi, mkU64(0) );
   9997       assign( rLo, mkU64(0) );
   9998    }
   9999    else
   10000       vassert(0);
   10001 
   10002    assign( res, binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo)));
   10003    return res;
   10004 }
   10005 
   10006 
   10007 /* Generate a SIGSEGV followed by a restart of the current instruction
   10008    if effective_addr is not 16-aligned.  This is required behaviour
   10009    for some SSE3 instructions and all 128-bit SSSE3 instructions.
   10010    This assumes that guest_RIP_curr_instr is set correctly! */
   10011 static
   10012 void gen_SEGV_if_not_XX_aligned ( IRTemp effective_addr, ULong mask )
   10013 {
   10014    stmt(
   10015       IRStmt_Exit(
   10016          binop(Iop_CmpNE64,
   10017                binop(Iop_And64,mkexpr(effective_addr),mkU64(mask)),
   10018                mkU64(0)),
   10019          Ijk_SigSEGV,
   10020          IRConst_U64(guest_RIP_curr_instr),
   10021          OFFB_RIP
   10022       )
   10023    );
   10024 }
   10025 
   10026 static void gen_SEGV_if_not_16_aligned ( IRTemp effective_addr ) {
   10027    gen_SEGV_if_not_XX_aligned(effective_addr, 16-1);
   10028 }
   10029 
   10030 static void gen_SEGV_if_not_32_aligned ( IRTemp effective_addr ) {
   10031    gen_SEGV_if_not_XX_aligned(effective_addr, 32-1);
   10032 }
   10033 
   10034 static void gen_SEGV_if_not_64_aligned ( IRTemp effective_addr ) {
   10035    gen_SEGV_if_not_XX_aligned(effective_addr, 64-1);
   10036 }
   10037 
   10038 /* Helper for deciding whether a given insn (starting at the opcode
   10039    byte) may validly be used with a LOCK prefix.  The following insns
   10040    may be used with LOCK when their destination operand is in memory.
   10041    AFAICS this is exactly the same for both 32-bit and 64-bit mode.
   10042 
   10043    ADD        80 /0,  81 /0,  82 /0,  83 /0,  00,  01
   10044    OR         80 /1,  81 /1,  82 /x,  83 /1,  08,  09
   10045    ADC        80 /2,  81 /2,  82 /2,  83 /2,  10,  11
   10046    SBB        81 /3,  81 /3,  82 /x,  83 /3,  18,  19
   10047    AND        80 /4,  81 /4,  82 /x,  83 /4,  20,  21
   10048    SUB        80 /5,  81 /5,  82 /x,  83 /5,  28,  29
   10049    XOR        80 /6,  81 /6,  82 /x,  83 /6,  30,  31
   10050 
   10051    DEC        FE /1,  FF /1
   10052    INC        FE /0,  FF /0
   10053 
   10054    NEG        F6 /3,  F7 /3
   10055    NOT        F6 /2,  F7 /2
   10056 
   10057    XCHG       86, 87
   10058 
   10059    BTC        0F BB,  0F BA /7
   10060    BTR        0F B3,  0F BA /6
   10061    BTS        0F AB,  0F BA /5
   10062 
   10063    CMPXCHG    0F B0,  0F B1
   10064    CMPXCHG8B  0F C7 /1
   10065 
   10066    XADD       0F C0,  0F C1
   10067 
   10068    ------------------------------
   10069 
   10070    80 /0  =  addb $imm8,  rm8
   10071    81 /0  =  addl $imm32, rm32  and  addw $imm16, rm16
   10072    82 /0  =  addb $imm8,  rm8
   10073    83 /0  =  addl $simm8, rm32  and  addw $simm8, rm16
   10074 
   10075    00     =  addb r8,  rm8
   10076    01     =  addl r32, rm32  and  addw r16, rm16
   10077 
   10078    Same for ADD OR ADC SBB AND SUB XOR
   10079 
   10080    FE /1  = dec rm8
   10081    FF /1  = dec rm32  and  dec rm16
   10082 
   10083    FE /0  = inc rm8
   10084    FF /0  = inc rm32  and  inc rm16
   10085 
   10086    F6 /3  = neg rm8
   10087    F7 /3  = neg rm32  and  neg rm16
   10088 
   10089    F6 /2  = not rm8
   10090    F7 /2  = not rm32  and  not rm16
   10091 
   10092    0F BB     = btcw r16, rm16    and  btcl r32, rm32
   10093    OF BA /7  = btcw $imm8, rm16  and  btcw $imm8, rm32
   10094 
   10095    Same for BTS, BTR
   10096 */
   10097 static Bool can_be_used_with_LOCK_prefix ( const UChar* opc )
   10098 {
   10099    switch (opc[0]) {
   10100       case 0x00: case 0x01: case 0x08: case 0x09:
   10101       case 0x10: case 0x11: case 0x18: case 0x19:
   10102       case 0x20: case 0x21: case 0x28: case 0x29:
   10103       case 0x30: case 0x31:
   10104          if (!epartIsReg(opc[1]))
   10105             return True;
   10106          break;
   10107 
   10108       case 0x80: case 0x81: case 0x82: case 0x83:
   10109          if (gregLO3ofRM(opc[1]) >= 0 && gregLO3ofRM(opc[1]) <= 6
   10110              && !epartIsReg(opc[1]))
   10111             return True;
   10112          break;
   10113 
   10114       case 0xFE: case 0xFF:
   10115          if (gregLO3ofRM(opc[1]) >= 0 && gregLO3ofRM(opc[1]) <= 1
   10116              && !epartIsReg(opc[1]))
   10117             return True;
   10118          break;
   10119 
   10120       case 0xF6: case 0xF7:
   10121          if (gregLO3ofRM(opc[1]) >= 2 && gregLO3ofRM(opc[1]) <= 3
   10122              && !epartIsReg(opc[1]))
   10123             return True;
   10124          break;
   10125 
   10126       case 0x86: case 0x87:
   10127          if (!epartIsReg(opc[1]))
   10128             return True;
   10129          break;
   10130 
   10131       case 0x0F: {
   10132          switch (opc[1]) {
   10133             case 0xBB: case 0xB3: case 0xAB:
   10134                if (!epartIsReg(opc[2]))
   10135                   return True;
   10136                break;
   10137             case 0xBA:
   10138                if (gregLO3ofRM(opc[2]) >= 5 && gregLO3ofRM(opc[2]) <= 7
   10139                    && !epartIsReg(opc[2]))
   10140                   return True;
   10141                break;
   10142             case 0xB0: case 0xB1:
   10143                if (!epartIsReg(opc[2]))
   10144                   return True;
   10145                break;
   10146             case 0xC7:
   10147                if (gregLO3ofRM(opc[2]) == 1 && !epartIsReg(opc[2]) )
   10148                   return True;
   10149                break;
   10150             case 0xC0: case 0xC1:
   10151                if (!epartIsReg(opc[2]))
   10152                   return True;
   10153                break;
   10154             default:
   10155                break;
   10156          } /* switch (opc[1]) */
   10157          break;
   10158       }
   10159 
   10160       default:
   10161          break;
   10162    } /* switch (opc[0]) */
   10163 
   10164    return False;
   10165 }
   10166 
   10167 
   10168 /*------------------------------------------------------------*/
   10169 /*---                                                      ---*/
   10170 /*--- Top-level SSE/SSE2: dis_ESC_0F__SSE2                 ---*/
   10171 /*---                                                      ---*/
   10172 /*------------------------------------------------------------*/
   10173 
   10174 static Long dis_COMISD ( const VexAbiInfo* vbi, Prefix pfx,
   10175                          Long delta, Bool isAvx, UChar opc )
   10176 {
   10177    vassert(opc == 0x2F/*COMISD*/ || opc == 0x2E/*UCOMISD*/);
   10178    Int    alen  = 0;
   10179    HChar  dis_buf[50];
   10180    IRTemp argL  = newTemp(Ity_F64);
   10181    IRTemp argR  = newTemp(Ity_F64);
   10182    UChar  modrm = getUChar(delta);
   10183    IRTemp addr  = IRTemp_INVALID;
   10184    if (epartIsReg(modrm)) {
   10185       assign( argR, getXMMRegLane64F( eregOfRexRM(pfx,modrm),
   10186                                       0/*lowest lane*/ ) );
   10187       delta += 1;
   10188       DIP("%s%scomisd %s,%s\n", isAvx ? "v" : "",
   10189                                 opc==0x2E ? "u" : "",
   10190                                 nameXMMReg(eregOfRexRM(pfx,modrm)),
   10191                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10192    } else {
   10193       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10194       assign( argR, loadLE(Ity_F64, mkexpr(addr)) );
   10195       delta += alen;
   10196       DIP("%s%scomisd %s,%s\n", isAvx ? "v" : "",
   10197                                 opc==0x2E ? "u" : "",
   10198                                 dis_buf,
   10199                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10200    }
   10201    assign( argL, getXMMRegLane64F( gregOfRexRM(pfx,modrm),
   10202                                    0/*lowest lane*/ ) );
   10203 
   10204    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   10205    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   10206    stmt( IRStmt_Put(
   10207             OFFB_CC_DEP1,
   10208             binop( Iop_And64,
   10209                    unop( Iop_32Uto64,
   10210                          binop(Iop_CmpF64, mkexpr(argL), mkexpr(argR)) ),
   10211                    mkU64(0x45)
   10212        )));
   10213    return delta;
   10214 }
   10215 
   10216 
   10217 static Long dis_COMISS ( const VexAbiInfo* vbi, Prefix pfx,
   10218                          Long delta, Bool isAvx, UChar opc )
   10219 {
   10220    vassert(opc == 0x2F/*COMISS*/ || opc == 0x2E/*UCOMISS*/);
   10221    Int    alen  = 0;
   10222    HChar  dis_buf[50];
   10223    IRTemp argL  = newTemp(Ity_F32);
   10224    IRTemp argR  = newTemp(Ity_F32);
   10225    UChar  modrm = getUChar(delta);
   10226    IRTemp addr  = IRTemp_INVALID;
   10227    if (epartIsReg(modrm)) {
   10228       assign( argR, getXMMRegLane32F( eregOfRexRM(pfx,modrm),
   10229                                       0/*lowest lane*/ ) );
   10230       delta += 1;
   10231       DIP("%s%scomiss %s,%s\n", isAvx ? "v" : "",
   10232                                 opc==0x2E ? "u" : "",
   10233                                 nameXMMReg(eregOfRexRM(pfx,modrm)),
   10234                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10235    } else {
   10236       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10237       assign( argR, loadLE(Ity_F32, mkexpr(addr)) );
   10238       delta += alen;
   10239       DIP("%s%scomiss %s,%s\n", isAvx ? "v" : "",
   10240                                 opc==0x2E ? "u" : "",
   10241                                 dis_buf,
   10242                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10243    }
   10244    assign( argL, getXMMRegLane32F( gregOfRexRM(pfx,modrm),
   10245                                    0/*lowest lane*/ ) );
   10246 
   10247    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   10248    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   10249    stmt( IRStmt_Put(
   10250             OFFB_CC_DEP1,
   10251             binop( Iop_And64,
   10252                    unop( Iop_32Uto64,
   10253                          binop(Iop_CmpF64,
   10254                                unop(Iop_F32toF64,mkexpr(argL)),
   10255                                unop(Iop_F32toF64,mkexpr(argR)))),
   10256                    mkU64(0x45)
   10257        )));
   10258    return delta;
   10259 }
   10260 
   10261 
   10262 static Long dis_PSHUFD_32x4 ( const VexAbiInfo* vbi, Prefix pfx,
   10263                               Long delta, Bool writesYmm )
   10264 {
   10265    Int    order;
   10266    Int    alen  = 0;
   10267    HChar  dis_buf[50];
   10268    IRTemp sV    = newTemp(Ity_V128);
   10269    UChar  modrm = getUChar(delta);
   10270    const HChar* strV  = writesYmm ? "v" : "";
   10271    IRTemp addr  = IRTemp_INVALID;
   10272    if (epartIsReg(modrm)) {
   10273       assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   10274       order = (Int)getUChar(delta+1);
   10275       delta += 1+1;
   10276       DIP("%spshufd $%d,%s,%s\n", strV, order,
   10277                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
   10278                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   10279    } else {
   10280       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   10281                         1/*byte after the amode*/ );
   10282       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   10283       order = (Int)getUChar(delta+alen);
   10284       delta += alen+1;
   10285       DIP("%spshufd $%d,%s,%s\n", strV, order,
   10286                                  dis_buf,
   10287                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   10288    }
   10289 
   10290    IRTemp s3, s2, s1, s0;
   10291    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   10292    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   10293 
   10294 #  define SEL(n)  ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   10295    IRTemp dV = newTemp(Ity_V128);
   10296    assign(dV,
   10297           mkV128from32s( SEL((order>>6)&3), SEL((order>>4)&3),
   10298                          SEL((order>>2)&3), SEL((order>>0)&3) )
   10299    );
   10300 #  undef SEL
   10301 
   10302    (writesYmm ? putYMMRegLoAndZU : putXMMReg)
   10303       (gregOfRexRM(pfx,modrm), mkexpr(dV));
   10304    return delta;
   10305 }
   10306 
   10307 
   10308 static Long dis_PSHUFD_32x8 ( const VexAbiInfo* vbi, Prefix pfx, Long delta )
   10309 {
   10310    Int    order;
   10311    Int    alen  = 0;
   10312    HChar  dis_buf[50];
   10313    IRTemp sV    = newTemp(Ity_V256);
   10314    UChar  modrm = getUChar(delta);
   10315    IRTemp addr  = IRTemp_INVALID;
   10316    UInt   rG    = gregOfRexRM(pfx,modrm);
   10317    if (epartIsReg(modrm)) {
   10318       UInt rE = eregOfRexRM(pfx,modrm);
   10319       assign( sV, getYMMReg(rE) );
   10320       order = (Int)getUChar(delta+1);
   10321       delta += 1+1;
   10322       DIP("vpshufd $%d,%s,%s\n", order, nameYMMReg(rE), nameYMMReg(rG));
   10323    } else {
   10324       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   10325                         1/*byte after the amode*/ );
   10326       assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   10327       order = (Int)getUChar(delta+alen);
   10328       delta += alen+1;
   10329       DIP("vpshufd $%d,%s,%s\n", order,  dis_buf, nameYMMReg(rG));
   10330    }
   10331 
   10332    IRTemp s[8];
   10333    s[7] = s[6] = s[5] = s[4] = s[3] = s[2] = s[1] = s[0] = IRTemp_INVALID;
   10334    breakupV256to32s( sV, &s[7], &s[6], &s[5], &s[4],
   10335                          &s[3], &s[2], &s[1], &s[0] );
   10336 
   10337    putYMMReg( rG, mkV256from32s( s[4 + ((order>>6)&3)],
   10338                                  s[4 + ((order>>4)&3)],
   10339                                  s[4 + ((order>>2)&3)],
   10340                                  s[4 + ((order>>0)&3)],
   10341                                  s[0 + ((order>>6)&3)],
   10342                                  s[0 + ((order>>4)&3)],
   10343                                  s[0 + ((order>>2)&3)],
   10344                                  s[0 + ((order>>0)&3)] ) );
   10345    return delta;
   10346 }
   10347 
   10348 
   10349 static IRTemp math_PSRLDQ ( IRTemp sV, Int imm )
   10350 {
   10351    IRTemp dV    = newTemp(Ity_V128);
   10352    IRTemp hi64  = newTemp(Ity_I64);
   10353    IRTemp lo64  = newTemp(Ity_I64);
   10354    IRTemp hi64r = newTemp(Ity_I64);
   10355    IRTemp lo64r = newTemp(Ity_I64);
   10356 
   10357    vassert(imm >= 0 && imm <= 255);
   10358    if (imm >= 16) {
   10359       assign(dV, mkV128(0x0000));
   10360       return dV;
   10361    }
   10362 
   10363    assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
   10364    assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
   10365 
   10366    if (imm == 0) {
   10367       assign( lo64r, mkexpr(lo64) );
   10368       assign( hi64r, mkexpr(hi64) );
   10369    }
   10370    else
   10371    if (imm == 8) {
   10372       assign( hi64r, mkU64(0) );
   10373       assign( lo64r, mkexpr(hi64) );
   10374    }
   10375    else
   10376    if (imm > 8) {
   10377       assign( hi64r, mkU64(0) );
   10378       assign( lo64r, binop( Iop_Shr64, mkexpr(hi64), mkU8( 8*(imm-8) ) ));
   10379    } else {
   10380       assign( hi64r, binop( Iop_Shr64, mkexpr(hi64), mkU8(8 * imm) ));
   10381       assign( lo64r,
   10382               binop( Iop_Or64,
   10383                      binop(Iop_Shr64, mkexpr(lo64),
   10384                            mkU8(8 * imm)),
   10385                      binop(Iop_Shl64, mkexpr(hi64),
   10386                            mkU8(8 * (8 - imm)) )
   10387                      )
   10388               );
   10389    }
   10390 
   10391    assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
   10392    return dV;
   10393 }
   10394 
   10395 
   10396 static IRTemp math_PSLLDQ ( IRTemp sV, Int imm )
   10397 {
   10398    IRTemp       dV    = newTemp(Ity_V128);
   10399    IRTemp       hi64  = newTemp(Ity_I64);
   10400    IRTemp       lo64  = newTemp(Ity_I64);
   10401    IRTemp       hi64r = newTemp(Ity_I64);
   10402    IRTemp       lo64r = newTemp(Ity_I64);
   10403 
   10404    vassert(imm >= 0 && imm <= 255);
   10405    if (imm >= 16) {
   10406       assign(dV, mkV128(0x0000));
   10407       return dV;
   10408    }
   10409 
   10410    assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
   10411    assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
   10412 
   10413    if (imm == 0) {
   10414       assign( lo64r, mkexpr(lo64) );
   10415       assign( hi64r, mkexpr(hi64) );
   10416    }
   10417    else
   10418    if (imm == 8) {
   10419       assign( lo64r, mkU64(0) );
   10420       assign( hi64r, mkexpr(lo64) );
   10421    }
   10422    else
   10423    if (imm > 8) {
   10424       assign( lo64r, mkU64(0) );
   10425       assign( hi64r, binop( Iop_Shl64, mkexpr(lo64), mkU8( 8*(imm-8) ) ));
   10426    } else {
   10427       assign( lo64r, binop( Iop_Shl64, mkexpr(lo64), mkU8(8 * imm) ));
   10428       assign( hi64r,
   10429               binop( Iop_Or64,
   10430                      binop(Iop_Shl64, mkexpr(hi64),
   10431                            mkU8(8 * imm)),
   10432                      binop(Iop_Shr64, mkexpr(lo64),
   10433                            mkU8(8 * (8 - imm)) )
   10434                      )
   10435               );
   10436    }
   10437 
   10438    assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
   10439    return dV;
   10440 }
   10441 
   10442 
   10443 static Long dis_CVTxSD2SI ( const VexAbiInfo* vbi, Prefix pfx,
   10444                             Long delta, Bool isAvx, UChar opc, Int sz )
   10445 {
   10446    vassert(opc == 0x2D/*CVTSD2SI*/ || opc == 0x2C/*CVTTSD2SI*/);
   10447    HChar  dis_buf[50];
   10448    Int    alen   = 0;
   10449    UChar  modrm  = getUChar(delta);
   10450    IRTemp addr   = IRTemp_INVALID;
   10451    IRTemp rmode  = newTemp(Ity_I32);
   10452    IRTemp f64lo  = newTemp(Ity_F64);
   10453    Bool   r2zero = toBool(opc == 0x2C);
   10454 
   10455    if (epartIsReg(modrm)) {
   10456       delta += 1;
   10457       assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
   10458       DIP("%scvt%ssd2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
   10459                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
   10460                                   nameIReg(sz, gregOfRexRM(pfx,modrm),
   10461                                            False));
   10462    } else {
   10463       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10464       assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   10465       delta += alen;
   10466       DIP("%scvt%ssd2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
   10467                                   dis_buf,
   10468                                   nameIReg(sz, gregOfRexRM(pfx,modrm),
   10469                                            False));
   10470    }
   10471 
   10472    if (r2zero) {
   10473       assign( rmode, mkU32((UInt)Irrm_ZERO) );
   10474    } else {
   10475       assign( rmode, get_sse_roundingmode() );
   10476    }
   10477 
   10478    if (sz == 4) {
   10479       putIReg32( gregOfRexRM(pfx,modrm),
   10480                  binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo)) );
   10481    } else {
   10482       vassert(sz == 8);
   10483       putIReg64( gregOfRexRM(pfx,modrm),
   10484                  binop( Iop_F64toI64S, mkexpr(rmode), mkexpr(f64lo)) );
   10485    }
   10486 
   10487    return delta;
   10488 }
   10489 
   10490 
   10491 static Long dis_CVTxSS2SI ( const VexAbiInfo* vbi, Prefix pfx,
   10492                             Long delta, Bool isAvx, UChar opc, Int sz )
   10493 {
   10494    vassert(opc == 0x2D/*CVTSS2SI*/ || opc == 0x2C/*CVTTSS2SI*/);
   10495    HChar  dis_buf[50];
   10496    Int    alen   = 0;
   10497    UChar  modrm  = getUChar(delta);
   10498    IRTemp addr   = IRTemp_INVALID;
   10499    IRTemp rmode  = newTemp(Ity_I32);
   10500    IRTemp f32lo  = newTemp(Ity_F32);
   10501    Bool   r2zero = toBool(opc == 0x2C);
   10502 
   10503    if (epartIsReg(modrm)) {
   10504       delta += 1;
   10505       assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
   10506       DIP("%scvt%sss2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
   10507                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
   10508                                   nameIReg(sz, gregOfRexRM(pfx,modrm),
   10509                                            False));
   10510    } else {
   10511       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10512       assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   10513       delta += alen;
   10514       DIP("%scvt%sss2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
   10515                                   dis_buf,
   10516                                   nameIReg(sz, gregOfRexRM(pfx,modrm),
   10517                                            False));
   10518    }
   10519 
   10520    if (r2zero) {
   10521       assign( rmode, mkU32((UInt)Irrm_ZERO) );
   10522    } else {
   10523       assign( rmode, get_sse_roundingmode() );
   10524    }
   10525 
   10526    if (sz == 4) {
   10527       putIReg32( gregOfRexRM(pfx,modrm),
   10528                  binop( Iop_F64toI32S,
   10529                         mkexpr(rmode),
   10530                         unop(Iop_F32toF64, mkexpr(f32lo))) );
   10531    } else {
   10532       vassert(sz == 8);
   10533       putIReg64( gregOfRexRM(pfx,modrm),
   10534                  binop( Iop_F64toI64S,
   10535                         mkexpr(rmode),
   10536                         unop(Iop_F32toF64, mkexpr(f32lo))) );
   10537    }
   10538 
   10539    return delta;
   10540 }
   10541 
   10542 
   10543 static Long dis_CVTPS2PD_128 ( const VexAbiInfo* vbi, Prefix pfx,
   10544                                Long delta, Bool isAvx )
   10545 {
   10546    IRTemp addr  = IRTemp_INVALID;
   10547    Int    alen  = 0;
   10548    HChar  dis_buf[50];
   10549    IRTemp f32lo = newTemp(Ity_F32);
   10550    IRTemp f32hi = newTemp(Ity_F32);
   10551    UChar  modrm = getUChar(delta);
   10552    UInt   rG    = gregOfRexRM(pfx,modrm);
   10553    if (epartIsReg(modrm)) {
   10554       UInt rE = eregOfRexRM(pfx,modrm);
   10555       assign( f32lo, getXMMRegLane32F(rE, 0) );
   10556       assign( f32hi, getXMMRegLane32F(rE, 1) );
   10557       delta += 1;
   10558       DIP("%scvtps2pd %s,%s\n",
   10559           isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG));
   10560    } else {
   10561       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10562       assign( f32lo, loadLE(Ity_F32, mkexpr(addr)) );
   10563       assign( f32hi, loadLE(Ity_F32,
   10564                             binop(Iop_Add64,mkexpr(addr),mkU64(4))) );
   10565       delta += alen;
   10566       DIP("%scvtps2pd %s,%s\n",
   10567           isAvx ? "v" : "", dis_buf, nameXMMReg(rG));
   10568    }
   10569 
   10570    putXMMRegLane64F( rG, 1, unop(Iop_F32toF64, mkexpr(f32hi)) );
   10571    putXMMRegLane64F( rG, 0, unop(Iop_F32toF64, mkexpr(f32lo)) );
   10572    if (isAvx)
   10573       putYMMRegLane128( rG, 1, mkV128(0));
   10574    return delta;
   10575 }
   10576 
   10577 
   10578 static Long dis_CVTPS2PD_256 ( const VexAbiInfo* vbi, Prefix pfx,
   10579                                Long delta )
   10580 {
   10581    IRTemp addr  = IRTemp_INVALID;
   10582    Int    alen  = 0;
   10583    HChar  dis_buf[50];
   10584    IRTemp f32_0 = newTemp(Ity_F32);
   10585    IRTemp f32_1 = newTemp(Ity_F32);
   10586    IRTemp f32_2 = newTemp(Ity_F32);
   10587    IRTemp f32_3 = newTemp(Ity_F32);
   10588    UChar  modrm = getUChar(delta);
   10589    UInt   rG    = gregOfRexRM(pfx,modrm);
   10590    if (epartIsReg(modrm)) {
   10591       UInt rE = eregOfRexRM(pfx,modrm);
   10592       assign( f32_0, getXMMRegLane32F(rE, 0) );
   10593       assign( f32_1, getXMMRegLane32F(rE, 1) );
   10594       assign( f32_2, getXMMRegLane32F(rE, 2) );
   10595       assign( f32_3, getXMMRegLane32F(rE, 3) );
   10596       delta += 1;
   10597       DIP("vcvtps2pd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   10598    } else {
   10599       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10600       assign( f32_0, loadLE(Ity_F32, mkexpr(addr)) );
   10601       assign( f32_1, loadLE(Ity_F32,
   10602                             binop(Iop_Add64,mkexpr(addr),mkU64(4))) );
   10603       assign( f32_2, loadLE(Ity_F32,
   10604                             binop(Iop_Add64,mkexpr(addr),mkU64(8))) );
   10605       assign( f32_3, loadLE(Ity_F32,
   10606                             binop(Iop_Add64,mkexpr(addr),mkU64(12))) );
   10607       delta += alen;
   10608       DIP("vcvtps2pd %s,%s\n", dis_buf, nameYMMReg(rG));
   10609    }
   10610 
   10611    putYMMRegLane64F( rG, 3, unop(Iop_F32toF64, mkexpr(f32_3)) );
   10612    putYMMRegLane64F( rG, 2, unop(Iop_F32toF64, mkexpr(f32_2)) );
   10613    putYMMRegLane64F( rG, 1, unop(Iop_F32toF64, mkexpr(f32_1)) );
   10614    putYMMRegLane64F( rG, 0, unop(Iop_F32toF64, mkexpr(f32_0)) );
   10615    return delta;
   10616 }
   10617 
   10618 
   10619 static Long dis_CVTPD2PS_128 ( const VexAbiInfo* vbi, Prefix pfx,
   10620                                Long delta, Bool isAvx )
   10621 {
   10622    IRTemp addr  = IRTemp_INVALID;
   10623    Int    alen  = 0;
   10624    HChar  dis_buf[50];
   10625    UChar  modrm = getUChar(delta);
   10626    UInt   rG    = gregOfRexRM(pfx,modrm);
   10627    IRTemp argV  = newTemp(Ity_V128);
   10628    IRTemp rmode = newTemp(Ity_I32);
   10629    if (epartIsReg(modrm)) {
   10630       UInt rE = eregOfRexRM(pfx,modrm);
   10631       assign( argV, getXMMReg(rE) );
   10632       delta += 1;
   10633       DIP("%scvtpd2ps %s,%s\n", isAvx ? "v" : "",
   10634           nameXMMReg(rE), nameXMMReg(rG));
   10635    } else {
   10636       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10637       assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10638       delta += alen;
   10639       DIP("%scvtpd2ps %s,%s\n", isAvx ? "v" : "",
   10640           dis_buf, nameXMMReg(rG) );
   10641    }
   10642 
   10643    assign( rmode, get_sse_roundingmode() );
   10644    IRTemp t0 = newTemp(Ity_F64);
   10645    IRTemp t1 = newTemp(Ity_F64);
   10646    assign( t0, unop(Iop_ReinterpI64asF64,
   10647                     unop(Iop_V128to64, mkexpr(argV))) );
   10648    assign( t1, unop(Iop_ReinterpI64asF64,
   10649                     unop(Iop_V128HIto64, mkexpr(argV))) );
   10650 
   10651 #  define CVT(_t)  binop( Iop_F64toF32, mkexpr(rmode), mkexpr(_t) )
   10652    putXMMRegLane32(  rG, 3, mkU32(0) );
   10653    putXMMRegLane32(  rG, 2, mkU32(0) );
   10654    putXMMRegLane32F( rG, 1, CVT(t1) );
   10655    putXMMRegLane32F( rG, 0, CVT(t0) );
   10656 #  undef CVT
   10657    if (isAvx)
   10658       putYMMRegLane128( rG, 1, mkV128(0) );
   10659 
   10660    return delta;
   10661 }
   10662 
   10663 
   10664 static Long dis_CVTxPS2DQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
   10665                                 Long delta, Bool isAvx, Bool r2zero )
   10666 {
   10667    IRTemp addr  = IRTemp_INVALID;
   10668    Int    alen  = 0;
   10669    HChar  dis_buf[50];
   10670    UChar  modrm = getUChar(delta);
   10671    IRTemp argV  = newTemp(Ity_V128);
   10672    IRTemp rmode = newTemp(Ity_I32);
   10673    UInt   rG    = gregOfRexRM(pfx,modrm);
   10674    IRTemp t0, t1, t2, t3;
   10675 
   10676    if (epartIsReg(modrm)) {
   10677       UInt rE = eregOfRexRM(pfx,modrm);
   10678       assign( argV, getXMMReg(rE) );
   10679       delta += 1;
   10680       DIP("%scvt%sps2dq %s,%s\n",
   10681           isAvx ? "v" : "", r2zero ? "t" : "", nameXMMReg(rE), nameXMMReg(rG));
   10682    } else {
   10683       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10684       assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10685       delta += alen;
   10686       DIP("%scvt%sps2dq %s,%s\n",
   10687           isAvx ? "v" : "", r2zero ? "t" : "", dis_buf, nameXMMReg(rG) );
   10688    }
   10689 
   10690    assign( rmode, r2zero ? mkU32((UInt)Irrm_ZERO)
   10691                          : get_sse_roundingmode() );
   10692    t0 = t1 = t2 = t3 = IRTemp_INVALID;
   10693    breakupV128to32s( argV, &t3, &t2, &t1, &t0 );
   10694    /* This is less than ideal.  If it turns out to be a performance
   10695       bottleneck it can be improved. */
   10696 #  define CVT(_t)                             \
   10697       binop( Iop_F64toI32S,                   \
   10698              mkexpr(rmode),                   \
   10699              unop( Iop_F32toF64,              \
   10700                    unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
   10701 
   10702    putXMMRegLane32( rG, 3, CVT(t3) );
   10703    putXMMRegLane32( rG, 2, CVT(t2) );
   10704    putXMMRegLane32( rG, 1, CVT(t1) );
   10705    putXMMRegLane32( rG, 0, CVT(t0) );
   10706 #  undef CVT
   10707    if (isAvx)
   10708       putYMMRegLane128( rG, 1, mkV128(0) );
   10709 
   10710    return delta;
   10711 }
   10712 
   10713 
   10714 static Long dis_CVTxPS2DQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
   10715                                 Long delta, Bool r2zero )
   10716 {
   10717    IRTemp addr  = IRTemp_INVALID;
   10718    Int    alen  = 0;
   10719    HChar  dis_buf[50];
   10720    UChar  modrm = getUChar(delta);
   10721    IRTemp argV  = newTemp(Ity_V256);
   10722    IRTemp rmode = newTemp(Ity_I32);
   10723    UInt   rG    = gregOfRexRM(pfx,modrm);
   10724    IRTemp t0, t1, t2, t3, t4, t5, t6, t7;
   10725 
   10726    if (epartIsReg(modrm)) {
   10727       UInt rE = eregOfRexRM(pfx,modrm);
   10728       assign( argV, getYMMReg(rE) );
   10729       delta += 1;
   10730       DIP("vcvt%sps2dq %s,%s\n",
   10731           r2zero ? "t" : "", nameYMMReg(rE), nameYMMReg(rG));
   10732    } else {
   10733       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10734       assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
   10735       delta += alen;
   10736       DIP("vcvt%sps2dq %s,%s\n",
   10737           r2zero ? "t" : "", dis_buf, nameYMMReg(rG) );
   10738    }
   10739 
   10740    assign( rmode, r2zero ? mkU32((UInt)Irrm_ZERO)
   10741                          : get_sse_roundingmode() );
   10742    t0 = t1 = t2 = t3 = t4 = t5 = t6 = t7 = IRTemp_INVALID;
   10743    breakupV256to32s( argV, &t7, &t6, &t5, &t4, &t3, &t2, &t1, &t0 );
   10744    /* This is less than ideal.  If it turns out to be a performance
   10745       bottleneck it can be improved. */
   10746 #  define CVT(_t)                             \
   10747       binop( Iop_F64toI32S,                   \
   10748              mkexpr(rmode),                   \
   10749              unop( Iop_F32toF64,              \
   10750                    unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
   10751 
   10752    putYMMRegLane32( rG, 7, CVT(t7) );
   10753    putYMMRegLane32( rG, 6, CVT(t6) );
   10754    putYMMRegLane32( rG, 5, CVT(t5) );
   10755    putYMMRegLane32( rG, 4, CVT(t4) );
   10756    putYMMRegLane32( rG, 3, CVT(t3) );
   10757    putYMMRegLane32( rG, 2, CVT(t2) );
   10758    putYMMRegLane32( rG, 1, CVT(t1) );
   10759    putYMMRegLane32( rG, 0, CVT(t0) );
   10760 #  undef CVT
   10761 
   10762    return delta;
   10763 }
   10764 
   10765 
   10766 static Long dis_CVTxPD2DQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
   10767                                 Long delta, Bool isAvx, Bool r2zero )
   10768 {
   10769    IRTemp addr  = IRTemp_INVALID;
   10770    Int    alen  = 0;
   10771    HChar  dis_buf[50];
   10772    UChar  modrm = getUChar(delta);
   10773    IRTemp argV  = newTemp(Ity_V128);
   10774    IRTemp rmode = newTemp(Ity_I32);
   10775    UInt   rG    = gregOfRexRM(pfx,modrm);
   10776    IRTemp t0, t1;
   10777 
   10778    if (epartIsReg(modrm)) {
   10779       UInt rE = eregOfRexRM(pfx,modrm);
   10780       assign( argV, getXMMReg(rE) );
   10781       delta += 1;
   10782       DIP("%scvt%spd2dq %s,%s\n",
   10783           isAvx ? "v" : "", r2zero ? "t" : "", nameXMMReg(rE), nameXMMReg(rG));
   10784    } else {
   10785       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10786       assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10787       delta += alen;
   10788       DIP("%scvt%spd2dqx %s,%s\n",
   10789           isAvx ? "v" : "", r2zero ? "t" : "", dis_buf, nameXMMReg(rG) );
   10790    }
   10791 
   10792    if (r2zero) {
   10793       assign(rmode, mkU32((UInt)Irrm_ZERO) );
   10794    } else {
   10795       assign( rmode, get_sse_roundingmode() );
   10796    }
   10797 
   10798    t0 = newTemp(Ity_F64);
   10799    t1 = newTemp(Ity_F64);
   10800    assign( t0, unop(Iop_ReinterpI64asF64,
   10801                     unop(Iop_V128to64, mkexpr(argV))) );
   10802    assign( t1, unop(Iop_ReinterpI64asF64,
   10803                     unop(Iop_V128HIto64, mkexpr(argV))) );
   10804 
   10805 #  define CVT(_t)  binop( Iop_F64toI32S,                   \
   10806                           mkexpr(rmode),                   \
   10807                           mkexpr(_t) )
   10808 
   10809    putXMMRegLane32( rG, 3, mkU32(0) );
   10810    putXMMRegLane32( rG, 2, mkU32(0) );
   10811    putXMMRegLane32( rG, 1, CVT(t1) );
   10812    putXMMRegLane32( rG, 0, CVT(t0) );
   10813 #  undef CVT
   10814    if (isAvx)
   10815       putYMMRegLane128( rG, 1, mkV128(0) );
   10816 
   10817    return delta;
   10818 }
   10819 
   10820 
   10821 static Long dis_CVTxPD2DQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
   10822                                 Long delta, Bool r2zero )
   10823 {
   10824    IRTemp addr  = IRTemp_INVALID;
   10825    Int    alen  = 0;
   10826    HChar  dis_buf[50];
   10827    UChar  modrm = getUChar(delta);
   10828    IRTemp argV  = newTemp(Ity_V256);
   10829    IRTemp rmode = newTemp(Ity_I32);
   10830    UInt   rG    = gregOfRexRM(pfx,modrm);
   10831    IRTemp t0, t1, t2, t3;
   10832 
   10833    if (epartIsReg(modrm)) {
   10834       UInt rE = eregOfRexRM(pfx,modrm);
   10835       assign( argV, getYMMReg(rE) );
   10836       delta += 1;
   10837       DIP("vcvt%spd2dq %s,%s\n",
   10838           r2zero ? "t" : "", nameYMMReg(rE), nameXMMReg(rG));
   10839    } else {
   10840       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10841       assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
   10842       delta += alen;
   10843       DIP("vcvt%spd2dqy %s,%s\n",
   10844           r2zero ? "t" : "", dis_buf, nameXMMReg(rG) );
   10845    }
   10846 
   10847    if (r2zero) {
   10848       assign(rmode, mkU32((UInt)Irrm_ZERO) );
   10849    } else {
   10850       assign( rmode, get_sse_roundingmode() );
   10851    }
   10852 
   10853    t0 = IRTemp_INVALID;
   10854    t1 = IRTemp_INVALID;
   10855    t2 = IRTemp_INVALID;
   10856    t3 = IRTemp_INVALID;
   10857    breakupV256to64s( argV, &t3, &t2, &t1, &t0 );
   10858 
   10859 #  define CVT(_t)  binop( Iop_F64toI32S,                   \
   10860                           mkexpr(rmode),                   \
   10861                           unop( Iop_ReinterpI64asF64,      \
   10862                                 mkexpr(_t) ) )
   10863 
   10864    putXMMRegLane32( rG, 3, CVT(t3) );
   10865    putXMMRegLane32( rG, 2, CVT(t2) );
   10866    putXMMRegLane32( rG, 1, CVT(t1) );
   10867    putXMMRegLane32( rG, 0, CVT(t0) );
   10868 #  undef CVT
   10869    putYMMRegLane128( rG, 1, mkV128(0) );
   10870 
   10871    return delta;
   10872 }
   10873 
   10874 
   10875 static Long dis_CVTDQ2PS_128 ( const VexAbiInfo* vbi, Prefix pfx,
   10876                                Long delta, Bool isAvx )
   10877 {
   10878    IRTemp addr  = IRTemp_INVALID;
   10879    Int    alen  = 0;
   10880    HChar  dis_buf[50];
   10881    UChar  modrm = getUChar(delta);
   10882    IRTemp argV  = newTemp(Ity_V128);
   10883    IRTemp rmode = newTemp(Ity_I32);
   10884    UInt   rG    = gregOfRexRM(pfx,modrm);
   10885    IRTemp t0, t1, t2, t3;
   10886 
   10887    if (epartIsReg(modrm)) {
   10888       UInt rE = eregOfRexRM(pfx,modrm);
   10889       assign( argV, getXMMReg(rE) );
   10890       delta += 1;
   10891       DIP("%scvtdq2ps %s,%s\n",
   10892           isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG));
   10893    } else {
   10894       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10895       assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10896       delta += alen;
   10897       DIP("%scvtdq2ps %s,%s\n",
   10898           isAvx ? "v" : "", dis_buf, nameXMMReg(rG) );
   10899    }
   10900 
   10901    assign( rmode, get_sse_roundingmode() );
   10902    t0 = IRTemp_INVALID;
   10903    t1 = IRTemp_INVALID;
   10904    t2 = IRTemp_INVALID;
   10905    t3 = IRTemp_INVALID;
   10906    breakupV128to32s( argV, &t3, &t2, &t1, &t0 );
   10907 
   10908 #  define CVT(_t)  binop( Iop_F64toF32,                    \
   10909                           mkexpr(rmode),                   \
   10910                           unop(Iop_I32StoF64,mkexpr(_t)))
   10911 
   10912    putXMMRegLane32F( rG, 3, CVT(t3) );
   10913    putXMMRegLane32F( rG, 2, CVT(t2) );
   10914    putXMMRegLane32F( rG, 1, CVT(t1) );
   10915    putXMMRegLane32F( rG, 0, CVT(t0) );
   10916 #  undef CVT
   10917    if (isAvx)
   10918       putYMMRegLane128( rG, 1, mkV128(0) );
   10919 
   10920    return delta;
   10921 }
   10922 
   10923 static Long dis_CVTDQ2PS_256 ( const VexAbiInfo* vbi, Prefix pfx,
   10924                                Long delta )
   10925 {
   10926    IRTemp addr   = IRTemp_INVALID;
   10927    Int    alen   = 0;
   10928    HChar  dis_buf[50];
   10929    UChar  modrm  = getUChar(delta);
   10930    IRTemp argV   = newTemp(Ity_V256);
   10931    IRTemp rmode  = newTemp(Ity_I32);
   10932    UInt   rG     = gregOfRexRM(pfx,modrm);
   10933    IRTemp t0, t1, t2, t3, t4, t5, t6, t7;
   10934 
   10935    if (epartIsReg(modrm)) {
   10936       UInt rE = eregOfRexRM(pfx,modrm);
   10937       assign( argV, getYMMReg(rE) );
   10938       delta += 1;
   10939       DIP("vcvtdq2ps %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   10940    } else {
   10941       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10942       assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
   10943       delta += alen;
   10944       DIP("vcvtdq2ps %s,%s\n", dis_buf, nameYMMReg(rG) );
   10945    }
   10946 
   10947    assign( rmode, get_sse_roundingmode() );
   10948    t0 = IRTemp_INVALID;
   10949    t1 = IRTemp_INVALID;
   10950    t2 = IRTemp_INVALID;
   10951    t3 = IRTemp_INVALID;
   10952    t4 = IRTemp_INVALID;
   10953    t5 = IRTemp_INVALID;
   10954    t6 = IRTemp_INVALID;
   10955    t7 = IRTemp_INVALID;
   10956    breakupV256to32s( argV, &t7, &t6, &t5, &t4, &t3, &t2, &t1, &t0 );
   10957 
   10958 #  define CVT(_t)  binop( Iop_F64toF32,                    \
   10959                           mkexpr(rmode),                   \
   10960                           unop(Iop_I32StoF64,mkexpr(_t)))
   10961 
   10962    putYMMRegLane32F( rG, 7, CVT(t7) );
   10963    putYMMRegLane32F( rG, 6, CVT(t6) );
   10964    putYMMRegLane32F( rG, 5, CVT(t5) );
   10965    putYMMRegLane32F( rG, 4, CVT(t4) );
   10966    putYMMRegLane32F( rG, 3, CVT(t3) );
   10967    putYMMRegLane32F( rG, 2, CVT(t2) );
   10968    putYMMRegLane32F( rG, 1, CVT(t1) );
   10969    putYMMRegLane32F( rG, 0, CVT(t0) );
   10970 #  undef CVT
   10971 
   10972    return delta;
   10973 }
   10974 
   10975 
   10976 static Long dis_PMOVMSKB_128 ( const VexAbiInfo* vbi, Prefix pfx,
   10977                                Long delta, Bool isAvx )
   10978 {
   10979    UChar modrm = getUChar(delta);
   10980    vassert(epartIsReg(modrm)); /* ensured by caller */
   10981    UInt   rE = eregOfRexRM(pfx,modrm);
   10982    UInt   rG = gregOfRexRM(pfx,modrm);
   10983    IRTemp t0 = newTemp(Ity_V128);
   10984    IRTemp t1 = newTemp(Ity_I32);
   10985    assign(t0, getXMMReg(rE));
   10986    assign(t1, unop(Iop_16Uto32, unop(Iop_GetMSBs8x16, mkexpr(t0))));
   10987    putIReg32(rG, mkexpr(t1));
   10988    DIP("%spmovmskb %s,%s\n", isAvx ? "v" : "", nameXMMReg(rE),
   10989        nameIReg32(rG));
   10990    delta += 1;
   10991    return delta;
   10992 }
   10993 
   10994 
   10995 static Long dis_PMOVMSKB_256 ( const VexAbiInfo* vbi, Prefix pfx,
   10996                                Long delta  )
   10997 {
   10998    UChar modrm = getUChar(delta);
   10999    vassert(epartIsReg(modrm)); /* ensured by caller */
   11000    UInt   rE = eregOfRexRM(pfx,modrm);
   11001    UInt   rG = gregOfRexRM(pfx,modrm);
   11002    IRTemp t0 = newTemp(Ity_V128);
   11003    IRTemp t1 = newTemp(Ity_V128);
   11004    IRTemp t2 = newTemp(Ity_I16);
   11005    IRTemp t3 = newTemp(Ity_I16);
   11006    assign(t0, getYMMRegLane128(rE, 0));
   11007    assign(t1, getYMMRegLane128(rE, 1));
   11008    assign(t2, unop(Iop_GetMSBs8x16, mkexpr(t0)));
   11009    assign(t3, unop(Iop_GetMSBs8x16, mkexpr(t1)));
   11010    putIReg32(rG, binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)));
   11011    DIP("vpmovmskb %s,%s\n", nameYMMReg(rE), nameIReg32(rG));
   11012    delta += 1;
   11013    return delta;
   11014 }
   11015 
   11016 
   11017 /* FIXME: why not just use InterleaveLO / InterleaveHI?  I think the
   11018    relevant ops are "xIsH ? InterleaveHI32x4 : InterleaveLO32x4". */
   11019 /* Does the maths for 128 bit versions of UNPCKLPS and UNPCKHPS */
   11020 static IRTemp math_UNPCKxPS_128 ( IRTemp sV, IRTemp dV, Bool xIsH )
   11021 {
   11022    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   11023    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   11024    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   11025    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   11026    IRTemp res = newTemp(Ity_V128);
   11027    assign(res,  xIsH ? mkV128from32s( s3, d3, s2, d2 )
   11028                      : mkV128from32s( s1, d1, s0, d0 ));
   11029    return res;
   11030 }
   11031 
   11032 
   11033 /* FIXME: why not just use InterleaveLO / InterleaveHI ?? */
   11034 /* Does the maths for 128 bit versions of UNPCKLPD and UNPCKHPD */
   11035 static IRTemp math_UNPCKxPD_128 ( IRTemp sV, IRTemp dV, Bool xIsH )
   11036 {
   11037    IRTemp s1 = newTemp(Ity_I64);
   11038    IRTemp s0 = newTemp(Ity_I64);
   11039    IRTemp d1 = newTemp(Ity_I64);
   11040    IRTemp d0 = newTemp(Ity_I64);
   11041    assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
   11042    assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
   11043    assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
   11044    assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
   11045    IRTemp res = newTemp(Ity_V128);
   11046    assign(res, xIsH ? binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1))
   11047                     : binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)));
   11048    return res;
   11049 }
   11050 
   11051 
   11052 /* Does the maths for 256 bit versions of UNPCKLPD and UNPCKHPD.
   11053    Doesn't seem like this fits in either of the Iop_Interleave{LO,HI}
   11054    or the Iop_Cat{Odd,Even}Lanes idioms, hence just do it the stupid
   11055    way. */
   11056 static IRTemp math_UNPCKxPD_256 ( IRTemp sV, IRTemp dV, Bool xIsH )
   11057 {
   11058    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   11059    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   11060    breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
   11061    breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
   11062    IRTemp res = newTemp(Ity_V256);
   11063    assign(res, xIsH
   11064                ? IRExpr_Qop(Iop_64x4toV256, mkexpr(s3), mkexpr(d3),
   11065                                             mkexpr(s1), mkexpr(d1))
   11066                : IRExpr_Qop(Iop_64x4toV256, mkexpr(s2), mkexpr(d2),
   11067                                             mkexpr(s0), mkexpr(d0)));
   11068    return res;
   11069 }
   11070 
   11071 
   11072 /* FIXME: this is really bad.  Surely can do something better here?
   11073    One observation is that the steering in the upper and lower 128 bit
   11074    halves is the same as with math_UNPCKxPS_128, so we simply split
   11075    into two halves, and use that.  Consequently any improvement in
   11076    math_UNPCKxPS_128 (probably, to use interleave-style primops)
   11077    benefits this too. */
   11078 static IRTemp math_UNPCKxPS_256 ( IRTemp sV, IRTemp dV, Bool xIsH )
   11079 {
   11080    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   11081    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   11082    breakupV256toV128s( sV, &sVhi, &sVlo );
   11083    breakupV256toV128s( dV, &dVhi, &dVlo );
   11084    IRTemp rVhi = math_UNPCKxPS_128(sVhi, dVhi, xIsH);
   11085    IRTemp rVlo = math_UNPCKxPS_128(sVlo, dVlo, xIsH);
   11086    IRTemp rV   = newTemp(Ity_V256);
   11087    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   11088    return rV;
   11089 }
   11090 
   11091 
   11092 static IRTemp math_SHUFPS_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11093 {
   11094    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   11095    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   11096    vassert(imm8 < 256);
   11097 
   11098    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   11099    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   11100 
   11101 #  define SELD(n) ((n)==0 ? d0 : ((n)==1 ? d1 : ((n)==2 ? d2 : d3)))
   11102 #  define SELS(n) ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   11103    IRTemp res = newTemp(Ity_V128);
   11104    assign(res,
   11105           mkV128from32s( SELS((imm8>>6)&3), SELS((imm8>>4)&3),
   11106                          SELD((imm8>>2)&3), SELD((imm8>>0)&3) ) );
   11107 #  undef SELD
   11108 #  undef SELS
   11109    return res;
   11110 }
   11111 
   11112 
   11113 /* 256-bit SHUFPS appears to steer each of the 128-bit halves
   11114    identically.  Hence do the clueless thing and use math_SHUFPS_128
   11115    twice. */
   11116 static IRTemp math_SHUFPS_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11117 {
   11118    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   11119    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   11120    breakupV256toV128s( sV, &sVhi, &sVlo );
   11121    breakupV256toV128s( dV, &dVhi, &dVlo );
   11122    IRTemp rVhi = math_SHUFPS_128(sVhi, dVhi, imm8);
   11123    IRTemp rVlo = math_SHUFPS_128(sVlo, dVlo, imm8);
   11124    IRTemp rV   = newTemp(Ity_V256);
   11125    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   11126    return rV;
   11127 }
   11128 
   11129 
   11130 static IRTemp math_SHUFPD_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11131 {
   11132    IRTemp s1 = newTemp(Ity_I64);
   11133    IRTemp s0 = newTemp(Ity_I64);
   11134    IRTemp d1 = newTemp(Ity_I64);
   11135    IRTemp d0 = newTemp(Ity_I64);
   11136 
   11137    assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
   11138    assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
   11139    assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
   11140    assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
   11141 
   11142 #  define SELD(n) mkexpr((n)==0 ? d0 : d1)
   11143 #  define SELS(n) mkexpr((n)==0 ? s0 : s1)
   11144 
   11145    IRTemp res = newTemp(Ity_V128);
   11146    assign(res, binop( Iop_64HLtoV128,
   11147                       SELS((imm8>>1)&1), SELD((imm8>>0)&1) ) );
   11148 
   11149 #  undef SELD
   11150 #  undef SELS
   11151    return res;
   11152 }
   11153 
   11154 
   11155 static IRTemp math_SHUFPD_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11156 {
   11157    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   11158    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   11159    breakupV256toV128s( sV, &sVhi, &sVlo );
   11160    breakupV256toV128s( dV, &dVhi, &dVlo );
   11161    IRTemp rVhi = math_SHUFPD_128(sVhi, dVhi, (imm8 >> 2) & 3);
   11162    IRTemp rVlo = math_SHUFPD_128(sVlo, dVlo, imm8 & 3);
   11163    IRTemp rV   = newTemp(Ity_V256);
   11164    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   11165    return rV;
   11166 }
   11167 
   11168 
   11169 static IRTemp math_BLENDPD_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11170 {
   11171    UShort imm8_mask_16;
   11172    IRTemp imm8_mask = newTemp(Ity_V128);
   11173 
   11174    switch( imm8 & 3 ) {
   11175       case 0:  imm8_mask_16 = 0x0000; break;
   11176       case 1:  imm8_mask_16 = 0x00FF; break;
   11177       case 2:  imm8_mask_16 = 0xFF00; break;
   11178       case 3:  imm8_mask_16 = 0xFFFF; break;
   11179       default: vassert(0);            break;
   11180    }
   11181    assign( imm8_mask, mkV128( imm8_mask_16 ) );
   11182 
   11183    IRTemp res = newTemp(Ity_V128);
   11184    assign ( res, binop( Iop_OrV128,
   11185                         binop( Iop_AndV128, mkexpr(sV),
   11186                                             mkexpr(imm8_mask) ),
   11187                         binop( Iop_AndV128, mkexpr(dV),
   11188                                unop( Iop_NotV128, mkexpr(imm8_mask) ) ) ) );
   11189    return res;
   11190 }
   11191 
   11192 
   11193 static IRTemp math_BLENDPD_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11194 {
   11195    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   11196    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   11197    breakupV256toV128s( sV, &sVhi, &sVlo );
   11198    breakupV256toV128s( dV, &dVhi, &dVlo );
   11199    IRTemp rVhi = math_BLENDPD_128(sVhi, dVhi, (imm8 >> 2) & 3);
   11200    IRTemp rVlo = math_BLENDPD_128(sVlo, dVlo, imm8 & 3);
   11201    IRTemp rV   = newTemp(Ity_V256);
   11202    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   11203    return rV;
   11204 }
   11205 
   11206 
   11207 static IRTemp math_BLENDPS_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11208 {
   11209    UShort imm8_perms[16] = { 0x0000, 0x000F, 0x00F0, 0x00FF, 0x0F00,
   11210                              0x0F0F, 0x0FF0, 0x0FFF, 0xF000, 0xF00F,
   11211                              0xF0F0, 0xF0FF, 0xFF00, 0xFF0F, 0xFFF0,
   11212                              0xFFFF };
   11213    IRTemp imm8_mask = newTemp(Ity_V128);
   11214    assign( imm8_mask, mkV128( imm8_perms[ (imm8 & 15) ] ) );
   11215 
   11216    IRTemp res = newTemp(Ity_V128);
   11217    assign ( res, binop( Iop_OrV128,
   11218                         binop( Iop_AndV128, mkexpr(sV),
   11219                                             mkexpr(imm8_mask) ),
   11220                         binop( Iop_AndV128, mkexpr(dV),
   11221                                unop( Iop_NotV128, mkexpr(imm8_mask) ) ) ) );
   11222    return res;
   11223 }
   11224 
   11225 
   11226 static IRTemp math_BLENDPS_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11227 {
   11228    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   11229    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   11230    breakupV256toV128s( sV, &sVhi, &sVlo );
   11231    breakupV256toV128s( dV, &dVhi, &dVlo );
   11232    IRTemp rVhi = math_BLENDPS_128(sVhi, dVhi, (imm8 >> 4) & 15);
   11233    IRTemp rVlo = math_BLENDPS_128(sVlo, dVlo, imm8 & 15);
   11234    IRTemp rV   = newTemp(Ity_V256);
   11235    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   11236    return rV;
   11237 }
   11238 
   11239 
   11240 static IRTemp math_PBLENDW_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11241 {
   11242    /* Make w be a 16-bit version of imm8, formed by duplicating each
   11243       bit in imm8. */
   11244    Int i;
   11245    UShort imm16 = 0;
   11246    for (i = 0; i < 8; i++) {
   11247       if (imm8 & (1 << i))
   11248          imm16 |= (3 << (2*i));
   11249    }
   11250    IRTemp imm16_mask = newTemp(Ity_V128);
   11251    assign( imm16_mask, mkV128( imm16 ));
   11252 
   11253    IRTemp res = newTemp(Ity_V128);
   11254    assign ( res, binop( Iop_OrV128,
   11255                         binop( Iop_AndV128, mkexpr(sV),
   11256                                             mkexpr(imm16_mask) ),
   11257                         binop( Iop_AndV128, mkexpr(dV),
   11258                                unop( Iop_NotV128, mkexpr(imm16_mask) ) ) ) );
   11259    return res;
   11260 }
   11261 
   11262 
   11263 static IRTemp math_PMULUDQ_128 ( IRTemp sV, IRTemp dV )
   11264 {
   11265    /* This is a really poor translation -- could be improved if
   11266       performance critical */
   11267    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   11268    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   11269    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   11270    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   11271    IRTemp res = newTemp(Ity_V128);
   11272    assign(res, binop(Iop_64HLtoV128,
   11273                      binop( Iop_MullU32, mkexpr(d2), mkexpr(s2)),
   11274                      binop( Iop_MullU32, mkexpr(d0), mkexpr(s0)) ));
   11275    return res;
   11276 }
   11277 
   11278 
   11279 static IRTemp math_PMULUDQ_256 ( IRTemp sV, IRTemp dV )
   11280 {
   11281    /* This is a really poor translation -- could be improved if
   11282       performance critical */
   11283    IRTemp sHi, sLo, dHi, dLo;
   11284    sHi = sLo = dHi = dLo = IRTemp_INVALID;
   11285    breakupV256toV128s( dV, &dHi, &dLo);
   11286    breakupV256toV128s( sV, &sHi, &sLo);
   11287    IRTemp res = newTemp(Ity_V256);
   11288    assign(res, binop(Iop_V128HLtoV256,
   11289                      mkexpr(math_PMULUDQ_128(sHi, dHi)),
   11290                      mkexpr(math_PMULUDQ_128(sLo, dLo))));
   11291    return res;
   11292 }
   11293 
   11294 
   11295 static IRTemp math_PMULDQ_128 ( IRTemp dV, IRTemp sV )
   11296 {
   11297    /* This is a really poor translation -- could be improved if
   11298       performance critical */
   11299    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   11300    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   11301    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   11302    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   11303    IRTemp res = newTemp(Ity_V128);
   11304    assign(res, binop(Iop_64HLtoV128,
   11305                      binop( Iop_MullS32, mkexpr(d2), mkexpr(s2)),
   11306                      binop( Iop_MullS32, mkexpr(d0), mkexpr(s0)) ));
   11307    return res;
   11308 }
   11309 
   11310 
   11311 static IRTemp math_PMULDQ_256 ( IRTemp sV, IRTemp dV )
   11312 {
   11313    /* This is a really poor translation -- could be improved if
   11314       performance critical */
   11315    IRTemp sHi, sLo, dHi, dLo;
   11316    sHi = sLo = dHi = dLo = IRTemp_INVALID;
   11317    breakupV256toV128s( dV, &dHi, &dLo);
   11318    breakupV256toV128s( sV, &sHi, &sLo);
   11319    IRTemp res = newTemp(Ity_V256);
   11320    assign(res, binop(Iop_V128HLtoV256,
   11321                      mkexpr(math_PMULDQ_128(sHi, dHi)),
   11322                      mkexpr(math_PMULDQ_128(sLo, dLo))));
   11323    return res;
   11324 }
   11325 
   11326 
   11327 static IRTemp math_PMADDWD_128 ( IRTemp dV, IRTemp sV )
   11328 {
   11329    IRTemp sVhi, sVlo, dVhi, dVlo;
   11330    IRTemp resHi = newTemp(Ity_I64);
   11331    IRTemp resLo = newTemp(Ity_I64);
   11332    sVhi = sVlo = dVhi = dVlo = IRTemp_INVALID;
   11333    breakupV128to64s( sV, &sVhi, &sVlo );
   11334    breakupV128to64s( dV, &dVhi, &dVlo );
   11335    assign( resHi, mkIRExprCCall(Ity_I64, 0/*regparms*/,
   11336                                 "amd64g_calculate_mmx_pmaddwd",
   11337                                 &amd64g_calculate_mmx_pmaddwd,
   11338                                 mkIRExprVec_2( mkexpr(sVhi), mkexpr(dVhi))));
   11339    assign( resLo, mkIRExprCCall(Ity_I64, 0/*regparms*/,
   11340                                 "amd64g_calculate_mmx_pmaddwd",
   11341                                 &amd64g_calculate_mmx_pmaddwd,
   11342                                 mkIRExprVec_2( mkexpr(sVlo), mkexpr(dVlo))));
   11343    IRTemp res = newTemp(Ity_V128);
   11344    assign( res, binop(Iop_64HLtoV128, mkexpr(resHi), mkexpr(resLo))) ;
   11345    return res;
   11346 }
   11347 
   11348 
   11349 static IRTemp math_PMADDWD_256 ( IRTemp dV, IRTemp sV )
   11350 {
   11351    IRTemp sHi, sLo, dHi, dLo;
   11352    sHi = sLo = dHi = dLo = IRTemp_INVALID;
   11353    breakupV256toV128s( dV, &dHi, &dLo);
   11354    breakupV256toV128s( sV, &sHi, &sLo);
   11355    IRTemp res = newTemp(Ity_V256);
   11356    assign(res, binop(Iop_V128HLtoV256,
   11357                      mkexpr(math_PMADDWD_128(dHi, sHi)),
   11358                      mkexpr(math_PMADDWD_128(dLo, sLo))));
   11359    return res;
   11360 }
   11361 
   11362 
   11363 static IRTemp math_ADDSUBPD_128 ( IRTemp dV, IRTemp sV )
   11364 {
   11365    IRTemp addV = newTemp(Ity_V128);
   11366    IRTemp subV = newTemp(Ity_V128);
   11367    IRTemp a1   = newTemp(Ity_I64);
   11368    IRTemp s0   = newTemp(Ity_I64);
   11369    IRTemp rm   = newTemp(Ity_I32);
   11370 
   11371    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   11372    assign( addV, triop(Iop_Add64Fx2, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11373    assign( subV, triop(Iop_Sub64Fx2, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11374 
   11375    assign( a1, unop(Iop_V128HIto64, mkexpr(addV) ));
   11376    assign( s0, unop(Iop_V128to64,   mkexpr(subV) ));
   11377 
   11378    IRTemp res = newTemp(Ity_V128);
   11379    assign( res, binop(Iop_64HLtoV128, mkexpr(a1), mkexpr(s0)) );
   11380    return res;
   11381 }
   11382 
   11383 
   11384 static IRTemp math_ADDSUBPD_256 ( IRTemp dV, IRTemp sV )
   11385 {
   11386    IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
   11387    IRTemp addV = newTemp(Ity_V256);
   11388    IRTemp subV = newTemp(Ity_V256);
   11389    IRTemp rm   = newTemp(Ity_I32);
   11390    a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11391 
   11392    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   11393    assign( addV, triop(Iop_Add64Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11394    assign( subV, triop(Iop_Sub64Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11395 
   11396    breakupV256to64s( addV, &a3, &a2, &a1, &a0 );
   11397    breakupV256to64s( subV, &s3, &s2, &s1, &s0 );
   11398 
   11399    IRTemp res = newTemp(Ity_V256);
   11400    assign( res, mkV256from64s( a3, s2, a1, s0 ) );
   11401    return res;
   11402 }
   11403 
   11404 
   11405 static IRTemp math_ADDSUBPS_128 ( IRTemp dV, IRTemp sV )
   11406 {
   11407    IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
   11408    IRTemp addV = newTemp(Ity_V128);
   11409    IRTemp subV = newTemp(Ity_V128);
   11410    IRTemp rm   = newTemp(Ity_I32);
   11411    a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11412 
   11413    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   11414    assign( addV, triop(Iop_Add32Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11415    assign( subV, triop(Iop_Sub32Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11416 
   11417    breakupV128to32s( addV, &a3, &a2, &a1, &a0 );
   11418    breakupV128to32s( subV, &s3, &s2, &s1, &s0 );
   11419 
   11420    IRTemp res = newTemp(Ity_V128);
   11421    assign( res, mkV128from32s( a3, s2, a1, s0 ) );
   11422    return res;
   11423 }
   11424 
   11425 
   11426 static IRTemp math_ADDSUBPS_256 ( IRTemp dV, IRTemp sV )
   11427 {
   11428    IRTemp a7, a6, a5, a4, a3, a2, a1, a0;
   11429    IRTemp s7, s6, s5, s4, s3, s2, s1, s0;
   11430    IRTemp addV = newTemp(Ity_V256);
   11431    IRTemp subV = newTemp(Ity_V256);
   11432    IRTemp rm   = newTemp(Ity_I32);
   11433    a7 = a6 = a5 = a4 = a3 = a2 = a1 = a0 = IRTemp_INVALID;
   11434    s7 = s6 = s5 = s4 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11435 
   11436    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   11437    assign( addV, triop(Iop_Add32Fx8, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11438    assign( subV, triop(Iop_Sub32Fx8, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11439 
   11440    breakupV256to32s( addV, &a7, &a6, &a5, &a4, &a3, &a2, &a1, &a0 );
   11441    breakupV256to32s( subV, &s7, &s6, &s5, &s4, &s3, &s2, &s1, &s0 );
   11442 
   11443    IRTemp res = newTemp(Ity_V256);
   11444    assign( res, mkV256from32s( a7, s6, a5, s4, a3, s2, a1, s0 ) );
   11445    return res;
   11446 }
   11447 
   11448 
   11449 /* Handle 128 bit PSHUFLW and PSHUFHW. */
   11450 static Long dis_PSHUFxW_128 ( const VexAbiInfo* vbi, Prefix pfx,
   11451                               Long delta, Bool isAvx, Bool xIsH )
   11452 {
   11453    IRTemp addr  = IRTemp_INVALID;
   11454    Int    alen  = 0;
   11455    HChar  dis_buf[50];
   11456    UChar  modrm = getUChar(delta);
   11457    UInt   rG = gregOfRexRM(pfx,modrm);
   11458    UInt   imm8;
   11459    IRTemp sVmut, dVmut, sVcon, sV, dV, s3, s2, s1, s0;
   11460    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11461    sV    = newTemp(Ity_V128);
   11462    dV    = newTemp(Ity_V128);
   11463    sVmut = newTemp(Ity_I64);
   11464    dVmut = newTemp(Ity_I64);
   11465    sVcon = newTemp(Ity_I64);
   11466    if (epartIsReg(modrm)) {
   11467       UInt rE = eregOfRexRM(pfx,modrm);
   11468       assign( sV, getXMMReg(rE) );
   11469       imm8 = (UInt)getUChar(delta+1);
   11470       delta += 1+1;
   11471       DIP("%spshuf%cw $%u,%s,%s\n",
   11472           isAvx ? "v" : "", xIsH ? 'h' : 'l',
   11473           imm8, nameXMMReg(rE), nameXMMReg(rG));
   11474    } else {
   11475       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   11476       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11477       imm8 = (UInt)getUChar(delta+alen);
   11478       delta += alen+1;
   11479       DIP("%spshuf%cw $%u,%s,%s\n",
   11480           isAvx ? "v" : "", xIsH ? 'h' : 'l',
   11481           imm8, dis_buf, nameXMMReg(rG));
   11482    }
   11483 
   11484    /* Get the to-be-changed (mut) and unchanging (con) bits of the
   11485       source. */
   11486    assign( sVmut, unop(xIsH ? Iop_V128HIto64 : Iop_V128to64,   mkexpr(sV)) );
   11487    assign( sVcon, unop(xIsH ? Iop_V128to64   : Iop_V128HIto64, mkexpr(sV)) );
   11488 
   11489    breakup64to16s( sVmut, &s3, &s2, &s1, &s0 );
   11490 #  define SEL(n) \
   11491              ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   11492    assign(dVmut, mk64from16s( SEL((imm8>>6)&3), SEL((imm8>>4)&3),
   11493                               SEL((imm8>>2)&3), SEL((imm8>>0)&3) ));
   11494 #  undef SEL
   11495 
   11496    assign(dV, xIsH ? binop(Iop_64HLtoV128, mkexpr(dVmut), mkexpr(sVcon))
   11497                    : binop(Iop_64HLtoV128, mkexpr(sVcon), mkexpr(dVmut)) );
   11498 
   11499    (isAvx ? putYMMRegLoAndZU : putXMMReg)(rG, mkexpr(dV));
   11500    return delta;
   11501 }
   11502 
   11503 
   11504 /* Handle 256 bit PSHUFLW and PSHUFHW. */
   11505 static Long dis_PSHUFxW_256 ( const VexAbiInfo* vbi, Prefix pfx,
   11506                               Long delta, Bool xIsH )
   11507 {
   11508    IRTemp addr  = IRTemp_INVALID;
   11509    Int    alen  = 0;
   11510    HChar  dis_buf[50];
   11511    UChar  modrm = getUChar(delta);
   11512    UInt   rG = gregOfRexRM(pfx,modrm);
   11513    UInt   imm8;
   11514    IRTemp sV, s[8], sV64[4], dVhi, dVlo;
   11515    sV64[3] = sV64[2] = sV64[1] = sV64[0] = IRTemp_INVALID;
   11516    s[7] = s[6] = s[5] = s[4] = s[3] = s[2] = s[1] = s[0] = IRTemp_INVALID;
   11517    sV    = newTemp(Ity_V256);
   11518    dVhi  = newTemp(Ity_I64);
   11519    dVlo  = newTemp(Ity_I64);
   11520    if (epartIsReg(modrm)) {
   11521       UInt rE = eregOfRexRM(pfx,modrm);
   11522       assign( sV, getYMMReg(rE) );
   11523       imm8 = (UInt)getUChar(delta+1);
   11524       delta += 1+1;
   11525       DIP("vpshuf%cw $%u,%s,%s\n", xIsH ? 'h' : 'l',
   11526           imm8, nameYMMReg(rE), nameYMMReg(rG));
   11527    } else {
   11528       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   11529       assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   11530       imm8 = (UInt)getUChar(delta+alen);
   11531       delta += alen+1;
   11532       DIP("vpshuf%cw $%u,%s,%s\n", xIsH ? 'h' : 'l',
   11533           imm8, dis_buf, nameYMMReg(rG));
   11534    }
   11535 
   11536    breakupV256to64s( sV, &sV64[3], &sV64[2], &sV64[1], &sV64[0] );
   11537    breakup64to16s( sV64[xIsH ? 3 : 2], &s[7], &s[6], &s[5], &s[4] );
   11538    breakup64to16s( sV64[xIsH ? 1 : 0], &s[3], &s[2], &s[1], &s[0] );
   11539 
   11540    assign( dVhi, mk64from16s( s[4 + ((imm8>>6)&3)], s[4 + ((imm8>>4)&3)],
   11541                               s[4 + ((imm8>>2)&3)], s[4 + ((imm8>>0)&3)] ) );
   11542    assign( dVlo, mk64from16s( s[0 + ((imm8>>6)&3)], s[0 + ((imm8>>4)&3)],
   11543                               s[0 + ((imm8>>2)&3)], s[0 + ((imm8>>0)&3)] ) );
   11544    putYMMReg( rG, mkV256from64s( xIsH ? dVhi : sV64[3],
   11545                                  xIsH ? sV64[2] : dVhi,
   11546                                  xIsH ? dVlo : sV64[1],
   11547                                  xIsH ? sV64[0] : dVlo ) );
   11548    return delta;
   11549 }
   11550 
   11551 
   11552 static Long dis_PEXTRW_128_EregOnly_toG ( const VexAbiInfo* vbi, Prefix pfx,
   11553                                           Long delta, Bool isAvx )
   11554 {
   11555    Long   deltaIN = delta;
   11556    UChar  modrm   = getUChar(delta);
   11557    UInt   rG      = gregOfRexRM(pfx,modrm);
   11558    IRTemp sV      = newTemp(Ity_V128);
   11559    IRTemp d16     = newTemp(Ity_I16);
   11560    UInt   imm8;
   11561    IRTemp s0, s1, s2, s3;
   11562    if (epartIsReg(modrm)) {
   11563       UInt rE = eregOfRexRM(pfx,modrm);
   11564       assign(sV, getXMMReg(rE));
   11565       imm8 = getUChar(delta+1) & 7;
   11566       delta += 1+1;
   11567       DIP("%spextrw $%u,%s,%s\n", isAvx ? "v" : "",
   11568           imm8, nameXMMReg(rE), nameIReg32(rG));
   11569    } else {
   11570       /* The memory case is disallowed, apparently. */
   11571       return deltaIN; /* FAIL */
   11572    }
   11573    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11574    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   11575    switch (imm8) {
   11576       case 0:  assign(d16, unop(Iop_32to16,   mkexpr(s0))); break;
   11577       case 1:  assign(d16, unop(Iop_32HIto16, mkexpr(s0))); break;
   11578       case 2:  assign(d16, unop(Iop_32to16,   mkexpr(s1))); break;
   11579       case 3:  assign(d16, unop(Iop_32HIto16, mkexpr(s1))); break;
   11580       case 4:  assign(d16, unop(Iop_32to16,   mkexpr(s2))); break;
   11581       case 5:  assign(d16, unop(Iop_32HIto16, mkexpr(s2))); break;
   11582       case 6:  assign(d16, unop(Iop_32to16,   mkexpr(s3))); break;
   11583       case 7:  assign(d16, unop(Iop_32HIto16, mkexpr(s3))); break;
   11584       default: vassert(0);
   11585    }
   11586    putIReg32(rG, unop(Iop_16Uto32, mkexpr(d16)));
   11587    return delta;
   11588 }
   11589 
   11590 
   11591 static Long dis_CVTDQ2PD_128 ( const VexAbiInfo* vbi, Prefix pfx,
   11592                                Long delta, Bool isAvx )
   11593 {
   11594    IRTemp addr  = IRTemp_INVALID;
   11595    Int    alen  = 0;
   11596    HChar  dis_buf[50];
   11597    UChar  modrm = getUChar(delta);
   11598    IRTemp arg64 = newTemp(Ity_I64);
   11599    UInt   rG    = gregOfRexRM(pfx,modrm);
   11600    const HChar* mbV   = isAvx ? "v" : "";
   11601    if (epartIsReg(modrm)) {
   11602       UInt rE = eregOfRexRM(pfx,modrm);
   11603       assign( arg64, getXMMRegLane64(rE, 0) );
   11604       delta += 1;
   11605       DIP("%scvtdq2pd %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG));
   11606    } else {
   11607       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11608       assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   11609       delta += alen;
   11610       DIP("%scvtdq2pd %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
   11611    }
   11612    putXMMRegLane64F(
   11613       rG, 0,
   11614       unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)))
   11615    );
   11616    putXMMRegLane64F(
   11617       rG, 1,
   11618       unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)))
   11619    );
   11620    if (isAvx)
   11621       putYMMRegLane128(rG, 1, mkV128(0));
   11622    return delta;
   11623 }
   11624 
   11625 
   11626 static Long dis_STMXCSR ( const VexAbiInfo* vbi, Prefix pfx,
   11627                           Long delta, Bool isAvx )
   11628 {
   11629    IRTemp addr  = IRTemp_INVALID;
   11630    Int    alen  = 0;
   11631    HChar  dis_buf[50];
   11632    UChar  modrm = getUChar(delta);
   11633    vassert(!epartIsReg(modrm)); /* ensured by caller */
   11634    vassert(gregOfRexRM(pfx,modrm) == 3); /* ditto */
   11635 
   11636    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11637    delta += alen;
   11638 
   11639    /* Fake up a native SSE mxcsr word.  The only thing it depends on
   11640       is SSEROUND[1:0], so call a clean helper to cook it up.
   11641    */
   11642    /* ULong amd64h_create_mxcsr ( ULong sseround ) */
   11643    DIP("%sstmxcsr %s\n",  isAvx ? "v" : "", dis_buf);
   11644    storeLE(
   11645       mkexpr(addr),
   11646       unop(Iop_64to32,
   11647            mkIRExprCCall(
   11648               Ity_I64, 0/*regp*/,
   11649               "amd64g_create_mxcsr", &amd64g_create_mxcsr,
   11650               mkIRExprVec_1( unop(Iop_32Uto64,get_sse_roundingmode()) )
   11651            )
   11652       )
   11653    );
   11654    return delta;
   11655 }
   11656 
   11657 
   11658 static Long dis_LDMXCSR ( const VexAbiInfo* vbi, Prefix pfx,
   11659                           Long delta, Bool isAvx )
   11660 {
   11661    IRTemp addr  = IRTemp_INVALID;
   11662    Int    alen  = 0;
   11663    HChar  dis_buf[50];
   11664    UChar  modrm = getUChar(delta);
   11665    vassert(!epartIsReg(modrm)); /* ensured by caller */
   11666    vassert(gregOfRexRM(pfx,modrm) == 2); /* ditto */
   11667 
   11668    IRTemp t64 = newTemp(Ity_I64);
   11669    IRTemp ew  = newTemp(Ity_I32);
   11670 
   11671    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11672    delta += alen;
   11673    DIP("%sldmxcsr %s\n",  isAvx ? "v" : "", dis_buf);
   11674 
   11675    /* The only thing we observe in %mxcsr is the rounding mode.
   11676       Therefore, pass the 32-bit value (SSE native-format control
   11677       word) to a clean helper, getting back a 64-bit value, the
   11678       lower half of which is the SSEROUND value to store, and the
   11679       upper half of which is the emulation-warning token which may
   11680       be generated.
   11681    */
   11682    /* ULong amd64h_check_ldmxcsr ( ULong ); */
   11683    assign( t64, mkIRExprCCall(
   11684                    Ity_I64, 0/*regparms*/,
   11685                    "amd64g_check_ldmxcsr",
   11686                    &amd64g_check_ldmxcsr,
   11687                    mkIRExprVec_1(
   11688                       unop(Iop_32Uto64,
   11689                            loadLE(Ity_I32, mkexpr(addr))
   11690                       )
   11691                    )
   11692                 )
   11693          );
   11694 
   11695    put_sse_roundingmode( unop(Iop_64to32, mkexpr(t64)) );
   11696    assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
   11697    put_emwarn( mkexpr(ew) );
   11698    /* Finally, if an emulation warning was reported, side-exit to
   11699       the next insn, reporting the warning, so that Valgrind's
   11700       dispatcher sees the warning. */
   11701    stmt(
   11702       IRStmt_Exit(
   11703          binop(Iop_CmpNE64, unop(Iop_32Uto64,mkexpr(ew)), mkU64(0)),
   11704          Ijk_EmWarn,
   11705          IRConst_U64(guest_RIP_bbstart+delta),
   11706          OFFB_RIP
   11707       )
   11708    );
   11709    return delta;
   11710 }
   11711 
   11712 
   11713 static void gen_XSAVE_SEQUENCE ( IRTemp addr, IRTemp rfbm )
   11714 {
   11715    /* ------ rfbm[0] gates the x87 state ------ */
   11716 
   11717    /* Uses dirty helper:
   11718          void amd64g_do_XSAVE_COMPONENT_0 ( VexGuestAMD64State*, ULong )
   11719    */
   11720    IRDirty* d0 = unsafeIRDirty_0_N (
   11721                     0/*regparms*/,
   11722                     "amd64g_dirtyhelper_XSAVE_COMPONENT_0",
   11723                     &amd64g_dirtyhelper_XSAVE_COMPONENT_0,
   11724                     mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
   11725                  );
   11726    d0->guard = binop(Iop_CmpEQ64, binop(Iop_And64, mkexpr(rfbm), mkU64(1)),
   11727                      mkU64(1));
   11728 
   11729    /* Declare we're writing memory.  Really, bytes 24 through 31
   11730       (MXCSR and MXCSR_MASK) aren't written, but we can't express more
   11731       than 1 memory area here, so just mark the whole thing as
   11732       written. */
   11733    d0->mFx   = Ifx_Write;
   11734    d0->mAddr = mkexpr(addr);
   11735    d0->mSize = 160;
   11736 
   11737    /* declare we're reading guest state */
   11738    d0->nFxState = 5;
   11739    vex_bzero(&d0->fxState, sizeof(d0->fxState));
   11740 
   11741    d0->fxState[0].fx     = Ifx_Read;
   11742    d0->fxState[0].offset = OFFB_FTOP;
   11743    d0->fxState[0].size   = sizeof(UInt);
   11744 
   11745    d0->fxState[1].fx     = Ifx_Read;
   11746    d0->fxState[1].offset = OFFB_FPREGS;
   11747    d0->fxState[1].size   = 8 * sizeof(ULong);
   11748 
   11749    d0->fxState[2].fx     = Ifx_Read;
   11750    d0->fxState[2].offset = OFFB_FPTAGS;
   11751    d0->fxState[2].size   = 8 * sizeof(UChar);
   11752 
   11753    d0->fxState[3].fx     = Ifx_Read;
   11754    d0->fxState[3].offset = OFFB_FPROUND;
   11755    d0->fxState[3].size   = sizeof(ULong);
   11756 
   11757    d0->fxState[4].fx     = Ifx_Read;
   11758    d0->fxState[4].offset = OFFB_FC3210;
   11759    d0->fxState[4].size   = sizeof(ULong);
   11760 
   11761    stmt( IRStmt_Dirty(d0) );
   11762 
   11763    /* ------ rfbm[1] gates the SSE state ------ */
   11764 
   11765    IRTemp rfbm_1    = newTemp(Ity_I64);
   11766    IRTemp rfbm_1or2 = newTemp(Ity_I64);
   11767    assign(rfbm_1,    binop(Iop_And64, mkexpr(rfbm), mkU64(2)));
   11768    assign(rfbm_1or2, binop(Iop_And64, mkexpr(rfbm), mkU64(6)));
   11769 
   11770    IRExpr* guard_1    = binop(Iop_CmpEQ64, mkexpr(rfbm_1),    mkU64(2));
   11771    IRExpr* guard_1or2 = binop(Iop_CmpNE64, mkexpr(rfbm_1or2), mkU64(0));
   11772 
   11773    /* Uses dirty helper:
   11774          void amd64g_do_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS
   11775                  ( VexGuestAMD64State*, ULong )
   11776       This creates only MXCSR and MXCSR_MASK.  We need to do this if
   11777       either components 1 (SSE) or 2 (AVX) are requested.  Hence the
   11778       guard condition is a bit more complex.
   11779    */
   11780    IRDirty* d1 = unsafeIRDirty_0_N (
   11781                     0/*regparms*/,
   11782                     "amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS",
   11783                     &amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS,
   11784                     mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
   11785                  );
   11786    d1->guard = guard_1or2;
   11787 
   11788    /* Declare we're writing memory: MXCSR and MXCSR_MASK.  Note that
   11789       the code for rbfm[0] just above claims a write of 0 .. 159, so
   11790       this duplicates it.  But at least correctly connects 24 .. 31 to
   11791       the MXCSR guest state representation (SSEROUND field). */
   11792    d1->mFx   = Ifx_Write;
   11793    d1->mAddr = binop(Iop_Add64, mkexpr(addr), mkU64(24));
   11794    d1->mSize = 8;
   11795 
   11796    /* declare we're reading guest state */
   11797    d1->nFxState = 1;
   11798    vex_bzero(&d1->fxState, sizeof(d1->fxState));
   11799 
   11800    d1->fxState[0].fx     = Ifx_Read;
   11801    d1->fxState[0].offset = OFFB_SSEROUND;
   11802    d1->fxState[0].size   = sizeof(ULong);
   11803 
   11804    /* Call the helper.  This creates MXCSR and MXCSR_MASK but nothing
   11805       else.  We do the actual register array, XMM[0..15], separately,
   11806       in order that any undefinedness in the XMM registers is tracked
   11807       separately by Memcheck and does not "infect" the in-memory
   11808       shadow for the other parts of the image. */
   11809    stmt( IRStmt_Dirty(d1) );
   11810 
   11811    /* And now the XMMs themselves. */
   11812    UInt reg;
   11813    for (reg = 0; reg < 16; reg++) {
   11814       stmt( IRStmt_StoreG(
   11815                Iend_LE,
   11816                binop(Iop_Add64, mkexpr(addr), mkU64(160 + reg * 16)),
   11817                getXMMReg(reg),
   11818                guard_1
   11819       ));
   11820    }
   11821 
   11822    /* ------ rfbm[2] gates the AVX state ------ */
   11823    /* Component 2 is just a bunch of register saves, so we'll do it
   11824       inline, just to be simple and to be Memcheck friendly. */
   11825 
   11826    IRTemp rfbm_2 = newTemp(Ity_I64);
   11827    assign(rfbm_2, binop(Iop_And64, mkexpr(rfbm), mkU64(4)));
   11828 
   11829    IRExpr* guard_2 = binop(Iop_CmpEQ64, mkexpr(rfbm_2), mkU64(4));
   11830 
   11831    for (reg = 0; reg < 16; reg++) {
   11832       stmt( IRStmt_StoreG(
   11833                Iend_LE,
   11834                binop(Iop_Add64, mkexpr(addr), mkU64(576 + reg * 16)),
   11835                getYMMRegLane128(reg,1),
   11836                guard_2
   11837       ));
   11838    }
   11839 }
   11840 
   11841 
   11842 static Long dis_XSAVE ( const VexAbiInfo* vbi,
   11843                         Prefix pfx, Long delta, Int sz )
   11844 {
   11845    /* Note that the presence or absence of REX.W (indicated here by
   11846       |sz|) slightly affects the written format: whether the saved FPU
   11847       IP and DP pointers are 64 or 32 bits.  But the helper function
   11848       we call simply writes zero bits in the relevant fields, which
   11849       are 64 bits regardless of what REX.W is, and so it's good enough
   11850       (iow, equally broken) in both cases. */
   11851    IRTemp addr  = IRTemp_INVALID;
   11852    Int    alen  = 0;
   11853    HChar  dis_buf[50];
   11854    UChar  modrm = getUChar(delta);
   11855    vassert(!epartIsReg(modrm)); /* ensured by caller */
   11856    vassert(sz == 4 || sz == 8); /* ditto */
   11857 
   11858    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11859    delta += alen;
   11860    gen_SEGV_if_not_64_aligned(addr);
   11861 
   11862    DIP("%sxsave %s\n", sz==8 ? "rex64/" : "", dis_buf);
   11863 
   11864    /* VEX's caller is assumed to have checked this. */
   11865    const ULong aSSUMED_XCR0_VALUE = 7;
   11866 
   11867    IRTemp rfbm = newTemp(Ity_I64);
   11868    assign(rfbm,
   11869           binop(Iop_And64,
   11870                 binop(Iop_Or64,
   11871                       binop(Iop_Shl64,
   11872                             unop(Iop_32Uto64, getIRegRDX(4)), mkU8(32)),
   11873                       unop(Iop_32Uto64, getIRegRAX(4))),
   11874                 mkU64(aSSUMED_XCR0_VALUE)));
   11875 
   11876    gen_XSAVE_SEQUENCE(addr, rfbm);
   11877 
   11878    /* Finally, we need to update XSTATE_BV in the XSAVE header area, by
   11879       OR-ing the RFBM value into it. */
   11880    IRTemp addr_plus_512 = newTemp(Ity_I64);
   11881    assign(addr_plus_512, binop(Iop_Add64, mkexpr(addr), mkU64(512)));
   11882    storeLE( mkexpr(addr_plus_512),
   11883             binop(Iop_Or8,
   11884                   unop(Iop_64to8, mkexpr(rfbm)),
   11885                   loadLE(Ity_I8, mkexpr(addr_plus_512))) );
   11886 
   11887    return delta;
   11888 }
   11889 
   11890 
   11891 static Long dis_FXSAVE ( const VexAbiInfo* vbi,
   11892                          Prefix pfx, Long delta, Int sz )
   11893 {
   11894    /* See comment in dis_XSAVE about the significance of REX.W. */
   11895    IRTemp addr  = IRTemp_INVALID;
   11896    Int    alen  = 0;
   11897    HChar  dis_buf[50];
   11898    UChar  modrm = getUChar(delta);
   11899    vassert(!epartIsReg(modrm)); /* ensured by caller */
   11900    vassert(sz == 4 || sz == 8); /* ditto */
   11901 
   11902    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11903    delta += alen;
   11904    gen_SEGV_if_not_16_aligned(addr);
   11905 
   11906    DIP("%sfxsave %s\n", sz==8 ? "rex64/" : "", dis_buf);
   11907 
   11908    /* FXSAVE is just XSAVE with components 0 and 1 selected.  Set rfbm
   11909       to 0b011, generate the XSAVE sequence accordingly, and let iropt
   11910       fold out the unused (AVX) parts accordingly. */
   11911    IRTemp rfbm = newTemp(Ity_I64);
   11912    assign(rfbm, mkU64(3));
   11913    gen_XSAVE_SEQUENCE(addr, rfbm);
   11914 
   11915    return delta;
   11916 }
   11917 
   11918 
   11919 static void gen_XRSTOR_SEQUENCE ( IRTemp addr, IRTemp xstate_bv, IRTemp rfbm )
   11920 {
   11921    /* ------ rfbm[0] gates the x87 state ------ */
   11922 
   11923    /* If rfbm[0] == 1, we have to write the x87 state.  If
   11924       xstate_bv[0] == 1, we will read it from the memory image, else
   11925       we'll set it to initial values.  Doing this with a helper
   11926       function and getting the definedness flow annotations correct is
   11927       too difficult, so generate stupid but simple code: first set the
   11928       registers to initial values, regardless of xstate_bv[0].  Then,
   11929       conditionally restore from the memory image. */
   11930 
   11931    IRTemp rfbm_0       = newTemp(Ity_I64);
   11932    IRTemp xstate_bv_0  = newTemp(Ity_I64);
   11933    IRTemp restore_0    = newTemp(Ity_I64);
   11934    assign(rfbm_0,      binop(Iop_And64, mkexpr(rfbm), mkU64(1)));
   11935    assign(xstate_bv_0, binop(Iop_And64, mkexpr(xstate_bv), mkU64(1)));
   11936    assign(restore_0,   binop(Iop_And64, mkexpr(rfbm_0), mkexpr(xstate_bv_0)));
   11937 
   11938    gen_FINIT_SEQUENCE( binop(Iop_CmpNE64, mkexpr(rfbm_0), mkU64(0)) );
   11939 
   11940    /* Uses dirty helper:
   11941          void amd64g_do_XRSTOR_COMPONENT_0 ( VexGuestAMD64State*, ULong )
   11942    */
   11943    IRDirty* d0 = unsafeIRDirty_0_N (
   11944                     0/*regparms*/,
   11945                     "amd64g_dirtyhelper_XRSTOR_COMPONENT_0",
   11946                     &amd64g_dirtyhelper_XRSTOR_COMPONENT_0,
   11947                     mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
   11948                  );
   11949    d0->guard = binop(Iop_CmpNE64, mkexpr(restore_0), mkU64(0));
   11950 
   11951    /* Declare we're reading memory.  Really, bytes 24 through 31
   11952       (MXCSR and MXCSR_MASK) aren't read, but we can't express more
   11953       than 1 memory area here, so just mark the whole thing as
   11954       read. */
   11955    d0->mFx   = Ifx_Read;
   11956    d0->mAddr = mkexpr(addr);
   11957    d0->mSize = 160;
   11958 
   11959    /* declare we're writing guest state */
   11960    d0->nFxState = 5;
   11961    vex_bzero(&d0->fxState, sizeof(d0->fxState));
   11962 
   11963    d0->fxState[0].fx     = Ifx_Write;
   11964    d0->fxState[0].offset = OFFB_FTOP;
   11965    d0->fxState[0].size   = sizeof(UInt);
   11966 
   11967    d0->fxState[1].fx     = Ifx_Write;
   11968    d0->fxState[1].offset = OFFB_FPREGS;
   11969    d0->fxState[1].size   = 8 * sizeof(ULong);
   11970 
   11971    d0->fxState[2].fx     = Ifx_Write;
   11972    d0->fxState[2].offset = OFFB_FPTAGS;
   11973    d0->fxState[2].size   = 8 * sizeof(UChar);
   11974 
   11975    d0->fxState[3].fx     = Ifx_Write;
   11976    d0->fxState[3].offset = OFFB_FPROUND;
   11977    d0->fxState[3].size   = sizeof(ULong);
   11978 
   11979    d0->fxState[4].fx     = Ifx_Write;
   11980    d0->fxState[4].offset = OFFB_FC3210;
   11981    d0->fxState[4].size   = sizeof(ULong);
   11982 
   11983    stmt( IRStmt_Dirty(d0) );
   11984 
   11985    /* ------ rfbm[1] gates the SSE state ------ */
   11986 
   11987    /* Same scheme as component 0: first zero it out, and then possibly
   11988       restore from the memory area. */
   11989    IRTemp rfbm_1       = newTemp(Ity_I64);
   11990    IRTemp xstate_bv_1  = newTemp(Ity_I64);
   11991    IRTemp restore_1    = newTemp(Ity_I64);
   11992    assign(rfbm_1,      binop(Iop_And64, mkexpr(rfbm), mkU64(2)));
   11993    assign(xstate_bv_1, binop(Iop_And64, mkexpr(xstate_bv), mkU64(2)));
   11994    assign(restore_1,   binop(Iop_And64, mkexpr(rfbm_1), mkexpr(xstate_bv_1)));
   11995    IRExpr* rfbm_1e     = binop(Iop_CmpNE64, mkexpr(rfbm_1),    mkU64(0));
   11996    IRExpr* restore_1e  = binop(Iop_CmpNE64, mkexpr(restore_1), mkU64(0));
   11997 
   11998    IRTemp rfbm_1or2       = newTemp(Ity_I64);
   11999    IRTemp xstate_bv_1or2  = newTemp(Ity_I64);
   12000    IRTemp restore_1or2    = newTemp(Ity_I64);
   12001    assign(rfbm_1or2,      binop(Iop_And64, mkexpr(rfbm), mkU64(6)));
   12002    assign(xstate_bv_1or2, binop(Iop_And64, mkexpr(xstate_bv), mkU64(6)));
   12003    assign(restore_1or2,   binop(Iop_And64, mkexpr(rfbm_1or2),
   12004                                            mkexpr(xstate_bv_1or2)));
   12005    IRExpr* rfbm_1or2e     = binop(Iop_CmpNE64, mkexpr(rfbm_1or2),    mkU64(0));
   12006    IRExpr* restore_1or2e  = binop(Iop_CmpNE64, mkexpr(restore_1or2), mkU64(0));
   12007 
   12008    /* The areas in question are: SSEROUND, and the XMM register array. */
   12009    putGuarded(OFFB_SSEROUND, rfbm_1or2e, mkU64(Irrm_NEAREST));
   12010 
   12011    UInt reg;
   12012    for (reg = 0; reg < 16; reg++) {
   12013       putGuarded(xmmGuestRegOffset(reg), rfbm_1e, mkV128(0));
   12014    }
   12015 
   12016    /* And now possibly restore from MXCSR/MXCSR_MASK */
   12017    /* Uses dirty helper:
   12018          void amd64g_do_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS
   12019                  ( VexGuestAMD64State*, ULong )
   12020       This restores from only MXCSR and MXCSR_MASK.  We need to do
   12021       this if either components 1 (SSE) or 2 (AVX) are requested.
   12022       Hence the guard condition is a bit more complex.
   12023    */
   12024    IRDirty* d1 = unsafeIRDirty_0_N (
   12025                     0/*regparms*/,
   12026                     "amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS",
   12027                     &amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS,
   12028                     mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
   12029                 ) ;
   12030    d1->guard = restore_1or2e;
   12031 
   12032    /* Declare we're reading memory: MXCSR and MXCSR_MASK.  Note that
   12033       the code for rbfm[0] just above claims a read of 0 .. 159, so
   12034       this duplicates it.  But at least correctly connects 24 .. 31 to
   12035       the MXCSR guest state representation (SSEROUND field). */
   12036    d1->mFx   = Ifx_Read;
   12037    d1->mAddr = binop(Iop_Add64, mkexpr(addr), mkU64(24));
   12038    d1->mSize = 8;
   12039 
   12040    /* declare we're writing guest state */
   12041    d1->nFxState = 1;
   12042    vex_bzero(&d1->fxState, sizeof(d1->fxState));
   12043 
   12044    d1->fxState[0].fx     = Ifx_Write;
   12045    d1->fxState[0].offset = OFFB_SSEROUND;
   12046    d1->fxState[0].size   = sizeof(ULong);
   12047 
   12048    /* Call the helper.  This creates SSEROUND but nothing
   12049       else.  We do the actual register array, XMM[0..15], separately,
   12050       in order that any undefinedness in the XMM registers is tracked
   12051       separately by Memcheck and is not "infected" by the in-memory
   12052       shadow for the other parts of the image. */
   12053    stmt( IRStmt_Dirty(d1) );
   12054 
   12055    /* And now the XMMs themselves.  For each register, we PUT either
   12056       its old value, or the value loaded from memory.  One convenient
   12057       way to do that is with a conditional load that has its the
   12058       default value, the old value of the register. */
   12059    for (reg = 0; reg < 16; reg++) {
   12060       IRExpr* ea  = binop(Iop_Add64, mkexpr(addr), mkU64(160 + reg * 16));
   12061       IRExpr* alt = getXMMReg(reg);
   12062       IRTemp  loadedValue = newTemp(Ity_V128);
   12063       stmt( IRStmt_LoadG(Iend_LE,
   12064                          ILGop_IdentV128,
   12065                          loadedValue, ea, alt, restore_1e) );
   12066       putXMMReg(reg, mkexpr(loadedValue));
   12067    }
   12068 
   12069    /* ------ rfbm[2] gates the AVX state ------ */
   12070    /* Component 2 is just a bunch of register loads, so we'll do it
   12071       inline, just to be simple and to be Memcheck friendly. */
   12072 
   12073    /* Same scheme as component 0: first zero it out, and then possibly
   12074       restore from the memory area. */
   12075    IRTemp rfbm_2      = newTemp(Ity_I64);
   12076    IRTemp xstate_bv_2 = newTemp(Ity_I64);
   12077    IRTemp restore_2   = newTemp(Ity_I64);
   12078    assign(rfbm_2,      binop(Iop_And64, mkexpr(rfbm), mkU64(4)));
   12079    assign(xstate_bv_2, binop(Iop_And64, mkexpr(xstate_bv), mkU64(4)));
   12080    assign(restore_2,   binop(Iop_And64, mkexpr(rfbm_2), mkexpr(xstate_bv_2)));
   12081 
   12082    IRExpr* rfbm_2e    = binop(Iop_CmpNE64, mkexpr(rfbm_2),    mkU64(0));
   12083    IRExpr* restore_2e = binop(Iop_CmpNE64, mkexpr(restore_2), mkU64(0));
   12084 
   12085    for (reg = 0; reg < 16; reg++) {
   12086       putGuarded(ymmGuestRegLane128offset(reg, 1), rfbm_2e, mkV128(0));
   12087    }
   12088 
   12089    for (reg = 0; reg < 16; reg++) {
   12090       IRExpr* ea  = binop(Iop_Add64, mkexpr(addr), mkU64(576 + reg * 16));
   12091       IRExpr* alt = getYMMRegLane128(reg, 1);
   12092       IRTemp  loadedValue = newTemp(Ity_V128);
   12093       stmt( IRStmt_LoadG(Iend_LE,
   12094                          ILGop_IdentV128,
   12095                          loadedValue, ea, alt, restore_2e) );
   12096       putYMMRegLane128(reg, 1, mkexpr(loadedValue));
   12097    }
   12098 }
   12099 
   12100 
   12101 static Long dis_XRSTOR ( const VexAbiInfo* vbi,
   12102                          Prefix pfx, Long delta, Int sz )
   12103 {
   12104    /* As with XRSTOR above we ignore the value of REX.W since we're
   12105       not bothering with the FPU DP and IP fields. */
   12106    IRTemp addr  = IRTemp_INVALID;
   12107    Int    alen  = 0;
   12108    HChar  dis_buf[50];
   12109    UChar  modrm = getUChar(delta);
   12110    vassert(!epartIsReg(modrm)); /* ensured by caller */
   12111    vassert(sz == 4 || sz == 8); /* ditto */
   12112 
   12113    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12114    delta += alen;
   12115    gen_SEGV_if_not_64_aligned(addr);
   12116 
   12117    DIP("%sxrstor %s\n", sz==8 ? "rex64/" : "", dis_buf);
   12118 
   12119    /* VEX's caller is assumed to have checked this. */
   12120    const ULong aSSUMED_XCR0_VALUE = 7;
   12121 
   12122    IRTemp rfbm = newTemp(Ity_I64);
   12123    assign(rfbm,
   12124           binop(Iop_And64,
   12125                 binop(Iop_Or64,
   12126                       binop(Iop_Shl64,
   12127                             unop(Iop_32Uto64, getIRegRDX(4)), mkU8(32)),
   12128                       unop(Iop_32Uto64, getIRegRAX(4))),
   12129                 mkU64(aSSUMED_XCR0_VALUE)));
   12130 
   12131    IRTemp xstate_bv = newTemp(Ity_I64);
   12132    assign(xstate_bv, loadLE(Ity_I64,
   12133                             binop(Iop_Add64, mkexpr(addr), mkU64(512+0))));
   12134 
   12135    IRTemp xcomp_bv = newTemp(Ity_I64);
   12136    assign(xcomp_bv, loadLE(Ity_I64,
   12137                            binop(Iop_Add64, mkexpr(addr), mkU64(512+8))));
   12138 
   12139    IRTemp xsavehdr_23_16 = newTemp(Ity_I64);
   12140    assign( xsavehdr_23_16,
   12141            loadLE(Ity_I64,
   12142                   binop(Iop_Add64, mkexpr(addr), mkU64(512+16))));
   12143 
   12144    /* We must fault if
   12145       * xcomp_bv[63] == 1, since this simulated CPU does not support
   12146         the compaction extension.
   12147       * xstate_bv sets a bit outside of XCR0 (which we assume to be 7).
   12148       * any of the xsave header bytes 23 .. 8 are nonzero.  This seems to
   12149         imply that xcomp_bv must be zero.
   12150       xcomp_bv is header bytes 15 .. 8 and xstate_bv is header bytes 7 .. 0
   12151    */
   12152    IRTemp fault_if_nonzero = newTemp(Ity_I64);
   12153    assign(fault_if_nonzero,
   12154           binop(Iop_Or64,
   12155                 binop(Iop_And64, mkexpr(xstate_bv), mkU64(~aSSUMED_XCR0_VALUE)),
   12156                 binop(Iop_Or64, mkexpr(xcomp_bv), mkexpr(xsavehdr_23_16))));
   12157    stmt( IRStmt_Exit(binop(Iop_CmpNE64, mkexpr(fault_if_nonzero), mkU64(0)),
   12158                      Ijk_SigSEGV,
   12159                      IRConst_U64(guest_RIP_curr_instr),
   12160                      OFFB_RIP
   12161    ));
   12162 
   12163    /* We are guaranteed now that both xstate_bv and rfbm are in the
   12164       range 0 .. 7.  Generate the restore sequence proper. */
   12165    gen_XRSTOR_SEQUENCE(addr, xstate_bv, rfbm);
   12166 
   12167    return delta;
   12168 }
   12169 
   12170 
   12171 static Long dis_FXRSTOR ( const VexAbiInfo* vbi,
   12172                           Prefix pfx, Long delta, Int sz )
   12173 {
   12174    /* As with FXSAVE above we ignore the value of REX.W since we're
   12175       not bothering with the FPU DP and IP fields. */
   12176    IRTemp addr  = IRTemp_INVALID;
   12177    Int    alen  = 0;
   12178    HChar  dis_buf[50];
   12179    UChar  modrm = getUChar(delta);
   12180    vassert(!epartIsReg(modrm)); /* ensured by caller */
   12181    vassert(sz == 4 || sz == 8); /* ditto */
   12182 
   12183    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12184    delta += alen;
   12185    gen_SEGV_if_not_16_aligned(addr);
   12186 
   12187    DIP("%sfxrstor %s\n", sz==8 ? "rex64/" : "", dis_buf);
   12188 
   12189    /* FXRSTOR is just XRSTOR with components 0 and 1 selected and also
   12190       as if components 0 and 1 are set as present in XSTATE_BV in the
   12191       XSAVE header.  Set both rfbm and xstate_bv to 0b011 therefore,
   12192       generate the XRSTOR sequence accordingly, and let iropt fold out
   12193       the unused (AVX) parts accordingly. */
   12194    IRTemp three = newTemp(Ity_I64);
   12195    assign(three, mkU64(3));
   12196    gen_XRSTOR_SEQUENCE(addr, three/*xstate_bv*/, three/*rfbm*/);
   12197 
   12198    return delta;
   12199 }
   12200 
   12201 
   12202 static IRTemp math_PINSRW_128 ( IRTemp v128, IRTemp u16, UInt imm8 )
   12203 {
   12204    vassert(imm8 >= 0 && imm8 <= 7);
   12205 
   12206    // Create a V128 value which has the selected word in the
   12207    // specified lane, and zeroes everywhere else.
   12208    IRTemp tmp128    = newTemp(Ity_V128);
   12209    IRTemp halfshift = newTemp(Ity_I64);
   12210    assign(halfshift, binop(Iop_Shl64,
   12211                            unop(Iop_16Uto64, mkexpr(u16)),
   12212                            mkU8(16 * (imm8 & 3))));
   12213    if (imm8 < 4) {
   12214       assign(tmp128, binop(Iop_64HLtoV128, mkU64(0), mkexpr(halfshift)));
   12215    } else {
   12216       assign(tmp128, binop(Iop_64HLtoV128, mkexpr(halfshift), mkU64(0)));
   12217    }
   12218 
   12219    UShort mask = ~(3 << (imm8 * 2));
   12220    IRTemp res  = newTemp(Ity_V128);
   12221    assign( res, binop(Iop_OrV128,
   12222                       mkexpr(tmp128),
   12223                       binop(Iop_AndV128, mkexpr(v128), mkV128(mask))) );
   12224    return res;
   12225 }
   12226 
   12227 
   12228 static IRTemp math_PSADBW_128 ( IRTemp dV, IRTemp sV )
   12229 {
   12230    IRTemp s1, s0, d1, d0;
   12231    s1 = s0 = d1 = d0 = IRTemp_INVALID;
   12232 
   12233    breakupV128to64s( sV, &s1, &s0 );
   12234    breakupV128to64s( dV, &d1, &d0 );
   12235 
   12236    IRTemp res = newTemp(Ity_V128);
   12237    assign( res,
   12238            binop(Iop_64HLtoV128,
   12239                  mkIRExprCCall(Ity_I64, 0/*regparms*/,
   12240                                "amd64g_calculate_mmx_psadbw",
   12241                                &amd64g_calculate_mmx_psadbw,
   12242                                mkIRExprVec_2( mkexpr(s1), mkexpr(d1))),
   12243                  mkIRExprCCall(Ity_I64, 0/*regparms*/,
   12244                                "amd64g_calculate_mmx_psadbw",
   12245                                &amd64g_calculate_mmx_psadbw,
   12246                                mkIRExprVec_2( mkexpr(s0), mkexpr(d0)))) );
   12247    return res;
   12248 }
   12249 
   12250 
   12251 static IRTemp math_PSADBW_256 ( IRTemp dV, IRTemp sV )
   12252 {
   12253    IRTemp sHi, sLo, dHi, dLo;
   12254    sHi = sLo = dHi = dLo = IRTemp_INVALID;
   12255    breakupV256toV128s( dV, &dHi, &dLo);
   12256    breakupV256toV128s( sV, &sHi, &sLo);
   12257    IRTemp res = newTemp(Ity_V256);
   12258    assign(res, binop(Iop_V128HLtoV256,
   12259                      mkexpr(math_PSADBW_128(dHi, sHi)),
   12260                      mkexpr(math_PSADBW_128(dLo, sLo))));
   12261    return res;
   12262 }
   12263 
   12264 
   12265 static Long dis_MASKMOVDQU ( const VexAbiInfo* vbi, Prefix pfx,
   12266                              Long delta, Bool isAvx )
   12267 {
   12268    IRTemp regD    = newTemp(Ity_V128);
   12269    IRTemp mask    = newTemp(Ity_V128);
   12270    IRTemp olddata = newTemp(Ity_V128);
   12271    IRTemp newdata = newTemp(Ity_V128);
   12272    IRTemp addr    = newTemp(Ity_I64);
   12273    UChar  modrm   = getUChar(delta);
   12274    UInt   rG      = gregOfRexRM(pfx,modrm);
   12275    UInt   rE      = eregOfRexRM(pfx,modrm);
   12276 
   12277    assign( addr, handleAddrOverrides( vbi, pfx, getIReg64(R_RDI) ));
   12278    assign( regD, getXMMReg( rG ));
   12279 
   12280    /* Unfortunately can't do the obvious thing with SarN8x16
   12281       here since that can't be re-emitted as SSE2 code - no such
   12282       insn. */
   12283    assign( mask,
   12284            binop(Iop_64HLtoV128,
   12285                  binop(Iop_SarN8x8,
   12286                        getXMMRegLane64( eregOfRexRM(pfx,modrm), 1 ),
   12287                        mkU8(7) ),
   12288                  binop(Iop_SarN8x8,
   12289                        getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ),
   12290                        mkU8(7) ) ));
   12291    assign( olddata, loadLE( Ity_V128, mkexpr(addr) ));
   12292    assign( newdata, binop(Iop_OrV128,
   12293                           binop(Iop_AndV128,
   12294                                 mkexpr(regD),
   12295                                 mkexpr(mask) ),
   12296                           binop(Iop_AndV128,
   12297                                 mkexpr(olddata),
   12298                                 unop(Iop_NotV128, mkexpr(mask)))) );
   12299    storeLE( mkexpr(addr), mkexpr(newdata) );
   12300 
   12301    delta += 1;
   12302    DIP("%smaskmovdqu %s,%s\n", isAvx ? "v" : "",
   12303        nameXMMReg(rE), nameXMMReg(rG) );
   12304    return delta;
   12305 }
   12306 
   12307 
   12308 static Long dis_MOVMSKPS_128 ( const VexAbiInfo* vbi, Prefix pfx,
   12309                                Long delta, Bool isAvx )
   12310 {
   12311    UChar modrm = getUChar(delta);
   12312    UInt   rG   = gregOfRexRM(pfx,modrm);
   12313    UInt   rE   = eregOfRexRM(pfx,modrm);
   12314    IRTemp t0   = newTemp(Ity_I32);
   12315    IRTemp t1   = newTemp(Ity_I32);
   12316    IRTemp t2   = newTemp(Ity_I32);
   12317    IRTemp t3   = newTemp(Ity_I32);
   12318    delta += 1;
   12319    assign( t0, binop( Iop_And32,
   12320                       binop(Iop_Shr32, getXMMRegLane32(rE,0), mkU8(31)),
   12321                       mkU32(1) ));
   12322    assign( t1, binop( Iop_And32,
   12323                       binop(Iop_Shr32, getXMMRegLane32(rE,1), mkU8(30)),
   12324                       mkU32(2) ));
   12325    assign( t2, binop( Iop_And32,
   12326                       binop(Iop_Shr32, getXMMRegLane32(rE,2), mkU8(29)),
   12327                       mkU32(4) ));
   12328    assign( t3, binop( Iop_And32,
   12329                       binop(Iop_Shr32, getXMMRegLane32(rE,3), mkU8(28)),
   12330                       mkU32(8) ));
   12331    putIReg32( rG, binop(Iop_Or32,
   12332                         binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
   12333                         binop(Iop_Or32, mkexpr(t2), mkexpr(t3)) ) );
   12334    DIP("%smovmskps %s,%s\n", isAvx ? "v" : "",
   12335        nameXMMReg(rE), nameIReg32(rG));
   12336    return delta;
   12337 }
   12338 
   12339 
   12340 static Long dis_MOVMSKPS_256 ( const VexAbiInfo* vbi, Prefix pfx, Long delta )
   12341 {
   12342    UChar modrm = getUChar(delta);
   12343    UInt   rG   = gregOfRexRM(pfx,modrm);
   12344    UInt   rE   = eregOfRexRM(pfx,modrm);
   12345    IRTemp t0   = newTemp(Ity_I32);
   12346    IRTemp t1   = newTemp(Ity_I32);
   12347    IRTemp t2   = newTemp(Ity_I32);
   12348    IRTemp t3   = newTemp(Ity_I32);
   12349    IRTemp t4   = newTemp(Ity_I32);
   12350    IRTemp t5   = newTemp(Ity_I32);
   12351    IRTemp t6   = newTemp(Ity_I32);
   12352    IRTemp t7   = newTemp(Ity_I32);
   12353    delta += 1;
   12354    assign( t0, binop( Iop_And32,
   12355                       binop(Iop_Shr32, getYMMRegLane32(rE,0), mkU8(31)),
   12356                       mkU32(1) ));
   12357    assign( t1, binop( Iop_And32,
   12358                       binop(Iop_Shr32, getYMMRegLane32(rE,1), mkU8(30)),
   12359                       mkU32(2) ));
   12360    assign( t2, binop( Iop_And32,
   12361                       binop(Iop_Shr32, getYMMRegLane32(rE,2), mkU8(29)),
   12362                       mkU32(4) ));
   12363    assign( t3, binop( Iop_And32,
   12364                       binop(Iop_Shr32, getYMMRegLane32(rE,3), mkU8(28)),
   12365                       mkU32(8) ));
   12366    assign( t4, binop( Iop_And32,
   12367                       binop(Iop_Shr32, getYMMRegLane32(rE,4), mkU8(27)),
   12368                       mkU32(16) ));
   12369    assign( t5, binop( Iop_And32,
   12370                       binop(Iop_Shr32, getYMMRegLane32(rE,5), mkU8(26)),
   12371                       mkU32(32) ));
   12372    assign( t6, binop( Iop_And32,
   12373                       binop(Iop_Shr32, getYMMRegLane32(rE,6), mkU8(25)),
   12374                       mkU32(64) ));
   12375    assign( t7, binop( Iop_And32,
   12376                       binop(Iop_Shr32, getYMMRegLane32(rE,7), mkU8(24)),
   12377                       mkU32(128) ));
   12378    putIReg32( rG, binop(Iop_Or32,
   12379                         binop(Iop_Or32,
   12380                               binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
   12381                               binop(Iop_Or32, mkexpr(t2), mkexpr(t3)) ),
   12382                         binop(Iop_Or32,
   12383                               binop(Iop_Or32, mkexpr(t4), mkexpr(t5)),
   12384                               binop(Iop_Or32, mkexpr(t6), mkexpr(t7)) ) ) );
   12385    DIP("vmovmskps %s,%s\n", nameYMMReg(rE), nameIReg32(rG));
   12386    return delta;
   12387 }
   12388 
   12389 
   12390 static Long dis_MOVMSKPD_128 ( const VexAbiInfo* vbi, Prefix pfx,
   12391                                Long delta, Bool isAvx )
   12392 {
   12393    UChar modrm = getUChar(delta);
   12394    UInt   rG   = gregOfRexRM(pfx,modrm);
   12395    UInt   rE   = eregOfRexRM(pfx,modrm);
   12396    IRTemp t0   = newTemp(Ity_I32);
   12397    IRTemp t1   = newTemp(Ity_I32);
   12398    delta += 1;
   12399    assign( t0, binop( Iop_And32,
   12400                       binop(Iop_Shr32, getXMMRegLane32(rE,1), mkU8(31)),
   12401                       mkU32(1) ));
   12402    assign( t1, binop( Iop_And32,
   12403                       binop(Iop_Shr32, getXMMRegLane32(rE,3), mkU8(30)),
   12404                       mkU32(2) ));
   12405    putIReg32( rG, binop(Iop_Or32, mkexpr(t0), mkexpr(t1) ) );
   12406    DIP("%smovmskpd %s,%s\n", isAvx ? "v" : "",
   12407        nameXMMReg(rE), nameIReg32(rG));
   12408    return delta;
   12409 }
   12410 
   12411 
   12412 static Long dis_MOVMSKPD_256 ( const VexAbiInfo* vbi, Prefix pfx, Long delta )
   12413 {
   12414    UChar modrm = getUChar(delta);
   12415    UInt   rG   = gregOfRexRM(pfx,modrm);
   12416    UInt   rE   = eregOfRexRM(pfx,modrm);
   12417    IRTemp t0   = newTemp(Ity_I32);
   12418    IRTemp t1   = newTemp(Ity_I32);
   12419    IRTemp t2   = newTemp(Ity_I32);
   12420    IRTemp t3   = newTemp(Ity_I32);
   12421    delta += 1;
   12422    assign( t0, binop( Iop_And32,
   12423                       binop(Iop_Shr32, getYMMRegLane32(rE,1), mkU8(31)),
   12424                       mkU32(1) ));
   12425    assign( t1, binop( Iop_And32,
   12426                       binop(Iop_Shr32, getYMMRegLane32(rE,3), mkU8(30)),
   12427                       mkU32(2) ));
   12428    assign( t2, binop( Iop_And32,
   12429                       binop(Iop_Shr32, getYMMRegLane32(rE,5), mkU8(29)),
   12430                       mkU32(4) ));
   12431    assign( t3, binop( Iop_And32,
   12432                       binop(Iop_Shr32, getYMMRegLane32(rE,7), mkU8(28)),
   12433                       mkU32(8) ));
   12434    putIReg32( rG, binop(Iop_Or32,
   12435                         binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
   12436                         binop(Iop_Or32, mkexpr(t2), mkexpr(t3)) ) );
   12437    DIP("vmovmskps %s,%s\n", nameYMMReg(rE), nameIReg32(rG));
   12438    return delta;
   12439 }
   12440 
   12441 
   12442 /* Note, this also handles SSE(1) insns. */
   12443 __attribute__((noinline))
   12444 static
   12445 Long dis_ESC_0F__SSE2 ( Bool* decode_OK,
   12446                         const VexArchInfo* archinfo,
   12447                         const VexAbiInfo* vbi,
   12448                         Prefix pfx, Int sz, Long deltaIN,
   12449                         DisResult* dres )
   12450 {
   12451    IRTemp addr  = IRTemp_INVALID;
   12452    IRTemp t0    = IRTemp_INVALID;
   12453    IRTemp t1    = IRTemp_INVALID;
   12454    IRTemp t2    = IRTemp_INVALID;
   12455    IRTemp t3    = IRTemp_INVALID;
   12456    IRTemp t4    = IRTemp_INVALID;
   12457    IRTemp t5    = IRTemp_INVALID;
   12458    IRTemp t6    = IRTemp_INVALID;
   12459    UChar  modrm = 0;
   12460    Int    alen  = 0;
   12461    HChar  dis_buf[50];
   12462 
   12463    *decode_OK = False;
   12464 
   12465    Long   delta = deltaIN;
   12466    UChar  opc   = getUChar(delta);
   12467    delta++;
   12468    switch (opc) {
   12469 
   12470    case 0x10:
   12471       if (have66noF2noF3(pfx)
   12472           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12473          /* 66 0F 10 = MOVUPD -- move from E (mem or xmm) to G (xmm). */
   12474          modrm = getUChar(delta);
   12475          if (epartIsReg(modrm)) {
   12476             putXMMReg( gregOfRexRM(pfx,modrm),
   12477                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   12478             DIP("movupd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12479                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12480             delta += 1;
   12481          } else {
   12482             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12483             putXMMReg( gregOfRexRM(pfx,modrm),
   12484                        loadLE(Ity_V128, mkexpr(addr)) );
   12485             DIP("movupd %s,%s\n", dis_buf,
   12486                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12487             delta += alen;
   12488          }
   12489          goto decode_success;
   12490       }
   12491       /* F2 0F 10 = MOVSD -- move 64 bits from E (mem or lo half xmm) to
   12492          G (lo half xmm).  If E is mem, upper half of G is zeroed out.
   12493          If E is reg, upper half of G is unchanged. */
   12494       if (haveF2no66noF3(pfx)
   12495           && (sz == 4 || /* ignore redundant REX.W */ sz == 8) ) {
   12496          modrm = getUChar(delta);
   12497          if (epartIsReg(modrm)) {
   12498             putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
   12499                              getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
   12500             DIP("movsd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12501                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   12502             delta += 1;
   12503          } else {
   12504             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12505             putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
   12506             putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
   12507                              loadLE(Ity_I64, mkexpr(addr)) );
   12508             DIP("movsd %s,%s\n", dis_buf,
   12509                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   12510             delta += alen;
   12511          }
   12512          goto decode_success;
   12513       }
   12514       /* F3 0F 10 = MOVSS -- move 32 bits from E (mem or lo 1/4 xmm) to G
   12515          (lo 1/4 xmm).  If E is mem, upper 3/4 of G is zeroed out. */
   12516       if (haveF3no66noF2(pfx)
   12517           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12518          modrm = getUChar(delta);
   12519          if (epartIsReg(modrm)) {
   12520             putXMMRegLane32( gregOfRexRM(pfx,modrm), 0,
   12521                              getXMMRegLane32( eregOfRexRM(pfx,modrm), 0 ));
   12522             DIP("movss %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12523                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   12524             delta += 1;
   12525          } else {
   12526             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12527             putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
   12528             putXMMRegLane32( gregOfRexRM(pfx,modrm), 0,
   12529                              loadLE(Ity_I32, mkexpr(addr)) );
   12530             DIP("movss %s,%s\n", dis_buf,
   12531                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   12532             delta += alen;
   12533          }
   12534          goto decode_success;
   12535       }
   12536       /* 0F 10 = MOVUPS -- move from E (mem or xmm) to G (xmm). */
   12537       if (haveNo66noF2noF3(pfx)
   12538           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12539          modrm = getUChar(delta);
   12540          if (epartIsReg(modrm)) {
   12541             putXMMReg( gregOfRexRM(pfx,modrm),
   12542                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   12543             DIP("movups %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12544                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12545             delta += 1;
   12546          } else {
   12547             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12548             putXMMReg( gregOfRexRM(pfx,modrm),
   12549                        loadLE(Ity_V128, mkexpr(addr)) );
   12550             DIP("movups %s,%s\n", dis_buf,
   12551                                      nameXMMReg(gregOfRexRM(pfx,modrm)));
   12552             delta += alen;
   12553          }
   12554          goto decode_success;
   12555       }
   12556       break;
   12557 
   12558    case 0x11:
   12559       /* F2 0F 11 = MOVSD -- move 64 bits from G (lo half xmm) to E (mem
   12560          or lo half xmm). */
   12561       if (haveF2no66noF3(pfx)
   12562           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12563          modrm = getUChar(delta);
   12564          if (epartIsReg(modrm)) {
   12565             putXMMRegLane64( eregOfRexRM(pfx,modrm), 0,
   12566                              getXMMRegLane64( gregOfRexRM(pfx,modrm), 0 ));
   12567             DIP("movsd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12568                                  nameXMMReg(eregOfRexRM(pfx,modrm)));
   12569             delta += 1;
   12570          } else {
   12571             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12572             storeLE( mkexpr(addr),
   12573                      getXMMRegLane64(gregOfRexRM(pfx,modrm), 0) );
   12574             DIP("movsd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12575                                  dis_buf);
   12576             delta += alen;
   12577          }
   12578          goto decode_success;
   12579       }
   12580       /* F3 0F 11 = MOVSS -- move 32 bits from G (lo 1/4 xmm) to E (mem
   12581          or lo 1/4 xmm). */
   12582       if (haveF3no66noF2(pfx) && sz == 4) {
   12583          modrm = getUChar(delta);
   12584          if (epartIsReg(modrm)) {
   12585             /* fall through, we don't yet have a test case */
   12586          } else {
   12587             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12588             storeLE( mkexpr(addr),
   12589                      getXMMRegLane32(gregOfRexRM(pfx,modrm), 0) );
   12590             DIP("movss %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12591                                  dis_buf);
   12592             delta += alen;
   12593             goto decode_success;
   12594          }
   12595       }
   12596       /* 66 0F 11 = MOVUPD -- move from G (xmm) to E (mem or xmm). */
   12597       if (have66noF2noF3(pfx)
   12598           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12599          modrm = getUChar(delta);
   12600          if (epartIsReg(modrm)) {
   12601             putXMMReg( eregOfRexRM(pfx,modrm),
   12602                        getXMMReg( gregOfRexRM(pfx,modrm) ) );
   12603             DIP("movupd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12604                                   nameXMMReg(eregOfRexRM(pfx,modrm)));
   12605             delta += 1;
   12606          } else {
   12607             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12608             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   12609             DIP("movupd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12610                                   dis_buf );
   12611             delta += alen;
   12612          }
   12613          goto decode_success;
   12614       }
   12615       /* 0F 11 = MOVUPS -- move from G (xmm) to E (mem or xmm). */
   12616       if (haveNo66noF2noF3(pfx)
   12617           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12618          modrm = getUChar(delta);
   12619          if (epartIsReg(modrm)) {
   12620             /* fall through; awaiting test case */
   12621          } else {
   12622             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12623             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   12624             DIP("movups %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12625                                   dis_buf );
   12626             delta += alen;
   12627             goto decode_success;
   12628          }
   12629       }
   12630       break;
   12631 
   12632    case 0x12:
   12633       /* 66 0F 12 = MOVLPD -- move from mem to low half of XMM. */
   12634       /* Identical to MOVLPS ? */
   12635       if (have66noF2noF3(pfx)
   12636           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12637          modrm = getUChar(delta);
   12638          if (epartIsReg(modrm)) {
   12639             /* fall through; apparently reg-reg is not possible */
   12640          } else {
   12641             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12642             delta += alen;
   12643             putXMMRegLane64( gregOfRexRM(pfx,modrm),
   12644                              0/*lower lane*/,
   12645                              loadLE(Ity_I64, mkexpr(addr)) );
   12646             DIP("movlpd %s, %s\n",
   12647                 dis_buf, nameXMMReg( gregOfRexRM(pfx,modrm) ));
   12648             goto decode_success;
   12649          }
   12650       }
   12651       /* 0F 12 = MOVLPS -- move from mem to low half of XMM. */
   12652       /* OF 12 = MOVHLPS -- from from hi half to lo half of XMM. */
   12653       if (haveNo66noF2noF3(pfx)
   12654           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12655          modrm = getUChar(delta);
   12656          if (epartIsReg(modrm)) {
   12657             delta += 1;
   12658             putXMMRegLane64( gregOfRexRM(pfx,modrm),
   12659                              0/*lower lane*/,
   12660                              getXMMRegLane64( eregOfRexRM(pfx,modrm), 1 ));
   12661             DIP("movhlps %s, %s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12662                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12663          } else {
   12664             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12665             delta += alen;
   12666             putXMMRegLane64( gregOfRexRM(pfx,modrm),  0/*lower lane*/,
   12667                              loadLE(Ity_I64, mkexpr(addr)) );
   12668             DIP("movlps %s, %s\n",
   12669                 dis_buf, nameXMMReg( gregOfRexRM(pfx,modrm) ));
   12670          }
   12671          goto decode_success;
   12672       }
   12673       break;
   12674 
   12675    case 0x13:
   12676       /* 0F 13 = MOVLPS -- move from low half of XMM to mem. */
   12677       if (haveNo66noF2noF3(pfx)
   12678           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12679          modrm = getUChar(delta);
   12680          if (!epartIsReg(modrm)) {
   12681             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12682             delta += alen;
   12683             storeLE( mkexpr(addr),
   12684                      getXMMRegLane64( gregOfRexRM(pfx,modrm),
   12685                                       0/*lower lane*/ ) );
   12686             DIP("movlps %s, %s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
   12687                                    dis_buf);
   12688             goto decode_success;
   12689          }
   12690          /* else fall through */
   12691       }
   12692       /* 66 0F 13 = MOVLPD -- move from low half of XMM to mem. */
   12693       /* Identical to MOVLPS ? */
   12694       if (have66noF2noF3(pfx)
   12695           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12696          modrm = getUChar(delta);
   12697          if (!epartIsReg(modrm)) {
   12698             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12699             delta += alen;
   12700             storeLE( mkexpr(addr),
   12701                      getXMMRegLane64( gregOfRexRM(pfx,modrm),
   12702                                       0/*lower lane*/ ) );
   12703             DIP("movlpd %s, %s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
   12704                                    dis_buf);
   12705             goto decode_success;
   12706          }
   12707          /* else fall through */
   12708       }
   12709       break;
   12710 
   12711    case 0x14:
   12712    case 0x15:
   12713       /* 0F 14 = UNPCKLPS -- unpack and interleave low part F32s */
   12714       /* 0F 15 = UNPCKHPS -- unpack and interleave high part F32s */
   12715       /* These just appear to be special cases of SHUFPS */
   12716       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12717          Bool   hi = toBool(opc == 0x15);
   12718          IRTemp sV = newTemp(Ity_V128);
   12719          IRTemp dV = newTemp(Ity_V128);
   12720          modrm = getUChar(delta);
   12721          UInt   rG = gregOfRexRM(pfx,modrm);
   12722          assign( dV, getXMMReg(rG) );
   12723          if (epartIsReg(modrm)) {
   12724             UInt rE = eregOfRexRM(pfx,modrm);
   12725             assign( sV, getXMMReg(rE) );
   12726             delta += 1;
   12727             DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   12728                 nameXMMReg(rE), nameXMMReg(rG));
   12729          } else {
   12730             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12731             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12732             delta += alen;
   12733             DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   12734                 dis_buf, nameXMMReg(rG));
   12735          }
   12736          IRTemp res = math_UNPCKxPS_128( sV, dV, hi );
   12737          putXMMReg( rG, mkexpr(res) );
   12738          goto decode_success;
   12739       }
   12740       /* 66 0F 15 = UNPCKHPD -- unpack and interleave high part F64s */
   12741       /* 66 0F 14 = UNPCKLPD -- unpack and interleave low part F64s */
   12742       /* These just appear to be special cases of SHUFPS */
   12743       if (have66noF2noF3(pfx)
   12744           && sz == 2 /* could be 8 if rex also present */) {
   12745          Bool   hi = toBool(opc == 0x15);
   12746          IRTemp sV = newTemp(Ity_V128);
   12747          IRTemp dV = newTemp(Ity_V128);
   12748          modrm = getUChar(delta);
   12749          UInt   rG = gregOfRexRM(pfx,modrm);
   12750          assign( dV, getXMMReg(rG) );
   12751          if (epartIsReg(modrm)) {
   12752             UInt rE = eregOfRexRM(pfx,modrm);
   12753             assign( sV, getXMMReg(rE) );
   12754             delta += 1;
   12755             DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   12756                 nameXMMReg(rE), nameXMMReg(rG));
   12757          } else {
   12758             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12759             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12760             delta += alen;
   12761             DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   12762                 dis_buf, nameXMMReg(rG));
   12763          }
   12764          IRTemp res = math_UNPCKxPD_128( sV, dV, hi );
   12765          putXMMReg( rG, mkexpr(res) );
   12766          goto decode_success;
   12767       }
   12768       break;
   12769 
   12770    case 0x16:
   12771       /* 66 0F 16 = MOVHPD -- move from mem to high half of XMM. */
   12772       /* These seems identical to MOVHPS.  This instruction encoding is
   12773          completely crazy. */
   12774       if (have66noF2noF3(pfx)
   12775           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12776          modrm = getUChar(delta);
   12777          if (epartIsReg(modrm)) {
   12778             /* fall through; apparently reg-reg is not possible */
   12779          } else {
   12780             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12781             delta += alen;
   12782             putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
   12783                              loadLE(Ity_I64, mkexpr(addr)) );
   12784             DIP("movhpd %s,%s\n", dis_buf,
   12785                                   nameXMMReg( gregOfRexRM(pfx,modrm) ));
   12786             goto decode_success;
   12787          }
   12788       }
   12789       /* 0F 16 = MOVHPS -- move from mem to high half of XMM. */
   12790       /* 0F 16 = MOVLHPS -- move from lo half to hi half of XMM. */
   12791       if (haveNo66noF2noF3(pfx)
   12792           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12793          modrm = getUChar(delta);
   12794          if (epartIsReg(modrm)) {
   12795             delta += 1;
   12796             putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
   12797                              getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ) );
   12798             DIP("movhps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12799                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12800          } else {
   12801             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12802             delta += alen;
   12803             putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
   12804                              loadLE(Ity_I64, mkexpr(addr)) );
   12805             DIP("movhps %s,%s\n", dis_buf,
   12806                                   nameXMMReg( gregOfRexRM(pfx,modrm) ));
   12807          }
   12808          goto decode_success;
   12809       }
   12810       break;
   12811 
   12812    case 0x17:
   12813       /* 0F 17 = MOVHPS -- move from high half of XMM to mem. */
   12814       if (haveNo66noF2noF3(pfx)
   12815           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12816          modrm = getUChar(delta);
   12817          if (!epartIsReg(modrm)) {
   12818             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12819             delta += alen;
   12820             storeLE( mkexpr(addr),
   12821                      getXMMRegLane64( gregOfRexRM(pfx,modrm),
   12822                                       1/*upper lane*/ ) );
   12823             DIP("movhps %s,%s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
   12824                                   dis_buf);
   12825             goto decode_success;
   12826          }
   12827          /* else fall through */
   12828       }
   12829       /* 66 0F 17 = MOVHPD -- move from high half of XMM to mem. */
   12830       /* Again, this seems identical to MOVHPS. */
   12831       if (have66noF2noF3(pfx)
   12832           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12833          modrm = getUChar(delta);
   12834          if (!epartIsReg(modrm)) {
   12835             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12836             delta += alen;
   12837             storeLE( mkexpr(addr),
   12838                      getXMMRegLane64( gregOfRexRM(pfx,modrm),
   12839                                       1/*upper lane*/ ) );
   12840             DIP("movhpd %s,%s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
   12841                                   dis_buf);
   12842             goto decode_success;
   12843          }
   12844          /* else fall through */
   12845       }
   12846       break;
   12847 
   12848    case 0x18:
   12849       /* 0F 18 /0 = PREFETCHNTA -- prefetch into caches, */
   12850       /* 0F 18 /1 = PREFETCH0   -- with various different hints */
   12851       /* 0F 18 /2 = PREFETCH1 */
   12852       /* 0F 18 /3 = PREFETCH2 */
   12853       if (haveNo66noF2noF3(pfx)
   12854           && !epartIsReg(getUChar(delta))
   12855           && gregLO3ofRM(getUChar(delta)) >= 0
   12856           && gregLO3ofRM(getUChar(delta)) <= 3) {
   12857          const HChar* hintstr = "??";
   12858 
   12859          modrm = getUChar(delta);
   12860          vassert(!epartIsReg(modrm));
   12861 
   12862          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12863          delta += alen;
   12864 
   12865          switch (gregLO3ofRM(modrm)) {
   12866             case 0: hintstr = "nta"; break;
   12867             case 1: hintstr = "t0"; break;
   12868             case 2: hintstr = "t1"; break;
   12869             case 3: hintstr = "t2"; break;
   12870             default: vassert(0);
   12871          }
   12872 
   12873          DIP("prefetch%s %s\n", hintstr, dis_buf);
   12874          goto decode_success;
   12875       }
   12876       break;
   12877 
   12878    case 0x28:
   12879       /* 66 0F 28 = MOVAPD -- move from E (mem or xmm) to G (xmm). */
   12880       if (have66noF2noF3(pfx)
   12881           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12882          modrm = getUChar(delta);
   12883          if (epartIsReg(modrm)) {
   12884             putXMMReg( gregOfRexRM(pfx,modrm),
   12885                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   12886             DIP("movapd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12887                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12888             delta += 1;
   12889          } else {
   12890             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12891             gen_SEGV_if_not_16_aligned( addr );
   12892             putXMMReg( gregOfRexRM(pfx,modrm),
   12893                        loadLE(Ity_V128, mkexpr(addr)) );
   12894             DIP("movapd %s,%s\n", dis_buf,
   12895                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12896             delta += alen;
   12897          }
   12898          goto decode_success;
   12899       }
   12900       /* 0F 28 = MOVAPS -- move from E (mem or xmm) to G (xmm). */
   12901       if (haveNo66noF2noF3(pfx)
   12902           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12903          modrm = getUChar(delta);
   12904          if (epartIsReg(modrm)) {
   12905             putXMMReg( gregOfRexRM(pfx,modrm),
   12906                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   12907             DIP("movaps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12908                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12909             delta += 1;
   12910          } else {
   12911             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12912             gen_SEGV_if_not_16_aligned( addr );
   12913             putXMMReg( gregOfRexRM(pfx,modrm),
   12914                        loadLE(Ity_V128, mkexpr(addr)) );
   12915             DIP("movaps %s,%s\n", dis_buf,
   12916                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12917             delta += alen;
   12918          }
   12919          goto decode_success;
   12920       }
   12921       break;
   12922 
   12923    case 0x29:
   12924       /* 0F 29 = MOVAPS -- move from G (xmm) to E (mem or xmm). */
   12925       if (haveNo66noF2noF3(pfx)
   12926           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12927          modrm = getUChar(delta);
   12928          if (epartIsReg(modrm)) {
   12929             putXMMReg( eregOfRexRM(pfx,modrm),
   12930                        getXMMReg( gregOfRexRM(pfx,modrm) ));
   12931             DIP("movaps %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12932                                   nameXMMReg(eregOfRexRM(pfx,modrm)));
   12933             delta += 1;
   12934          } else {
   12935             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12936             gen_SEGV_if_not_16_aligned( addr );
   12937             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   12938             DIP("movaps %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12939                                   dis_buf );
   12940             delta += alen;
   12941          }
   12942          goto decode_success;
   12943       }
   12944       /* 66 0F 29 = MOVAPD -- move from G (xmm) to E (mem or xmm). */
   12945       if (have66noF2noF3(pfx)
   12946           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12947          modrm = getUChar(delta);
   12948          if (epartIsReg(modrm)) {
   12949             putXMMReg( eregOfRexRM(pfx,modrm),
   12950                        getXMMReg( gregOfRexRM(pfx,modrm) ) );
   12951             DIP("movapd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12952                                   nameXMMReg(eregOfRexRM(pfx,modrm)));
   12953             delta += 1;
   12954          } else {
   12955             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12956             gen_SEGV_if_not_16_aligned( addr );
   12957             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   12958             DIP("movapd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12959                                   dis_buf );
   12960             delta += alen;
   12961          }
   12962          goto decode_success;
   12963       }
   12964       break;
   12965 
   12966    case 0x2A:
   12967       /* 0F 2A = CVTPI2PS -- convert 2 x I32 in mem/mmx to 2 x F32 in low
   12968          half xmm */
   12969       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12970          IRTemp arg64 = newTemp(Ity_I64);
   12971          IRTemp rmode = newTemp(Ity_I32);
   12972 
   12973          modrm = getUChar(delta);
   12974          if (epartIsReg(modrm)) {
   12975             /* Only switch to MMX mode if the source is a MMX register.
   12976                See comments on CVTPI2PD for details.  Fixes #357059. */
   12977             do_MMX_preamble();
   12978             assign( arg64, getMMXReg(eregLO3ofRM(modrm)) );
   12979             delta += 1;
   12980             DIP("cvtpi2ps %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   12981                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12982          } else {
   12983             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12984             assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   12985             delta += alen;
   12986             DIP("cvtpi2ps %s,%s\n", dis_buf,
   12987                                     nameXMMReg(gregOfRexRM(pfx,modrm)) );
   12988          }
   12989 
   12990          assign( rmode, get_sse_roundingmode() );
   12991 
   12992          putXMMRegLane32F(
   12993             gregOfRexRM(pfx,modrm), 0,
   12994             binop(Iop_F64toF32,
   12995                   mkexpr(rmode),
   12996                   unop(Iop_I32StoF64,
   12997                        unop(Iop_64to32, mkexpr(arg64)) )) );
   12998 
   12999          putXMMRegLane32F(
   13000             gregOfRexRM(pfx,modrm), 1,
   13001             binop(Iop_F64toF32,
   13002                   mkexpr(rmode),
   13003                   unop(Iop_I32StoF64,
   13004                        unop(Iop_64HIto32, mkexpr(arg64)) )) );
   13005 
   13006          goto decode_success;
   13007       }
   13008       /* F3 0F 2A = CVTSI2SS
   13009          -- sz==4: convert I32 in mem/ireg to F32 in low quarter xmm
   13010          -- sz==8: convert I64 in mem/ireg to F32 in low quarter xmm */
   13011       if (haveF3no66noF2(pfx) && (sz == 4 || sz == 8)) {
   13012          IRTemp rmode = newTemp(Ity_I32);
   13013          assign( rmode, get_sse_roundingmode() );
   13014          modrm = getUChar(delta);
   13015          if (sz == 4) {
   13016             IRTemp arg32 = newTemp(Ity_I32);
   13017             if (epartIsReg(modrm)) {
   13018                assign( arg32, getIReg32(eregOfRexRM(pfx,modrm)) );
   13019                delta += 1;
   13020                DIP("cvtsi2ss %s,%s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
   13021                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
   13022             } else {
   13023                addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13024                assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   13025                delta += alen;
   13026                DIP("cvtsi2ss %s,%s\n", dis_buf,
   13027                                        nameXMMReg(gregOfRexRM(pfx,modrm)) );
   13028             }
   13029             putXMMRegLane32F(
   13030                gregOfRexRM(pfx,modrm), 0,
   13031                binop(Iop_F64toF32,
   13032                      mkexpr(rmode),
   13033                      unop(Iop_I32StoF64, mkexpr(arg32)) ) );
   13034          } else {
   13035             /* sz == 8 */
   13036             IRTemp arg64 = newTemp(Ity_I64);
   13037             if (epartIsReg(modrm)) {
   13038                assign( arg64, getIReg64(eregOfRexRM(pfx,modrm)) );
   13039                delta += 1;
   13040                DIP("cvtsi2ssq %s,%s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
   13041                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
   13042             } else {
   13043                addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13044                assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   13045                delta += alen;
   13046                DIP("cvtsi2ssq %s,%s\n", dis_buf,
   13047                                         nameXMMReg(gregOfRexRM(pfx,modrm)) );
   13048             }
   13049             putXMMRegLane32F(
   13050                gregOfRexRM(pfx,modrm), 0,
   13051                binop(Iop_F64toF32,
   13052                      mkexpr(rmode),
   13053                      binop(Iop_I64StoF64, mkexpr(rmode), mkexpr(arg64)) ) );
   13054          }
   13055          goto decode_success;
   13056       }
   13057       /* F2 0F 2A = CVTSI2SD
   13058          when sz==4 -- convert I32 in mem/ireg to F64 in low half xmm
   13059          when sz==8 -- convert I64 in mem/ireg to F64 in low half xmm
   13060       */
   13061       if (haveF2no66noF3(pfx) && (sz == 4 || sz == 8)) {
   13062          modrm = getUChar(delta);
   13063          if (sz == 4) {
   13064             IRTemp arg32 = newTemp(Ity_I32);
   13065             if (epartIsReg(modrm)) {
   13066                assign( arg32, getIReg32(eregOfRexRM(pfx,modrm)) );
   13067                delta += 1;
   13068                DIP("cvtsi2sdl %s,%s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
   13069                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
   13070             } else {
   13071                addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13072                assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   13073                delta += alen;
   13074                DIP("cvtsi2sdl %s,%s\n", dis_buf,
   13075                                         nameXMMReg(gregOfRexRM(pfx,modrm)) );
   13076             }
   13077             putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0,
   13078                               unop(Iop_I32StoF64, mkexpr(arg32))
   13079             );
   13080          } else {
   13081             /* sz == 8 */
   13082             IRTemp arg64 = newTemp(Ity_I64);
   13083             if (epartIsReg(modrm)) {
   13084                assign( arg64, getIReg64(eregOfRexRM(pfx,modrm)) );
   13085                delta += 1;
   13086                DIP("cvtsi2sdq %s,%s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
   13087                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
   13088             } else {
   13089                addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13090                assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   13091                delta += alen;
   13092                DIP("cvtsi2sdq %s,%s\n", dis_buf,
   13093                                         nameXMMReg(gregOfRexRM(pfx,modrm)) );
   13094             }
   13095             putXMMRegLane64F(
   13096                gregOfRexRM(pfx,modrm),
   13097                0,
   13098                binop( Iop_I64StoF64,
   13099                       get_sse_roundingmode(),
   13100                       mkexpr(arg64)
   13101                )
   13102             );
   13103          }
   13104          goto decode_success;
   13105       }
   13106       /* 66 0F 2A = CVTPI2PD -- convert 2 x I32 in mem/mmx to 2 x F64 in
   13107          xmm(G) */
   13108       if (have66noF2noF3(pfx) && sz == 2) {
   13109          IRTemp arg64 = newTemp(Ity_I64);
   13110 
   13111          modrm = getUChar(delta);
   13112          if (epartIsReg(modrm)) {
   13113             /* Only switch to MMX mode if the source is a MMX register.
   13114                This is inconsistent with all other instructions which
   13115                convert between XMM and (M64 or MMX), which always switch
   13116                to MMX mode even if 64-bit operand is M64 and not MMX.  At
   13117                least, that's what the Intel docs seem to me to say.
   13118                Fixes #210264. */
   13119             do_MMX_preamble();
   13120             assign( arg64, getMMXReg(eregLO3ofRM(modrm)) );
   13121             delta += 1;
   13122             DIP("cvtpi2pd %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   13123                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13124          } else {
   13125             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13126             assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   13127             delta += alen;
   13128             DIP("cvtpi2pd %s,%s\n", dis_buf,
   13129                                     nameXMMReg(gregOfRexRM(pfx,modrm)) );
   13130          }
   13131 
   13132          putXMMRegLane64F(
   13133             gregOfRexRM(pfx,modrm), 0,
   13134             unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)) )
   13135          );
   13136 
   13137          putXMMRegLane64F(
   13138             gregOfRexRM(pfx,modrm), 1,
   13139             unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)) )
   13140          );
   13141 
   13142          goto decode_success;
   13143       }
   13144       break;
   13145 
   13146    case 0x2B:
   13147       /* 66 0F 2B = MOVNTPD -- for us, just a plain SSE store. */
   13148       /* 0F 2B = MOVNTPS -- for us, just a plain SSE store. */
   13149       if ( (haveNo66noF2noF3(pfx) && sz == 4)
   13150            || (have66noF2noF3(pfx) && sz == 2) ) {
   13151          modrm = getUChar(delta);
   13152          if (!epartIsReg(modrm)) {
   13153             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13154             gen_SEGV_if_not_16_aligned( addr );
   13155             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   13156             DIP("movntp%s %s,%s\n", sz==2 ? "d" : "s",
   13157                                     dis_buf,
   13158                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13159             delta += alen;
   13160             goto decode_success;
   13161          }
   13162          /* else fall through */
   13163       }
   13164       break;
   13165 
   13166    case 0x2C:
   13167    case 0x2D:
   13168       /* 0F 2D = CVTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
   13169          I32 in mmx, according to prevailing SSE rounding mode */
   13170       /* 0F 2C = CVTTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
   13171          I32 in mmx, rounding towards zero */
   13172       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13173          IRTemp dst64  = newTemp(Ity_I64);
   13174          IRTemp rmode  = newTemp(Ity_I32);
   13175          IRTemp f32lo  = newTemp(Ity_F32);
   13176          IRTemp f32hi  = newTemp(Ity_F32);
   13177          Bool   r2zero = toBool(opc == 0x2C);
   13178 
   13179          do_MMX_preamble();
   13180          modrm = getUChar(delta);
   13181 
   13182          if (epartIsReg(modrm)) {
   13183             delta += 1;
   13184             assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
   13185             assign(f32hi, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 1));
   13186             DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
   13187                                       nameXMMReg(eregOfRexRM(pfx,modrm)),
   13188                                       nameMMXReg(gregLO3ofRM(modrm)));
   13189          } else {
   13190             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13191             assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   13192             assign(f32hi, loadLE(Ity_F32, binop( Iop_Add64,
   13193                                                  mkexpr(addr),
   13194                                                  mkU64(4) )));
   13195             delta += alen;
   13196             DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
   13197                                       dis_buf,
   13198                                       nameMMXReg(gregLO3ofRM(modrm)));
   13199          }
   13200 
   13201          if (r2zero) {
   13202             assign(rmode, mkU32((UInt)Irrm_ZERO) );
   13203          } else {
   13204             assign( rmode, get_sse_roundingmode() );
   13205          }
   13206 
   13207          assign(
   13208             dst64,
   13209             binop( Iop_32HLto64,
   13210                    binop( Iop_F64toI32S,
   13211                           mkexpr(rmode),
   13212                           unop( Iop_F32toF64, mkexpr(f32hi) ) ),
   13213                    binop( Iop_F64toI32S,
   13214                           mkexpr(rmode),
   13215                           unop( Iop_F32toF64, mkexpr(f32lo) ) )
   13216                  )
   13217          );
   13218 
   13219          putMMXReg(gregLO3ofRM(modrm), mkexpr(dst64));
   13220          goto decode_success;
   13221       }
   13222       /* F3 0F 2D = CVTSS2SI
   13223          when sz==4 -- convert F32 in mem/low quarter xmm to I32 in ireg,
   13224                        according to prevailing SSE rounding mode
   13225          when sz==8 -- convert F32 in mem/low quarter xmm to I64 in ireg,
   13226                        according to prevailing SSE rounding mode
   13227       */
   13228       /* F3 0F 2C = CVTTSS2SI
   13229          when sz==4 -- convert F32 in mem/low quarter xmm to I32 in ireg,
   13230                        truncating towards zero
   13231          when sz==8 -- convert F32 in mem/low quarter xmm to I64 in ireg,
   13232                        truncating towards zero
   13233       */
   13234       if (haveF3no66noF2(pfx) && (sz == 4 || sz == 8)) {
   13235          delta = dis_CVTxSS2SI( vbi, pfx, delta, False/*!isAvx*/, opc, sz);
   13236          goto decode_success;
   13237       }
   13238       /* F2 0F 2D = CVTSD2SI
   13239          when sz==4 -- convert F64 in mem/low half xmm to I32 in ireg,
   13240                        according to prevailing SSE rounding mode
   13241          when sz==8 -- convert F64 in mem/low half xmm to I64 in ireg,
   13242                        according to prevailing SSE rounding mode
   13243       */
   13244       /* F2 0F 2C = CVTTSD2SI
   13245          when sz==4 -- convert F64 in mem/low half xmm to I32 in ireg,
   13246                        truncating towards zero
   13247          when sz==8 -- convert F64 in mem/low half xmm to I64 in ireg,
   13248                        truncating towards zero
   13249       */
   13250       if (haveF2no66noF3(pfx) && (sz == 4 || sz == 8)) {
   13251          delta = dis_CVTxSD2SI( vbi, pfx, delta, False/*!isAvx*/, opc, sz);
   13252          goto decode_success;
   13253       }
   13254       /* 66 0F 2D = CVTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
   13255          I32 in mmx, according to prevailing SSE rounding mode */
   13256       /* 66 0F 2C = CVTTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
   13257          I32 in mmx, rounding towards zero */
   13258       if (have66noF2noF3(pfx) && sz == 2) {
   13259          IRTemp dst64  = newTemp(Ity_I64);
   13260          IRTemp rmode  = newTemp(Ity_I32);
   13261          IRTemp f64lo  = newTemp(Ity_F64);
   13262          IRTemp f64hi  = newTemp(Ity_F64);
   13263          Bool   r2zero = toBool(opc == 0x2C);
   13264 
   13265          do_MMX_preamble();
   13266          modrm = getUChar(delta);
   13267 
   13268          if (epartIsReg(modrm)) {
   13269             delta += 1;
   13270             assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
   13271             assign(f64hi, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 1));
   13272             DIP("cvt%spd2pi %s,%s\n", r2zero ? "t" : "",
   13273                                       nameXMMReg(eregOfRexRM(pfx,modrm)),
   13274                                       nameMMXReg(gregLO3ofRM(modrm)));
   13275          } else {
   13276             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13277             assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   13278             assign(f64hi, loadLE(Ity_F64, binop( Iop_Add64,
   13279                                                  mkexpr(addr),
   13280                                                  mkU64(8) )));
   13281             delta += alen;
   13282             DIP("cvt%spf2pi %s,%s\n", r2zero ? "t" : "",
   13283                                       dis_buf,
   13284                                       nameMMXReg(gregLO3ofRM(modrm)));
   13285          }
   13286 
   13287          if (r2zero) {
   13288             assign(rmode, mkU32((UInt)Irrm_ZERO) );
   13289          } else {
   13290             assign( rmode, get_sse_roundingmode() );
   13291          }
   13292 
   13293          assign(
   13294             dst64,
   13295             binop( Iop_32HLto64,
   13296                    binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64hi) ),
   13297                    binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo) )
   13298                  )
   13299          );
   13300 
   13301          putMMXReg(gregLO3ofRM(modrm), mkexpr(dst64));
   13302          goto decode_success;
   13303       }
   13304       break;
   13305 
   13306    case 0x2E:
   13307    case 0x2F:
   13308       /* 66 0F 2E = UCOMISD -- 64F0x2 comparison G,E, and set ZCP */
   13309       /* 66 0F 2F = COMISD  -- 64F0x2 comparison G,E, and set ZCP */
   13310       if (have66noF2noF3(pfx) && sz == 2) {
   13311          delta = dis_COMISD( vbi, pfx, delta, False/*!isAvx*/, opc );
   13312          goto decode_success;
   13313       }
   13314       /* 0F 2E = UCOMISS -- 32F0x4 comparison G,E, and set ZCP */
   13315       /* 0F 2F = COMISS  -- 32F0x4 comparison G,E, and set ZCP */
   13316       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13317          delta = dis_COMISS( vbi, pfx, delta, False/*!isAvx*/, opc );
   13318          goto decode_success;
   13319       }
   13320       break;
   13321 
   13322    case 0x50:
   13323       /* 0F 50 = MOVMSKPS - move 4 sign bits from 4 x F32 in xmm(E)
   13324          to 4 lowest bits of ireg(G) */
   13325       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   13326           && epartIsReg(getUChar(delta))) {
   13327          /* sz == 8 is a kludge to handle insns with REX.W redundantly
   13328             set to 1, which has been known to happen:
   13329 
   13330             4c 0f 50 d9             rex64X movmskps %xmm1,%r11d
   13331 
   13332             20071106: Intel docs say that REX.W isn't redundant: when
   13333             present, a 64-bit register is written; when not present, only
   13334             the 32-bit half is written.  However, testing on a Core2
   13335             machine suggests the entire 64 bit register is written
   13336             irrespective of the status of REX.W.  That could be because
   13337             of the default rule that says "if the lower half of a 32-bit
   13338             register is written, the upper half is zeroed".  By using
   13339             putIReg32 here we inadvertantly produce the same behaviour as
   13340             the Core2, for the same reason -- putIReg32 implements said
   13341             rule.
   13342 
   13343             AMD docs give no indication that REX.W is even valid for this
   13344             insn. */
   13345          delta = dis_MOVMSKPS_128( vbi, pfx, delta, False/*!isAvx*/ );
   13346          goto decode_success;
   13347       }
   13348       /* 66 0F 50 = MOVMSKPD - move 2 sign bits from 2 x F64 in xmm(E) to
   13349          2 lowest bits of ireg(G) */
   13350       if (have66noF2noF3(pfx) && (sz == 2 || sz == 8)) {
   13351          /* sz == 8 is a kludge to handle insns with REX.W redundantly
   13352             set to 1, which has been known to happen:
   13353             66 4c 0f 50 d9          rex64X movmskpd %xmm1,%r11d
   13354             20071106: see further comments on MOVMSKPS implementation above.
   13355          */
   13356          delta = dis_MOVMSKPD_128( vbi, pfx, delta, False/*!isAvx*/ );
   13357          goto decode_success;
   13358       }
   13359       break;
   13360 
   13361    case 0x51:
   13362       /* F3 0F 51 = SQRTSS -- approx sqrt 32F0x4 from R/M to R */
   13363       if (haveF3no66noF2(pfx) && sz == 4) {
   13364          delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta,
   13365                                             "sqrtss", Iop_Sqrt32F0x4 );
   13366          goto decode_success;
   13367       }
   13368       /* 0F 51 = SQRTPS -- approx sqrt 32Fx4 from R/M to R */
   13369       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13370          delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
   13371                                            "sqrtps", Iop_Sqrt32Fx4 );
   13372          goto decode_success;
   13373       }
   13374       /* F2 0F 51 = SQRTSD -- approx sqrt 64F0x2 from R/M to R */
   13375       if (haveF2no66noF3(pfx) && sz == 4) {
   13376          delta = dis_SSE_E_to_G_unary_lo64( vbi, pfx, delta,
   13377                                             "sqrtsd", Iop_Sqrt64F0x2 );
   13378          goto decode_success;
   13379       }
   13380       /* 66 0F 51 = SQRTPD -- approx sqrt 64Fx2 from R/M to R */
   13381       if (have66noF2noF3(pfx) && sz == 2) {
   13382          delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
   13383                                            "sqrtpd", Iop_Sqrt64Fx2 );
   13384          goto decode_success;
   13385       }
   13386       break;
   13387 
   13388    case 0x52:
   13389       /* F3 0F 52 = RSQRTSS -- approx reciprocal sqrt 32F0x4 from R/M to R */
   13390       if (haveF3no66noF2(pfx) && sz == 4) {
   13391          delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta,
   13392                                             "rsqrtss", Iop_RSqrtEst32F0x4 );
   13393          goto decode_success;
   13394       }
   13395       /* 0F 52 = RSQRTPS -- approx reciprocal sqrt 32Fx4 from R/M to R */
   13396       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13397          delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
   13398                                            "rsqrtps", Iop_RSqrtEst32Fx4 );
   13399          goto decode_success;
   13400       }
   13401       break;
   13402 
   13403    case 0x53:
   13404       /* F3 0F 53 = RCPSS -- approx reciprocal 32F0x4 from R/M to R */
   13405       if (haveF3no66noF2(pfx) && sz == 4) {
   13406          delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta,
   13407                                             "rcpss", Iop_RecipEst32F0x4 );
   13408          goto decode_success;
   13409       }
   13410       /* 0F 53 = RCPPS -- approx reciprocal 32Fx4 from R/M to R */
   13411       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13412          delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
   13413                                            "rcpps", Iop_RecipEst32Fx4 );
   13414          goto decode_success;
   13415       }
   13416       break;
   13417 
   13418    case 0x54:
   13419       /* 0F 54 = ANDPS -- G = G and E */
   13420       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13421          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "andps", Iop_AndV128 );
   13422          goto decode_success;
   13423       }
   13424       /* 66 0F 54 = ANDPD -- G = G and E */
   13425       if (have66noF2noF3(pfx) && sz == 2) {
   13426          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "andpd", Iop_AndV128 );
   13427          goto decode_success;
   13428       }
   13429       break;
   13430 
   13431    case 0x55:
   13432       /* 0F 55 = ANDNPS -- G = (not G) and E */
   13433       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13434          delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta, "andnps",
   13435                                                            Iop_AndV128 );
   13436          goto decode_success;
   13437       }
   13438       /* 66 0F 55 = ANDNPD -- G = (not G) and E */
   13439       if (have66noF2noF3(pfx) && sz == 2) {
   13440          delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta, "andnpd",
   13441                                                            Iop_AndV128 );
   13442          goto decode_success;
   13443       }
   13444       break;
   13445 
   13446    case 0x56:
   13447       /* 0F 56 = ORPS -- G = G and E */
   13448       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13449          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "orps", Iop_OrV128 );
   13450          goto decode_success;
   13451       }
   13452       /* 66 0F 56 = ORPD -- G = G and E */
   13453       if (have66noF2noF3(pfx) && sz == 2) {
   13454          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "orpd", Iop_OrV128 );
   13455          goto decode_success;
   13456       }
   13457       break;
   13458 
   13459    case 0x57:
   13460       /* 66 0F 57 = XORPD -- G = G xor E */
   13461       if (have66noF2noF3(pfx) && sz == 2) {
   13462          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "xorpd", Iop_XorV128 );
   13463          goto decode_success;
   13464       }
   13465       /* 0F 57 = XORPS -- G = G xor E */
   13466       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13467          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "xorps", Iop_XorV128 );
   13468          goto decode_success;
   13469       }
   13470       break;
   13471 
   13472    case 0x58:
   13473       /* 0F 58 = ADDPS -- add 32Fx4 from R/M to R */
   13474       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13475          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "addps", Iop_Add32Fx4 );
   13476          goto decode_success;
   13477       }
   13478       /* F3 0F 58 = ADDSS -- add 32F0x4 from R/M to R */
   13479       if (haveF3no66noF2(pfx) && sz == 4) {
   13480          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "addss", Iop_Add32F0x4 );
   13481          goto decode_success;
   13482       }
   13483       /* F2 0F 58 = ADDSD -- add 64F0x2 from R/M to R */
   13484       if (haveF2no66noF3(pfx)
   13485           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   13486          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "addsd", Iop_Add64F0x2 );
   13487          goto decode_success;
   13488       }
   13489       /* 66 0F 58 = ADDPD -- add 32Fx4 from R/M to R */
   13490       if (have66noF2noF3(pfx)
   13491           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   13492          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "addpd", Iop_Add64Fx2 );
   13493          goto decode_success;
   13494       }
   13495       break;
   13496 
   13497    case 0x59:
   13498       /* F2 0F 59 = MULSD -- mul 64F0x2 from R/M to R */
   13499       if (haveF2no66noF3(pfx)
   13500           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   13501          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "mulsd", Iop_Mul64F0x2 );
   13502          goto decode_success;
   13503       }
   13504       /* F3 0F 59 = MULSS -- mul 32F0x4 from R/M to R */
   13505       if (haveF3no66noF2(pfx) && sz == 4) {
   13506          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "mulss", Iop_Mul32F0x4 );
   13507          goto decode_success;
   13508       }
   13509       /* 0F 59 = MULPS -- mul 32Fx4 from R/M to R */
   13510       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13511          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "mulps", Iop_Mul32Fx4 );
   13512          goto decode_success;
   13513       }
   13514       /* 66 0F 59 = MULPD -- mul 64Fx2 from R/M to R */
   13515       if (have66noF2noF3(pfx)
   13516           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   13517          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "mulpd", Iop_Mul64Fx2 );
   13518          goto decode_success;
   13519       }
   13520       break;
   13521 
   13522    case 0x5A:
   13523       /* 0F 5A = CVTPS2PD -- convert 2 x F32 in low half mem/xmm to 2 x
   13524          F64 in xmm(G). */
   13525       if (haveNo66noF2noF3(pfx)
   13526           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   13527          delta = dis_CVTPS2PD_128( vbi, pfx, delta, False/*!isAvx*/ );
   13528          goto decode_success;
   13529       }
   13530       /* F3 0F 5A = CVTSS2SD -- convert F32 in mem/low 1/4 xmm to F64 in
   13531          low half xmm(G) */
   13532       if (haveF3no66noF2(pfx) && sz == 4) {
   13533          IRTemp f32lo = newTemp(Ity_F32);
   13534 
   13535          modrm = getUChar(delta);
   13536          if (epartIsReg(modrm)) {
   13537             delta += 1;
   13538             assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
   13539             DIP("cvtss2sd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   13540                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13541          } else {
   13542             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13543             assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   13544             delta += alen;
   13545             DIP("cvtss2sd %s,%s\n", dis_buf,
   13546                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13547          }
   13548 
   13549          putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0,
   13550                            unop( Iop_F32toF64, mkexpr(f32lo) ) );
   13551 
   13552          goto decode_success;
   13553       }
   13554       /* F2 0F 5A = CVTSD2SS -- convert F64 in mem/low half xmm to F32 in
   13555          low 1/4 xmm(G), according to prevailing SSE rounding mode */
   13556       if (haveF2no66noF3(pfx) && sz == 4) {
   13557          IRTemp rmode = newTemp(Ity_I32);
   13558          IRTemp f64lo = newTemp(Ity_F64);
   13559 
   13560          modrm = getUChar(delta);
   13561          if (epartIsReg(modrm)) {
   13562             delta += 1;
   13563             assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
   13564             DIP("cvtsd2ss %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   13565                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13566          } else {
   13567             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13568             assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   13569             delta += alen;
   13570             DIP("cvtsd2ss %s,%s\n", dis_buf,
   13571                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13572          }
   13573 
   13574          assign( rmode, get_sse_roundingmode() );
   13575          putXMMRegLane32F(
   13576             gregOfRexRM(pfx,modrm), 0,
   13577             binop( Iop_F64toF32, mkexpr(rmode), mkexpr(f64lo) )
   13578          );
   13579 
   13580          goto decode_success;
   13581       }
   13582       /* 66 0F 5A = CVTPD2PS -- convert 2 x F64 in mem/xmm to 2 x F32 in
   13583          lo half xmm(G), rounding according to prevailing SSE rounding
   13584          mode, and zero upper half */
   13585       /* Note, this is practically identical to CVTPD2DQ.  It would have
   13586          be nice to merge them together. */
   13587       if (have66noF2noF3(pfx) && sz == 2) {
   13588          delta = dis_CVTPD2PS_128( vbi, pfx, delta, False/*!isAvx*/ );
   13589          goto decode_success;
   13590       }
   13591       break;
   13592 
   13593    case 0x5B:
   13594       /* F3 0F 5B = CVTTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
   13595          xmm(G), rounding towards zero */
   13596       /* 66 0F 5B = CVTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
   13597          xmm(G), as per the prevailing rounding mode */
   13598       if ( (have66noF2noF3(pfx) && sz == 2)
   13599            || (haveF3no66noF2(pfx) && sz == 4) ) {
   13600          Bool r2zero = toBool(sz == 4); // FIXME -- unreliable (???)
   13601          delta = dis_CVTxPS2DQ_128( vbi, pfx, delta, False/*!isAvx*/, r2zero );
   13602          goto decode_success;
   13603       }
   13604       /* 0F 5B = CVTDQ2PS -- convert 4 x I32 in mem/xmm to 4 x F32 in
   13605          xmm(G) */
   13606       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13607          delta = dis_CVTDQ2PS_128( vbi, pfx, delta, False/*!isAvx*/ );
   13608          goto decode_success;
   13609       }
   13610       break;
   13611 
   13612    case 0x5C:
   13613       /* F3 0F 5C = SUBSS -- sub 32F0x4 from R/M to R */
   13614       if (haveF3no66noF2(pfx) && sz == 4) {
   13615          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "subss", Iop_Sub32F0x4 );
   13616          goto decode_success;
   13617       }
   13618       /* F2 0F 5C = SUBSD -- sub 64F0x2 from R/M to R */
   13619       if (haveF2no66noF3(pfx)
   13620           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   13621          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "subsd", Iop_Sub64F0x2 );
   13622          goto decode_success;
   13623       }
   13624       /* 0F 5C = SUBPS -- sub 32Fx4 from R/M to R */
   13625       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13626          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "subps", Iop_Sub32Fx4 );
   13627          goto decode_success;
   13628       }
   13629       /* 66 0F 5C = SUBPD -- sub 64Fx2 from R/M to R */
   13630       if (have66noF2noF3(pfx) && sz == 2) {
   13631          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "subpd", Iop_Sub64Fx2 );
   13632          goto decode_success;
   13633       }
   13634       break;
   13635 
   13636    case 0x5D:
   13637       /* 0F 5D = MINPS -- min 32Fx4 from R/M to R */
   13638       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13639          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "minps", Iop_Min32Fx4 );
   13640          goto decode_success;
   13641       }
   13642       /* F3 0F 5D = MINSS -- min 32F0x4 from R/M to R */
   13643       if (haveF3no66noF2(pfx) && sz == 4) {
   13644          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "minss", Iop_Min32F0x4 );
   13645          goto decode_success;
   13646       }
   13647       /* F2 0F 5D = MINSD -- min 64F0x2 from R/M to R */
   13648       if (haveF2no66noF3(pfx)
   13649           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   13650          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "minsd", Iop_Min64F0x2 );
   13651          goto decode_success;
   13652       }
   13653       /* 66 0F 5D = MINPD -- min 64Fx2 from R/M to R */
   13654       if (have66noF2noF3(pfx) && sz == 2) {
   13655          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "minpd", Iop_Min64Fx2 );
   13656          goto decode_success;
   13657       }
   13658       break;
   13659 
   13660    case 0x5E:
   13661       /* F2 0F 5E = DIVSD -- div 64F0x2 from R/M to R */
   13662       if (haveF2no66noF3(pfx) && sz == 4) {
   13663          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "divsd", Iop_Div64F0x2 );
   13664          goto decode_success;
   13665       }
   13666       /* 0F 5E = DIVPS -- div 32Fx4 from R/M to R */
   13667       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13668          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "divps", Iop_Div32Fx4 );
   13669          goto decode_success;
   13670       }
   13671       /* F3 0F 5E = DIVSS -- div 32F0x4 from R/M to R */
   13672       if (haveF3no66noF2(pfx) && sz == 4) {
   13673          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "divss", Iop_Div32F0x4 );
   13674          goto decode_success;
   13675       }
   13676       /* 66 0F 5E = DIVPD -- div 64Fx2 from R/M to R */
   13677       if (have66noF2noF3(pfx) && sz == 2) {
   13678          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "divpd", Iop_Div64Fx2 );
   13679          goto decode_success;
   13680       }
   13681       break;
   13682 
   13683    case 0x5F:
   13684       /* 0F 5F = MAXPS -- max 32Fx4 from R/M to R */
   13685       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13686          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "maxps", Iop_Max32Fx4 );
   13687          goto decode_success;
   13688       }
   13689       /* F3 0F 5F = MAXSS -- max 32F0x4 from R/M to R */
   13690       if (haveF3no66noF2(pfx) && sz == 4) {
   13691          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "maxss", Iop_Max32F0x4 );
   13692          goto decode_success;
   13693       }
   13694       /* F2 0F 5F = MAXSD -- max 64F0x2 from R/M to R */
   13695       if (haveF2no66noF3(pfx)
   13696           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   13697          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "maxsd", Iop_Max64F0x2 );
   13698          goto decode_success;
   13699       }
   13700       /* 66 0F 5F = MAXPD -- max 64Fx2 from R/M to R */
   13701       if (have66noF2noF3(pfx) && sz == 2) {
   13702          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "maxpd", Iop_Max64Fx2 );
   13703          goto decode_success;
   13704       }
   13705       break;
   13706 
   13707    case 0x60:
   13708       /* 66 0F 60 = PUNPCKLBW */
   13709       if (have66noF2noF3(pfx) && sz == 2) {
   13710          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13711                                     "punpcklbw",
   13712                                     Iop_InterleaveLO8x16, True );
   13713          goto decode_success;
   13714       }
   13715       break;
   13716 
   13717    case 0x61:
   13718       /* 66 0F 61 = PUNPCKLWD */
   13719       if (have66noF2noF3(pfx) && sz == 2) {
   13720          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13721                                     "punpcklwd",
   13722                                     Iop_InterleaveLO16x8, True );
   13723          goto decode_success;
   13724       }
   13725       break;
   13726 
   13727    case 0x62:
   13728       /* 66 0F 62 = PUNPCKLDQ */
   13729       if (have66noF2noF3(pfx) && sz == 2) {
   13730          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13731                                     "punpckldq",
   13732                                     Iop_InterleaveLO32x4, True );
   13733          goto decode_success;
   13734       }
   13735       break;
   13736 
   13737    case 0x63:
   13738       /* 66 0F 63 = PACKSSWB */
   13739       if (have66noF2noF3(pfx) && sz == 2) {
   13740          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13741                                     "packsswb",
   13742                                     Iop_QNarrowBin16Sto8Sx16, True );
   13743          goto decode_success;
   13744       }
   13745       break;
   13746 
   13747    case 0x64:
   13748       /* 66 0F 64 = PCMPGTB */
   13749       if (have66noF2noF3(pfx) && sz == 2) {
   13750          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13751                                     "pcmpgtb", Iop_CmpGT8Sx16, False );
   13752          goto decode_success;
   13753       }
   13754       break;
   13755 
   13756    case 0x65:
   13757       /* 66 0F 65 = PCMPGTW */
   13758       if (have66noF2noF3(pfx) && sz == 2) {
   13759          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13760                                     "pcmpgtw", Iop_CmpGT16Sx8, False );
   13761          goto decode_success;
   13762       }
   13763       break;
   13764 
   13765    case 0x66:
   13766       /* 66 0F 66 = PCMPGTD */
   13767       if (have66noF2noF3(pfx) && sz == 2) {
   13768          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13769                                     "pcmpgtd", Iop_CmpGT32Sx4, False );
   13770          goto decode_success;
   13771       }
   13772       break;
   13773 
   13774    case 0x67:
   13775       /* 66 0F 67 = PACKUSWB */
   13776       if (have66noF2noF3(pfx) && sz == 2) {
   13777          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13778                                     "packuswb",
   13779                                     Iop_QNarrowBin16Sto8Ux16, True );
   13780          goto decode_success;
   13781       }
   13782       break;
   13783 
   13784    case 0x68:
   13785       /* 66 0F 68 = PUNPCKHBW */
   13786       if (have66noF2noF3(pfx) && sz == 2) {
   13787          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13788                                     "punpckhbw",
   13789                                     Iop_InterleaveHI8x16, True );
   13790          goto decode_success;
   13791       }
   13792       break;
   13793 
   13794    case 0x69:
   13795       /* 66 0F 69 = PUNPCKHWD */
   13796       if (have66noF2noF3(pfx) && sz == 2) {
   13797          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13798                                     "punpckhwd",
   13799                                     Iop_InterleaveHI16x8, True );
   13800          goto decode_success;
   13801       }
   13802       break;
   13803 
   13804    case 0x6A:
   13805       /* 66 0F 6A = PUNPCKHDQ */
   13806       if (have66noF2noF3(pfx) && sz == 2) {
   13807          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13808                                     "punpckhdq",
   13809                                     Iop_InterleaveHI32x4, True );
   13810          goto decode_success;
   13811       }
   13812       break;
   13813 
   13814    case 0x6B:
   13815       /* 66 0F 6B = PACKSSDW */
   13816       if (have66noF2noF3(pfx) && sz == 2) {
   13817          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13818                                     "packssdw",
   13819                                     Iop_QNarrowBin32Sto16Sx8, True );
   13820          goto decode_success;
   13821       }
   13822       break;
   13823 
   13824    case 0x6C:
   13825       /* 66 0F 6C = PUNPCKLQDQ */
   13826       if (have66noF2noF3(pfx) && sz == 2) {
   13827          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13828                                     "punpcklqdq",
   13829                                     Iop_InterleaveLO64x2, True );
   13830          goto decode_success;
   13831       }
   13832       break;
   13833 
   13834    case 0x6D:
   13835       /* 66 0F 6D = PUNPCKHQDQ */
   13836       if (have66noF2noF3(pfx) && sz == 2) {
   13837          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13838                                     "punpckhqdq",
   13839                                     Iop_InterleaveHI64x2, True );
   13840          goto decode_success;
   13841       }
   13842       break;
   13843 
   13844    case 0x6E:
   13845       /* 66 0F 6E = MOVD from ireg32/m32 to xmm lo 1/4,
   13846                     zeroing high 3/4 of xmm. */
   13847       /*              or from ireg64/m64 to xmm lo 1/2,
   13848                     zeroing high 1/2 of xmm. */
   13849       if (have66noF2noF3(pfx)) {
   13850          vassert(sz == 2 || sz == 8);
   13851          if (sz == 2) sz = 4;
   13852          modrm = getUChar(delta);
   13853          if (epartIsReg(modrm)) {
   13854             delta += 1;
   13855             if (sz == 4) {
   13856                putXMMReg(
   13857                   gregOfRexRM(pfx,modrm),
   13858                   unop( Iop_32UtoV128, getIReg32(eregOfRexRM(pfx,modrm)) )
   13859                );
   13860                DIP("movd %s, %s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
   13861                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13862             } else {
   13863                putXMMReg(
   13864                   gregOfRexRM(pfx,modrm),
   13865                   unop( Iop_64UtoV128, getIReg64(eregOfRexRM(pfx,modrm)) )
   13866                );
   13867                DIP("movq %s, %s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
   13868                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13869             }
   13870          } else {
   13871             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   13872             delta += alen;
   13873             putXMMReg(
   13874                gregOfRexRM(pfx,modrm),
   13875                sz == 4
   13876                   ?  unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)) )
   13877                   :  unop( Iop_64UtoV128,loadLE(Ity_I64, mkexpr(addr)) )
   13878             );
   13879             DIP("mov%c %s, %s\n", sz == 4 ? 'd' : 'q', dis_buf,
   13880                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13881          }
   13882          goto decode_success;
   13883       }
   13884       break;
   13885 
   13886    case 0x6F:
   13887       if (have66noF2noF3(pfx)
   13888           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   13889          /* 66 0F 6F = MOVDQA -- move from E (mem or xmm) to G (xmm). */
   13890          modrm = getUChar(delta);
   13891          if (epartIsReg(modrm)) {
   13892             putXMMReg( gregOfRexRM(pfx,modrm),
   13893                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   13894             DIP("movdqa %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   13895                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13896             delta += 1;
   13897          } else {
   13898             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13899             gen_SEGV_if_not_16_aligned( addr );
   13900             putXMMReg( gregOfRexRM(pfx,modrm),
   13901                        loadLE(Ity_V128, mkexpr(addr)) );
   13902             DIP("movdqa %s,%s\n", dis_buf,
   13903                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13904             delta += alen;
   13905          }
   13906          goto decode_success;
   13907       }
   13908       if (haveF3no66noF2(pfx) && sz == 4) {
   13909          /* F3 0F 6F = MOVDQU -- move from E (mem or xmm) to G (xmm). */
   13910          modrm = getUChar(delta);
   13911          if (epartIsReg(modrm)) {
   13912             putXMMReg( gregOfRexRM(pfx,modrm),
   13913                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   13914             DIP("movdqu %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   13915                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13916             delta += 1;
   13917          } else {
   13918             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13919             putXMMReg( gregOfRexRM(pfx,modrm),
   13920                        loadLE(Ity_V128, mkexpr(addr)) );
   13921             DIP("movdqu %s,%s\n", dis_buf,
   13922                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13923             delta += alen;
   13924          }
   13925          goto decode_success;
   13926       }
   13927       break;
   13928 
   13929    case 0x70:
   13930       /* 66 0F 70 = PSHUFD -- rearrange 4x32 from E(xmm or mem) to G(xmm) */
   13931       if (have66noF2noF3(pfx) && sz == 2) {
   13932          delta = dis_PSHUFD_32x4( vbi, pfx, delta, False/*!writesYmm*/);
   13933          goto decode_success;
   13934       }
   13935       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   13936       /* 0F 70 = PSHUFW -- rearrange 4x16 from E(mmx or mem) to G(mmx) */
   13937       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13938          Int order;
   13939          IRTemp sV, dV, s3, s2, s1, s0;
   13940          s3 = s2 = s1 = s0 = IRTemp_INVALID;
   13941          sV = newTemp(Ity_I64);
   13942          dV = newTemp(Ity_I64);
   13943          do_MMX_preamble();
   13944          modrm = getUChar(delta);
   13945          if (epartIsReg(modrm)) {
   13946             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   13947             order = (Int)getUChar(delta+1);
   13948             delta += 1+1;
   13949             DIP("pshufw $%d,%s,%s\n", order,
   13950                                       nameMMXReg(eregLO3ofRM(modrm)),
   13951                                       nameMMXReg(gregLO3ofRM(modrm)));
   13952          } else {
   13953             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   13954                               1/*extra byte after amode*/ );
   13955             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   13956             order = (Int)getUChar(delta+alen);
   13957             delta += 1+alen;
   13958             DIP("pshufw $%d,%s,%s\n", order,
   13959                                       dis_buf,
   13960                                       nameMMXReg(gregLO3ofRM(modrm)));
   13961          }
   13962          breakup64to16s( sV, &s3, &s2, &s1, &s0 );
   13963 #        define SEL(n) \
   13964                    ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   13965          assign(dV,
   13966                 mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
   13967                              SEL((order>>2)&3), SEL((order>>0)&3) )
   13968          );
   13969          putMMXReg(gregLO3ofRM(modrm), mkexpr(dV));
   13970 #        undef SEL
   13971          goto decode_success;
   13972       }
   13973       /* F2 0F 70 = PSHUFLW -- rearrange lower half 4x16 from E(xmm or
   13974          mem) to G(xmm), and copy upper half */
   13975       if (haveF2no66noF3(pfx) && sz == 4) {
   13976          delta = dis_PSHUFxW_128( vbi, pfx, delta,
   13977                                   False/*!isAvx*/, False/*!xIsH*/ );
   13978          goto decode_success;
   13979       }
   13980       /* F3 0F 70 = PSHUFHW -- rearrange upper half 4x16 from E(xmm or
   13981          mem) to G(xmm), and copy lower half */
   13982       if (haveF3no66noF2(pfx) && sz == 4) {
   13983          delta = dis_PSHUFxW_128( vbi, pfx, delta,
   13984                                   False/*!isAvx*/, True/*xIsH*/ );
   13985          goto decode_success;
   13986       }
   13987       break;
   13988 
   13989    case 0x71:
   13990       /* 66 0F 71 /2 ib = PSRLW by immediate */
   13991       if (have66noF2noF3(pfx) && sz == 2
   13992           && epartIsReg(getUChar(delta))
   13993           && gregLO3ofRM(getUChar(delta)) == 2) {
   13994          delta = dis_SSE_shiftE_imm( pfx, delta, "psrlw", Iop_ShrN16x8 );
   13995          goto decode_success;
   13996       }
   13997       /* 66 0F 71 /4 ib = PSRAW by immediate */
   13998       if (have66noF2noF3(pfx) && sz == 2
   13999           && epartIsReg(getUChar(delta))
   14000           && gregLO3ofRM(getUChar(delta)) == 4) {
   14001          delta = dis_SSE_shiftE_imm( pfx, delta, "psraw", Iop_SarN16x8 );
   14002          goto decode_success;
   14003       }
   14004       /* 66 0F 71 /6 ib = PSLLW by immediate */
   14005       if (have66noF2noF3(pfx) && sz == 2
   14006           && epartIsReg(getUChar(delta))
   14007           && gregLO3ofRM(getUChar(delta)) == 6) {
   14008          delta = dis_SSE_shiftE_imm( pfx, delta, "psllw", Iop_ShlN16x8 );
   14009          goto decode_success;
   14010       }
   14011       break;
   14012 
   14013    case 0x72:
   14014       /* 66 0F 72 /2 ib = PSRLD by immediate */
   14015       if (have66noF2noF3(pfx) && sz == 2
   14016           && epartIsReg(getUChar(delta))
   14017           && gregLO3ofRM(getUChar(delta)) == 2) {
   14018          delta = dis_SSE_shiftE_imm( pfx, delta, "psrld", Iop_ShrN32x4 );
   14019          goto decode_success;
   14020       }
   14021       /* 66 0F 72 /4 ib = PSRAD by immediate */
   14022       if (have66noF2noF3(pfx) && sz == 2
   14023           && epartIsReg(getUChar(delta))
   14024           && gregLO3ofRM(getUChar(delta)) == 4) {
   14025          delta = dis_SSE_shiftE_imm( pfx, delta, "psrad", Iop_SarN32x4 );
   14026          goto decode_success;
   14027       }
   14028       /* 66 0F 72 /6 ib = PSLLD by immediate */
   14029       if (have66noF2noF3(pfx) && sz == 2
   14030           && epartIsReg(getUChar(delta))
   14031           && gregLO3ofRM(getUChar(delta)) == 6) {
   14032          delta = dis_SSE_shiftE_imm( pfx, delta, "pslld", Iop_ShlN32x4 );
   14033          goto decode_success;
   14034       }
   14035       break;
   14036 
   14037    case 0x73:
   14038       /* 66 0F 73 /3 ib = PSRLDQ by immediate */
   14039       /* note, if mem case ever filled in, 1 byte after amode */
   14040       if (have66noF2noF3(pfx) && sz == 2
   14041           && epartIsReg(getUChar(delta))
   14042           && gregLO3ofRM(getUChar(delta)) == 3) {
   14043          Int imm = (Int)getUChar(delta+1);
   14044          Int reg = eregOfRexRM(pfx,getUChar(delta));
   14045          DIP("psrldq $%d,%s\n", imm, nameXMMReg(reg));
   14046          delta += 2;
   14047          IRTemp sV = newTemp(Ity_V128);
   14048          assign( sV, getXMMReg(reg) );
   14049          putXMMReg(reg, mkexpr(math_PSRLDQ( sV, imm )));
   14050          goto decode_success;
   14051       }
   14052       /* 66 0F 73 /7 ib = PSLLDQ by immediate */
   14053       /* note, if mem case ever filled in, 1 byte after amode */
   14054       if (have66noF2noF3(pfx) && sz == 2
   14055           && epartIsReg(getUChar(delta))
   14056           && gregLO3ofRM(getUChar(delta)) == 7) {
   14057          Int imm = (Int)getUChar(delta+1);
   14058          Int reg = eregOfRexRM(pfx,getUChar(delta));
   14059          DIP("pslldq $%d,%s\n", imm, nameXMMReg(reg));
   14060          vassert(imm >= 0 && imm <= 255);
   14061          delta += 2;
   14062          IRTemp sV = newTemp(Ity_V128);
   14063          assign( sV, getXMMReg(reg) );
   14064          putXMMReg(reg, mkexpr(math_PSLLDQ( sV, imm )));
   14065          goto decode_success;
   14066       }
   14067       /* 66 0F 73 /2 ib = PSRLQ by immediate */
   14068       if (have66noF2noF3(pfx) && sz == 2
   14069           && epartIsReg(getUChar(delta))
   14070           && gregLO3ofRM(getUChar(delta)) == 2) {
   14071          delta = dis_SSE_shiftE_imm( pfx, delta, "psrlq", Iop_ShrN64x2 );
   14072          goto decode_success;
   14073       }
   14074       /* 66 0F 73 /6 ib = PSLLQ by immediate */
   14075       if (have66noF2noF3(pfx) && sz == 2
   14076           && epartIsReg(getUChar(delta))
   14077           && gregLO3ofRM(getUChar(delta)) == 6) {
   14078          delta = dis_SSE_shiftE_imm( pfx, delta, "psllq", Iop_ShlN64x2 );
   14079          goto decode_success;
   14080       }
   14081       break;
   14082 
   14083    case 0x74:
   14084       /* 66 0F 74 = PCMPEQB */
   14085       if (have66noF2noF3(pfx) && sz == 2) {
   14086          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14087                                     "pcmpeqb", Iop_CmpEQ8x16, False );
   14088          goto decode_success;
   14089       }
   14090       break;
   14091 
   14092    case 0x75:
   14093       /* 66 0F 75 = PCMPEQW */
   14094       if (have66noF2noF3(pfx) && sz == 2) {
   14095          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14096                                     "pcmpeqw", Iop_CmpEQ16x8, False );
   14097          goto decode_success;
   14098       }
   14099       break;
   14100 
   14101    case 0x76:
   14102       /* 66 0F 76 = PCMPEQD */
   14103       if (have66noF2noF3(pfx) && sz == 2) {
   14104          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14105                                     "pcmpeqd", Iop_CmpEQ32x4, False );
   14106          goto decode_success;
   14107       }
   14108       break;
   14109 
   14110    case 0x7E:
   14111       /* F3 0F 7E = MOVQ -- move 64 bits from E (mem or lo half xmm) to
   14112          G (lo half xmm).  Upper half of G is zeroed out. */
   14113       if (haveF3no66noF2(pfx)
   14114           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   14115          modrm = getUChar(delta);
   14116          if (epartIsReg(modrm)) {
   14117             putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
   14118                              getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
   14119                /* zero bits 127:64 */
   14120                putXMMRegLane64( gregOfRexRM(pfx,modrm), 1, mkU64(0) );
   14121             DIP("movsd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   14122                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   14123             delta += 1;
   14124          } else {
   14125             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14126             putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
   14127             putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
   14128                              loadLE(Ity_I64, mkexpr(addr)) );
   14129             DIP("movsd %s,%s\n", dis_buf,
   14130                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   14131             delta += alen;
   14132          }
   14133          goto decode_success;
   14134       }
   14135       /* 66 0F 7E = MOVD from xmm low 1/4 to ireg32 or m32. */
   14136       /*              or from xmm low 1/2 to ireg64 or m64. */
   14137          if (have66noF2noF3(pfx) && (sz == 2 || sz == 8)) {
   14138          if (sz == 2) sz = 4;
   14139          modrm = getUChar(delta);
   14140          if (epartIsReg(modrm)) {
   14141             delta += 1;
   14142             if (sz == 4) {
   14143                putIReg32( eregOfRexRM(pfx,modrm),
   14144                           getXMMRegLane32(gregOfRexRM(pfx,modrm), 0) );
   14145                DIP("movd %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   14146                                     nameIReg32(eregOfRexRM(pfx,modrm)));
   14147             } else {
   14148                putIReg64( eregOfRexRM(pfx,modrm),
   14149                           getXMMRegLane64(gregOfRexRM(pfx,modrm), 0) );
   14150                DIP("movq %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   14151                                     nameIReg64(eregOfRexRM(pfx,modrm)));
   14152             }
   14153          } else {
   14154             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   14155             delta += alen;
   14156             storeLE( mkexpr(addr),
   14157                      sz == 4
   14158                         ? getXMMRegLane32(gregOfRexRM(pfx,modrm),0)
   14159                         : getXMMRegLane64(gregOfRexRM(pfx,modrm),0) );
   14160             DIP("mov%c %s, %s\n", sz == 4 ? 'd' : 'q',
   14161                                   nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
   14162          }
   14163          goto decode_success;
   14164       }
   14165       break;
   14166 
   14167    case 0x7F:
   14168       /* F3 0F 7F = MOVDQU -- move from G (xmm) to E (mem or xmm). */
   14169       if (haveF3no66noF2(pfx) && sz == 4) {
   14170          modrm = getUChar(delta);
   14171          if (epartIsReg(modrm)) {
   14172             goto decode_failure; /* awaiting test case */
   14173             delta += 1;
   14174             putXMMReg( eregOfRexRM(pfx,modrm),
   14175                        getXMMReg(gregOfRexRM(pfx,modrm)) );
   14176             DIP("movdqu %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   14177                                    nameXMMReg(eregOfRexRM(pfx,modrm)));
   14178          } else {
   14179             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   14180             delta += alen;
   14181             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   14182             DIP("movdqu %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
   14183          }
   14184          goto decode_success;
   14185       }
   14186       /* 66 0F 7F = MOVDQA -- move from G (xmm) to E (mem or xmm). */
   14187       if (have66noF2noF3(pfx) && sz == 2) {
   14188          modrm = getUChar(delta);
   14189          if (epartIsReg(modrm)) {
   14190             delta += 1;
   14191             putXMMReg( eregOfRexRM(pfx,modrm),
   14192                        getXMMReg(gregOfRexRM(pfx,modrm)) );
   14193             DIP("movdqa %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   14194                                    nameXMMReg(eregOfRexRM(pfx,modrm)));
   14195          } else {
   14196             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   14197             gen_SEGV_if_not_16_aligned( addr );
   14198             delta += alen;
   14199             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   14200             DIP("movdqa %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
   14201          }
   14202          goto decode_success;
   14203       }
   14204       break;
   14205 
   14206    case 0xAE:
   14207       /* 0F AE /7 = SFENCE -- flush pending operations to memory */
   14208       if (haveNo66noF2noF3(pfx)
   14209           && epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 7
   14210           && sz == 4) {
   14211          delta += 1;
   14212          /* Insert a memory fence.  It's sometimes important that these
   14213             are carried through to the generated code. */
   14214          stmt( IRStmt_MBE(Imbe_Fence) );
   14215          DIP("sfence\n");
   14216          goto decode_success;
   14217       }
   14218       /* mindless duplication follows .. */
   14219       /* 0F AE /5 = LFENCE -- flush pending operations to memory */
   14220       /* 0F AE /6 = MFENCE -- flush pending operations to memory */
   14221       if (haveNo66noF2noF3(pfx)
   14222           && epartIsReg(getUChar(delta))
   14223           && (gregLO3ofRM(getUChar(delta)) == 5
   14224               || gregLO3ofRM(getUChar(delta)) == 6)
   14225           && sz == 4) {
   14226          delta += 1;
   14227          /* Insert a memory fence.  It's sometimes important that these
   14228             are carried through to the generated code. */
   14229          stmt( IRStmt_MBE(Imbe_Fence) );
   14230          DIP("%sfence\n", gregLO3ofRM(getUChar(delta-1))==5 ? "l" : "m");
   14231          goto decode_success;
   14232       }
   14233 
   14234       /* 0F AE /7 = CLFLUSH -- flush cache line */
   14235       if (haveNo66noF2noF3(pfx)
   14236           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 7
   14237           && sz == 4) {
   14238 
   14239          /* This is something of a hack.  We need to know the size of
   14240             the cache line containing addr.  Since we don't (easily),
   14241             assume 256 on the basis that no real cache would have a
   14242             line that big.  It's safe to invalidate more stuff than we
   14243             need, just inefficient. */
   14244          ULong lineszB = 256ULL;
   14245 
   14246          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14247          delta += alen;
   14248 
   14249          /* Round addr down to the start of the containing block. */
   14250          stmt( IRStmt_Put(
   14251                   OFFB_CMSTART,
   14252                   binop( Iop_And64,
   14253                          mkexpr(addr),
   14254                          mkU64( ~(lineszB-1) ))) );
   14255 
   14256          stmt( IRStmt_Put(OFFB_CMLEN, mkU64(lineszB) ) );
   14257 
   14258          jmp_lit(dres, Ijk_InvalICache, (Addr64)(guest_RIP_bbstart+delta));
   14259 
   14260          DIP("clflush %s\n", dis_buf);
   14261          goto decode_success;
   14262       }
   14263 
   14264       /* 0F AE /3 = STMXCSR m32 -- store %mxcsr */
   14265       if (haveNo66noF2noF3(pfx)
   14266           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 3
   14267           && sz == 4) {
   14268          delta = dis_STMXCSR(vbi, pfx, delta, False/*!isAvx*/);
   14269          goto decode_success;
   14270       }
   14271       /* 0F AE /2 = LDMXCSR m32 -- load %mxcsr */
   14272       if (haveNo66noF2noF3(pfx)
   14273           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 2
   14274           && sz == 4) {
   14275          delta = dis_LDMXCSR(vbi, pfx, delta, False/*!isAvx*/);
   14276          goto decode_success;
   14277       }
   14278       /* 0F AE /0 = FXSAVE m512 -- write x87 and SSE state to memory */
   14279       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   14280           && !epartIsReg(getUChar(delta))
   14281           && gregOfRexRM(pfx,getUChar(delta)) == 0) {
   14282          delta = dis_FXSAVE(vbi, pfx, delta, sz);
   14283          goto decode_success;
   14284       }
   14285       /* 0F AE /1 = FXRSTOR m512 -- read x87 and SSE state from memory */
   14286       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   14287           && !epartIsReg(getUChar(delta))
   14288           && gregOfRexRM(pfx,getUChar(delta)) == 1) {
   14289          delta = dis_FXRSTOR(vbi, pfx, delta, sz);
   14290          goto decode_success;
   14291       }
   14292       /* 0F AE /4 = XSAVE mem -- write x87, SSE, AVX state to memory */
   14293       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   14294           && !epartIsReg(getUChar(delta))
   14295           && gregOfRexRM(pfx,getUChar(delta)) == 4
   14296           && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   14297          delta = dis_XSAVE(vbi, pfx, delta, sz);
   14298          goto decode_success;
   14299       }
   14300       /* 0F AE /5 = XRSTOR mem -- read x87, SSE, AVX state from memory */
   14301       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   14302           && !epartIsReg(getUChar(delta))
   14303           && gregOfRexRM(pfx,getUChar(delta)) == 5
   14304           && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   14305          delta = dis_XRSTOR(vbi, pfx, delta, sz);
   14306          goto decode_success;
   14307       }
   14308       break;
   14309 
   14310    case 0xC2:
   14311       /* 0F C2 = CMPPS -- 32Fx4 comparison from R/M to R */
   14312       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14313          Long delta0 = delta;
   14314          delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpps", True, 4 );
   14315          if (delta > delta0) goto decode_success;
   14316       }
   14317       /* F3 0F C2 = CMPSS -- 32F0x4 comparison from R/M to R */
   14318       if (haveF3no66noF2(pfx) && sz == 4) {
   14319          Long delta0 = delta;
   14320          delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpss", False, 4 );
   14321          if (delta > delta0) goto decode_success;
   14322       }
   14323       /* F2 0F C2 = CMPSD -- 64F0x2 comparison from R/M to R */
   14324       if (haveF2no66noF3(pfx) && sz == 4) {
   14325          Long delta0 = delta;
   14326          delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpsd", False, 8 );
   14327          if (delta > delta0) goto decode_success;
   14328       }
   14329       /* 66 0F C2 = CMPPD -- 64Fx2 comparison from R/M to R */
   14330       if (have66noF2noF3(pfx) && sz == 2) {
   14331          Long delta0 = delta;
   14332          delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmppd", True, 8 );
   14333          if (delta > delta0) goto decode_success;
   14334       }
   14335       break;
   14336 
   14337    case 0xC3:
   14338       /* 0F C3 = MOVNTI -- for us, just a plain ireg store. */
   14339       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)) {
   14340          modrm = getUChar(delta);
   14341          if (!epartIsReg(modrm)) {
   14342             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14343             storeLE( mkexpr(addr), getIRegG(sz, pfx, modrm) );
   14344             DIP("movnti %s,%s\n", dis_buf,
   14345                                   nameIRegG(sz, pfx, modrm));
   14346             delta += alen;
   14347             goto decode_success;
   14348          }
   14349          /* else fall through */
   14350       }
   14351       break;
   14352 
   14353    case 0xC4:
   14354       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14355       /* 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
   14356          put it into the specified lane of mmx(G). */
   14357       if (haveNo66noF2noF3(pfx)
   14358           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   14359          /* Use t0 .. t3 to hold the 4 original 16-bit lanes of the
   14360             mmx reg.  t4 is the new lane value.  t5 is the original
   14361             mmx value. t6 is the new mmx value. */
   14362          Int lane;
   14363          t4 = newTemp(Ity_I16);
   14364          t5 = newTemp(Ity_I64);
   14365          t6 = newTemp(Ity_I64);
   14366          modrm = getUChar(delta);
   14367          do_MMX_preamble();
   14368 
   14369          assign(t5, getMMXReg(gregLO3ofRM(modrm)));
   14370          breakup64to16s( t5, &t3, &t2, &t1, &t0 );
   14371 
   14372          if (epartIsReg(modrm)) {
   14373             assign(t4, getIReg16(eregOfRexRM(pfx,modrm)));
   14374             delta += 1+1;
   14375             lane = getUChar(delta-1);
   14376             DIP("pinsrw $%d,%s,%s\n", lane,
   14377                                       nameIReg16(eregOfRexRM(pfx,modrm)),
   14378                                       nameMMXReg(gregLO3ofRM(modrm)));
   14379          } else {
   14380             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   14381             delta += 1+alen;
   14382             lane = getUChar(delta-1);
   14383             assign(t4, loadLE(Ity_I16, mkexpr(addr)));
   14384             DIP("pinsrw $%d,%s,%s\n", lane,
   14385                                       dis_buf,
   14386                                       nameMMXReg(gregLO3ofRM(modrm)));
   14387          }
   14388 
   14389          switch (lane & 3) {
   14390             case 0:  assign(t6, mk64from16s(t3,t2,t1,t4)); break;
   14391             case 1:  assign(t6, mk64from16s(t3,t2,t4,t0)); break;
   14392             case 2:  assign(t6, mk64from16s(t3,t4,t1,t0)); break;
   14393             case 3:  assign(t6, mk64from16s(t4,t2,t1,t0)); break;
   14394             default: vassert(0);
   14395          }
   14396          putMMXReg(gregLO3ofRM(modrm), mkexpr(t6));
   14397          goto decode_success;
   14398       }
   14399       /* 66 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
   14400          put it into the specified lane of xmm(G). */
   14401       if (have66noF2noF3(pfx)
   14402           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   14403          Int lane;
   14404          t4 = newTemp(Ity_I16);
   14405          modrm = getUChar(delta);
   14406          UInt rG = gregOfRexRM(pfx,modrm);
   14407          if (epartIsReg(modrm)) {
   14408             UInt rE = eregOfRexRM(pfx,modrm);
   14409             assign(t4, getIReg16(rE));
   14410             delta += 1+1;
   14411             lane = getUChar(delta-1);
   14412             DIP("pinsrw $%d,%s,%s\n",
   14413                 lane, nameIReg16(rE), nameXMMReg(rG));
   14414          } else {
   14415             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   14416                               1/*byte after the amode*/ );
   14417             delta += 1+alen;
   14418             lane = getUChar(delta-1);
   14419             assign(t4, loadLE(Ity_I16, mkexpr(addr)));
   14420             DIP("pinsrw $%d,%s,%s\n",
   14421                 lane, dis_buf, nameXMMReg(rG));
   14422          }
   14423          IRTemp src_vec = newTemp(Ity_V128);
   14424          assign(src_vec, getXMMReg(rG));
   14425          IRTemp res_vec = math_PINSRW_128( src_vec, t4, lane & 7);
   14426          putXMMReg(rG, mkexpr(res_vec));
   14427          goto decode_success;
   14428       }
   14429       break;
   14430 
   14431    case 0xC5:
   14432       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14433       /* 0F C5 = PEXTRW -- extract 16-bit field from mmx(E) and put
   14434          zero-extend of it in ireg(G). */
   14435       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)) {
   14436          modrm = getUChar(delta);
   14437          if (epartIsReg(modrm)) {
   14438             IRTemp sV = newTemp(Ity_I64);
   14439             t5 = newTemp(Ity_I16);
   14440             do_MMX_preamble();
   14441             assign(sV, getMMXReg(eregLO3ofRM(modrm)));
   14442             breakup64to16s( sV, &t3, &t2, &t1, &t0 );
   14443             switch (getUChar(delta+1) & 3) {
   14444                case 0:  assign(t5, mkexpr(t0)); break;
   14445                case 1:  assign(t5, mkexpr(t1)); break;
   14446                case 2:  assign(t5, mkexpr(t2)); break;
   14447                case 3:  assign(t5, mkexpr(t3)); break;
   14448                default: vassert(0);
   14449             }
   14450             if (sz == 8)
   14451                putIReg64(gregOfRexRM(pfx,modrm), unop(Iop_16Uto64, mkexpr(t5)));
   14452             else
   14453                putIReg32(gregOfRexRM(pfx,modrm), unop(Iop_16Uto32, mkexpr(t5)));
   14454             DIP("pextrw $%d,%s,%s\n",
   14455                 (Int)getUChar(delta+1),
   14456                 nameMMXReg(eregLO3ofRM(modrm)),
   14457                 sz==8 ? nameIReg64(gregOfRexRM(pfx,modrm))
   14458                       : nameIReg32(gregOfRexRM(pfx,modrm))
   14459             );
   14460             delta += 2;
   14461             goto decode_success;
   14462          }
   14463          /* else fall through */
   14464          /* note, for anyone filling in the mem case: this insn has one
   14465             byte after the amode and therefore you must pass 1 as the
   14466             last arg to disAMode */
   14467       }
   14468       /* 66 0F C5 = PEXTRW -- extract 16-bit field from xmm(E) and put
   14469          zero-extend of it in ireg(G). */
   14470       if (have66noF2noF3(pfx)
   14471           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   14472          Long delta0 = delta;
   14473          delta = dis_PEXTRW_128_EregOnly_toG( vbi, pfx, delta,
   14474                                               False/*!isAvx*/ );
   14475          if (delta > delta0) goto decode_success;
   14476          /* else fall through -- decoding has failed */
   14477       }
   14478       break;
   14479 
   14480    case 0xC6:
   14481       /* 0F C6 /r ib = SHUFPS -- shuffle packed F32s */
   14482       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14483          Int    imm8 = 0;
   14484          IRTemp sV   = newTemp(Ity_V128);
   14485          IRTemp dV   = newTemp(Ity_V128);
   14486          modrm = getUChar(delta);
   14487          UInt rG = gregOfRexRM(pfx,modrm);
   14488          assign( dV, getXMMReg(rG) );
   14489          if (epartIsReg(modrm)) {
   14490             UInt rE = eregOfRexRM(pfx,modrm);
   14491             assign( sV, getXMMReg(rE) );
   14492             imm8 = (Int)getUChar(delta+1);
   14493             delta += 1+1;
   14494             DIP("shufps $%d,%s,%s\n", imm8, nameXMMReg(rE), nameXMMReg(rG));
   14495          } else {
   14496             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   14497             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   14498             imm8 = (Int)getUChar(delta+alen);
   14499             delta += 1+alen;
   14500             DIP("shufps $%d,%s,%s\n", imm8, dis_buf, nameXMMReg(rG));
   14501          }
   14502          IRTemp res = math_SHUFPS_128( sV, dV, imm8 );
   14503          putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
   14504          goto decode_success;
   14505       }
   14506       /* 66 0F C6 /r ib = SHUFPD -- shuffle packed F64s */
   14507       if (have66noF2noF3(pfx) && sz == 2) {
   14508          Int    select;
   14509          IRTemp sV = newTemp(Ity_V128);
   14510          IRTemp dV = newTemp(Ity_V128);
   14511 
   14512          modrm = getUChar(delta);
   14513          assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   14514 
   14515          if (epartIsReg(modrm)) {
   14516             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   14517             select = (Int)getUChar(delta+1);
   14518             delta += 1+1;
   14519             DIP("shufpd $%d,%s,%s\n", select,
   14520                                       nameXMMReg(eregOfRexRM(pfx,modrm)),
   14521                                       nameXMMReg(gregOfRexRM(pfx,modrm)));
   14522          } else {
   14523             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   14524             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   14525             select = getUChar(delta+alen);
   14526             delta += 1+alen;
   14527             DIP("shufpd $%d,%s,%s\n", select,
   14528                                       dis_buf,
   14529                                       nameXMMReg(gregOfRexRM(pfx,modrm)));
   14530          }
   14531 
   14532          IRTemp res = math_SHUFPD_128( sV, dV, select );
   14533          putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
   14534          goto decode_success;
   14535       }
   14536       break;
   14537 
   14538    case 0xD1:
   14539       /* 66 0F D1 = PSRLW by E */
   14540       if (have66noF2noF3(pfx) && sz == 2) {
   14541          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrlw", Iop_ShrN16x8 );
   14542          goto decode_success;
   14543       }
   14544       break;
   14545 
   14546    case 0xD2:
   14547       /* 66 0F D2 = PSRLD by E */
   14548       if (have66noF2noF3(pfx) && sz == 2) {
   14549          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrld", Iop_ShrN32x4 );
   14550          goto decode_success;
   14551       }
   14552       break;
   14553 
   14554    case 0xD3:
   14555       /* 66 0F D3 = PSRLQ by E */
   14556       if (have66noF2noF3(pfx) && sz == 2) {
   14557          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrlq", Iop_ShrN64x2 );
   14558          goto decode_success;
   14559       }
   14560       break;
   14561 
   14562    case 0xD4:
   14563       /* 66 0F D4 = PADDQ */
   14564       if (have66noF2noF3(pfx) && sz == 2) {
   14565          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14566                                     "paddq", Iop_Add64x2, False );
   14567          goto decode_success;
   14568       }
   14569       /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   14570       /* 0F D4 = PADDQ -- add 64x1 */
   14571       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14572          do_MMX_preamble();
   14573          delta = dis_MMXop_regmem_to_reg (
   14574                    vbi, pfx, delta, opc, "paddq", False );
   14575          goto decode_success;
   14576       }
   14577       break;
   14578 
   14579    case 0xD5:
   14580       /* 66 0F D5 = PMULLW -- 16x8 multiply */
   14581       if (have66noF2noF3(pfx) && sz == 2) {
   14582          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14583                                     "pmullw", Iop_Mul16x8, False );
   14584          goto decode_success;
   14585       }
   14586       break;
   14587 
   14588    case 0xD6:
   14589       /* F3 0F D6 = MOVQ2DQ -- move from E (mmx) to G (lo half xmm, zero
   14590          hi half). */
   14591       if (haveF3no66noF2(pfx) && sz == 4) {
   14592          modrm = getUChar(delta);
   14593          if (epartIsReg(modrm)) {
   14594             do_MMX_preamble();
   14595             putXMMReg( gregOfRexRM(pfx,modrm),
   14596                        unop(Iop_64UtoV128, getMMXReg( eregLO3ofRM(modrm) )) );
   14597             DIP("movq2dq %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   14598                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   14599             delta += 1;
   14600             goto decode_success;
   14601          }
   14602          /* apparently no mem case for this insn */
   14603       }
   14604       /* 66 0F D6 = MOVQ -- move 64 bits from G (lo half xmm) to E (mem
   14605          or lo half xmm).  */
   14606       if (have66noF2noF3(pfx)
   14607           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   14608          modrm = getUChar(delta);
   14609          if (epartIsReg(modrm)) {
   14610             /* fall through, awaiting test case */
   14611             /* dst: lo half copied, hi half zeroed */
   14612          } else {
   14613             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14614             storeLE( mkexpr(addr),
   14615                      getXMMRegLane64( gregOfRexRM(pfx,modrm), 0 ));
   14616             DIP("movq %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf );
   14617             delta += alen;
   14618             goto decode_success;
   14619          }
   14620       }
   14621       /* F2 0F D6 = MOVDQ2Q -- move from E (lo half xmm, not mem) to G (mmx). */
   14622       if (haveF2no66noF3(pfx) && sz == 4) {
   14623          modrm = getUChar(delta);
   14624          if (epartIsReg(modrm)) {
   14625             do_MMX_preamble();
   14626             putMMXReg( gregLO3ofRM(modrm),
   14627                        getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
   14628             DIP("movdq2q %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   14629                                    nameMMXReg(gregLO3ofRM(modrm)));
   14630             delta += 1;
   14631             goto decode_success;
   14632          }
   14633          /* apparently no mem case for this insn */
   14634       }
   14635       break;
   14636 
   14637    case 0xD7:
   14638       /* 66 0F D7 = PMOVMSKB -- extract sign bits from each of 16
   14639          lanes in xmm(E), turn them into a byte, and put
   14640          zero-extend of it in ireg(G).  Doing this directly is just
   14641          too cumbersome; give up therefore and call a helper. */
   14642       if (have66noF2noF3(pfx)
   14643           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
   14644           && epartIsReg(getUChar(delta))) { /* no memory case, it seems */
   14645          delta = dis_PMOVMSKB_128( vbi, pfx, delta, False/*!isAvx*/ );
   14646          goto decode_success;
   14647       }
   14648       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14649       /* 0F D7 = PMOVMSKB -- extract sign bits from each of 8 lanes in
   14650          mmx(E), turn them into a byte, and put zero-extend of it in
   14651          ireg(G). */
   14652       if (haveNo66noF2noF3(pfx)
   14653           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   14654          modrm = getUChar(delta);
   14655          if (epartIsReg(modrm)) {
   14656             do_MMX_preamble();
   14657             t0 = newTemp(Ity_I64);
   14658             t1 = newTemp(Ity_I32);
   14659             assign(t0, getMMXReg(eregLO3ofRM(modrm)));
   14660             assign(t1, unop(Iop_8Uto32, unop(Iop_GetMSBs8x8, mkexpr(t0))));
   14661             putIReg32(gregOfRexRM(pfx,modrm), mkexpr(t1));
   14662             DIP("pmovmskb %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   14663                                     nameIReg32(gregOfRexRM(pfx,modrm)));
   14664             delta += 1;
   14665             goto decode_success;
   14666          }
   14667          /* else fall through */
   14668       }
   14669       break;
   14670 
   14671    case 0xD8:
   14672       /* 66 0F D8 = PSUBUSB */
   14673       if (have66noF2noF3(pfx) && sz == 2) {
   14674          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14675                                     "psubusb", Iop_QSub8Ux16, False );
   14676          goto decode_success;
   14677       }
   14678       break;
   14679 
   14680    case 0xD9:
   14681       /* 66 0F D9 = PSUBUSW */
   14682       if (have66noF2noF3(pfx) && sz == 2) {
   14683          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14684                                     "psubusw", Iop_QSub16Ux8, False );
   14685          goto decode_success;
   14686       }
   14687       break;
   14688 
   14689    case 0xDA:
   14690       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14691       /* 0F DA = PMINUB -- 8x8 unsigned min */
   14692       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14693          do_MMX_preamble();
   14694          delta = dis_MMXop_regmem_to_reg (
   14695                     vbi, pfx, delta, opc, "pminub", False );
   14696          goto decode_success;
   14697       }
   14698       /* 66 0F DA = PMINUB -- 8x16 unsigned min */
   14699       if (have66noF2noF3(pfx) && sz == 2) {
   14700          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14701                                     "pminub", Iop_Min8Ux16, False );
   14702          goto decode_success;
   14703       }
   14704       break;
   14705 
   14706    case 0xDB:
   14707       /* 66 0F DB = PAND */
   14708       if (have66noF2noF3(pfx) && sz == 2) {
   14709          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "pand", Iop_AndV128 );
   14710          goto decode_success;
   14711       }
   14712       break;
   14713 
   14714    case 0xDC:
   14715       /* 66 0F DC = PADDUSB */
   14716       if (have66noF2noF3(pfx) && sz == 2) {
   14717          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14718                                     "paddusb", Iop_QAdd8Ux16, False );
   14719          goto decode_success;
   14720       }
   14721       break;
   14722 
   14723    case 0xDD:
   14724       /* 66 0F DD = PADDUSW */
   14725       if (have66noF2noF3(pfx) && sz == 2) {
   14726          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14727                                     "paddusw", Iop_QAdd16Ux8, False );
   14728          goto decode_success;
   14729       }
   14730       break;
   14731 
   14732    case 0xDE:
   14733       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14734       /* 0F DE = PMAXUB -- 8x8 unsigned max */
   14735       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14736          do_MMX_preamble();
   14737          delta = dis_MMXop_regmem_to_reg (
   14738                     vbi, pfx, delta, opc, "pmaxub", False );
   14739          goto decode_success;
   14740       }
   14741       /* 66 0F DE = PMAXUB -- 8x16 unsigned max */
   14742       if (have66noF2noF3(pfx) && sz == 2) {
   14743          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14744                                     "pmaxub", Iop_Max8Ux16, False );
   14745          goto decode_success;
   14746       }
   14747       break;
   14748 
   14749    case 0xDF:
   14750       /* 66 0F DF = PANDN */
   14751       if (have66noF2noF3(pfx) && sz == 2) {
   14752          delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta, "pandn", Iop_AndV128 );
   14753          goto decode_success;
   14754       }
   14755       break;
   14756 
   14757    case 0xE0:
   14758       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14759       /* 0F E0 = PAVGB -- 8x8 unsigned Packed Average, with rounding */
   14760       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14761          do_MMX_preamble();
   14762          delta = dis_MMXop_regmem_to_reg (
   14763                     vbi, pfx, delta, opc, "pavgb", False );
   14764          goto decode_success;
   14765       }
   14766       /* 66 0F E0 = PAVGB */
   14767       if (have66noF2noF3(pfx) && sz == 2) {
   14768          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14769                                     "pavgb", Iop_Avg8Ux16, False );
   14770          goto decode_success;
   14771       }
   14772       break;
   14773 
   14774    case 0xE1:
   14775       /* 66 0F E1 = PSRAW by E */
   14776       if (have66noF2noF3(pfx) && sz == 2) {
   14777          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psraw", Iop_SarN16x8 );
   14778          goto decode_success;
   14779       }
   14780       break;
   14781 
   14782    case 0xE2:
   14783       /* 66 0F E2 = PSRAD by E */
   14784       if (have66noF2noF3(pfx) && sz == 2) {
   14785          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrad", Iop_SarN32x4 );
   14786          goto decode_success;
   14787       }
   14788       break;
   14789 
   14790    case 0xE3:
   14791       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14792       /* 0F E3 = PAVGW -- 16x4 unsigned Packed Average, with rounding */
   14793       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14794          do_MMX_preamble();
   14795          delta = dis_MMXop_regmem_to_reg (
   14796                     vbi, pfx, delta, opc, "pavgw", False );
   14797          goto decode_success;
   14798       }
   14799       /* 66 0F E3 = PAVGW */
   14800       if (have66noF2noF3(pfx) && sz == 2) {
   14801          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14802                                     "pavgw", Iop_Avg16Ux8, False );
   14803          goto decode_success;
   14804       }
   14805       break;
   14806 
   14807    case 0xE4:
   14808       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14809       /* 0F E4 = PMULUH -- 16x4 hi-half of unsigned widening multiply */
   14810       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14811          do_MMX_preamble();
   14812          delta = dis_MMXop_regmem_to_reg (
   14813                     vbi, pfx, delta, opc, "pmuluh", False );
   14814          goto decode_success;
   14815       }
   14816       /* 66 0F E4 = PMULHUW -- 16x8 hi-half of unsigned widening multiply */
   14817       if (have66noF2noF3(pfx) && sz == 2) {
   14818          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14819                                     "pmulhuw", Iop_MulHi16Ux8, False );
   14820          goto decode_success;
   14821       }
   14822       break;
   14823 
   14824    case 0xE5:
   14825       /* 66 0F E5 = PMULHW -- 16x8 hi-half of signed widening multiply */
   14826       if (have66noF2noF3(pfx) && sz == 2) {
   14827          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14828                                     "pmulhw", Iop_MulHi16Sx8, False );
   14829          goto decode_success;
   14830       }
   14831       break;
   14832 
   14833    case 0xE6:
   14834       /* 66 0F E6 = CVTTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
   14835          lo half xmm(G), and zero upper half, rounding towards zero */
   14836       /* F2 0F E6 = CVTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
   14837          lo half xmm(G), according to prevailing rounding mode, and zero
   14838          upper half */
   14839       if ( (haveF2no66noF3(pfx) && sz == 4)
   14840            || (have66noF2noF3(pfx) && sz == 2) ) {
   14841          delta = dis_CVTxPD2DQ_128( vbi, pfx, delta, False/*!isAvx*/,
   14842                                     toBool(sz == 2)/*r2zero*/);
   14843          goto decode_success;
   14844       }
   14845       /* F3 0F E6 = CVTDQ2PD -- convert 2 x I32 in mem/lo half xmm to 2 x
   14846          F64 in xmm(G) */
   14847       if (haveF3no66noF2(pfx) && sz == 4) {
   14848          delta = dis_CVTDQ2PD_128(vbi, pfx, delta, False/*!isAvx*/);
   14849          goto decode_success;
   14850       }
   14851       break;
   14852 
   14853    case 0xE7:
   14854       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14855       /* 0F E7 = MOVNTQ -- for us, just a plain MMX store.  Note, the
   14856          Intel manual does not say anything about the usual business of
   14857          the FP reg tags getting trashed whenever an MMX insn happens.
   14858          So we just leave them alone.
   14859       */
   14860       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14861          modrm = getUChar(delta);
   14862          if (!epartIsReg(modrm)) {
   14863             /* do_MMX_preamble(); Intel docs don't specify this */
   14864             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14865             storeLE( mkexpr(addr), getMMXReg(gregLO3ofRM(modrm)) );
   14866             DIP("movntq %s,%s\n", dis_buf,
   14867                                   nameMMXReg(gregLO3ofRM(modrm)));
   14868             delta += alen;
   14869             goto decode_success;
   14870          }
   14871          /* else fall through */
   14872       }
   14873       /* 66 0F E7 = MOVNTDQ -- for us, just a plain SSE store. */
   14874       if (have66noF2noF3(pfx) && sz == 2) {
   14875          modrm = getUChar(delta);
   14876          if (!epartIsReg(modrm)) {
   14877             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14878             gen_SEGV_if_not_16_aligned( addr );
   14879             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   14880             DIP("movntdq %s,%s\n", dis_buf,
   14881                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   14882             delta += alen;
   14883             goto decode_success;
   14884          }
   14885          /* else fall through */
   14886       }
   14887       break;
   14888 
   14889    case 0xE8:
   14890       /* 66 0F E8 = PSUBSB */
   14891       if (have66noF2noF3(pfx) && sz == 2) {
   14892          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14893                                     "psubsb", Iop_QSub8Sx16, False );
   14894          goto decode_success;
   14895       }
   14896       break;
   14897 
   14898    case 0xE9:
   14899       /* 66 0F E9 = PSUBSW */
   14900       if (have66noF2noF3(pfx) && sz == 2) {
   14901          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14902                                     "psubsw", Iop_QSub16Sx8, False );
   14903          goto decode_success;
   14904       }
   14905       break;
   14906 
   14907    case 0xEA:
   14908       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14909       /* 0F EA = PMINSW -- 16x4 signed min */
   14910       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14911          do_MMX_preamble();
   14912          delta = dis_MMXop_regmem_to_reg (
   14913                     vbi, pfx, delta, opc, "pminsw", False );
   14914          goto decode_success;
   14915       }
   14916       /* 66 0F EA = PMINSW -- 16x8 signed min */
   14917       if (have66noF2noF3(pfx) && sz == 2) {
   14918          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14919                                     "pminsw", Iop_Min16Sx8, False );
   14920          goto decode_success;
   14921       }
   14922       break;
   14923 
   14924    case 0xEB:
   14925       /* 66 0F EB = POR */
   14926       if (have66noF2noF3(pfx) && sz == 2) {
   14927          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "por", Iop_OrV128 );
   14928          goto decode_success;
   14929       }
   14930       break;
   14931 
   14932    case 0xEC:
   14933       /* 66 0F EC = PADDSB */
   14934       if (have66noF2noF3(pfx) && sz == 2) {
   14935          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14936                                     "paddsb", Iop_QAdd8Sx16, False );
   14937          goto decode_success;
   14938       }
   14939       break;
   14940 
   14941    case 0xED:
   14942       /* 66 0F ED = PADDSW */
   14943       if (have66noF2noF3(pfx) && sz == 2) {
   14944          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14945                                     "paddsw", Iop_QAdd16Sx8, False );
   14946          goto decode_success;
   14947       }
   14948       break;
   14949 
   14950    case 0xEE:
   14951       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14952       /* 0F EE = PMAXSW -- 16x4 signed max */
   14953       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14954          do_MMX_preamble();
   14955          delta = dis_MMXop_regmem_to_reg (
   14956                     vbi, pfx, delta, opc, "pmaxsw", False );
   14957          goto decode_success;
   14958       }
   14959       /* 66 0F EE = PMAXSW -- 16x8 signed max */
   14960       if (have66noF2noF3(pfx) && sz == 2) {
   14961          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14962                                     "pmaxsw", Iop_Max16Sx8, False );
   14963          goto decode_success;
   14964       }
   14965       break;
   14966 
   14967    case 0xEF:
   14968       /* 66 0F EF = PXOR */
   14969       if (have66noF2noF3(pfx) && sz == 2) {
   14970          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "pxor", Iop_XorV128 );
   14971          goto decode_success;
   14972       }
   14973       break;
   14974 
   14975    case 0xF1:
   14976       /* 66 0F F1 = PSLLW by E */
   14977       if (have66noF2noF3(pfx) && sz == 2) {
   14978          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psllw", Iop_ShlN16x8 );
   14979          goto decode_success;
   14980       }
   14981       break;
   14982 
   14983    case 0xF2:
   14984       /* 66 0F F2 = PSLLD by E */
   14985       if (have66noF2noF3(pfx) && sz == 2) {
   14986          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "pslld", Iop_ShlN32x4 );
   14987          goto decode_success;
   14988       }
   14989       break;
   14990 
   14991    case 0xF3:
   14992       /* 66 0F F3 = PSLLQ by E */
   14993       if (have66noF2noF3(pfx) && sz == 2) {
   14994          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psllq", Iop_ShlN64x2 );
   14995          goto decode_success;
   14996       }
   14997       break;
   14998 
   14999    case 0xF4:
   15000       /* 66 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
   15001          0 to form lower 64-bit half and lanes 2 x 2 to form upper 64-bit
   15002          half */
   15003       if (have66noF2noF3(pfx) && sz == 2) {
   15004          IRTemp sV = newTemp(Ity_V128);
   15005          IRTemp dV = newTemp(Ity_V128);
   15006          modrm = getUChar(delta);
   15007          UInt rG = gregOfRexRM(pfx,modrm);
   15008          assign( dV, getXMMReg(rG) );
   15009          if (epartIsReg(modrm)) {
   15010             UInt rE = eregOfRexRM(pfx,modrm);
   15011             assign( sV, getXMMReg(rE) );
   15012             delta += 1;
   15013             DIP("pmuludq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   15014          } else {
   15015             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15016             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   15017             delta += alen;
   15018             DIP("pmuludq %s,%s\n", dis_buf, nameXMMReg(rG));
   15019          }
   15020          putXMMReg( rG, mkexpr(math_PMULUDQ_128( sV, dV )) );
   15021          goto decode_success;
   15022       }
   15023       /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   15024       /* 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
   15025          0 to form 64-bit result */
   15026       if (haveNo66noF2noF3(pfx) && sz == 4) {
   15027          IRTemp sV = newTemp(Ity_I64);
   15028          IRTemp dV = newTemp(Ity_I64);
   15029          t1 = newTemp(Ity_I32);
   15030          t0 = newTemp(Ity_I32);
   15031          modrm = getUChar(delta);
   15032 
   15033          do_MMX_preamble();
   15034          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   15035 
   15036          if (epartIsReg(modrm)) {
   15037             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   15038             delta += 1;
   15039             DIP("pmuludq %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   15040                                    nameMMXReg(gregLO3ofRM(modrm)));
   15041          } else {
   15042             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15043             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   15044             delta += alen;
   15045             DIP("pmuludq %s,%s\n", dis_buf,
   15046                                    nameMMXReg(gregLO3ofRM(modrm)));
   15047          }
   15048 
   15049          assign( t0, unop(Iop_64to32, mkexpr(dV)) );
   15050          assign( t1, unop(Iop_64to32, mkexpr(sV)) );
   15051          putMMXReg( gregLO3ofRM(modrm),
   15052                     binop( Iop_MullU32, mkexpr(t0), mkexpr(t1) ) );
   15053          goto decode_success;
   15054       }
   15055       break;
   15056 
   15057    case 0xF5:
   15058       /* 66 0F F5 = PMADDWD -- Multiply and add packed integers from
   15059          E(xmm or mem) to G(xmm) */
   15060       if (have66noF2noF3(pfx) && sz == 2) {
   15061          IRTemp sV = newTemp(Ity_V128);
   15062          IRTemp dV = newTemp(Ity_V128);
   15063          modrm     = getUChar(delta);
   15064          UInt   rG = gregOfRexRM(pfx,modrm);
   15065          if (epartIsReg(modrm)) {
   15066             UInt rE = eregOfRexRM(pfx,modrm);
   15067             assign( sV, getXMMReg(rE) );
   15068             delta += 1;
   15069             DIP("pmaddwd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   15070          } else {
   15071             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15072             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   15073             delta += alen;
   15074             DIP("pmaddwd %s,%s\n", dis_buf, nameXMMReg(rG));
   15075          }
   15076          assign( dV, getXMMReg(rG) );
   15077          putXMMReg( rG, mkexpr(math_PMADDWD_128(dV, sV)) );
   15078          goto decode_success;
   15079       }
   15080       break;
   15081 
   15082    case 0xF6:
   15083       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   15084       /* 0F F6 = PSADBW -- sum of 8Ux8 absolute differences */
   15085       if (haveNo66noF2noF3(pfx) && sz == 4) {
   15086          do_MMX_preamble();
   15087          delta = dis_MMXop_regmem_to_reg (
   15088                     vbi, pfx, delta, opc, "psadbw", False );
   15089          goto decode_success;
   15090       }
   15091       /* 66 0F F6 = PSADBW -- 2 x (8x8 -> 48 zeroes ++ u16) Sum Abs Diffs
   15092          from E(xmm or mem) to G(xmm) */
   15093       if (have66noF2noF3(pfx) && sz == 2) {
   15094          IRTemp sV  = newTemp(Ity_V128);
   15095          IRTemp dV  = newTemp(Ity_V128);
   15096          modrm = getUChar(delta);
   15097          UInt   rG   = gregOfRexRM(pfx,modrm);
   15098          if (epartIsReg(modrm)) {
   15099             UInt rE = eregOfRexRM(pfx,modrm);
   15100             assign( sV, getXMMReg(rE) );
   15101             delta += 1;
   15102             DIP("psadbw %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   15103          } else {
   15104             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15105             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   15106             delta += alen;
   15107             DIP("psadbw %s,%s\n", dis_buf, nameXMMReg(rG));
   15108          }
   15109          assign( dV, getXMMReg(rG) );
   15110          putXMMReg( rG, mkexpr( math_PSADBW_128 ( dV, sV ) ) );
   15111 
   15112          goto decode_success;
   15113       }
   15114       break;
   15115 
   15116    case 0xF7:
   15117       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   15118       /* 0F F7 = MASKMOVQ -- 8x8 masked store */
   15119       if (haveNo66noF2noF3(pfx) && sz == 4) {
   15120          Bool ok = False;
   15121          delta = dis_MMX( &ok, vbi, pfx, sz, delta-1 );
   15122          if (ok) goto decode_success;
   15123       }
   15124       /* 66 0F F7 = MASKMOVDQU -- store selected bytes of double quadword */
   15125       if (have66noF2noF3(pfx) && sz == 2 && epartIsReg(getUChar(delta))) {
   15126          delta = dis_MASKMOVDQU( vbi, pfx, delta, False/*!isAvx*/ );
   15127          goto decode_success;
   15128       }
   15129       break;
   15130 
   15131    case 0xF8:
   15132       /* 66 0F F8 = PSUBB */
   15133       if (have66noF2noF3(pfx) && sz == 2) {
   15134          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   15135                                     "psubb", Iop_Sub8x16, False );
   15136          goto decode_success;
   15137       }
   15138       break;
   15139 
   15140    case 0xF9:
   15141       /* 66 0F F9 = PSUBW */
   15142       if (have66noF2noF3(pfx) && sz == 2) {
   15143          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   15144                                     "psubw", Iop_Sub16x8, False );
   15145          goto decode_success;
   15146       }
   15147       break;
   15148 
   15149    case 0xFA:
   15150       /* 66 0F FA = PSUBD */
   15151       if (have66noF2noF3(pfx) && sz == 2) {
   15152          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   15153                                     "psubd", Iop_Sub32x4, False );
   15154          goto decode_success;
   15155       }
   15156       break;
   15157 
   15158    case 0xFB:
   15159       /* 66 0F FB = PSUBQ */
   15160       if (have66noF2noF3(pfx) && sz == 2) {
   15161          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   15162                                     "psubq", Iop_Sub64x2, False );
   15163          goto decode_success;
   15164       }
   15165       /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   15166       /* 0F FB = PSUBQ -- sub 64x1 */
   15167       if (haveNo66noF2noF3(pfx) && sz == 4) {
   15168          do_MMX_preamble();
   15169          delta = dis_MMXop_regmem_to_reg (
   15170                    vbi, pfx, delta, opc, "psubq", False );
   15171          goto decode_success;
   15172       }
   15173       break;
   15174 
   15175    case 0xFC:
   15176       /* 66 0F FC = PADDB */
   15177       if (have66noF2noF3(pfx) && sz == 2) {
   15178          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   15179                                     "paddb", Iop_Add8x16, False );
   15180          goto decode_success;
   15181       }
   15182       break;
   15183 
   15184    case 0xFD:
   15185       /* 66 0F FD = PADDW */
   15186       if (have66noF2noF3(pfx) && sz == 2) {
   15187          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   15188                                     "paddw", Iop_Add16x8, False );
   15189          goto decode_success;
   15190       }
   15191       break;
   15192 
   15193    case 0xFE:
   15194       /* 66 0F FE = PADDD */
   15195       if (have66noF2noF3(pfx) && sz == 2) {
   15196          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   15197                                     "paddd", Iop_Add32x4, False );
   15198          goto decode_success;
   15199       }
   15200       break;
   15201 
   15202    default:
   15203       goto decode_failure;
   15204 
   15205    }
   15206 
   15207   decode_failure:
   15208    *decode_OK = False;
   15209    return deltaIN;
   15210 
   15211   decode_success:
   15212    *decode_OK = True;
   15213    return delta;
   15214 }
   15215 
   15216 
   15217 /*------------------------------------------------------------*/
   15218 /*---                                                      ---*/
   15219 /*--- Top-level SSE3 (not SupSSE3): dis_ESC_0F__SSE3       ---*/
   15220 /*---                                                      ---*/
   15221 /*------------------------------------------------------------*/
   15222 
   15223 static Long dis_MOVDDUP_128 ( const VexAbiInfo* vbi, Prefix pfx,
   15224                               Long delta, Bool isAvx )
   15225 {
   15226    IRTemp addr   = IRTemp_INVALID;
   15227    Int    alen   = 0;
   15228    HChar  dis_buf[50];
   15229    IRTemp sV    = newTemp(Ity_V128);
   15230    IRTemp d0    = newTemp(Ity_I64);
   15231    UChar  modrm = getUChar(delta);
   15232    UInt   rG    = gregOfRexRM(pfx,modrm);
   15233    if (epartIsReg(modrm)) {
   15234       UInt rE = eregOfRexRM(pfx,modrm);
   15235       assign( sV, getXMMReg(rE) );
   15236       DIP("%smovddup %s,%s\n",
   15237           isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG));
   15238       delta += 1;
   15239       assign ( d0, unop(Iop_V128to64, mkexpr(sV)) );
   15240    } else {
   15241       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15242       assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
   15243       DIP("%smovddup %s,%s\n",
   15244           isAvx ? "v" : "", dis_buf, nameXMMReg(rG));
   15245       delta += alen;
   15246    }
   15247    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   15248       ( rG, binop(Iop_64HLtoV128,mkexpr(d0),mkexpr(d0)) );
   15249    return delta;
   15250 }
   15251 
   15252 
   15253 static Long dis_MOVDDUP_256 ( const VexAbiInfo* vbi, Prefix pfx,
   15254                               Long delta )
   15255 {
   15256    IRTemp addr   = IRTemp_INVALID;
   15257    Int    alen   = 0;
   15258    HChar  dis_buf[50];
   15259    IRTemp d0    = newTemp(Ity_I64);
   15260    IRTemp d1    = newTemp(Ity_I64);
   15261    UChar  modrm = getUChar(delta);
   15262    UInt   rG    = gregOfRexRM(pfx,modrm);
   15263    if (epartIsReg(modrm)) {
   15264       UInt rE = eregOfRexRM(pfx,modrm);
   15265       DIP("vmovddup %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   15266       delta += 1;
   15267       assign ( d0, getYMMRegLane64(rE, 0) );
   15268       assign ( d1, getYMMRegLane64(rE, 2) );
   15269    } else {
   15270       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15271       assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
   15272       assign( d1, loadLE(Ity_I64, binop(Iop_Add64,
   15273                                         mkexpr(addr), mkU64(16))) );
   15274       DIP("vmovddup %s,%s\n", dis_buf, nameYMMReg(rG));
   15275       delta += alen;
   15276    }
   15277    putYMMRegLane64( rG, 0, mkexpr(d0) );
   15278    putYMMRegLane64( rG, 1, mkexpr(d0) );
   15279    putYMMRegLane64( rG, 2, mkexpr(d1) );
   15280    putYMMRegLane64( rG, 3, mkexpr(d1) );
   15281    return delta;
   15282 }
   15283 
   15284 
   15285 static Long dis_MOVSxDUP_128 ( const VexAbiInfo* vbi, Prefix pfx,
   15286                                Long delta, Bool isAvx, Bool isL )
   15287 {
   15288    IRTemp addr  = IRTemp_INVALID;
   15289    Int    alen  = 0;
   15290    HChar  dis_buf[50];
   15291    IRTemp sV    = newTemp(Ity_V128);
   15292    UChar  modrm = getUChar(delta);
   15293    UInt   rG    = gregOfRexRM(pfx,modrm);
   15294    IRTemp s3, s2, s1, s0;
   15295    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   15296    if (epartIsReg(modrm)) {
   15297       UInt rE = eregOfRexRM(pfx,modrm);
   15298       assign( sV, getXMMReg(rE) );
   15299       DIP("%smovs%cdup %s,%s\n",
   15300           isAvx ? "v" : "", isL ? 'l' : 'h', nameXMMReg(rE), nameXMMReg(rG));
   15301       delta += 1;
   15302    } else {
   15303       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15304       if (!isAvx)
   15305          gen_SEGV_if_not_16_aligned( addr );
   15306       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   15307       DIP("%smovs%cdup %s,%s\n",
   15308           isAvx ? "v" : "", isL ? 'l' : 'h', dis_buf, nameXMMReg(rG));
   15309       delta += alen;
   15310    }
   15311    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   15312    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   15313       ( rG, isL ? mkV128from32s( s2, s2, s0, s0 )
   15314                 : mkV128from32s( s3, s3, s1, s1 ) );
   15315    return delta;
   15316 }
   15317 
   15318 
   15319 static Long dis_MOVSxDUP_256 ( const VexAbiInfo* vbi, Prefix pfx,
   15320                                Long delta, Bool isL )
   15321 {
   15322    IRTemp addr  = IRTemp_INVALID;
   15323    Int    alen  = 0;
   15324    HChar  dis_buf[50];
   15325    IRTemp sV    = newTemp(Ity_V256);
   15326    UChar  modrm = getUChar(delta);
   15327    UInt   rG    = gregOfRexRM(pfx,modrm);
   15328    IRTemp s7, s6, s5, s4, s3, s2, s1, s0;
   15329    s7 = s6 = s5 = s4 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   15330    if (epartIsReg(modrm)) {
   15331       UInt rE = eregOfRexRM(pfx,modrm);
   15332       assign( sV, getYMMReg(rE) );
   15333       DIP("vmovs%cdup %s,%s\n",
   15334           isL ? 'l' : 'h', nameYMMReg(rE), nameYMMReg(rG));
   15335       delta += 1;
   15336    } else {
   15337       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15338       assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   15339       DIP("vmovs%cdup %s,%s\n",
   15340           isL ? 'l' : 'h', dis_buf, nameYMMReg(rG));
   15341       delta += alen;
   15342    }
   15343    breakupV256to32s( sV, &s7, &s6, &s5, &s4, &s3, &s2, &s1, &s0 );
   15344    putYMMRegLane128( rG, 1, isL ? mkV128from32s( s6, s6, s4, s4 )
   15345                                 : mkV128from32s( s7, s7, s5, s5 ) );
   15346    putYMMRegLane128( rG, 0, isL ? mkV128from32s( s2, s2, s0, s0 )
   15347                                 : mkV128from32s( s3, s3, s1, s1 ) );
   15348    return delta;
   15349 }
   15350 
   15351 
   15352 static IRTemp math_HADDPS_128 ( IRTemp dV, IRTemp sV, Bool isAdd )
   15353 {
   15354    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   15355    IRTemp leftV  = newTemp(Ity_V128);
   15356    IRTemp rightV = newTemp(Ity_V128);
   15357    IRTemp rm     = newTemp(Ity_I32);
   15358    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   15359 
   15360    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   15361    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   15362 
   15363    assign( leftV,  mkV128from32s( s2, s0, d2, d0 ) );
   15364    assign( rightV, mkV128from32s( s3, s1, d3, d1 ) );
   15365 
   15366    IRTemp res = newTemp(Ity_V128);
   15367    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   15368    assign( res, triop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4,
   15369                       mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
   15370    return res;
   15371 }
   15372 
   15373 
   15374 static IRTemp math_HADDPD_128 ( IRTemp dV, IRTemp sV, Bool isAdd )
   15375 {
   15376    IRTemp s1, s0, d1, d0;
   15377    IRTemp leftV  = newTemp(Ity_V128);
   15378    IRTemp rightV = newTemp(Ity_V128);
   15379    IRTemp rm     = newTemp(Ity_I32);
   15380    s1 = s0 = d1 = d0 = IRTemp_INVALID;
   15381 
   15382    breakupV128to64s( sV, &s1, &s0 );
   15383    breakupV128to64s( dV, &d1, &d0 );
   15384 
   15385    assign( leftV,  binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)) );
   15386    assign( rightV, binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1)) );
   15387 
   15388    IRTemp res = newTemp(Ity_V128);
   15389    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   15390    assign( res, triop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2,
   15391                       mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
   15392    return res;
   15393 }
   15394 
   15395 
   15396 __attribute__((noinline))
   15397 static
   15398 Long dis_ESC_0F__SSE3 ( Bool* decode_OK,
   15399                         const VexAbiInfo* vbi,
   15400                         Prefix pfx, Int sz, Long deltaIN )
   15401 {
   15402    IRTemp addr  = IRTemp_INVALID;
   15403    UChar  modrm = 0;
   15404    Int    alen  = 0;
   15405    HChar  dis_buf[50];
   15406 
   15407    *decode_OK = False;
   15408 
   15409    Long   delta = deltaIN;
   15410    UChar  opc   = getUChar(delta);
   15411    delta++;
   15412    switch (opc) {
   15413 
   15414    case 0x12:
   15415       /* F3 0F 12 = MOVSLDUP -- move from E (mem or xmm) to G (xmm),
   15416          duplicating some lanes (2:2:0:0). */
   15417       if (haveF3no66noF2(pfx) && sz == 4) {
   15418          delta = dis_MOVSxDUP_128( vbi, pfx, delta, False/*!isAvx*/,
   15419                                    True/*isL*/ );
   15420          goto decode_success;
   15421       }
   15422       /* F2 0F 12 = MOVDDUP -- move from E (mem or xmm) to G (xmm),
   15423