Home | History | Annotate | Download | only in priv
      1 
      2 /*--------------------------------------------------------------------*/
      3 /*--- begin                                       guest_x86_toIR.c ---*/
      4 /*--------------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2017 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 /* Translates x86 code to IR. */
     37 
     38 /* TODO:
     39 
     40    All Puts to CC_OP/CC_DEP1/CC_DEP2/CC_NDEP should really be checked
     41    to ensure a 32-bit value is being written.
     42 
     43    FUCOMI(P): what happens to A and S flags?  Currently are forced
     44       to zero.
     45 
     46    x87 FP Limitations:
     47 
     48    * all arithmetic done at 64 bits
     49 
     50    * no FP exceptions, except for handling stack over/underflow
     51 
     52    * FP rounding mode observed only for float->int conversions
     53      and int->float conversions which could lose accuracy, and
     54      for float-to-float rounding.  For all other operations,
     55      round-to-nearest is used, regardless.
     56 
     57    * some of the FCOM cases could do with testing -- not convinced
     58      that the args are the right way round.
     59 
     60    * FSAVE does not re-initialise the FPU; it should do
     61 
     62    * FINIT not only initialises the FPU environment, it also
     63      zeroes all the FP registers.  It should leave the registers
     64      unchanged.
     65 
     66    SAHF should cause eflags[1] == 1, and in fact it produces 0.  As
     67    per Intel docs this bit has no meaning anyway.  Since PUSHF is the
     68    only way to observe eflags[1], a proper fix would be to make that
     69    bit be set by PUSHF.
     70 
     71    The state of %eflags.AC (alignment check, bit 18) is recorded by
     72    the simulation (viz, if you set it with popf then a pushf produces
     73    the value you set it to), but it is otherwise ignored.  In
     74    particular, setting it to 1 does NOT cause alignment checking to
     75    happen.  Programs that set it to 1 and then rely on the resulting
     76    SIGBUSs to inform them of misaligned accesses will not work.
     77 
     78    Implementation of sysenter is necessarily partial.  sysenter is a
     79    kind of system call entry.  When doing a sysenter, the return
     80    address is not known -- that is something that is beyond Vex's
     81    knowledge.  So the generated IR forces a return to the scheduler,
     82    which can do what it likes to simulate the systenter, but it MUST
     83    set this thread's guest_EIP field with the continuation address
     84    before resuming execution.  If that doesn't happen, the thread will
     85    jump to address zero, which is probably fatal.
     86 
     87    This module uses global variables and so is not MT-safe (if that
     88    should ever become relevant).
     89 
     90    The delta values are 32-bit ints, not 64-bit ints.  That means
     91    this module may not work right if run on a 64-bit host.  That should
     92    be fixed properly, really -- if anyone ever wants to use Vex to
     93    translate x86 code for execution on a 64-bit host.
     94 
     95    casLE (implementation of lock-prefixed insns) and rep-prefixed
     96    insns: the side-exit back to the start of the insn is done with
     97    Ijk_Boring.  This is quite wrong, it should be done with
     98    Ijk_NoRedir, since otherwise the side exit, which is intended to
     99    restart the instruction for whatever reason, could go somewhere
    100    entirely else.  Doing it right (with Ijk_NoRedir jumps) would make
    101    no-redir jumps performance critical, at least for rep-prefixed
    102    instructions, since all iterations thereof would involve such a
    103    jump.  It's not such a big deal with casLE since the side exit is
    104    only taken if the CAS fails, that is, the location is contended,
    105    which is relatively unlikely.
    106 
    107    XXXX: Nov 2009: handling of SWP on ARM suffers from the same
    108    problem.
    109 
    110    Note also, the test for CAS success vs failure is done using
    111    Iop_CasCmp{EQ,NE}{8,16,32,64} rather than the ordinary
    112    Iop_Cmp{EQ,NE} equivalents.  This is so as to tell Memcheck that it
    113    shouldn't definedness-check these comparisons.  See
    114    COMMENT_ON_CasCmpEQ in memcheck/mc_translate.c for
    115    background/rationale.
    116 */
    117 
    118 /* Performance holes:
    119 
    120    - fcom ; fstsw %ax ; sahf
    121      sahf does not update the O flag (sigh) and so O needs to
    122      be computed.  This is done expensively; it would be better
    123      to have a calculate_eflags_o helper.
    124 
    125    - emwarns; some FP codes can generate huge numbers of these
    126      if the fpucw is changed in an inner loop.  It would be
    127      better for the guest state to have an emwarn-enable reg
    128      which can be set zero or nonzero.  If it is zero, emwarns
    129      are not flagged, and instead control just flows all the
    130      way through bbs as usual.
    131 */
    132 
    133 /* "Special" instructions.
    134 
    135    This instruction decoder can decode three special instructions
    136    which mean nothing natively (are no-ops as far as regs/mem are
    137    concerned) but have meaning for supporting Valgrind.  A special
    138    instruction is flagged by the 12-byte preamble C1C703 C1C70D C1C71D
    139    C1C713 (in the standard interpretation, that means: roll $3, %edi;
    140    roll $13, %edi; roll $29, %edi; roll $19, %edi).  Following that,
    141    one of the following 3 are allowed (standard interpretation in
    142    parentheses):
    143 
    144       87DB (xchgl %ebx,%ebx)   %EDX = client_request ( %EAX )
    145       87C9 (xchgl %ecx,%ecx)   %EAX = guest_NRADDR
    146       87D2 (xchgl %edx,%edx)   call-noredir *%EAX
    147       87FF (xchgl %edi,%edi)   IR injection
    148 
    149    Any other bytes following the 12-byte preamble are illegal and
    150    constitute a failure in instruction decoding.  This all assumes
    151    that the preamble will never occur except in specific code
    152    fragments designed for Valgrind to catch.
    153 
    154    No prefixes may precede a "Special" instruction.
    155 */
    156 
    157 /* LOCK prefixed instructions.  These are translated using IR-level
    158    CAS statements (IRCAS) and are believed to preserve atomicity, even
    159    from the point of view of some other process racing against a
    160    simulated one (presumably they communicate via a shared memory
    161    segment).
    162 
    163    Handlers which are aware of LOCK prefixes are:
    164       dis_op2_G_E      (add, or, adc, sbb, and, sub, xor)
    165       dis_cmpxchg_G_E  (cmpxchg)
    166       dis_Grp1         (add, or, adc, sbb, and, sub, xor)
    167       dis_Grp3         (not, neg)
    168       dis_Grp4         (inc, dec)
    169       dis_Grp5         (inc, dec)
    170       dis_Grp8_Imm     (bts, btc, btr)
    171       dis_bt_G_E       (bts, btc, btr)
    172       dis_xadd_G_E     (xadd)
    173 */
    174 
    175 
    176 #include "libvex_basictypes.h"
    177 #include "libvex_ir.h"
    178 #include "libvex.h"
    179 #include "libvex_guest_x86.h"
    180 
    181 #include "main_util.h"
    182 #include "main_globals.h"
    183 #include "guest_generic_bb_to_IR.h"
    184 #include "guest_generic_x87.h"
    185 #include "guest_x86_defs.h"
    186 
    187 
    188 /*------------------------------------------------------------*/
    189 /*--- Globals                                              ---*/
    190 /*------------------------------------------------------------*/
    191 
    192 /* These are set at the start of the translation of an insn, right
    193    down in disInstr_X86, so that we don't have to pass them around
    194    endlessly.  They are all constant during the translation of any
    195    given insn. */
    196 
    197 /* We need to know this to do sub-register accesses correctly. */
    198 static VexEndness host_endness;
    199 
    200 /* Pointer to the guest code area (points to start of BB, not to the
    201    insn being processed). */
    202 static const UChar* guest_code;
    203 
    204 /* The guest address corresponding to guest_code[0]. */
    205 static Addr32 guest_EIP_bbstart;
    206 
    207 /* The guest address for the instruction currently being
    208    translated. */
    209 static Addr32 guest_EIP_curr_instr;
    210 
    211 /* The IRSB* into which we're generating code. */
    212 static IRSB* irsb;
    213 
    214 
    215 /*------------------------------------------------------------*/
    216 /*--- Debugging output                                     ---*/
    217 /*------------------------------------------------------------*/
    218 
    219 #define DIP(format, args...)           \
    220    if (vex_traceflags & VEX_TRACE_FE)  \
    221       vex_printf(format, ## args)
    222 
    223 #define DIS(buf, format, args...)      \
    224    if (vex_traceflags & VEX_TRACE_FE)  \
    225       vex_sprintf(buf, format, ## args)
    226 
    227 
    228 /*------------------------------------------------------------*/
    229 /*--- Offsets of various parts of the x86 guest state.     ---*/
    230 /*------------------------------------------------------------*/
    231 
    232 #define OFFB_EAX       offsetof(VexGuestX86State,guest_EAX)
    233 #define OFFB_EBX       offsetof(VexGuestX86State,guest_EBX)
    234 #define OFFB_ECX       offsetof(VexGuestX86State,guest_ECX)
    235 #define OFFB_EDX       offsetof(VexGuestX86State,guest_EDX)
    236 #define OFFB_ESP       offsetof(VexGuestX86State,guest_ESP)
    237 #define OFFB_EBP       offsetof(VexGuestX86State,guest_EBP)
    238 #define OFFB_ESI       offsetof(VexGuestX86State,guest_ESI)
    239 #define OFFB_EDI       offsetof(VexGuestX86State,guest_EDI)
    240 
    241 #define OFFB_EIP       offsetof(VexGuestX86State,guest_EIP)
    242 
    243 #define OFFB_CC_OP     offsetof(VexGuestX86State,guest_CC_OP)
    244 #define OFFB_CC_DEP1   offsetof(VexGuestX86State,guest_CC_DEP1)
    245 #define OFFB_CC_DEP2   offsetof(VexGuestX86State,guest_CC_DEP2)
    246 #define OFFB_CC_NDEP   offsetof(VexGuestX86State,guest_CC_NDEP)
    247 
    248 #define OFFB_FPREGS    offsetof(VexGuestX86State,guest_FPREG[0])
    249 #define OFFB_FPTAGS    offsetof(VexGuestX86State,guest_FPTAG[0])
    250 #define OFFB_DFLAG     offsetof(VexGuestX86State,guest_DFLAG)
    251 #define OFFB_IDFLAG    offsetof(VexGuestX86State,guest_IDFLAG)
    252 #define OFFB_ACFLAG    offsetof(VexGuestX86State,guest_ACFLAG)
    253 #define OFFB_FTOP      offsetof(VexGuestX86State,guest_FTOP)
    254 #define OFFB_FC3210    offsetof(VexGuestX86State,guest_FC3210)
    255 #define OFFB_FPROUND   offsetof(VexGuestX86State,guest_FPROUND)
    256 
    257 #define OFFB_CS        offsetof(VexGuestX86State,guest_CS)
    258 #define OFFB_DS        offsetof(VexGuestX86State,guest_DS)
    259 #define OFFB_ES        offsetof(VexGuestX86State,guest_ES)
    260 #define OFFB_FS        offsetof(VexGuestX86State,guest_FS)
    261 #define OFFB_GS        offsetof(VexGuestX86State,guest_GS)
    262 #define OFFB_SS        offsetof(VexGuestX86State,guest_SS)
    263 #define OFFB_LDT       offsetof(VexGuestX86State,guest_LDT)
    264 #define OFFB_GDT       offsetof(VexGuestX86State,guest_GDT)
    265 
    266 #define OFFB_SSEROUND  offsetof(VexGuestX86State,guest_SSEROUND)
    267 #define OFFB_XMM0      offsetof(VexGuestX86State,guest_XMM0)
    268 #define OFFB_XMM1      offsetof(VexGuestX86State,guest_XMM1)
    269 #define OFFB_XMM2      offsetof(VexGuestX86State,guest_XMM2)
    270 #define OFFB_XMM3      offsetof(VexGuestX86State,guest_XMM3)
    271 #define OFFB_XMM4      offsetof(VexGuestX86State,guest_XMM4)
    272 #define OFFB_XMM5      offsetof(VexGuestX86State,guest_XMM5)
    273 #define OFFB_XMM6      offsetof(VexGuestX86State,guest_XMM6)
    274 #define OFFB_XMM7      offsetof(VexGuestX86State,guest_XMM7)
    275 
    276 #define OFFB_EMNOTE    offsetof(VexGuestX86State,guest_EMNOTE)
    277 
    278 #define OFFB_CMSTART   offsetof(VexGuestX86State,guest_CMSTART)
    279 #define OFFB_CMLEN     offsetof(VexGuestX86State,guest_CMLEN)
    280 #define OFFB_NRADDR    offsetof(VexGuestX86State,guest_NRADDR)
    281 
    282 #define OFFB_IP_AT_SYSCALL offsetof(VexGuestX86State,guest_IP_AT_SYSCALL)
    283 
    284 
    285 /*------------------------------------------------------------*/
    286 /*--- Helper bits and pieces for deconstructing the        ---*/
    287 /*--- x86 insn stream.                                     ---*/
    288 /*------------------------------------------------------------*/
    289 
    290 /* This is the Intel register encoding -- integer regs. */
    291 #define R_EAX 0
    292 #define R_ECX 1
    293 #define R_EDX 2
    294 #define R_EBX 3
    295 #define R_ESP 4
    296 #define R_EBP 5
    297 #define R_ESI 6
    298 #define R_EDI 7
    299 
    300 #define R_AL (0+R_EAX)
    301 #define R_AH (4+R_EAX)
    302 
    303 /* This is the Intel register encoding -- segment regs. */
    304 #define R_ES 0
    305 #define R_CS 1
    306 #define R_SS 2
    307 #define R_DS 3
    308 #define R_FS 4
    309 #define R_GS 5
    310 
    311 
    312 /* Add a statement to the list held by "irbb". */
    313 static void stmt ( IRStmt* st )
    314 {
    315    addStmtToIRSB( irsb, st );
    316 }
    317 
    318 /* Generate a new temporary of the given type. */
    319 static IRTemp newTemp ( IRType ty )
    320 {
    321    vassert(isPlausibleIRType(ty));
    322    return newIRTemp( irsb->tyenv, ty );
    323 }
    324 
    325 /* Various simple conversions */
    326 
    327 static UInt extend_s_8to32( UInt x )
    328 {
    329    return (UInt)((Int)(x << 24) >> 24);
    330 }
    331 
    332 static UInt extend_s_16to32 ( UInt x )
    333 {
    334   return (UInt)((Int)(x << 16) >> 16);
    335 }
    336 
    337 /* Fetch a byte from the guest insn stream. */
    338 static UChar getIByte ( Int delta )
    339 {
    340    return guest_code[delta];
    341 }
    342 
    343 /* Extract the reg field from a modRM byte. */
    344 static Int gregOfRM ( UChar mod_reg_rm )
    345 {
    346    return (Int)( (mod_reg_rm >> 3) & 7 );
    347 }
    348 
    349 /* Figure out whether the mod and rm parts of a modRM byte refer to a
    350    register or memory.  If so, the byte will have the form 11XXXYYY,
    351    where YYY is the register number. */
    352 static Bool epartIsReg ( UChar mod_reg_rm )
    353 {
    354    return toBool(0xC0 == (mod_reg_rm & 0xC0));
    355 }
    356 
    357 /* ... and extract the register number ... */
    358 static Int eregOfRM ( UChar mod_reg_rm )
    359 {
    360    return (Int)(mod_reg_rm & 0x7);
    361 }
    362 
    363 /* Get a 8/16/32-bit unsigned value out of the insn stream. */
    364 
    365 static UChar getUChar ( Int delta )
    366 {
    367    UChar v = guest_code[delta+0];
    368    return toUChar(v);
    369 }
    370 
    371 static UInt getUDisp16 ( Int delta )
    372 {
    373    UInt v = guest_code[delta+1]; v <<= 8;
    374    v |= guest_code[delta+0];
    375    return v & 0xFFFF;
    376 }
    377 
    378 static UInt getUDisp32 ( Int delta )
    379 {
    380    UInt v = guest_code[delta+3]; v <<= 8;
    381    v |= guest_code[delta+2]; v <<= 8;
    382    v |= guest_code[delta+1]; v <<= 8;
    383    v |= guest_code[delta+0];
    384    return v;
    385 }
    386 
    387 static UInt getUDisp ( Int size, Int delta )
    388 {
    389    switch (size) {
    390       case 4: return getUDisp32(delta);
    391       case 2: return getUDisp16(delta);
    392       case 1: return (UInt)getUChar(delta);
    393       default: vpanic("getUDisp(x86)");
    394    }
    395    return 0; /*notreached*/
    396 }
    397 
    398 
    399 /* Get a byte value out of the insn stream and sign-extend to 32
    400    bits. */
    401 static UInt getSDisp8 ( Int delta )
    402 {
    403    return extend_s_8to32( (UInt) (guest_code[delta]) );
    404 }
    405 
    406 static UInt getSDisp16 ( Int delta0 )
    407 {
    408    const UChar* eip = &guest_code[delta0];
    409    UInt d = *eip++;
    410    d |= ((*eip++) << 8);
    411    return extend_s_16to32(d);
    412 }
    413 
    414 static UInt getSDisp ( Int size, Int delta )
    415 {
    416    switch (size) {
    417       case 4: return getUDisp32(delta);
    418       case 2: return getSDisp16(delta);
    419       case 1: return getSDisp8(delta);
    420       default: vpanic("getSDisp(x86)");
    421   }
    422   return 0; /*notreached*/
    423 }
    424 
    425 
    426 /*------------------------------------------------------------*/
    427 /*--- Helpers for constructing IR.                         ---*/
    428 /*------------------------------------------------------------*/
    429 
    430 /* Create a 1/2/4 byte read of an x86 integer registers.  For 16/8 bit
    431    register references, we need to take the host endianness into
    432    account.  Supplied value is 0 .. 7 and in the Intel instruction
    433    encoding. */
    434 
    435 static IRType szToITy ( Int n )
    436 {
    437    switch (n) {
    438       case 1: return Ity_I8;
    439       case 2: return Ity_I16;
    440       case 4: return Ity_I32;
    441       default: vpanic("szToITy(x86)");
    442    }
    443 }
    444 
    445 /* On a little-endian host, less significant bits of the guest
    446    registers are at lower addresses.  Therefore, if a reference to a
    447    register low half has the safe guest state offset as a reference to
    448    the full register.
    449 */
    450 static Int integerGuestRegOffset ( Int sz, UInt archreg )
    451 {
    452    vassert(archreg < 8);
    453 
    454    /* Correct for little-endian host only. */
    455    vassert(host_endness == VexEndnessLE);
    456 
    457    if (sz == 4 || sz == 2 || (sz == 1 && archreg < 4)) {
    458       switch (archreg) {
    459          case R_EAX: return OFFB_EAX;
    460          case R_EBX: return OFFB_EBX;
    461          case R_ECX: return OFFB_ECX;
    462          case R_EDX: return OFFB_EDX;
    463          case R_ESI: return OFFB_ESI;
    464          case R_EDI: return OFFB_EDI;
    465          case R_ESP: return OFFB_ESP;
    466          case R_EBP: return OFFB_EBP;
    467          default: vpanic("integerGuestRegOffset(x86,le)(4,2)");
    468       }
    469    }
    470 
    471    vassert(archreg >= 4 && archreg < 8 && sz == 1);
    472    switch (archreg-4) {
    473       case R_EAX: return 1+ OFFB_EAX;
    474       case R_EBX: return 1+ OFFB_EBX;
    475       case R_ECX: return 1+ OFFB_ECX;
    476       case R_EDX: return 1+ OFFB_EDX;
    477       default: vpanic("integerGuestRegOffset(x86,le)(1h)");
    478    }
    479 
    480    /* NOTREACHED */
    481    vpanic("integerGuestRegOffset(x86,le)");
    482 }
    483 
    484 static Int segmentGuestRegOffset ( UInt sreg )
    485 {
    486    switch (sreg) {
    487       case R_ES: return OFFB_ES;
    488       case R_CS: return OFFB_CS;
    489       case R_SS: return OFFB_SS;
    490       case R_DS: return OFFB_DS;
    491       case R_FS: return OFFB_FS;
    492       case R_GS: return OFFB_GS;
    493       default: vpanic("segmentGuestRegOffset(x86)");
    494    }
    495 }
    496 
    497 static Int xmmGuestRegOffset ( UInt xmmreg )
    498 {
    499    switch (xmmreg) {
    500       case 0: return OFFB_XMM0;
    501       case 1: return OFFB_XMM1;
    502       case 2: return OFFB_XMM2;
    503       case 3: return OFFB_XMM3;
    504       case 4: return OFFB_XMM4;
    505       case 5: return OFFB_XMM5;
    506       case 6: return OFFB_XMM6;
    507       case 7: return OFFB_XMM7;
    508       default: vpanic("xmmGuestRegOffset");
    509    }
    510 }
    511 
    512 /* Lanes of vector registers are always numbered from zero being the
    513    least significant lane (rightmost in the register).  */
    514 
    515 static Int xmmGuestRegLane16offset ( UInt xmmreg, Int laneno )
    516 {
    517    /* Correct for little-endian host only. */
    518    vassert(host_endness == VexEndnessLE);
    519    vassert(laneno >= 0 && laneno < 8);
    520    return xmmGuestRegOffset( xmmreg ) + 2 * laneno;
    521 }
    522 
    523 static Int xmmGuestRegLane32offset ( UInt xmmreg, Int laneno )
    524 {
    525    /* Correct for little-endian host only. */
    526    vassert(host_endness == VexEndnessLE);
    527    vassert(laneno >= 0 && laneno < 4);
    528    return xmmGuestRegOffset( xmmreg ) + 4 * laneno;
    529 }
    530 
    531 static Int xmmGuestRegLane64offset ( UInt xmmreg, Int laneno )
    532 {
    533    /* Correct for little-endian host only. */
    534    vassert(host_endness == VexEndnessLE);
    535    vassert(laneno >= 0 && laneno < 2);
    536    return xmmGuestRegOffset( xmmreg ) + 8 * laneno;
    537 }
    538 
    539 static IRExpr* getIReg ( Int sz, UInt archreg )
    540 {
    541    vassert(sz == 1 || sz == 2 || sz == 4);
    542    vassert(archreg < 8);
    543    return IRExpr_Get( integerGuestRegOffset(sz,archreg),
    544                       szToITy(sz) );
    545 }
    546 
    547 /* Ditto, but write to a reg instead. */
    548 static void putIReg ( Int sz, UInt archreg, IRExpr* e )
    549 {
    550    IRType ty = typeOfIRExpr(irsb->tyenv, e);
    551    switch (sz) {
    552       case 1: vassert(ty == Ity_I8); break;
    553       case 2: vassert(ty == Ity_I16); break;
    554       case 4: vassert(ty == Ity_I32); break;
    555       default: vpanic("putIReg(x86)");
    556    }
    557    vassert(archreg < 8);
    558    stmt( IRStmt_Put(integerGuestRegOffset(sz,archreg), e) );
    559 }
    560 
    561 static IRExpr* getSReg ( UInt sreg )
    562 {
    563    return IRExpr_Get( segmentGuestRegOffset(sreg), Ity_I16 );
    564 }
    565 
    566 static void putSReg ( UInt sreg, IRExpr* e )
    567 {
    568    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
    569    stmt( IRStmt_Put( segmentGuestRegOffset(sreg), e ) );
    570 }
    571 
    572 static IRExpr* getXMMReg ( UInt xmmreg )
    573 {
    574    return IRExpr_Get( xmmGuestRegOffset(xmmreg), Ity_V128 );
    575 }
    576 
    577 static IRExpr* getXMMRegLane64 ( UInt xmmreg, Int laneno )
    578 {
    579    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_I64 );
    580 }
    581 
    582 static IRExpr* getXMMRegLane64F ( UInt xmmreg, Int laneno )
    583 {
    584    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_F64 );
    585 }
    586 
    587 static IRExpr* getXMMRegLane32 ( UInt xmmreg, Int laneno )
    588 {
    589    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_I32 );
    590 }
    591 
    592 static IRExpr* getXMMRegLane32F ( UInt xmmreg, Int laneno )
    593 {
    594    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_F32 );
    595 }
    596 
    597 static void putXMMReg ( UInt xmmreg, IRExpr* e )
    598 {
    599    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
    600    stmt( IRStmt_Put( xmmGuestRegOffset(xmmreg), e ) );
    601 }
    602 
    603 static void putXMMRegLane64 ( UInt xmmreg, Int laneno, IRExpr* e )
    604 {
    605    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
    606    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
    607 }
    608 
    609 static void putXMMRegLane64F ( UInt xmmreg, Int laneno, IRExpr* e )
    610 {
    611    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
    612    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
    613 }
    614 
    615 static void putXMMRegLane32F ( UInt xmmreg, Int laneno, IRExpr* e )
    616 {
    617    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
    618    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
    619 }
    620 
    621 static void putXMMRegLane32 ( UInt xmmreg, Int laneno, IRExpr* e )
    622 {
    623    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
    624    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
    625 }
    626 
    627 static void putXMMRegLane16 ( UInt xmmreg, Int laneno, IRExpr* e )
    628 {
    629    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
    630    stmt( IRStmt_Put( xmmGuestRegLane16offset(xmmreg,laneno), e ) );
    631 }
    632 
    633 static void assign ( IRTemp dst, IRExpr* e )
    634 {
    635    stmt( IRStmt_WrTmp(dst, e) );
    636 }
    637 
    638 static void storeLE ( IRExpr* addr, IRExpr* data )
    639 {
    640    stmt( IRStmt_Store(Iend_LE, addr, data) );
    641 }
    642 
    643 static IRExpr* unop ( IROp op, IRExpr* a )
    644 {
    645    return IRExpr_Unop(op, a);
    646 }
    647 
    648 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
    649 {
    650    return IRExpr_Binop(op, a1, a2);
    651 }
    652 
    653 static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
    654 {
    655    return IRExpr_Triop(op, a1, a2, a3);
    656 }
    657 
    658 static IRExpr* mkexpr ( IRTemp tmp )
    659 {
    660    return IRExpr_RdTmp(tmp);
    661 }
    662 
    663 static IRExpr* mkU8 ( UInt i )
    664 {
    665    vassert(i < 256);
    666    return IRExpr_Const(IRConst_U8( (UChar)i ));
    667 }
    668 
    669 static IRExpr* mkU16 ( UInt i )
    670 {
    671    vassert(i < 65536);
    672    return IRExpr_Const(IRConst_U16( (UShort)i ));
    673 }
    674 
    675 static IRExpr* mkU32 ( UInt i )
    676 {
    677    return IRExpr_Const(IRConst_U32(i));
    678 }
    679 
    680 static IRExpr* mkU64 ( ULong i )
    681 {
    682    return IRExpr_Const(IRConst_U64(i));
    683 }
    684 
    685 static IRExpr* mkU ( IRType ty, UInt i )
    686 {
    687    if (ty == Ity_I8)  return mkU8(i);
    688    if (ty == Ity_I16) return mkU16(i);
    689    if (ty == Ity_I32) return mkU32(i);
    690    /* If this panics, it usually means you passed a size (1,2,4)
    691       value as the IRType, rather than a real IRType. */
    692    vpanic("mkU(x86)");
    693 }
    694 
    695 static IRExpr* mkV128 ( UShort mask )
    696 {
    697    return IRExpr_Const(IRConst_V128(mask));
    698 }
    699 
    700 static IRExpr* loadLE ( IRType ty, IRExpr* addr )
    701 {
    702    return IRExpr_Load(Iend_LE, ty, addr);
    703 }
    704 
    705 static IROp mkSizedOp ( IRType ty, IROp op8 )
    706 {
    707    Int adj;
    708    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
    709    vassert(op8 == Iop_Add8 || op8 == Iop_Sub8
    710            || op8 == Iop_Mul8
    711            || op8 == Iop_Or8 || op8 == Iop_And8 || op8 == Iop_Xor8
    712            || op8 == Iop_Shl8 || op8 == Iop_Shr8 || op8 == Iop_Sar8
    713            || op8 == Iop_CmpEQ8 || op8 == Iop_CmpNE8
    714            || op8 == Iop_CasCmpNE8
    715            || op8 == Iop_ExpCmpNE8
    716            || op8 == Iop_Not8);
    717    adj = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
    718    return adj + op8;
    719 }
    720 
    721 static IROp mkWidenOp ( Int szSmall, Int szBig, Bool signd )
    722 {
    723    if (szSmall == 1 && szBig == 4) {
    724       return signd ? Iop_8Sto32 : Iop_8Uto32;
    725    }
    726    if (szSmall == 1 && szBig == 2) {
    727       return signd ? Iop_8Sto16 : Iop_8Uto16;
    728    }
    729    if (szSmall == 2 && szBig == 4) {
    730       return signd ? Iop_16Sto32 : Iop_16Uto32;
    731    }
    732    vpanic("mkWidenOp(x86,guest)");
    733 }
    734 
    735 static IRExpr* mkAnd1 ( IRExpr* x, IRExpr* y )
    736 {
    737    vassert(typeOfIRExpr(irsb->tyenv,x) == Ity_I1);
    738    vassert(typeOfIRExpr(irsb->tyenv,y) == Ity_I1);
    739    return unop(Iop_32to1,
    740                binop(Iop_And32,
    741                      unop(Iop_1Uto32,x),
    742                      unop(Iop_1Uto32,y)));
    743 }
    744 
    745 /* Generate a compare-and-swap operation, operating on memory at
    746    'addr'.  The expected value is 'expVal' and the new value is
    747    'newVal'.  If the operation fails, then transfer control (with a
    748    no-redir jump (XXX no -- see comment at top of this file)) to
    749    'restart_point', which is presumably the address of the guest
    750    instruction again -- retrying, essentially. */
    751 static void casLE ( IRExpr* addr, IRExpr* expVal, IRExpr* newVal,
    752                     Addr32 restart_point )
    753 {
    754    IRCAS* cas;
    755    IRType tyE    = typeOfIRExpr(irsb->tyenv, expVal);
    756    IRType tyN    = typeOfIRExpr(irsb->tyenv, newVal);
    757    IRTemp oldTmp = newTemp(tyE);
    758    IRTemp expTmp = newTemp(tyE);
    759    vassert(tyE == tyN);
    760    vassert(tyE == Ity_I32 || tyE == Ity_I16 || tyE == Ity_I8);
    761    assign(expTmp, expVal);
    762    cas = mkIRCAS( IRTemp_INVALID, oldTmp, Iend_LE, addr,
    763                   NULL, mkexpr(expTmp), NULL, newVal );
    764    stmt( IRStmt_CAS(cas) );
    765    stmt( IRStmt_Exit(
    766             binop( mkSizedOp(tyE,Iop_CasCmpNE8),
    767                    mkexpr(oldTmp), mkexpr(expTmp) ),
    768             Ijk_Boring, /*Ijk_NoRedir*/
    769             IRConst_U32( restart_point ),
    770             OFFB_EIP
    771          ));
    772 }
    773 
    774 
    775 /*------------------------------------------------------------*/
    776 /*--- Helpers for %eflags.                                 ---*/
    777 /*------------------------------------------------------------*/
    778 
    779 /* -------------- Evaluating the flags-thunk. -------------- */
    780 
    781 /* Build IR to calculate all the eflags from stored
    782    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
    783    Ity_I32. */
    784 static IRExpr* mk_x86g_calculate_eflags_all ( void )
    785 {
    786    IRExpr** args
    787       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
    788                        IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
    789                        IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
    790                        IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
    791    IRExpr* call
    792       = mkIRExprCCall(
    793            Ity_I32,
    794            0/*regparm*/,
    795            "x86g_calculate_eflags_all", &x86g_calculate_eflags_all,
    796            args
    797         );
    798    /* Exclude OP and NDEP from definedness checking.  We're only
    799       interested in DEP1 and DEP2. */
    800    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
    801    return call;
    802 }
    803 
    804 /* Build IR to calculate some particular condition from stored
    805    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
    806    Ity_Bit. */
    807 static IRExpr* mk_x86g_calculate_condition ( X86Condcode cond )
    808 {
    809    IRExpr** args
    810       = mkIRExprVec_5( mkU32(cond),
    811                        IRExpr_Get(OFFB_CC_OP,  Ity_I32),
    812                        IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
    813                        IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
    814                        IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
    815    IRExpr* call
    816       = mkIRExprCCall(
    817            Ity_I32,
    818            0/*regparm*/,
    819            "x86g_calculate_condition", &x86g_calculate_condition,
    820            args
    821         );
    822    /* Exclude the requested condition, OP and NDEP from definedness
    823       checking.  We're only interested in DEP1 and DEP2. */
    824    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<1) | (1<<4);
    825    return unop(Iop_32to1, call);
    826 }
    827 
    828 /* Build IR to calculate just the carry flag from stored
    829    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression :: Ity_I32. */
    830 static IRExpr* mk_x86g_calculate_eflags_c ( void )
    831 {
    832    IRExpr** args
    833       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
    834                        IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
    835                        IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
    836                        IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
    837    IRExpr* call
    838       = mkIRExprCCall(
    839            Ity_I32,
    840            3/*regparm*/,
    841            "x86g_calculate_eflags_c", &x86g_calculate_eflags_c,
    842            args
    843         );
    844    /* Exclude OP and NDEP from definedness checking.  We're only
    845       interested in DEP1 and DEP2. */
    846    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
    847    return call;
    848 }
    849 
    850 
    851 /* -------------- Building the flags-thunk. -------------- */
    852 
    853 /* The machinery in this section builds the flag-thunk following a
    854    flag-setting operation.  Hence the various setFlags_* functions.
    855 */
    856 
    857 static Bool isAddSub ( IROp op8 )
    858 {
    859    return toBool(op8 == Iop_Add8 || op8 == Iop_Sub8);
    860 }
    861 
    862 static Bool isLogic ( IROp op8 )
    863 {
    864    return toBool(op8 == Iop_And8 || op8 == Iop_Or8 || op8 == Iop_Xor8);
    865 }
    866 
    867 /* U-widen 8/16/32 bit int expr to 32. */
    868 static IRExpr* widenUto32 ( IRExpr* e )
    869 {
    870    switch (typeOfIRExpr(irsb->tyenv,e)) {
    871       case Ity_I32: return e;
    872       case Ity_I16: return unop(Iop_16Uto32,e);
    873       case Ity_I8:  return unop(Iop_8Uto32,e);
    874       default: vpanic("widenUto32");
    875    }
    876 }
    877 
    878 /* S-widen 8/16/32 bit int expr to 32. */
    879 static IRExpr* widenSto32 ( IRExpr* e )
    880 {
    881    switch (typeOfIRExpr(irsb->tyenv,e)) {
    882       case Ity_I32: return e;
    883       case Ity_I16: return unop(Iop_16Sto32,e);
    884       case Ity_I8:  return unop(Iop_8Sto32,e);
    885       default: vpanic("widenSto32");
    886    }
    887 }
    888 
    889 /* Narrow 8/16/32 bit int expr to 8/16/32.  Clearly only some
    890    of these combinations make sense. */
    891 static IRExpr* narrowTo ( IRType dst_ty, IRExpr* e )
    892 {
    893    IRType src_ty = typeOfIRExpr(irsb->tyenv,e);
    894    if (src_ty == dst_ty)
    895       return e;
    896    if (src_ty == Ity_I32 && dst_ty == Ity_I16)
    897       return unop(Iop_32to16, e);
    898    if (src_ty == Ity_I32 && dst_ty == Ity_I8)
    899       return unop(Iop_32to8, e);
    900 
    901    vex_printf("\nsrc, dst tys are: ");
    902    ppIRType(src_ty);
    903    vex_printf(", ");
    904    ppIRType(dst_ty);
    905    vex_printf("\n");
    906    vpanic("narrowTo(x86)");
    907 }
    908 
    909 
    910 /* Set the flags thunk OP, DEP1 and DEP2 fields.  The supplied op is
    911    auto-sized up to the real op. */
    912 
    913 static
    914 void setFlags_DEP1_DEP2 ( IROp op8, IRTemp dep1, IRTemp dep2, IRType ty )
    915 {
    916    Int ccOp = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
    917 
    918    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
    919 
    920    switch (op8) {
    921       case Iop_Add8: ccOp += X86G_CC_OP_ADDB;   break;
    922       case Iop_Sub8: ccOp += X86G_CC_OP_SUBB;   break;
    923       default:       ppIROp(op8);
    924                      vpanic("setFlags_DEP1_DEP2(x86)");
    925    }
    926    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
    927    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(dep1))) );
    928    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(mkexpr(dep2))) );
    929    /* Set NDEP even though it isn't used.  This makes redundant-PUT
    930       elimination of previous stores to this field work better. */
    931    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
    932 }
    933 
    934 
    935 /* Set the OP and DEP1 fields only, and write zero to DEP2. */
    936 
    937 static
    938 void setFlags_DEP1 ( IROp op8, IRTemp dep1, IRType ty )
    939 {
    940    Int ccOp = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
    941 
    942    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
    943 
    944    switch (op8) {
    945       case Iop_Or8:
    946       case Iop_And8:
    947       case Iop_Xor8: ccOp += X86G_CC_OP_LOGICB; break;
    948       default:       ppIROp(op8);
    949                      vpanic("setFlags_DEP1(x86)");
    950    }
    951    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
    952    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(dep1))) );
    953    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0)) );
    954    /* Set NDEP even though it isn't used.  This makes redundant-PUT
    955       elimination of previous stores to this field work better. */
    956    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
    957 }
    958 
    959 
    960 /* For shift operations, we put in the result and the undershifted
    961    result.  Except if the shift amount is zero, the thunk is left
    962    unchanged. */
    963 
    964 static void setFlags_DEP1_DEP2_shift ( IROp    op32,
    965                                        IRTemp  res,
    966                                        IRTemp  resUS,
    967                                        IRType  ty,
    968                                        IRTemp  guard )
    969 {
    970    Int ccOp = ty==Ity_I8 ? 2 : (ty==Ity_I16 ? 1 : 0);
    971 
    972    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
    973    vassert(guard);
    974 
    975    /* Both kinds of right shifts are handled by the same thunk
    976       operation. */
    977    switch (op32) {
    978       case Iop_Shr32:
    979       case Iop_Sar32: ccOp = X86G_CC_OP_SHRL - ccOp; break;
    980       case Iop_Shl32: ccOp = X86G_CC_OP_SHLL - ccOp; break;
    981       default:        ppIROp(op32);
    982                       vpanic("setFlags_DEP1_DEP2_shift(x86)");
    983    }
    984 
    985    /* guard :: Ity_I8.  We need to convert it to I1. */
    986    IRTemp guardB = newTemp(Ity_I1);
    987    assign( guardB, binop(Iop_CmpNE8, mkexpr(guard), mkU8(0)) );
    988 
    989    /* DEP1 contains the result, DEP2 contains the undershifted value. */
    990    stmt( IRStmt_Put( OFFB_CC_OP,
    991                      IRExpr_ITE( mkexpr(guardB),
    992                                  mkU32(ccOp),
    993                                  IRExpr_Get(OFFB_CC_OP,Ity_I32) ) ));
    994    stmt( IRStmt_Put( OFFB_CC_DEP1,
    995                      IRExpr_ITE( mkexpr(guardB),
    996                                  widenUto32(mkexpr(res)),
    997                                  IRExpr_Get(OFFB_CC_DEP1,Ity_I32) ) ));
    998    stmt( IRStmt_Put( OFFB_CC_DEP2,
    999                      IRExpr_ITE( mkexpr(guardB),
   1000                                  widenUto32(mkexpr(resUS)),
   1001                                  IRExpr_Get(OFFB_CC_DEP2,Ity_I32) ) ));
   1002    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   1003       elimination of previous stores to this field work better. */
   1004    stmt( IRStmt_Put( OFFB_CC_NDEP,
   1005                      IRExpr_ITE( mkexpr(guardB),
   1006                                  mkU32(0),
   1007                                  IRExpr_Get(OFFB_CC_NDEP,Ity_I32) ) ));
   1008 }
   1009 
   1010 
   1011 /* For the inc/dec case, we store in DEP1 the result value and in NDEP
   1012    the former value of the carry flag, which unfortunately we have to
   1013    compute. */
   1014 
   1015 static void setFlags_INC_DEC ( Bool inc, IRTemp res, IRType ty )
   1016 {
   1017    Int ccOp = inc ? X86G_CC_OP_INCB : X86G_CC_OP_DECB;
   1018 
   1019    ccOp += ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
   1020    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
   1021 
   1022    /* This has to come first, because calculating the C flag
   1023       may require reading all four thunk fields. */
   1024    stmt( IRStmt_Put( OFFB_CC_NDEP, mk_x86g_calculate_eflags_c()) );
   1025    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
   1026    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(res))) );
   1027    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0)) );
   1028 }
   1029 
   1030 
   1031 /* Multiplies are pretty much like add and sub: DEP1 and DEP2 hold the
   1032    two arguments. */
   1033 
   1034 static
   1035 void setFlags_MUL ( IRType ty, IRTemp arg1, IRTemp arg2, UInt base_op )
   1036 {
   1037    switch (ty) {
   1038       case Ity_I8:
   1039          stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+0) ) );
   1040          break;
   1041       case Ity_I16:
   1042          stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+1) ) );
   1043          break;
   1044       case Ity_I32:
   1045          stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+2) ) );
   1046          break;
   1047       default:
   1048          vpanic("setFlags_MUL(x86)");
   1049    }
   1050    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(arg1)) ));
   1051    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(mkexpr(arg2)) ));
   1052    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   1053       elimination of previous stores to this field work better. */
   1054    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   1055 }
   1056 
   1057 
   1058 /* -------------- Condition codes. -------------- */
   1059 
   1060 /* Condition codes, using the Intel encoding.  */
   1061 
   1062 static const HChar* name_X86Condcode ( X86Condcode cond )
   1063 {
   1064    switch (cond) {
   1065       case X86CondO:      return "o";
   1066       case X86CondNO:     return "no";
   1067       case X86CondB:      return "b";
   1068       case X86CondNB:     return "nb";
   1069       case X86CondZ:      return "z";
   1070       case X86CondNZ:     return "nz";
   1071       case X86CondBE:     return "be";
   1072       case X86CondNBE:    return "nbe";
   1073       case X86CondS:      return "s";
   1074       case X86CondNS:     return "ns";
   1075       case X86CondP:      return "p";
   1076       case X86CondNP:     return "np";
   1077       case X86CondL:      return "l";
   1078       case X86CondNL:     return "nl";
   1079       case X86CondLE:     return "le";
   1080       case X86CondNLE:    return "nle";
   1081       case X86CondAlways: return "ALWAYS";
   1082       default: vpanic("name_X86Condcode");
   1083    }
   1084 }
   1085 
   1086 static
   1087 X86Condcode positiveIse_X86Condcode ( X86Condcode  cond,
   1088                                       Bool*        needInvert )
   1089 {
   1090    vassert(cond >= X86CondO && cond <= X86CondNLE);
   1091    if (cond & 1) {
   1092       *needInvert = True;
   1093       return cond-1;
   1094    } else {
   1095       *needInvert = False;
   1096       return cond;
   1097    }
   1098 }
   1099 
   1100 
   1101 /* -------------- Helpers for ADD/SUB with carry. -------------- */
   1102 
   1103 /* Given ta1, ta2 and tres, compute tres = ADC(ta1,ta2) and set flags
   1104    appropriately.
   1105 
   1106    Optionally, generate a store for the 'tres' value.  This can either
   1107    be a normal store, or it can be a cas-with-possible-failure style
   1108    store:
   1109 
   1110    if taddr is IRTemp_INVALID, then no store is generated.
   1111 
   1112    if taddr is not IRTemp_INVALID, then a store (using taddr as
   1113    the address) is generated:
   1114 
   1115      if texpVal is IRTemp_INVALID then a normal store is
   1116      generated, and restart_point must be zero (it is irrelevant).
   1117 
   1118      if texpVal is not IRTemp_INVALID then a cas-style store is
   1119      generated.  texpVal is the expected value, restart_point
   1120      is the restart point if the store fails, and texpVal must
   1121      have the same type as tres.
   1122 */
   1123 static void helper_ADC ( Int sz,
   1124                          IRTemp tres, IRTemp ta1, IRTemp ta2,
   1125                          /* info about optional store: */
   1126                          IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
   1127 {
   1128    UInt    thunkOp;
   1129    IRType  ty    = szToITy(sz);
   1130    IRTemp  oldc  = newTemp(Ity_I32);
   1131    IRTemp  oldcn = newTemp(ty);
   1132    IROp    plus  = mkSizedOp(ty, Iop_Add8);
   1133    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
   1134 
   1135    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
   1136    vassert(sz == 1 || sz == 2 || sz == 4);
   1137    thunkOp = sz==4 ? X86G_CC_OP_ADCL
   1138                    : (sz==2 ? X86G_CC_OP_ADCW : X86G_CC_OP_ADCB);
   1139 
   1140    /* oldc = old carry flag, 0 or 1 */
   1141    assign( oldc,  binop(Iop_And32,
   1142                         mk_x86g_calculate_eflags_c(),
   1143                         mkU32(1)) );
   1144 
   1145    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
   1146 
   1147    assign( tres, binop(plus,
   1148                        binop(plus,mkexpr(ta1),mkexpr(ta2)),
   1149                        mkexpr(oldcn)) );
   1150 
   1151    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
   1152       start of this function. */
   1153    if (taddr != IRTemp_INVALID) {
   1154       if (texpVal == IRTemp_INVALID) {
   1155          vassert(restart_point == 0);
   1156          storeLE( mkexpr(taddr), mkexpr(tres) );
   1157       } else {
   1158          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
   1159          /* .. and hence 'texpVal' has the same type as 'tres'. */
   1160          casLE( mkexpr(taddr),
   1161                 mkexpr(texpVal), mkexpr(tres), restart_point );
   1162       }
   1163    }
   1164 
   1165    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(thunkOp) ) );
   1166    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(ta1)) ));
   1167    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(binop(xor, mkexpr(ta2),
   1168                                                          mkexpr(oldcn)) )) );
   1169    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
   1170 }
   1171 
   1172 
   1173 /* Given ta1, ta2 and tres, compute tres = SBB(ta1,ta2) and set flags
   1174    appropriately.  As with helper_ADC, possibly generate a store of
   1175    the result -- see comments on helper_ADC for details.
   1176 */
   1177 static void helper_SBB ( Int sz,
   1178                          IRTemp tres, IRTemp ta1, IRTemp ta2,
   1179                          /* info about optional store: */
   1180                          IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
   1181 {
   1182    UInt    thunkOp;
   1183    IRType  ty    = szToITy(sz);
   1184    IRTemp  oldc  = newTemp(Ity_I32);
   1185    IRTemp  oldcn = newTemp(ty);
   1186    IROp    minus = mkSizedOp(ty, Iop_Sub8);
   1187    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
   1188 
   1189    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
   1190    vassert(sz == 1 || sz == 2 || sz == 4);
   1191    thunkOp = sz==4 ? X86G_CC_OP_SBBL
   1192                    : (sz==2 ? X86G_CC_OP_SBBW : X86G_CC_OP_SBBB);
   1193 
   1194    /* oldc = old carry flag, 0 or 1 */
   1195    assign( oldc, binop(Iop_And32,
   1196                        mk_x86g_calculate_eflags_c(),
   1197                        mkU32(1)) );
   1198 
   1199    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
   1200 
   1201    assign( tres, binop(minus,
   1202                        binop(minus,mkexpr(ta1),mkexpr(ta2)),
   1203                        mkexpr(oldcn)) );
   1204 
   1205    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
   1206       start of this function. */
   1207    if (taddr != IRTemp_INVALID) {
   1208       if (texpVal == IRTemp_INVALID) {
   1209          vassert(restart_point == 0);
   1210          storeLE( mkexpr(taddr), mkexpr(tres) );
   1211       } else {
   1212          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
   1213          /* .. and hence 'texpVal' has the same type as 'tres'. */
   1214          casLE( mkexpr(taddr),
   1215                 mkexpr(texpVal), mkexpr(tres), restart_point );
   1216       }
   1217    }
   1218 
   1219    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(thunkOp) ) );
   1220    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(ta1) )) );
   1221    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(binop(xor, mkexpr(ta2),
   1222                                                          mkexpr(oldcn)) )) );
   1223    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
   1224 }
   1225 
   1226 
   1227 /* -------------- Helpers for disassembly printing. -------------- */
   1228 
   1229 static const HChar* nameGrp1 ( Int opc_aux )
   1230 {
   1231    static const HChar* grp1_names[8]
   1232      = { "add", "or", "adc", "sbb", "and", "sub", "xor", "cmp" };
   1233    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp1(x86)");
   1234    return grp1_names[opc_aux];
   1235 }
   1236 
   1237 static const HChar* nameGrp2 ( Int opc_aux )
   1238 {
   1239    static const HChar* grp2_names[8]
   1240      = { "rol", "ror", "rcl", "rcr", "shl", "shr", "shl", "sar" };
   1241    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp2(x86)");
   1242    return grp2_names[opc_aux];
   1243 }
   1244 
   1245 static const HChar* nameGrp4 ( Int opc_aux )
   1246 {
   1247    static const HChar* grp4_names[8]
   1248      = { "inc", "dec", "???", "???", "???", "???", "???", "???" };
   1249    if (opc_aux < 0 || opc_aux > 1) vpanic("nameGrp4(x86)");
   1250    return grp4_names[opc_aux];
   1251 }
   1252 
   1253 static const HChar* nameGrp5 ( Int opc_aux )
   1254 {
   1255    static const HChar* grp5_names[8]
   1256      = { "inc", "dec", "call*", "call*", "jmp*", "jmp*", "push", "???" };
   1257    if (opc_aux < 0 || opc_aux > 6) vpanic("nameGrp5(x86)");
   1258    return grp5_names[opc_aux];
   1259 }
   1260 
   1261 static const HChar* nameGrp8 ( Int opc_aux )
   1262 {
   1263    static const HChar* grp8_names[8]
   1264      = { "???", "???", "???", "???", "bt", "bts", "btr", "btc" };
   1265    if (opc_aux < 4 || opc_aux > 7) vpanic("nameGrp8(x86)");
   1266    return grp8_names[opc_aux];
   1267 }
   1268 
   1269 static const HChar* nameIReg ( Int size, Int reg )
   1270 {
   1271    static const HChar* ireg32_names[8]
   1272      = { "%eax", "%ecx", "%edx", "%ebx",
   1273          "%esp", "%ebp", "%esi", "%edi" };
   1274    static const HChar* ireg16_names[8]
   1275      = { "%ax", "%cx", "%dx", "%bx", "%sp", "%bp", "%si", "%di" };
   1276    static const HChar* ireg8_names[8]
   1277      = { "%al", "%cl", "%dl", "%bl",
   1278          "%ah{sp}", "%ch{bp}", "%dh{si}", "%bh{di}" };
   1279    if (reg < 0 || reg > 7) goto bad;
   1280    switch (size) {
   1281       case 4: return ireg32_names[reg];
   1282       case 2: return ireg16_names[reg];
   1283       case 1: return ireg8_names[reg];
   1284    }
   1285   bad:
   1286    vpanic("nameIReg(X86)");
   1287    return NULL; /*notreached*/
   1288 }
   1289 
   1290 static const HChar* nameSReg ( UInt sreg )
   1291 {
   1292    switch (sreg) {
   1293       case R_ES: return "%es";
   1294       case R_CS: return "%cs";
   1295       case R_SS: return "%ss";
   1296       case R_DS: return "%ds";
   1297       case R_FS: return "%fs";
   1298       case R_GS: return "%gs";
   1299       default: vpanic("nameSReg(x86)");
   1300    }
   1301 }
   1302 
   1303 static const HChar* nameMMXReg ( Int mmxreg )
   1304 {
   1305    static const HChar* mmx_names[8]
   1306      = { "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" };
   1307    if (mmxreg < 0 || mmxreg > 7) vpanic("nameMMXReg(x86,guest)");
   1308    return mmx_names[mmxreg];
   1309 }
   1310 
   1311 static const HChar* nameXMMReg ( Int xmmreg )
   1312 {
   1313    static const HChar* xmm_names[8]
   1314      = { "%xmm0", "%xmm1", "%xmm2", "%xmm3",
   1315          "%xmm4", "%xmm5", "%xmm6", "%xmm7" };
   1316    if (xmmreg < 0 || xmmreg > 7) vpanic("name_of_xmm_reg");
   1317    return xmm_names[xmmreg];
   1318 }
   1319 
   1320 static const HChar* nameMMXGran ( Int gran )
   1321 {
   1322    switch (gran) {
   1323       case 0: return "b";
   1324       case 1: return "w";
   1325       case 2: return "d";
   1326       case 3: return "q";
   1327       default: vpanic("nameMMXGran(x86,guest)");
   1328    }
   1329 }
   1330 
   1331 static HChar nameISize ( Int size )
   1332 {
   1333    switch (size) {
   1334       case 4: return 'l';
   1335       case 2: return 'w';
   1336       case 1: return 'b';
   1337       default: vpanic("nameISize(x86)");
   1338    }
   1339 }
   1340 
   1341 
   1342 /*------------------------------------------------------------*/
   1343 /*--- JMP helpers                                          ---*/
   1344 /*------------------------------------------------------------*/
   1345 
   1346 static void jmp_lit( /*MOD*/DisResult* dres,
   1347                      IRJumpKind kind, Addr32 d32 )
   1348 {
   1349    vassert(dres->whatNext    == Dis_Continue);
   1350    vassert(dres->len         == 0);
   1351    vassert(dres->continueAt  == 0);
   1352    vassert(dres->jk_StopHere == Ijk_INVALID);
   1353    dres->whatNext    = Dis_StopHere;
   1354    dres->jk_StopHere = kind;
   1355    stmt( IRStmt_Put( OFFB_EIP, mkU32(d32) ) );
   1356 }
   1357 
   1358 static void jmp_treg( /*MOD*/DisResult* dres,
   1359                       IRJumpKind kind, IRTemp t )
   1360 {
   1361    vassert(dres->whatNext    == Dis_Continue);
   1362    vassert(dres->len         == 0);
   1363    vassert(dres->continueAt  == 0);
   1364    vassert(dres->jk_StopHere == Ijk_INVALID);
   1365    dres->whatNext    = Dis_StopHere;
   1366    dres->jk_StopHere = kind;
   1367    stmt( IRStmt_Put( OFFB_EIP, mkexpr(t) ) );
   1368 }
   1369 
   1370 static
   1371 void jcc_01( /*MOD*/DisResult* dres,
   1372              X86Condcode cond, Addr32 d32_false, Addr32 d32_true )
   1373 {
   1374    Bool        invert;
   1375    X86Condcode condPos;
   1376    vassert(dres->whatNext    == Dis_Continue);
   1377    vassert(dres->len         == 0);
   1378    vassert(dres->continueAt  == 0);
   1379    vassert(dres->jk_StopHere == Ijk_INVALID);
   1380    dres->whatNext    = Dis_StopHere;
   1381    dres->jk_StopHere = Ijk_Boring;
   1382    condPos = positiveIse_X86Condcode ( cond, &invert );
   1383    if (invert) {
   1384       stmt( IRStmt_Exit( mk_x86g_calculate_condition(condPos),
   1385                          Ijk_Boring,
   1386                          IRConst_U32(d32_false),
   1387                          OFFB_EIP ) );
   1388       stmt( IRStmt_Put( OFFB_EIP, mkU32(d32_true) ) );
   1389    } else {
   1390       stmt( IRStmt_Exit( mk_x86g_calculate_condition(condPos),
   1391                          Ijk_Boring,
   1392                          IRConst_U32(d32_true),
   1393                          OFFB_EIP ) );
   1394       stmt( IRStmt_Put( OFFB_EIP, mkU32(d32_false) ) );
   1395    }
   1396 }
   1397 
   1398 
   1399 /*------------------------------------------------------------*/
   1400 /*--- Disassembling addressing modes                       ---*/
   1401 /*------------------------------------------------------------*/
   1402 
   1403 static
   1404 const HChar* sorbTxt ( UChar sorb )
   1405 {
   1406    switch (sorb) {
   1407       case 0:    return ""; /* no override */
   1408       case 0x3E: return "%ds";
   1409       case 0x26: return "%es:";
   1410       case 0x64: return "%fs:";
   1411       case 0x65: return "%gs:";
   1412       case 0x36: return "%ss:";
   1413       default: vpanic("sorbTxt(x86,guest)");
   1414    }
   1415 }
   1416 
   1417 
   1418 /* 'virtual' is an IRExpr* holding a virtual address.  Convert it to a
   1419    linear address by adding any required segment override as indicated
   1420    by sorb. */
   1421 static
   1422 IRExpr* handleSegOverride ( UChar sorb, IRExpr* virtual )
   1423 {
   1424    Int    sreg;
   1425    IRType hWordTy;
   1426    IRTemp ldt_ptr, gdt_ptr, seg_selector, r64;
   1427 
   1428    if (sorb == 0)
   1429       /* the common case - no override */
   1430       return virtual;
   1431 
   1432    switch (sorb) {
   1433       case 0x3E: sreg = R_DS; break;
   1434       case 0x26: sreg = R_ES; break;
   1435       case 0x64: sreg = R_FS; break;
   1436       case 0x65: sreg = R_GS; break;
   1437       case 0x36: sreg = R_SS; break;
   1438       default: vpanic("handleSegOverride(x86,guest)");
   1439    }
   1440 
   1441    hWordTy = sizeof(HWord)==4 ? Ity_I32 : Ity_I64;
   1442 
   1443    seg_selector = newTemp(Ity_I32);
   1444    ldt_ptr      = newTemp(hWordTy);
   1445    gdt_ptr      = newTemp(hWordTy);
   1446    r64          = newTemp(Ity_I64);
   1447 
   1448    assign( seg_selector, unop(Iop_16Uto32, getSReg(sreg)) );
   1449    assign( ldt_ptr, IRExpr_Get( OFFB_LDT, hWordTy ));
   1450    assign( gdt_ptr, IRExpr_Get( OFFB_GDT, hWordTy ));
   1451 
   1452    /*
   1453    Call this to do the translation and limit checks:
   1454    ULong x86g_use_seg_selector ( HWord ldt, HWord gdt,
   1455                                  UInt seg_selector, UInt virtual_addr )
   1456    */
   1457    assign(
   1458       r64,
   1459       mkIRExprCCall(
   1460          Ity_I64,
   1461          0/*regparms*/,
   1462          "x86g_use_seg_selector",
   1463          &x86g_use_seg_selector,
   1464          mkIRExprVec_4( mkexpr(ldt_ptr), mkexpr(gdt_ptr),
   1465                         mkexpr(seg_selector), virtual)
   1466       )
   1467    );
   1468 
   1469    /* If the high 32 of the result are non-zero, there was a
   1470       failure in address translation.  In which case, make a
   1471       quick exit.
   1472    */
   1473    stmt(
   1474       IRStmt_Exit(
   1475          binop(Iop_CmpNE32, unop(Iop_64HIto32, mkexpr(r64)), mkU32(0)),
   1476          Ijk_MapFail,
   1477          IRConst_U32( guest_EIP_curr_instr ),
   1478          OFFB_EIP
   1479       )
   1480    );
   1481 
   1482    /* otherwise, here's the translated result. */
   1483    return unop(Iop_64to32, mkexpr(r64));
   1484 }
   1485 
   1486 
   1487 /* Generate IR to calculate an address indicated by a ModRM and
   1488    following SIB bytes.  The expression, and the number of bytes in
   1489    the address mode, are returned.  Note that this fn should not be
   1490    called if the R/M part of the address denotes a register instead of
   1491    memory.  If print_codegen is true, text of the addressing mode is
   1492    placed in buf.
   1493 
   1494    The computed address is stored in a new tempreg, and the
   1495    identity of the tempreg is returned.  */
   1496 
   1497 static IRTemp disAMode_copy2tmp ( IRExpr* addr32 )
   1498 {
   1499    IRTemp tmp = newTemp(Ity_I32);
   1500    assign( tmp, addr32 );
   1501    return tmp;
   1502 }
   1503 
   1504 static
   1505 IRTemp disAMode ( Int* len, UChar sorb, Int delta, HChar* buf )
   1506 {
   1507    UChar mod_reg_rm = getIByte(delta);
   1508    delta++;
   1509 
   1510    buf[0] = (UChar)0;
   1511 
   1512    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
   1513       jump table seems a bit excessive.
   1514    */
   1515    mod_reg_rm &= 0xC7;                      /* is now XX000YYY */
   1516    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
   1517                                             /* is now XX0XXYYY */
   1518    mod_reg_rm &= 0x1F;                      /* is now 000XXYYY */
   1519    switch (mod_reg_rm) {
   1520 
   1521       /* (%eax) .. (%edi), not including (%esp) or (%ebp).
   1522          --> GET %reg, t
   1523       */
   1524       case 0x00: case 0x01: case 0x02: case 0x03:
   1525       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
   1526          { UChar rm = mod_reg_rm;
   1527            DIS(buf, "%s(%s)", sorbTxt(sorb), nameIReg(4,rm));
   1528            *len = 1;
   1529            return disAMode_copy2tmp(
   1530                   handleSegOverride(sorb, getIReg(4,rm)));
   1531          }
   1532 
   1533       /* d8(%eax) ... d8(%edi), not including d8(%esp)
   1534          --> GET %reg, t ; ADDL d8, t
   1535       */
   1536       case 0x08: case 0x09: case 0x0A: case 0x0B:
   1537       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
   1538          { UChar rm = toUChar(mod_reg_rm & 7);
   1539            UInt  d  = getSDisp8(delta);
   1540            DIS(buf, "%s%d(%s)", sorbTxt(sorb), (Int)d, nameIReg(4,rm));
   1541            *len = 2;
   1542            return disAMode_copy2tmp(
   1543                   handleSegOverride(sorb,
   1544                      binop(Iop_Add32,getIReg(4,rm),mkU32(d))));
   1545          }
   1546 
   1547       /* d32(%eax) ... d32(%edi), not including d32(%esp)
   1548          --> GET %reg, t ; ADDL d8, t
   1549       */
   1550       case 0x10: case 0x11: case 0x12: case 0x13:
   1551       /* ! 14 */ case 0x15: case 0x16: case 0x17:
   1552          { UChar rm = toUChar(mod_reg_rm & 7);
   1553            UInt  d  = getUDisp32(delta);
   1554            DIS(buf, "%s0x%x(%s)", sorbTxt(sorb), d, nameIReg(4,rm));
   1555            *len = 5;
   1556            return disAMode_copy2tmp(
   1557                   handleSegOverride(sorb,
   1558                      binop(Iop_Add32,getIReg(4,rm),mkU32(d))));
   1559          }
   1560 
   1561       /* a register, %eax .. %edi.  This shouldn't happen. */
   1562       case 0x18: case 0x19: case 0x1A: case 0x1B:
   1563       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
   1564          vpanic("disAMode(x86): not an addr!");
   1565 
   1566       /* a 32-bit literal address
   1567          --> MOV d32, tmp
   1568       */
   1569       case 0x05:
   1570          { UInt d = getUDisp32(delta);
   1571            *len = 5;
   1572            DIS(buf, "%s(0x%x)", sorbTxt(sorb), d);
   1573            return disAMode_copy2tmp(
   1574                      handleSegOverride(sorb, mkU32(d)));
   1575          }
   1576 
   1577       case 0x04: {
   1578          /* SIB, with no displacement.  Special cases:
   1579             -- %esp cannot act as an index value.
   1580                If index_r indicates %esp, zero is used for the index.
   1581             -- when mod is zero and base indicates EBP, base is instead
   1582                a 32-bit literal.
   1583             It's all madness, I tell you.  Extract %index, %base and
   1584             scale from the SIB byte.  The value denoted is then:
   1585                | %index == %ESP && %base == %EBP
   1586                = d32 following SIB byte
   1587                | %index == %ESP && %base != %EBP
   1588                = %base
   1589                | %index != %ESP && %base == %EBP
   1590                = d32 following SIB byte + (%index << scale)
   1591                | %index != %ESP && %base != %ESP
   1592                = %base + (%index << scale)
   1593 
   1594             What happens to the souls of CPU architects who dream up such
   1595             horrendous schemes, do you suppose?
   1596          */
   1597          UChar sib     = getIByte(delta);
   1598          UChar scale   = toUChar((sib >> 6) & 3);
   1599          UChar index_r = toUChar((sib >> 3) & 7);
   1600          UChar base_r  = toUChar(sib & 7);
   1601          delta++;
   1602 
   1603          if (index_r != R_ESP && base_r != R_EBP) {
   1604             DIS(buf, "%s(%s,%s,%d)", sorbTxt(sorb),
   1605                       nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
   1606             *len = 2;
   1607             return
   1608                disAMode_copy2tmp(
   1609                handleSegOverride(sorb,
   1610                   binop(Iop_Add32,
   1611                         getIReg(4,base_r),
   1612                         binop(Iop_Shl32, getIReg(4,index_r),
   1613                               mkU8(scale)))));
   1614          }
   1615 
   1616          if (index_r != R_ESP && base_r == R_EBP) {
   1617             UInt d = getUDisp32(delta);
   1618             DIS(buf, "%s0x%x(,%s,%d)", sorbTxt(sorb), d,
   1619                       nameIReg(4,index_r), 1<<scale);
   1620             *len = 6;
   1621             return
   1622                disAMode_copy2tmp(
   1623                handleSegOverride(sorb,
   1624                   binop(Iop_Add32,
   1625                         binop(Iop_Shl32, getIReg(4,index_r), mkU8(scale)),
   1626                         mkU32(d))));
   1627          }
   1628 
   1629          if (index_r == R_ESP && base_r != R_EBP) {
   1630             DIS(buf, "%s(%s,,)", sorbTxt(sorb), nameIReg(4,base_r));
   1631             *len = 2;
   1632             return disAMode_copy2tmp(
   1633                    handleSegOverride(sorb, getIReg(4,base_r)));
   1634          }
   1635 
   1636          if (index_r == R_ESP && base_r == R_EBP) {
   1637             UInt d = getUDisp32(delta);
   1638             DIS(buf, "%s0x%x(,,)", sorbTxt(sorb), d);
   1639             *len = 6;
   1640             return disAMode_copy2tmp(
   1641                    handleSegOverride(sorb, mkU32(d)));
   1642          }
   1643          /*NOTREACHED*/
   1644          vassert(0);
   1645       }
   1646 
   1647       /* SIB, with 8-bit displacement.  Special cases:
   1648          -- %esp cannot act as an index value.
   1649             If index_r indicates %esp, zero is used for the index.
   1650          Denoted value is:
   1651             | %index == %ESP
   1652             = d8 + %base
   1653             | %index != %ESP
   1654             = d8 + %base + (%index << scale)
   1655       */
   1656       case 0x0C: {
   1657          UChar sib     = getIByte(delta);
   1658          UChar scale   = toUChar((sib >> 6) & 3);
   1659          UChar index_r = toUChar((sib >> 3) & 7);
   1660          UChar base_r  = toUChar(sib & 7);
   1661          UInt  d       = getSDisp8(delta+1);
   1662 
   1663          if (index_r == R_ESP) {
   1664             DIS(buf, "%s%d(%s,,)", sorbTxt(sorb),
   1665                                    (Int)d, nameIReg(4,base_r));
   1666             *len = 3;
   1667             return disAMode_copy2tmp(
   1668                    handleSegOverride(sorb,
   1669                       binop(Iop_Add32, getIReg(4,base_r), mkU32(d)) ));
   1670          } else {
   1671             DIS(buf, "%s%d(%s,%s,%d)", sorbTxt(sorb), (Int)d,
   1672                      nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
   1673             *len = 3;
   1674             return
   1675                 disAMode_copy2tmp(
   1676                 handleSegOverride(sorb,
   1677                   binop(Iop_Add32,
   1678                         binop(Iop_Add32,
   1679                               getIReg(4,base_r),
   1680                               binop(Iop_Shl32,
   1681                                     getIReg(4,index_r), mkU8(scale))),
   1682                         mkU32(d))));
   1683          }
   1684 	 /*NOTREACHED*/
   1685          vassert(0);
   1686       }
   1687 
   1688       /* SIB, with 32-bit displacement.  Special cases:
   1689          -- %esp cannot act as an index value.
   1690             If index_r indicates %esp, zero is used for the index.
   1691          Denoted value is:
   1692             | %index == %ESP
   1693             = d32 + %base
   1694             | %index != %ESP
   1695             = d32 + %base + (%index << scale)
   1696       */
   1697       case 0x14: {
   1698          UChar sib     = getIByte(delta);
   1699          UChar scale   = toUChar((sib >> 6) & 3);
   1700          UChar index_r = toUChar((sib >> 3) & 7);
   1701          UChar base_r  = toUChar(sib & 7);
   1702          UInt d        = getUDisp32(delta+1);
   1703 
   1704          if (index_r == R_ESP) {
   1705             DIS(buf, "%s%d(%s,,)", sorbTxt(sorb),
   1706                                    (Int)d, nameIReg(4,base_r));
   1707             *len = 6;
   1708             return disAMode_copy2tmp(
   1709                    handleSegOverride(sorb,
   1710                       binop(Iop_Add32, getIReg(4,base_r), mkU32(d)) ));
   1711          } else {
   1712             DIS(buf, "%s%d(%s,%s,%d)", sorbTxt(sorb), (Int)d,
   1713                      nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
   1714             *len = 6;
   1715             return
   1716                 disAMode_copy2tmp(
   1717                 handleSegOverride(sorb,
   1718                   binop(Iop_Add32,
   1719                         binop(Iop_Add32,
   1720                               getIReg(4,base_r),
   1721                               binop(Iop_Shl32,
   1722                                     getIReg(4,index_r), mkU8(scale))),
   1723                         mkU32(d))));
   1724          }
   1725 	 /*NOTREACHED*/
   1726          vassert(0);
   1727       }
   1728 
   1729       default:
   1730          vpanic("disAMode(x86)");
   1731          return 0; /*notreached*/
   1732    }
   1733 }
   1734 
   1735 
   1736 /* Figure out the number of (insn-stream) bytes constituting the amode
   1737    beginning at delta.  Is useful for getting hold of literals beyond
   1738    the end of the amode before it has been disassembled.  */
   1739 
   1740 static UInt lengthAMode ( Int delta )
   1741 {
   1742    UChar mod_reg_rm = getIByte(delta); delta++;
   1743 
   1744    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
   1745       jump table seems a bit excessive.
   1746    */
   1747    mod_reg_rm &= 0xC7;               /* is now XX000YYY */
   1748    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
   1749                                      /* is now XX0XXYYY */
   1750    mod_reg_rm &= 0x1F;               /* is now 000XXYYY */
   1751    switch (mod_reg_rm) {
   1752 
   1753       /* (%eax) .. (%edi), not including (%esp) or (%ebp). */
   1754       case 0x00: case 0x01: case 0x02: case 0x03:
   1755       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
   1756          return 1;
   1757 
   1758       /* d8(%eax) ... d8(%edi), not including d8(%esp). */
   1759       case 0x08: case 0x09: case 0x0A: case 0x0B:
   1760       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
   1761          return 2;
   1762 
   1763       /* d32(%eax) ... d32(%edi), not including d32(%esp). */
   1764       case 0x10: case 0x11: case 0x12: case 0x13:
   1765       /* ! 14 */ case 0x15: case 0x16: case 0x17:
   1766          return 5;
   1767 
   1768       /* a register, %eax .. %edi.  (Not an addr, but still handled.) */
   1769       case 0x18: case 0x19: case 0x1A: case 0x1B:
   1770       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
   1771          return 1;
   1772 
   1773       /* a 32-bit literal address. */
   1774       case 0x05: return 5;
   1775 
   1776       /* SIB, no displacement.  */
   1777       case 0x04: {
   1778          UChar sib    = getIByte(delta);
   1779          UChar base_r = toUChar(sib & 7);
   1780          if (base_r == R_EBP) return 6; else return 2;
   1781       }
   1782       /* SIB, with 8-bit displacement.  */
   1783       case 0x0C: return 3;
   1784 
   1785       /* SIB, with 32-bit displacement.  */
   1786       case 0x14: return 6;
   1787 
   1788       default:
   1789          vpanic("lengthAMode");
   1790          return 0; /*notreached*/
   1791    }
   1792 }
   1793 
   1794 /*------------------------------------------------------------*/
   1795 /*--- Disassembling common idioms                          ---*/
   1796 /*------------------------------------------------------------*/
   1797 
   1798 /* Handle binary integer instructions of the form
   1799       op E, G  meaning
   1800       op reg-or-mem, reg
   1801    Is passed the a ptr to the modRM byte, the actual operation, and the
   1802    data size.  Returns the address advanced completely over this
   1803    instruction.
   1804 
   1805    E(src) is reg-or-mem
   1806    G(dst) is reg.
   1807 
   1808    If E is reg, -->    GET %G,  tmp
   1809                        OP %E,   tmp
   1810                        PUT tmp, %G
   1811 
   1812    If E is mem and OP is not reversible,
   1813                 -->    (getAddr E) -> tmpa
   1814                        LD (tmpa), tmpa
   1815                        GET %G, tmp2
   1816                        OP tmpa, tmp2
   1817                        PUT tmp2, %G
   1818 
   1819    If E is mem and OP is reversible
   1820                 -->    (getAddr E) -> tmpa
   1821                        LD (tmpa), tmpa
   1822                        OP %G, tmpa
   1823                        PUT tmpa, %G
   1824 */
   1825 static
   1826 UInt dis_op2_E_G ( UChar       sorb,
   1827                    Bool        addSubCarry,
   1828                    IROp        op8,
   1829                    Bool        keep,
   1830                    Int         size,
   1831                    Int         delta0,
   1832                    const HChar* t_x86opc )
   1833 {
   1834    HChar   dis_buf[50];
   1835    Int     len;
   1836    IRType  ty   = szToITy(size);
   1837    IRTemp  dst1 = newTemp(ty);
   1838    IRTemp  src  = newTemp(ty);
   1839    IRTemp  dst0 = newTemp(ty);
   1840    UChar   rm   = getUChar(delta0);
   1841    IRTemp  addr = IRTemp_INVALID;
   1842 
   1843    /* addSubCarry == True indicates the intended operation is
   1844       add-with-carry or subtract-with-borrow. */
   1845    if (addSubCarry) {
   1846       vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
   1847       vassert(keep);
   1848    }
   1849 
   1850    if (epartIsReg(rm)) {
   1851       /* Specially handle XOR reg,reg, because that doesn't really
   1852          depend on reg, and doing the obvious thing potentially
   1853          generates a spurious value check failure due to the bogus
   1854          dependency.  Ditto SBB reg,reg. */
   1855       if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
   1856           && gregOfRM(rm) == eregOfRM(rm)) {
   1857          putIReg(size, gregOfRM(rm), mkU(ty,0));
   1858       }
   1859       assign( dst0, getIReg(size,gregOfRM(rm)) );
   1860       assign( src,  getIReg(size,eregOfRM(rm)) );
   1861 
   1862       if (addSubCarry && op8 == Iop_Add8) {
   1863          helper_ADC( size, dst1, dst0, src,
   1864                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1865          putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1866       } else
   1867       if (addSubCarry && op8 == Iop_Sub8) {
   1868          helper_SBB( size, dst1, dst0, src,
   1869                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1870          putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1871       } else {
   1872          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   1873          if (isAddSub(op8))
   1874             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   1875          else
   1876             setFlags_DEP1(op8, dst1, ty);
   1877          if (keep)
   1878             putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1879       }
   1880 
   1881       DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
   1882                           nameIReg(size,eregOfRM(rm)),
   1883                           nameIReg(size,gregOfRM(rm)));
   1884       return 1+delta0;
   1885    } else {
   1886       /* E refers to memory */
   1887       addr = disAMode ( &len, sorb, delta0, dis_buf);
   1888       assign( dst0, getIReg(size,gregOfRM(rm)) );
   1889       assign( src,  loadLE(szToITy(size), mkexpr(addr)) );
   1890 
   1891       if (addSubCarry && op8 == Iop_Add8) {
   1892          helper_ADC( size, dst1, dst0, src,
   1893                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1894          putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1895       } else
   1896       if (addSubCarry && op8 == Iop_Sub8) {
   1897          helper_SBB( size, dst1, dst0, src,
   1898                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1899          putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1900       } else {
   1901          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   1902          if (isAddSub(op8))
   1903             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   1904          else
   1905             setFlags_DEP1(op8, dst1, ty);
   1906          if (keep)
   1907             putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1908       }
   1909 
   1910       DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
   1911                           dis_buf,nameIReg(size,gregOfRM(rm)));
   1912       return len+delta0;
   1913    }
   1914 }
   1915 
   1916 
   1917 
   1918 /* Handle binary integer instructions of the form
   1919       op G, E  meaning
   1920       op reg, reg-or-mem
   1921    Is passed the a ptr to the modRM byte, the actual operation, and the
   1922    data size.  Returns the address advanced completely over this
   1923    instruction.
   1924 
   1925    G(src) is reg.
   1926    E(dst) is reg-or-mem
   1927 
   1928    If E is reg, -->    GET %E,  tmp
   1929                        OP %G,   tmp
   1930                        PUT tmp, %E
   1931 
   1932    If E is mem, -->    (getAddr E) -> tmpa
   1933                        LD (tmpa), tmpv
   1934                        OP %G, tmpv
   1935                        ST tmpv, (tmpa)
   1936 */
   1937 static
   1938 UInt dis_op2_G_E ( UChar       sorb,
   1939                    Bool        locked,
   1940                    Bool        addSubCarry,
   1941                    IROp        op8,
   1942                    Bool        keep,
   1943                    Int         size,
   1944                    Int         delta0,
   1945                    const HChar* t_x86opc )
   1946 {
   1947    HChar   dis_buf[50];
   1948    Int     len;
   1949    IRType  ty   = szToITy(size);
   1950    IRTemp  dst1 = newTemp(ty);
   1951    IRTemp  src  = newTemp(ty);
   1952    IRTemp  dst0 = newTemp(ty);
   1953    UChar   rm   = getIByte(delta0);
   1954    IRTemp  addr = IRTemp_INVALID;
   1955 
   1956    /* addSubCarry == True indicates the intended operation is
   1957       add-with-carry or subtract-with-borrow. */
   1958    if (addSubCarry) {
   1959       vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
   1960       vassert(keep);
   1961    }
   1962 
   1963    if (epartIsReg(rm)) {
   1964       /* Specially handle XOR reg,reg, because that doesn't really
   1965          depend on reg, and doing the obvious thing potentially
   1966          generates a spurious value check failure due to the bogus
   1967          dependency.  Ditto SBB reg,reg.*/
   1968       if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
   1969           && gregOfRM(rm) == eregOfRM(rm)) {
   1970          putIReg(size, eregOfRM(rm), mkU(ty,0));
   1971       }
   1972       assign(dst0, getIReg(size,eregOfRM(rm)));
   1973       assign(src,  getIReg(size,gregOfRM(rm)));
   1974 
   1975       if (addSubCarry && op8 == Iop_Add8) {
   1976          helper_ADC( size, dst1, dst0, src,
   1977                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1978          putIReg(size, eregOfRM(rm), mkexpr(dst1));
   1979       } else
   1980       if (addSubCarry && op8 == Iop_Sub8) {
   1981          helper_SBB( size, dst1, dst0, src,
   1982                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1983          putIReg(size, eregOfRM(rm), mkexpr(dst1));
   1984       } else {
   1985          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   1986          if (isAddSub(op8))
   1987             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   1988          else
   1989             setFlags_DEP1(op8, dst1, ty);
   1990          if (keep)
   1991             putIReg(size, eregOfRM(rm), mkexpr(dst1));
   1992       }
   1993 
   1994       DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
   1995                           nameIReg(size,gregOfRM(rm)),
   1996                           nameIReg(size,eregOfRM(rm)));
   1997       return 1+delta0;
   1998    }
   1999 
   2000    /* E refers to memory */
   2001    {
   2002       addr = disAMode ( &len, sorb, delta0, dis_buf);
   2003       assign(dst0, loadLE(ty,mkexpr(addr)));
   2004       assign(src,  getIReg(size,gregOfRM(rm)));
   2005 
   2006       if (addSubCarry && op8 == Iop_Add8) {
   2007          if (locked) {
   2008             /* cas-style store */
   2009             helper_ADC( size, dst1, dst0, src,
   2010                         /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
   2011          } else {
   2012             /* normal store */
   2013             helper_ADC( size, dst1, dst0, src,
   2014                         /*store*/addr, IRTemp_INVALID, 0 );
   2015          }
   2016       } else
   2017       if (addSubCarry && op8 == Iop_Sub8) {
   2018          if (locked) {
   2019             /* cas-style store */
   2020             helper_SBB( size, dst1, dst0, src,
   2021                         /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
   2022          } else {
   2023             /* normal store */
   2024             helper_SBB( size, dst1, dst0, src,
   2025                         /*store*/addr, IRTemp_INVALID, 0 );
   2026          }
   2027       } else {
   2028          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   2029          if (keep) {
   2030             if (locked) {
   2031                if (0) vex_printf("locked case\n" );
   2032                casLE( mkexpr(addr),
   2033                       mkexpr(dst0)/*expval*/,
   2034                       mkexpr(dst1)/*newval*/, guest_EIP_curr_instr );
   2035             } else {
   2036                if (0) vex_printf("nonlocked case\n");
   2037                storeLE(mkexpr(addr), mkexpr(dst1));
   2038             }
   2039          }
   2040          if (isAddSub(op8))
   2041             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2042          else
   2043             setFlags_DEP1(op8, dst1, ty);
   2044       }
   2045 
   2046       DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
   2047                           nameIReg(size,gregOfRM(rm)), dis_buf);
   2048       return len+delta0;
   2049    }
   2050 }
   2051 
   2052 
   2053 /* Handle move instructions of the form
   2054       mov E, G  meaning
   2055       mov reg-or-mem, reg
   2056    Is passed the a ptr to the modRM byte, and the data size.  Returns
   2057    the address advanced completely over this instruction.
   2058 
   2059    E(src) is reg-or-mem
   2060    G(dst) is reg.
   2061 
   2062    If E is reg, -->    GET %E,  tmpv
   2063                        PUT tmpv, %G
   2064 
   2065    If E is mem  -->    (getAddr E) -> tmpa
   2066                        LD (tmpa), tmpb
   2067                        PUT tmpb, %G
   2068 */
   2069 static
   2070 UInt dis_mov_E_G ( UChar       sorb,
   2071                    Int         size,
   2072                    Int         delta0 )
   2073 {
   2074    Int len;
   2075    UChar rm = getIByte(delta0);
   2076    HChar dis_buf[50];
   2077 
   2078    if (epartIsReg(rm)) {
   2079       putIReg(size, gregOfRM(rm), getIReg(size, eregOfRM(rm)));
   2080       DIP("mov%c %s,%s\n", nameISize(size),
   2081                            nameIReg(size,eregOfRM(rm)),
   2082                            nameIReg(size,gregOfRM(rm)));
   2083       return 1+delta0;
   2084    }
   2085 
   2086    /* E refers to memory */
   2087    {
   2088       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
   2089       putIReg(size, gregOfRM(rm), loadLE(szToITy(size), mkexpr(addr)));
   2090       DIP("mov%c %s,%s\n", nameISize(size),
   2091                            dis_buf,nameIReg(size,gregOfRM(rm)));
   2092       return delta0+len;
   2093    }
   2094 }
   2095 
   2096 
   2097 /* Handle move instructions of the form
   2098       mov G, E  meaning
   2099       mov reg, reg-or-mem
   2100    Is passed the a ptr to the modRM byte, and the data size.  Returns
   2101    the address advanced completely over this instruction.
   2102 
   2103    G(src) is reg.
   2104    E(dst) is reg-or-mem
   2105 
   2106    If E is reg, -->    GET %G,  tmp
   2107                        PUT tmp, %E
   2108 
   2109    If E is mem, -->    (getAddr E) -> tmpa
   2110                        GET %G, tmpv
   2111                        ST tmpv, (tmpa)
   2112 */
   2113 static
   2114 UInt dis_mov_G_E ( UChar       sorb,
   2115                    Int         size,
   2116                    Int         delta0 )
   2117 {
   2118    Int len;
   2119    UChar rm = getIByte(delta0);
   2120    HChar dis_buf[50];
   2121 
   2122    if (epartIsReg(rm)) {
   2123       putIReg(size, eregOfRM(rm), getIReg(size, gregOfRM(rm)));
   2124       DIP("mov%c %s,%s\n", nameISize(size),
   2125                            nameIReg(size,gregOfRM(rm)),
   2126                            nameIReg(size,eregOfRM(rm)));
   2127       return 1+delta0;
   2128    }
   2129 
   2130    /* E refers to memory */
   2131    {
   2132       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf);
   2133       storeLE( mkexpr(addr), getIReg(size, gregOfRM(rm)) );
   2134       DIP("mov%c %s,%s\n", nameISize(size),
   2135                            nameIReg(size,gregOfRM(rm)), dis_buf);
   2136       return len+delta0;
   2137    }
   2138 }
   2139 
   2140 
   2141 /* op $immediate, AL/AX/EAX. */
   2142 static
   2143 UInt dis_op_imm_A ( Int    size,
   2144                     Bool   carrying,
   2145                     IROp   op8,
   2146                     Bool   keep,
   2147                     Int    delta,
   2148                     const HChar* t_x86opc )
   2149 {
   2150    IRType ty   = szToITy(size);
   2151    IRTemp dst0 = newTemp(ty);
   2152    IRTemp src  = newTemp(ty);
   2153    IRTemp dst1 = newTemp(ty);
   2154    UInt lit    = getUDisp(size,delta);
   2155    assign(dst0, getIReg(size,R_EAX));
   2156    assign(src,  mkU(ty,lit));
   2157 
   2158    if (isAddSub(op8) && !carrying) {
   2159       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   2160       setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2161    }
   2162    else
   2163    if (isLogic(op8)) {
   2164       vassert(!carrying);
   2165       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   2166       setFlags_DEP1(op8, dst1, ty);
   2167    }
   2168    else
   2169    if (op8 == Iop_Add8 && carrying) {
   2170       helper_ADC( size, dst1, dst0, src,
   2171                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2172    }
   2173    else
   2174    if (op8 == Iop_Sub8 && carrying) {
   2175       helper_SBB( size, dst1, dst0, src,
   2176                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2177    }
   2178    else
   2179       vpanic("dis_op_imm_A(x86,guest)");
   2180 
   2181    if (keep)
   2182       putIReg(size, R_EAX, mkexpr(dst1));
   2183 
   2184    DIP("%s%c $0x%x, %s\n", t_x86opc, nameISize(size),
   2185                            lit, nameIReg(size,R_EAX));
   2186    return delta+size;
   2187 }
   2188 
   2189 
   2190 /* Sign- and Zero-extending moves. */
   2191 static
   2192 UInt dis_movx_E_G ( UChar      sorb,
   2193                     Int delta, Int szs, Int szd, Bool sign_extend )
   2194 {
   2195    UChar rm = getIByte(delta);
   2196    if (epartIsReg(rm)) {
   2197       if (szd == szs) {
   2198          // mutant case.  See #250799
   2199          putIReg(szd, gregOfRM(rm),
   2200                            getIReg(szs,eregOfRM(rm)));
   2201       } else {
   2202          // normal case
   2203          putIReg(szd, gregOfRM(rm),
   2204                       unop(mkWidenOp(szs,szd,sign_extend),
   2205                            getIReg(szs,eregOfRM(rm))));
   2206       }
   2207       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
   2208                                nameISize(szs), nameISize(szd),
   2209                                nameIReg(szs,eregOfRM(rm)),
   2210                                nameIReg(szd,gregOfRM(rm)));
   2211       return 1+delta;
   2212    }
   2213 
   2214    /* E refers to memory */
   2215    {
   2216       Int    len;
   2217       HChar  dis_buf[50];
   2218       IRTemp addr = disAMode ( &len, sorb, delta, dis_buf );
   2219       if (szd == szs) {
   2220          // mutant case.  See #250799
   2221          putIReg(szd, gregOfRM(rm),
   2222                            loadLE(szToITy(szs),mkexpr(addr)));
   2223       } else {
   2224          // normal case
   2225          putIReg(szd, gregOfRM(rm),
   2226                       unop(mkWidenOp(szs,szd,sign_extend),
   2227                            loadLE(szToITy(szs),mkexpr(addr))));
   2228       }
   2229       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
   2230                                nameISize(szs), nameISize(szd),
   2231                                dis_buf, nameIReg(szd,gregOfRM(rm)));
   2232       return len+delta;
   2233    }
   2234 }
   2235 
   2236 
   2237 /* Generate code to divide ArchRegs EDX:EAX / DX:AX / AX by the 32 /
   2238    16 / 8 bit quantity in the given IRTemp.  */
   2239 static
   2240 void codegen_div ( Int sz, IRTemp t, Bool signed_divide )
   2241 {
   2242    IROp   op    = signed_divide ? Iop_DivModS64to32 : Iop_DivModU64to32;
   2243    IRTemp src64 = newTemp(Ity_I64);
   2244    IRTemp dst64 = newTemp(Ity_I64);
   2245    switch (sz) {
   2246       case 4:
   2247          assign( src64, binop(Iop_32HLto64,
   2248                               getIReg(4,R_EDX), getIReg(4,R_EAX)) );
   2249          assign( dst64, binop(op, mkexpr(src64), mkexpr(t)) );
   2250          putIReg( 4, R_EAX, unop(Iop_64to32,mkexpr(dst64)) );
   2251          putIReg( 4, R_EDX, unop(Iop_64HIto32,mkexpr(dst64)) );
   2252          break;
   2253       case 2: {
   2254          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
   2255          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
   2256          assign( src64, unop(widen3264,
   2257                              binop(Iop_16HLto32,
   2258                                    getIReg(2,R_EDX), getIReg(2,R_EAX))) );
   2259          assign( dst64, binop(op, mkexpr(src64), unop(widen1632,mkexpr(t))) );
   2260          putIReg( 2, R_EAX, unop(Iop_32to16,unop(Iop_64to32,mkexpr(dst64))) );
   2261          putIReg( 2, R_EDX, unop(Iop_32to16,unop(Iop_64HIto32,mkexpr(dst64))) );
   2262          break;
   2263       }
   2264       case 1: {
   2265          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
   2266          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
   2267          IROp widen816  = signed_divide ? Iop_8Sto16  : Iop_8Uto16;
   2268          assign( src64, unop(widen3264, unop(widen1632, getIReg(2,R_EAX))) );
   2269          assign( dst64,
   2270                  binop(op, mkexpr(src64),
   2271                            unop(widen1632, unop(widen816, mkexpr(t)))) );
   2272          putIReg( 1, R_AL, unop(Iop_16to8, unop(Iop_32to16,
   2273                            unop(Iop_64to32,mkexpr(dst64)))) );
   2274          putIReg( 1, R_AH, unop(Iop_16to8, unop(Iop_32to16,
   2275                            unop(Iop_64HIto32,mkexpr(dst64)))) );
   2276          break;
   2277       }
   2278       default: vpanic("codegen_div(x86)");
   2279    }
   2280 }
   2281 
   2282 
   2283 static
   2284 UInt dis_Grp1 ( UChar sorb, Bool locked,
   2285                 Int delta, UChar modrm,
   2286                 Int am_sz, Int d_sz, Int sz, UInt d32 )
   2287 {
   2288    Int     len;
   2289    HChar   dis_buf[50];
   2290    IRType  ty   = szToITy(sz);
   2291    IRTemp  dst1 = newTemp(ty);
   2292    IRTemp  src  = newTemp(ty);
   2293    IRTemp  dst0 = newTemp(ty);
   2294    IRTemp  addr = IRTemp_INVALID;
   2295    IROp    op8  = Iop_INVALID;
   2296    UInt    mask = sz==1 ? 0xFF : (sz==2 ? 0xFFFF : 0xFFFFFFFF);
   2297 
   2298    switch (gregOfRM(modrm)) {
   2299       case 0: op8 = Iop_Add8; break;  case 1: op8 = Iop_Or8;  break;
   2300       case 2: break;  // ADC
   2301       case 3: break;  // SBB
   2302       case 4: op8 = Iop_And8; break;  case 5: op8 = Iop_Sub8; break;
   2303       case 6: op8 = Iop_Xor8; break;  case 7: op8 = Iop_Sub8; break;
   2304       /*NOTREACHED*/
   2305       default: vpanic("dis_Grp1: unhandled case");
   2306    }
   2307 
   2308    if (epartIsReg(modrm)) {
   2309       vassert(am_sz == 1);
   2310 
   2311       assign(dst0, getIReg(sz,eregOfRM(modrm)));
   2312       assign(src,  mkU(ty,d32 & mask));
   2313 
   2314       if (gregOfRM(modrm) == 2 /* ADC */) {
   2315          helper_ADC( sz, dst1, dst0, src,
   2316                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2317       } else
   2318       if (gregOfRM(modrm) == 3 /* SBB */) {
   2319          helper_SBB( sz, dst1, dst0, src,
   2320                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2321       } else {
   2322          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   2323          if (isAddSub(op8))
   2324             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2325          else
   2326             setFlags_DEP1(op8, dst1, ty);
   2327       }
   2328 
   2329       if (gregOfRM(modrm) < 7)
   2330          putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
   2331 
   2332       delta += (am_sz + d_sz);
   2333       DIP("%s%c $0x%x, %s\n", nameGrp1(gregOfRM(modrm)), nameISize(sz), d32,
   2334                               nameIReg(sz,eregOfRM(modrm)));
   2335    } else {
   2336       addr = disAMode ( &len, sorb, delta, dis_buf);
   2337 
   2338       assign(dst0, loadLE(ty,mkexpr(addr)));
   2339       assign(src, mkU(ty,d32 & mask));
   2340 
   2341       if (gregOfRM(modrm) == 2 /* ADC */) {
   2342          if (locked) {
   2343             /* cas-style store */
   2344             helper_ADC( sz, dst1, dst0, src,
   2345                        /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
   2346          } else {
   2347             /* normal store */
   2348             helper_ADC( sz, dst1, dst0, src,
   2349                         /*store*/addr, IRTemp_INVALID, 0 );
   2350          }
   2351       } else
   2352       if (gregOfRM(modrm) == 3 /* SBB */) {
   2353          if (locked) {
   2354             /* cas-style store */
   2355             helper_SBB( sz, dst1, dst0, src,
   2356                        /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
   2357          } else {
   2358             /* normal store */
   2359             helper_SBB( sz, dst1, dst0, src,
   2360                         /*store*/addr, IRTemp_INVALID, 0 );
   2361          }
   2362       } else {
   2363          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   2364          if (gregOfRM(modrm) < 7) {
   2365             if (locked) {
   2366                casLE( mkexpr(addr), mkexpr(dst0)/*expVal*/,
   2367                                     mkexpr(dst1)/*newVal*/,
   2368                                     guest_EIP_curr_instr );
   2369             } else {
   2370                storeLE(mkexpr(addr), mkexpr(dst1));
   2371             }
   2372          }
   2373          if (isAddSub(op8))
   2374             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2375          else
   2376             setFlags_DEP1(op8, dst1, ty);
   2377       }
   2378 
   2379       delta += (len+d_sz);
   2380       DIP("%s%c $0x%x, %s\n", nameGrp1(gregOfRM(modrm)), nameISize(sz),
   2381                               d32, dis_buf);
   2382    }
   2383    return delta;
   2384 }
   2385 
   2386 
   2387 /* Group 2 extended opcodes.  shift_expr must be an 8-bit typed
   2388    expression. */
   2389 
   2390 static
   2391 UInt dis_Grp2 ( UChar sorb,
   2392                 Int delta, UChar modrm,
   2393                 Int am_sz, Int d_sz, Int sz, IRExpr* shift_expr,
   2394                 const HChar* shift_expr_txt, Bool* decode_OK )
   2395 {
   2396    /* delta on entry points at the modrm byte. */
   2397    HChar  dis_buf[50];
   2398    Int    len;
   2399    Bool   isShift, isRotate, isRotateC;
   2400    IRType ty    = szToITy(sz);
   2401    IRTemp dst0  = newTemp(ty);
   2402    IRTemp dst1  = newTemp(ty);
   2403    IRTemp addr  = IRTemp_INVALID;
   2404 
   2405    *decode_OK = True;
   2406 
   2407    vassert(sz == 1 || sz == 2 || sz == 4);
   2408 
   2409    /* Put value to shift/rotate in dst0. */
   2410    if (epartIsReg(modrm)) {
   2411       assign(dst0, getIReg(sz, eregOfRM(modrm)));
   2412       delta += (am_sz + d_sz);
   2413    } else {
   2414       addr = disAMode ( &len, sorb, delta, dis_buf);
   2415       assign(dst0, loadLE(ty,mkexpr(addr)));
   2416       delta += len + d_sz;
   2417    }
   2418 
   2419    isShift = False;
   2420    switch (gregOfRM(modrm)) { case 4: case 5: case 6: case 7: isShift = True; }
   2421 
   2422    isRotate = False;
   2423    switch (gregOfRM(modrm)) { case 0: case 1: isRotate = True; }
   2424 
   2425    isRotateC = False;
   2426    switch (gregOfRM(modrm)) { case 2: case 3: isRotateC = True; }
   2427 
   2428    if (!isShift && !isRotate && !isRotateC) {
   2429       /*NOTREACHED*/
   2430       vpanic("dis_Grp2(Reg): unhandled case(x86)");
   2431    }
   2432 
   2433    if (isRotateC) {
   2434       /* call a helper; these insns are so ridiculous they do not
   2435          deserve better */
   2436       Bool     left = toBool(gregOfRM(modrm) == 2);
   2437       IRTemp   r64  = newTemp(Ity_I64);
   2438       IRExpr** args
   2439          = mkIRExprVec_4( widenUto32(mkexpr(dst0)), /* thing to rotate */
   2440                           widenUto32(shift_expr),   /* rotate amount */
   2441                           widenUto32(mk_x86g_calculate_eflags_all()),
   2442                           mkU32(sz) );
   2443       assign( r64, mkIRExprCCall(
   2444                       Ity_I64,
   2445                       0/*regparm*/,
   2446                       left ? "x86g_calculate_RCL" : "x86g_calculate_RCR",
   2447                       left ? &x86g_calculate_RCL  : &x86g_calculate_RCR,
   2448                       args
   2449                    )
   2450             );
   2451       /* new eflags in hi half r64; new value in lo half r64 */
   2452       assign( dst1, narrowTo(ty, unop(Iop_64to32, mkexpr(r64))) );
   2453       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   2454       stmt( IRStmt_Put( OFFB_CC_DEP1, unop(Iop_64HIto32, mkexpr(r64)) ));
   2455       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   2456       /* Set NDEP even though it isn't used.  This makes redundant-PUT
   2457          elimination of previous stores to this field work better. */
   2458       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   2459    }
   2460 
   2461    if (isShift) {
   2462 
   2463       IRTemp pre32     = newTemp(Ity_I32);
   2464       IRTemp res32     = newTemp(Ity_I32);
   2465       IRTemp res32ss   = newTemp(Ity_I32);
   2466       IRTemp shift_amt = newTemp(Ity_I8);
   2467       IROp   op32;
   2468 
   2469       switch (gregOfRM(modrm)) {
   2470          case 4: op32 = Iop_Shl32; break;
   2471          case 5: op32 = Iop_Shr32; break;
   2472          case 6: op32 = Iop_Shl32; break;
   2473          case 7: op32 = Iop_Sar32; break;
   2474          /*NOTREACHED*/
   2475          default: vpanic("dis_Grp2:shift"); break;
   2476       }
   2477 
   2478       /* Widen the value to be shifted to 32 bits, do the shift, and
   2479          narrow back down.  This seems surprisingly long-winded, but
   2480          unfortunately the Intel semantics requires that 8/16-bit
   2481          shifts give defined results for shift values all the way up
   2482          to 31, and this seems the simplest way to do it.  It has the
   2483          advantage that the only IR level shifts generated are of 32
   2484          bit values, and the shift amount is guaranteed to be in the
   2485          range 0 .. 31, thereby observing the IR semantics requiring
   2486          all shift values to be in the range 0 .. 2^word_size-1. */
   2487 
   2488       /* shift_amt = shift_expr & 31, regardless of operation size */
   2489       assign( shift_amt, binop(Iop_And8, shift_expr, mkU8(31)) );
   2490 
   2491       /* suitably widen the value to be shifted to 32 bits. */
   2492       assign( pre32, op32==Iop_Sar32 ? widenSto32(mkexpr(dst0))
   2493                                      : widenUto32(mkexpr(dst0)) );
   2494 
   2495       /* res32 = pre32 `shift` shift_amt */
   2496       assign( res32, binop(op32, mkexpr(pre32), mkexpr(shift_amt)) );
   2497 
   2498       /* res32ss = pre32 `shift` ((shift_amt - 1) & 31) */
   2499       assign( res32ss,
   2500               binop(op32,
   2501                     mkexpr(pre32),
   2502                     binop(Iop_And8,
   2503                           binop(Iop_Sub8,
   2504                                 mkexpr(shift_amt), mkU8(1)),
   2505                           mkU8(31))) );
   2506 
   2507       /* Build the flags thunk. */
   2508       setFlags_DEP1_DEP2_shift(op32, res32, res32ss, ty, shift_amt);
   2509 
   2510       /* Narrow the result back down. */
   2511       assign( dst1, narrowTo(ty, mkexpr(res32)) );
   2512 
   2513    } /* if (isShift) */
   2514 
   2515    else
   2516    if (isRotate) {
   2517       Int    ccOp      = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
   2518       Bool   left      = toBool(gregOfRM(modrm) == 0);
   2519       IRTemp rot_amt   = newTemp(Ity_I8);
   2520       IRTemp rot_amt32 = newTemp(Ity_I8);
   2521       IRTemp oldFlags  = newTemp(Ity_I32);
   2522 
   2523       /* rot_amt = shift_expr & mask */
   2524       /* By masking the rotate amount thusly, the IR-level Shl/Shr
   2525          expressions never shift beyond the word size and thus remain
   2526          well defined. */
   2527       assign(rot_amt32, binop(Iop_And8, shift_expr, mkU8(31)));
   2528 
   2529       if (ty == Ity_I32)
   2530          assign(rot_amt, mkexpr(rot_amt32));
   2531       else
   2532          assign(rot_amt, binop(Iop_And8, mkexpr(rot_amt32), mkU8(8*sz-1)));
   2533 
   2534       if (left) {
   2535 
   2536          /* dst1 = (dst0 << rot_amt) | (dst0 >>u (wordsize-rot_amt)) */
   2537          assign(dst1,
   2538             binop( mkSizedOp(ty,Iop_Or8),
   2539                    binop( mkSizedOp(ty,Iop_Shl8),
   2540                           mkexpr(dst0),
   2541                           mkexpr(rot_amt)
   2542                    ),
   2543                    binop( mkSizedOp(ty,Iop_Shr8),
   2544                           mkexpr(dst0),
   2545                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
   2546                    )
   2547             )
   2548          );
   2549          ccOp += X86G_CC_OP_ROLB;
   2550 
   2551       } else { /* right */
   2552 
   2553          /* dst1 = (dst0 >>u rot_amt) | (dst0 << (wordsize-rot_amt)) */
   2554          assign(dst1,
   2555             binop( mkSizedOp(ty,Iop_Or8),
   2556                    binop( mkSizedOp(ty,Iop_Shr8),
   2557                           mkexpr(dst0),
   2558                           mkexpr(rot_amt)
   2559                    ),
   2560                    binop( mkSizedOp(ty,Iop_Shl8),
   2561                           mkexpr(dst0),
   2562                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
   2563                    )
   2564             )
   2565          );
   2566          ccOp += X86G_CC_OP_RORB;
   2567 
   2568       }
   2569 
   2570       /* dst1 now holds the rotated value.  Build flag thunk.  We
   2571          need the resulting value for this, and the previous flags.
   2572          Except don't set it if the rotate count is zero. */
   2573 
   2574       assign(oldFlags, mk_x86g_calculate_eflags_all());
   2575 
   2576       /* rot_amt32 :: Ity_I8.  We need to convert it to I1. */
   2577       IRTemp rot_amt32b = newTemp(Ity_I1);
   2578       assign(rot_amt32b, binop(Iop_CmpNE8, mkexpr(rot_amt32), mkU8(0)) );
   2579 
   2580       /* CC_DEP1 is the rotated value.  CC_NDEP is flags before. */
   2581       stmt( IRStmt_Put( OFFB_CC_OP,
   2582                         IRExpr_ITE( mkexpr(rot_amt32b),
   2583                                     mkU32(ccOp),
   2584                                     IRExpr_Get(OFFB_CC_OP,Ity_I32) ) ));
   2585       stmt( IRStmt_Put( OFFB_CC_DEP1,
   2586                         IRExpr_ITE( mkexpr(rot_amt32b),
   2587                                     widenUto32(mkexpr(dst1)),
   2588                                     IRExpr_Get(OFFB_CC_DEP1,Ity_I32) ) ));
   2589       stmt( IRStmt_Put( OFFB_CC_DEP2,
   2590                         IRExpr_ITE( mkexpr(rot_amt32b),
   2591                                     mkU32(0),
   2592                                     IRExpr_Get(OFFB_CC_DEP2,Ity_I32) ) ));
   2593       stmt( IRStmt_Put( OFFB_CC_NDEP,
   2594                         IRExpr_ITE( mkexpr(rot_amt32b),
   2595                                     mkexpr(oldFlags),
   2596                                     IRExpr_Get(OFFB_CC_NDEP,Ity_I32) ) ));
   2597    } /* if (isRotate) */
   2598 
   2599    /* Save result, and finish up. */
   2600    if (epartIsReg(modrm)) {
   2601       putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
   2602       if (vex_traceflags & VEX_TRACE_FE) {
   2603          vex_printf("%s%c ",
   2604                     nameGrp2(gregOfRM(modrm)), nameISize(sz) );
   2605          if (shift_expr_txt)
   2606             vex_printf("%s", shift_expr_txt);
   2607          else
   2608             ppIRExpr(shift_expr);
   2609          vex_printf(", %s\n", nameIReg(sz,eregOfRM(modrm)));
   2610       }
   2611    } else {
   2612       storeLE(mkexpr(addr), mkexpr(dst1));
   2613       if (vex_traceflags & VEX_TRACE_FE) {
   2614          vex_printf("%s%c ",
   2615                     nameGrp2(gregOfRM(modrm)), nameISize(sz) );
   2616          if (shift_expr_txt)
   2617             vex_printf("%s", shift_expr_txt);
   2618          else
   2619             ppIRExpr(shift_expr);
   2620          vex_printf(", %s\n", dis_buf);
   2621       }
   2622    }
   2623    return delta;
   2624 }
   2625 
   2626 
   2627 /* Group 8 extended opcodes (but BT/BTS/BTC/BTR only). */
   2628 static
   2629 UInt dis_Grp8_Imm ( UChar sorb,
   2630                     Bool locked,
   2631                     Int delta, UChar modrm,
   2632                     Int am_sz, Int sz, UInt src_val,
   2633                     Bool* decode_OK )
   2634 {
   2635    /* src_val denotes a d8.
   2636       And delta on entry points at the modrm byte. */
   2637 
   2638    IRType ty     = szToITy(sz);
   2639    IRTemp t2     = newTemp(Ity_I32);
   2640    IRTemp t2m    = newTemp(Ity_I32);
   2641    IRTemp t_addr = IRTemp_INVALID;
   2642    HChar  dis_buf[50];
   2643    UInt   mask;
   2644 
   2645    /* we're optimists :-) */
   2646    *decode_OK = True;
   2647 
   2648    /* Limit src_val -- the bit offset -- to something within a word.
   2649       The Intel docs say that literal offsets larger than a word are
   2650       masked in this way. */
   2651    switch (sz) {
   2652       case 2:  src_val &= 15; break;
   2653       case 4:  src_val &= 31; break;
   2654       default: *decode_OK = False; return delta;
   2655    }
   2656 
   2657    /* Invent a mask suitable for the operation. */
   2658    switch (gregOfRM(modrm)) {
   2659       case 4: /* BT */  mask = 0;               break;
   2660       case 5: /* BTS */ mask = 1 << src_val;    break;
   2661       case 6: /* BTR */ mask = ~(1 << src_val); break;
   2662       case 7: /* BTC */ mask = 1 << src_val;    break;
   2663          /* If this needs to be extended, probably simplest to make a
   2664             new function to handle the other cases (0 .. 3).  The
   2665             Intel docs do however not indicate any use for 0 .. 3, so
   2666             we don't expect this to happen. */
   2667       default: *decode_OK = False; return delta;
   2668    }
   2669 
   2670    /* Fetch the value to be tested and modified into t2, which is
   2671       32-bits wide regardless of sz. */
   2672    if (epartIsReg(modrm)) {
   2673       vassert(am_sz == 1);
   2674       assign( t2, widenUto32(getIReg(sz, eregOfRM(modrm))) );
   2675       delta += (am_sz + 1);
   2676       DIP("%s%c $0x%x, %s\n", nameGrp8(gregOfRM(modrm)), nameISize(sz),
   2677                               src_val, nameIReg(sz,eregOfRM(modrm)));
   2678    } else {
   2679       Int len;
   2680       t_addr = disAMode ( &len, sorb, delta, dis_buf);
   2681       delta  += (len+1);
   2682       assign( t2, widenUto32(loadLE(ty, mkexpr(t_addr))) );
   2683       DIP("%s%c $0x%x, %s\n", nameGrp8(gregOfRM(modrm)), nameISize(sz),
   2684                               src_val, dis_buf);
   2685    }
   2686 
   2687    /* Compute the new value into t2m, if non-BT. */
   2688    switch (gregOfRM(modrm)) {
   2689       case 4: /* BT */
   2690          break;
   2691       case 5: /* BTS */
   2692          assign( t2m, binop(Iop_Or32, mkU32(mask), mkexpr(t2)) );
   2693          break;
   2694       case 6: /* BTR */
   2695          assign( t2m, binop(Iop_And32, mkU32(mask), mkexpr(t2)) );
   2696          break;
   2697       case 7: /* BTC */
   2698          assign( t2m, binop(Iop_Xor32, mkU32(mask), mkexpr(t2)) );
   2699          break;
   2700       default:
   2701          /*NOTREACHED*/ /*the previous switch guards this*/
   2702          vassert(0);
   2703    }
   2704 
   2705    /* Write the result back, if non-BT.  If the CAS fails then we
   2706       side-exit from the trace at this point, and so the flag state is
   2707       not affected.  This is of course as required. */
   2708    if (gregOfRM(modrm) != 4 /* BT */) {
   2709       if (epartIsReg(modrm)) {
   2710          putIReg(sz, eregOfRM(modrm), narrowTo(ty, mkexpr(t2m)));
   2711       } else {
   2712          if (locked) {
   2713             casLE( mkexpr(t_addr),
   2714                    narrowTo(ty, mkexpr(t2))/*expd*/,
   2715                    narrowTo(ty, mkexpr(t2m))/*new*/,
   2716                    guest_EIP_curr_instr );
   2717          } else {
   2718             storeLE(mkexpr(t_addr), narrowTo(ty, mkexpr(t2m)));
   2719          }
   2720       }
   2721    }
   2722 
   2723    /* Copy relevant bit from t2 into the carry flag. */
   2724    /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
   2725    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   2726    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   2727    stmt( IRStmt_Put(
   2728             OFFB_CC_DEP1,
   2729             binop(Iop_And32,
   2730                   binop(Iop_Shr32, mkexpr(t2), mkU8(src_val)),
   2731                   mkU32(1))
   2732        ));
   2733    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   2734       elimination of previous stores to this field work better. */
   2735    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   2736 
   2737    return delta;
   2738 }
   2739 
   2740 
   2741 /* Signed/unsigned widening multiply.  Generate IR to multiply the
   2742    value in EAX/AX/AL by the given IRTemp, and park the result in
   2743    EDX:EAX/DX:AX/AX.
   2744 */
   2745 static void codegen_mulL_A_D ( Int sz, Bool syned,
   2746                                IRTemp tmp, const HChar* tmp_txt )
   2747 {
   2748    IRType ty = szToITy(sz);
   2749    IRTemp t1 = newTemp(ty);
   2750 
   2751    assign( t1, getIReg(sz, R_EAX) );
   2752 
   2753    switch (ty) {
   2754       case Ity_I32: {
   2755          IRTemp res64   = newTemp(Ity_I64);
   2756          IRTemp resHi   = newTemp(Ity_I32);
   2757          IRTemp resLo   = newTemp(Ity_I32);
   2758          IROp   mulOp   = syned ? Iop_MullS32 : Iop_MullU32;
   2759          UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
   2760          setFlags_MUL ( Ity_I32, t1, tmp, tBaseOp );
   2761          assign( res64, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   2762          assign( resHi, unop(Iop_64HIto32,mkexpr(res64)));
   2763          assign( resLo, unop(Iop_64to32,mkexpr(res64)));
   2764          putIReg(4, R_EDX, mkexpr(resHi));
   2765          putIReg(4, R_EAX, mkexpr(resLo));
   2766          break;
   2767       }
   2768       case Ity_I16: {
   2769          IRTemp res32   = newTemp(Ity_I32);
   2770          IRTemp resHi   = newTemp(Ity_I16);
   2771          IRTemp resLo   = newTemp(Ity_I16);
   2772          IROp   mulOp   = syned ? Iop_MullS16 : Iop_MullU16;
   2773          UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
   2774          setFlags_MUL ( Ity_I16, t1, tmp, tBaseOp );
   2775          assign( res32, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   2776          assign( resHi, unop(Iop_32HIto16,mkexpr(res32)));
   2777          assign( resLo, unop(Iop_32to16,mkexpr(res32)));
   2778          putIReg(2, R_EDX, mkexpr(resHi));
   2779          putIReg(2, R_EAX, mkexpr(resLo));
   2780          break;
   2781       }
   2782       case Ity_I8: {
   2783          IRTemp res16   = newTemp(Ity_I16);
   2784          IRTemp resHi   = newTemp(Ity_I8);
   2785          IRTemp resLo   = newTemp(Ity_I8);
   2786          IROp   mulOp   = syned ? Iop_MullS8 : Iop_MullU8;
   2787          UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
   2788          setFlags_MUL ( Ity_I8, t1, tmp, tBaseOp );
   2789          assign( res16, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   2790          assign( resHi, unop(Iop_16HIto8,mkexpr(res16)));
   2791          assign( resLo, unop(Iop_16to8,mkexpr(res16)));
   2792          putIReg(2, R_EAX, mkexpr(res16));
   2793          break;
   2794       }
   2795       default:
   2796          vpanic("codegen_mulL_A_D(x86)");
   2797    }
   2798    DIP("%s%c %s\n", syned ? "imul" : "mul", nameISize(sz), tmp_txt);
   2799 }
   2800 
   2801 
   2802 /* Group 3 extended opcodes. */
   2803 static
   2804 UInt dis_Grp3 ( UChar sorb, Bool locked, Int sz, Int delta, Bool* decode_OK )
   2805 {
   2806    UInt    d32;
   2807    UChar   modrm;
   2808    HChar   dis_buf[50];
   2809    Int     len;
   2810    IRTemp  addr;
   2811    IRType  ty = szToITy(sz);
   2812    IRTemp  t1 = newTemp(ty);
   2813    IRTemp dst1, src, dst0;
   2814 
   2815    *decode_OK = True; /* may change this later */
   2816 
   2817    modrm = getIByte(delta);
   2818 
   2819    if (locked && (gregOfRM(modrm) != 2 && gregOfRM(modrm) != 3)) {
   2820       /* LOCK prefix only allowed with not and neg subopcodes */
   2821       *decode_OK = False;
   2822       return delta;
   2823    }
   2824 
   2825    if (epartIsReg(modrm)) {
   2826       switch (gregOfRM(modrm)) {
   2827          case 0: { /* TEST */
   2828             delta++; d32 = getUDisp(sz, delta); delta += sz;
   2829             dst1 = newTemp(ty);
   2830             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
   2831                                getIReg(sz,eregOfRM(modrm)),
   2832                                mkU(ty,d32)));
   2833             setFlags_DEP1( Iop_And8, dst1, ty );
   2834             DIP("test%c $0x%x, %s\n", nameISize(sz), d32,
   2835                                       nameIReg(sz, eregOfRM(modrm)));
   2836             break;
   2837          }
   2838          case 1: /* UNDEFINED */
   2839            /* The Intel docs imply this insn is undefined and binutils
   2840               agrees.  Unfortunately Core 2 will run it (with who
   2841               knows what result?)  sandpile.org reckons it's an alias
   2842               for case 0.  We play safe. */
   2843            *decode_OK = False;
   2844            break;
   2845          case 2: /* NOT */
   2846             delta++;
   2847             putIReg(sz, eregOfRM(modrm),
   2848                         unop(mkSizedOp(ty,Iop_Not8),
   2849                              getIReg(sz, eregOfRM(modrm))));
   2850             DIP("not%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
   2851             break;
   2852          case 3: /* NEG */
   2853             delta++;
   2854             dst0 = newTemp(ty);
   2855             src  = newTemp(ty);
   2856             dst1 = newTemp(ty);
   2857             assign(dst0, mkU(ty,0));
   2858             assign(src,  getIReg(sz,eregOfRM(modrm)));
   2859             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0), mkexpr(src)));
   2860             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
   2861             putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
   2862             DIP("neg%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
   2863             break;
   2864          case 4: /* MUL (unsigned widening) */
   2865             delta++;
   2866             src = newTemp(ty);
   2867             assign(src, getIReg(sz,eregOfRM(modrm)));
   2868             codegen_mulL_A_D ( sz, False, src, nameIReg(sz,eregOfRM(modrm)) );
   2869             break;
   2870          case 5: /* IMUL (signed widening) */
   2871             delta++;
   2872             src = newTemp(ty);
   2873             assign(src, getIReg(sz,eregOfRM(modrm)));
   2874             codegen_mulL_A_D ( sz, True, src, nameIReg(sz,eregOfRM(modrm)) );
   2875             break;
   2876          case 6: /* DIV */
   2877             delta++;
   2878             assign( t1, getIReg(sz, eregOfRM(modrm)) );
   2879             codegen_div ( sz, t1, False );
   2880             DIP("div%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
   2881             break;
   2882          case 7: /* IDIV */
   2883             delta++;
   2884             assign( t1, getIReg(sz, eregOfRM(modrm)) );
   2885             codegen_div ( sz, t1, True );
   2886             DIP("idiv%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
   2887             break;
   2888          default:
   2889             /* This can't happen - gregOfRM should return 0 .. 7 only */
   2890             vpanic("Grp3(x86)");
   2891       }
   2892    } else {
   2893       addr = disAMode ( &len, sorb, delta, dis_buf );
   2894       t1   = newTemp(ty);
   2895       delta += len;
   2896       assign(t1, loadLE(ty,mkexpr(addr)));
   2897       switch (gregOfRM(modrm)) {
   2898          case 0: { /* TEST */
   2899             d32 = getUDisp(sz, delta); delta += sz;
   2900             dst1 = newTemp(ty);
   2901             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
   2902                                mkexpr(t1), mkU(ty,d32)));
   2903             setFlags_DEP1( Iop_And8, dst1, ty );
   2904             DIP("test%c $0x%x, %s\n", nameISize(sz), d32, dis_buf);
   2905             break;
   2906          }
   2907          case 1: /* UNDEFINED */
   2908            /* See comment above on R case */
   2909            *decode_OK = False;
   2910            break;
   2911          case 2: /* NOT */
   2912             dst1 = newTemp(ty);
   2913             assign(dst1, unop(mkSizedOp(ty,Iop_Not8), mkexpr(t1)));
   2914             if (locked) {
   2915                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
   2916                                     guest_EIP_curr_instr );
   2917             } else {
   2918                storeLE( mkexpr(addr), mkexpr(dst1) );
   2919             }
   2920             DIP("not%c %s\n", nameISize(sz), dis_buf);
   2921             break;
   2922          case 3: /* NEG */
   2923             dst0 = newTemp(ty);
   2924             src  = newTemp(ty);
   2925             dst1 = newTemp(ty);
   2926             assign(dst0, mkU(ty,0));
   2927             assign(src,  mkexpr(t1));
   2928             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8),
   2929                                mkexpr(dst0), mkexpr(src)));
   2930             if (locked) {
   2931                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
   2932                                     guest_EIP_curr_instr );
   2933             } else {
   2934                storeLE( mkexpr(addr), mkexpr(dst1) );
   2935             }
   2936             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
   2937             DIP("neg%c %s\n", nameISize(sz), dis_buf);
   2938             break;
   2939          case 4: /* MUL */
   2940             codegen_mulL_A_D ( sz, False, t1, dis_buf );
   2941             break;
   2942          case 5: /* IMUL */
   2943             codegen_mulL_A_D ( sz, True, t1, dis_buf );
   2944             break;
   2945          case 6: /* DIV */
   2946             codegen_div ( sz, t1, False );
   2947             DIP("div%c %s\n", nameISize(sz), dis_buf);
   2948             break;
   2949          case 7: /* IDIV */
   2950             codegen_div ( sz, t1, True );
   2951             DIP("idiv%c %s\n", nameISize(sz), dis_buf);
   2952             break;
   2953          default:
   2954             /* This can't happen - gregOfRM should return 0 .. 7 only */
   2955             vpanic("Grp3(x86)");
   2956       }
   2957    }
   2958    return delta;
   2959 }
   2960 
   2961 
   2962 /* Group 4 extended opcodes. */
   2963 static
   2964 UInt dis_Grp4 ( UChar sorb, Bool locked, Int delta, Bool* decode_OK )
   2965 {
   2966    Int   alen;
   2967    UChar modrm;
   2968    HChar dis_buf[50];
   2969    IRType ty = Ity_I8;
   2970    IRTemp t1 = newTemp(ty);
   2971    IRTemp t2 = newTemp(ty);
   2972 
   2973    *decode_OK = True;
   2974 
   2975    modrm = getIByte(delta);
   2976 
   2977    if (locked && (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)) {
   2978       /* LOCK prefix only allowed with inc and dec subopcodes */
   2979       *decode_OK = False;
   2980       return delta;
   2981    }
   2982 
   2983    if (epartIsReg(modrm)) {
   2984       assign(t1, getIReg(1, eregOfRM(modrm)));
   2985       switch (gregOfRM(modrm)) {
   2986          case 0: /* INC */
   2987             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
   2988             putIReg(1, eregOfRM(modrm), mkexpr(t2));
   2989             setFlags_INC_DEC( True, t2, ty );
   2990             break;
   2991          case 1: /* DEC */
   2992             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
   2993             putIReg(1, eregOfRM(modrm), mkexpr(t2));
   2994             setFlags_INC_DEC( False, t2, ty );
   2995             break;
   2996          default:
   2997             *decode_OK = False;
   2998             return delta;
   2999       }
   3000       delta++;
   3001       DIP("%sb %s\n", nameGrp4(gregOfRM(modrm)),
   3002                       nameIReg(1, eregOfRM(modrm)));
   3003    } else {
   3004       IRTemp addr = disAMode ( &alen, sorb, delta, dis_buf );
   3005       assign( t1, loadLE(ty, mkexpr(addr)) );
   3006       switch (gregOfRM(modrm)) {
   3007          case 0: /* INC */
   3008             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
   3009             if (locked) {
   3010                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
   3011                       guest_EIP_curr_instr );
   3012             } else {
   3013                storeLE( mkexpr(addr), mkexpr(t2) );
   3014             }
   3015             setFlags_INC_DEC( True, t2, ty );
   3016             break;
   3017          case 1: /* DEC */
   3018             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
   3019             if (locked) {
   3020                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
   3021                       guest_EIP_curr_instr );
   3022             } else {
   3023                storeLE( mkexpr(addr), mkexpr(t2) );
   3024             }
   3025             setFlags_INC_DEC( False, t2, ty );
   3026             break;
   3027          default:
   3028             *decode_OK = False;
   3029             return delta;
   3030       }
   3031       delta += alen;
   3032       DIP("%sb %s\n", nameGrp4(gregOfRM(modrm)), dis_buf);
   3033    }
   3034    return delta;
   3035 }
   3036 
   3037 
   3038 /* Group 5 extended opcodes. */
   3039 static
   3040 UInt dis_Grp5 ( UChar sorb, Bool locked, Int sz, Int delta,
   3041                 /*MOD*/DisResult* dres, /*OUT*/Bool* decode_OK )
   3042 {
   3043    Int     len;
   3044    UChar   modrm;
   3045    HChar   dis_buf[50];
   3046    IRTemp  addr = IRTemp_INVALID;
   3047    IRType  ty = szToITy(sz);
   3048    IRTemp  t1 = newTemp(ty);
   3049    IRTemp  t2 = IRTemp_INVALID;
   3050 
   3051    *decode_OK = True;
   3052 
   3053    modrm = getIByte(delta);
   3054 
   3055    if (locked && (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)) {
   3056       /* LOCK prefix only allowed with inc and dec subopcodes */
   3057       *decode_OK = False;
   3058       return delta;
   3059    }
   3060 
   3061    if (epartIsReg(modrm)) {
   3062       assign(t1, getIReg(sz,eregOfRM(modrm)));
   3063       switch (gregOfRM(modrm)) {
   3064          case 0: /* INC */
   3065             vassert(sz == 2 || sz == 4);
   3066             t2 = newTemp(ty);
   3067             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
   3068                              mkexpr(t1), mkU(ty,1)));
   3069             setFlags_INC_DEC( True, t2, ty );
   3070             putIReg(sz,eregOfRM(modrm),mkexpr(t2));
   3071             break;
   3072          case 1: /* DEC */
   3073             vassert(sz == 2 || sz == 4);
   3074             t2 = newTemp(ty);
   3075             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
   3076                              mkexpr(t1), mkU(ty,1)));
   3077             setFlags_INC_DEC( False, t2, ty );
   3078             putIReg(sz,eregOfRM(modrm),mkexpr(t2));
   3079             break;
   3080          case 2: /* call Ev */
   3081             vassert(sz == 4);
   3082             t2 = newTemp(Ity_I32);
   3083             assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
   3084             putIReg(4, R_ESP, mkexpr(t2));
   3085             storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta+1));
   3086             jmp_treg(dres, Ijk_Call, t1);
   3087             vassert(dres->whatNext == Dis_StopHere);
   3088             break;
   3089          case 4: /* jmp Ev */
   3090             vassert(sz == 4);
   3091             jmp_treg(dres, Ijk_Boring, t1);
   3092             vassert(dres->whatNext == Dis_StopHere);
   3093             break;
   3094          case 6: /* PUSH Ev */
   3095             vassert(sz == 4 || sz == 2);
   3096             t2 = newTemp(Ity_I32);
   3097             assign( t2, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
   3098             putIReg(4, R_ESP, mkexpr(t2) );
   3099             storeLE( mkexpr(t2), mkexpr(t1) );
   3100             break;
   3101          default:
   3102             *decode_OK = False;
   3103             return delta;
   3104       }
   3105       delta++;
   3106       DIP("%s%c %s\n", nameGrp5(gregOfRM(modrm)),
   3107                        nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
   3108    } else {
   3109       addr = disAMode ( &len, sorb, delta, dis_buf );
   3110       assign(t1, loadLE(ty,mkexpr(addr)));
   3111       switch (gregOfRM(modrm)) {
   3112          case 0: /* INC */
   3113             t2 = newTemp(ty);
   3114             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
   3115                              mkexpr(t1), mkU(ty,1)));
   3116             if (locked) {
   3117                casLE( mkexpr(addr),
   3118                       mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
   3119             } else {
   3120                storeLE(mkexpr(addr),mkexpr(t2));
   3121             }
   3122             setFlags_INC_DEC( True, t2, ty );
   3123             break;
   3124          case 1: /* DEC */
   3125             t2 = newTemp(ty);
   3126             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
   3127                              mkexpr(t1), mkU(ty,1)));
   3128             if (locked) {
   3129                casLE( mkexpr(addr),
   3130                       mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
   3131             } else {
   3132                storeLE(mkexpr(addr),mkexpr(t2));
   3133             }
   3134             setFlags_INC_DEC( False, t2, ty );
   3135             break;
   3136          case 2: /* call Ev */
   3137             vassert(sz == 4);
   3138             t2 = newTemp(Ity_I32);
   3139             assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
   3140             putIReg(4, R_ESP, mkexpr(t2));
   3141             storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta+len));
   3142             jmp_treg(dres, Ijk_Call, t1);
   3143             vassert(dres->whatNext == Dis_StopHere);
   3144             break;
   3145          case 4: /* JMP Ev */
   3146             vassert(sz == 4);
   3147             jmp_treg(dres, Ijk_Boring, t1);
   3148             vassert(dres->whatNext == Dis_StopHere);
   3149             break;
   3150          case 6: /* PUSH Ev */
   3151             vassert(sz == 4 || sz == 2);
   3152             t2 = newTemp(Ity_I32);
   3153             assign( t2, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
   3154             putIReg(4, R_ESP, mkexpr(t2) );
   3155             storeLE( mkexpr(t2), mkexpr(t1) );
   3156             break;
   3157          default:
   3158             *decode_OK = False;
   3159             return delta;
   3160       }
   3161       delta += len;
   3162       DIP("%s%c %s\n", nameGrp5(gregOfRM(modrm)),
   3163                        nameISize(sz), dis_buf);
   3164    }
   3165    return delta;
   3166 }
   3167 
   3168 
   3169 /*------------------------------------------------------------*/
   3170 /*--- Disassembling string ops (including REP prefixes)    ---*/
   3171 /*------------------------------------------------------------*/
   3172 
   3173 /* Code shared by all the string ops */
   3174 static
   3175 void dis_string_op_increment(Int sz, IRTemp t_inc)
   3176 {
   3177    if (sz == 4 || sz == 2) {
   3178       assign( t_inc,
   3179               binop(Iop_Shl32, IRExpr_Get( OFFB_DFLAG, Ity_I32 ),
   3180                                mkU8(sz/2) ) );
   3181    } else {
   3182       assign( t_inc,
   3183               IRExpr_Get( OFFB_DFLAG, Ity_I32 ) );
   3184    }
   3185 }
   3186 
   3187 static
   3188 void dis_string_op( void (*dis_OP)( Int, IRTemp ),
   3189                     Int sz, const HChar* name, UChar sorb )
   3190 {
   3191    IRTemp t_inc = newTemp(Ity_I32);
   3192    vassert(sorb == 0); /* hmm.  so what was the point of passing it in? */
   3193    dis_string_op_increment(sz, t_inc);
   3194    dis_OP( sz, t_inc );
   3195    DIP("%s%c\n", name, nameISize(sz));
   3196 }
   3197 
   3198 static
   3199 void dis_MOVS ( Int sz, IRTemp t_inc )
   3200 {
   3201    IRType ty = szToITy(sz);
   3202    IRTemp td = newTemp(Ity_I32);   /* EDI */
   3203    IRTemp ts = newTemp(Ity_I32);   /* ESI */
   3204 
   3205    assign( td, getIReg(4, R_EDI) );
   3206    assign( ts, getIReg(4, R_ESI) );
   3207 
   3208    storeLE( mkexpr(td), loadLE(ty,mkexpr(ts)) );
   3209 
   3210    putIReg( 4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
   3211    putIReg( 4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
   3212 }
   3213 
   3214 static
   3215 void dis_LODS ( Int sz, IRTemp t_inc )
   3216 {
   3217    IRType ty = szToITy(sz);
   3218    IRTemp ts = newTemp(Ity_I32);   /* ESI */
   3219 
   3220    assign( ts, getIReg(4, R_ESI) );
   3221 
   3222    putIReg( sz, R_EAX, loadLE(ty, mkexpr(ts)) );
   3223 
   3224    putIReg( 4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
   3225 }
   3226 
   3227 static
   3228 void dis_STOS ( Int sz, IRTemp t_inc )
   3229 {
   3230    IRType ty = szToITy(sz);
   3231    IRTemp ta = newTemp(ty);        /* EAX */
   3232    IRTemp td = newTemp(Ity_I32);   /* EDI */
   3233 
   3234    assign( ta, getIReg(sz, R_EAX) );
   3235    assign( td, getIReg(4, R_EDI) );
   3236 
   3237    storeLE( mkexpr(td), mkexpr(ta) );
   3238 
   3239    putIReg( 4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
   3240 }
   3241 
   3242 static
   3243 void dis_CMPS ( Int sz, IRTemp t_inc )
   3244 {
   3245    IRType ty  = szToITy(sz);
   3246    IRTemp tdv = newTemp(ty);      /* (EDI) */
   3247    IRTemp tsv = newTemp(ty);      /* (ESI) */
   3248    IRTemp td  = newTemp(Ity_I32); /*  EDI  */
   3249    IRTemp ts  = newTemp(Ity_I32); /*  ESI  */
   3250 
   3251    assign( td, getIReg(4, R_EDI) );
   3252    assign( ts, getIReg(4, R_ESI) );
   3253 
   3254    assign( tdv, loadLE(ty,mkexpr(td)) );
   3255    assign( tsv, loadLE(ty,mkexpr(ts)) );
   3256 
   3257    setFlags_DEP1_DEP2 ( Iop_Sub8, tsv, tdv, ty );
   3258 
   3259    putIReg(4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
   3260    putIReg(4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
   3261 }
   3262 
   3263 static
   3264 void dis_SCAS ( Int sz, IRTemp t_inc )
   3265 {
   3266    IRType ty  = szToITy(sz);
   3267    IRTemp ta  = newTemp(ty);       /*  EAX  */
   3268    IRTemp td  = newTemp(Ity_I32);  /*  EDI  */
   3269    IRTemp tdv = newTemp(ty);       /* (EDI) */
   3270 
   3271    assign( ta, getIReg(sz, R_EAX) );
   3272    assign( td, getIReg(4, R_EDI) );
   3273 
   3274    assign( tdv, loadLE(ty,mkexpr(td)) );
   3275    setFlags_DEP1_DEP2 ( Iop_Sub8, ta, tdv, ty );
   3276 
   3277    putIReg(4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
   3278 }
   3279 
   3280 
   3281 /* Wrap the appropriate string op inside a REP/REPE/REPNE.
   3282    We assume the insn is the last one in the basic block, and so emit a jump
   3283    to the next insn, rather than just falling through. */
   3284 static
   3285 void dis_REP_op ( /*MOD*/DisResult* dres,
   3286                   X86Condcode cond,
   3287                   void (*dis_OP)(Int, IRTemp),
   3288                   Int sz, Addr32 eip, Addr32 eip_next, const HChar* name )
   3289 {
   3290    IRTemp t_inc = newTemp(Ity_I32);
   3291    IRTemp tc    = newTemp(Ity_I32);  /*  ECX  */
   3292 
   3293    assign( tc, getIReg(4,R_ECX) );
   3294 
   3295    stmt( IRStmt_Exit( binop(Iop_CmpEQ32,mkexpr(tc),mkU32(0)),
   3296                       Ijk_Boring,
   3297                       IRConst_U32(eip_next), OFFB_EIP ) );
   3298 
   3299    putIReg(4, R_ECX, binop(Iop_Sub32, mkexpr(tc), mkU32(1)) );
   3300 
   3301    dis_string_op_increment(sz, t_inc);
   3302    dis_OP (sz, t_inc);
   3303 
   3304    if (cond == X86CondAlways) {
   3305       jmp_lit(dres, Ijk_Boring, eip);
   3306       vassert(dres->whatNext == Dis_StopHere);
   3307    } else {
   3308       stmt( IRStmt_Exit( mk_x86g_calculate_condition(cond),
   3309                          Ijk_Boring,
   3310                          IRConst_U32(eip), OFFB_EIP ) );
   3311       jmp_lit(dres, Ijk_Boring, eip_next);
   3312       vassert(dres->whatNext == Dis_StopHere);
   3313    }
   3314    DIP("%s%c\n", name, nameISize(sz));
   3315 }
   3316 
   3317 
   3318 /*------------------------------------------------------------*/
   3319 /*--- Arithmetic, etc.                                     ---*/
   3320 /*------------------------------------------------------------*/
   3321 
   3322 /* IMUL E, G.  Supplied eip points to the modR/M byte. */
   3323 static
   3324 UInt dis_mul_E_G ( UChar       sorb,
   3325                    Int         size,
   3326                    Int         delta0 )
   3327 {
   3328    Int    alen;
   3329    HChar  dis_buf[50];
   3330    UChar  rm = getIByte(delta0);
   3331    IRType ty = szToITy(size);
   3332    IRTemp te = newTemp(ty);
   3333    IRTemp tg = newTemp(ty);
   3334    IRTemp resLo = newTemp(ty);
   3335 
   3336    assign( tg, getIReg(size, gregOfRM(rm)) );
   3337    if (epartIsReg(rm)) {
   3338       assign( te, getIReg(size, eregOfRM(rm)) );
   3339    } else {
   3340       IRTemp addr = disAMode( &alen, sorb, delta0, dis_buf );
   3341       assign( te, loadLE(ty,mkexpr(addr)) );
   3342    }
   3343 
   3344    setFlags_MUL ( ty, te, tg, X86G_CC_OP_SMULB );
   3345 
   3346    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tg) ) );
   3347 
   3348    putIReg(size, gregOfRM(rm), mkexpr(resLo) );
   3349 
   3350    if (epartIsReg(rm)) {
   3351       DIP("imul%c %s, %s\n", nameISize(size),
   3352                              nameIReg(size,eregOfRM(rm)),
   3353                              nameIReg(size,gregOfRM(rm)));
   3354       return 1+delta0;
   3355    } else {
   3356       DIP("imul%c %s, %s\n", nameISize(size),
   3357                              dis_buf, nameIReg(size,gregOfRM(rm)));
   3358       return alen+delta0;
   3359    }
   3360 }
   3361 
   3362 
   3363 /* IMUL I * E -> G.  Supplied eip points to the modR/M byte. */
   3364 static
   3365 UInt dis_imul_I_E_G ( UChar       sorb,
   3366                       Int         size,
   3367                       Int         delta,
   3368                       Int         litsize )
   3369 {
   3370    Int    d32, alen;
   3371    HChar  dis_buf[50];
   3372    UChar  rm = getIByte(delta);
   3373    IRType ty = szToITy(size);
   3374    IRTemp te = newTemp(ty);
   3375    IRTemp tl = newTemp(ty);
   3376    IRTemp resLo = newTemp(ty);
   3377 
   3378    vassert(size == 1 || size == 2 || size == 4);
   3379 
   3380    if (epartIsReg(rm)) {
   3381       assign(te, getIReg(size, eregOfRM(rm)));
   3382       delta++;
   3383    } else {
   3384       IRTemp addr = disAMode( &alen, sorb, delta, dis_buf );
   3385       assign(te, loadLE(ty, mkexpr(addr)));
   3386       delta += alen;
   3387    }
   3388    d32 = getSDisp(litsize,delta);
   3389    delta += litsize;
   3390 
   3391    if (size == 1) d32 &= 0xFF;
   3392    if (size == 2) d32 &= 0xFFFF;
   3393 
   3394    assign(tl, mkU(ty,d32));
   3395 
   3396    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tl) ));
   3397 
   3398    setFlags_MUL ( ty, te, tl, X86G_CC_OP_SMULB );
   3399 
   3400    putIReg(size, gregOfRM(rm), mkexpr(resLo));
   3401 
   3402    DIP("imul %d, %s, %s\n", d32,
   3403        ( epartIsReg(rm) ? nameIReg(size,eregOfRM(rm)) : dis_buf ),
   3404        nameIReg(size,gregOfRM(rm)) );
   3405    return delta;
   3406 }
   3407 
   3408 
   3409 /* Generate an IR sequence to do a count-leading-zeroes operation on
   3410    the supplied IRTemp, and return a new IRTemp holding the result.
   3411    'ty' may be Ity_I16 or Ity_I32 only.  In the case where the
   3412    argument is zero, return the number of bits in the word (the
   3413    natural semantics). */
   3414 static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
   3415 {
   3416    vassert(ty == Ity_I32 || ty == Ity_I16);
   3417 
   3418    IRTemp src32 = newTemp(Ity_I32);
   3419    assign(src32, widenUto32( mkexpr(src) ));
   3420 
   3421    IRTemp src32x = newTemp(Ity_I32);
   3422    assign(src32x,
   3423           binop(Iop_Shl32, mkexpr(src32),
   3424                            mkU8(32 - 8 * sizeofIRType(ty))));
   3425 
   3426    // Clz32 has undefined semantics when its input is zero, so
   3427    // special-case around that.
   3428    IRTemp res32 = newTemp(Ity_I32);
   3429    assign(res32,
   3430           IRExpr_ITE(
   3431              binop(Iop_CmpEQ32, mkexpr(src32x), mkU32(0)),
   3432              mkU32(8 * sizeofIRType(ty)),
   3433              unop(Iop_Clz32, mkexpr(src32x))
   3434    ));
   3435 
   3436    IRTemp res = newTemp(ty);
   3437    assign(res, narrowTo(ty, mkexpr(res32)));
   3438    return res;
   3439 }
   3440 
   3441 
   3442 /*------------------------------------------------------------*/
   3443 /*---                                                      ---*/
   3444 /*--- x87 FLOATING POINT INSTRUCTIONS                      ---*/
   3445 /*---                                                      ---*/
   3446 /*------------------------------------------------------------*/
   3447 
   3448 /* --- Helper functions for dealing with the register stack. --- */
   3449 
   3450 /* --- Set the emulation-warning pseudo-register. --- */
   3451 
   3452 static void put_emwarn ( IRExpr* e /* :: Ity_I32 */ )
   3453 {
   3454    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   3455    stmt( IRStmt_Put( OFFB_EMNOTE, e ) );
   3456 }
   3457 
   3458 /* --- Produce an IRExpr* denoting a 64-bit QNaN. --- */
   3459 
   3460 static IRExpr* mkQNaN64 ( void )
   3461 {
   3462   /* QNaN is 0 2047 1 0(51times)
   3463      == 0b 11111111111b 1 0(51times)
   3464      == 0x7FF8 0000 0000 0000
   3465    */
   3466    return IRExpr_Const(IRConst_F64i(0x7FF8000000000000ULL));
   3467 }
   3468 
   3469 /* --------- Get/put the top-of-stack pointer. --------- */
   3470 
   3471 static IRExpr* get_ftop ( void )
   3472 {
   3473    return IRExpr_Get( OFFB_FTOP, Ity_I32 );
   3474 }
   3475 
   3476 static void put_ftop ( IRExpr* e )
   3477 {
   3478    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   3479    stmt( IRStmt_Put( OFFB_FTOP, e ) );
   3480 }
   3481 
   3482 /* --------- Get/put the C3210 bits. --------- */
   3483 
   3484 static IRExpr* get_C3210 ( void )
   3485 {
   3486    return IRExpr_Get( OFFB_FC3210, Ity_I32 );
   3487 }
   3488 
   3489 static void put_C3210 ( IRExpr* e )
   3490 {
   3491    stmt( IRStmt_Put( OFFB_FC3210, e ) );
   3492 }
   3493 
   3494 /* --------- Get/put the FPU rounding mode. --------- */
   3495 static IRExpr* /* :: Ity_I32 */ get_fpround ( void )
   3496 {
   3497    return IRExpr_Get( OFFB_FPROUND, Ity_I32 );
   3498 }
   3499 
   3500 static void put_fpround ( IRExpr* /* :: Ity_I32 */ e )
   3501 {
   3502    stmt( IRStmt_Put( OFFB_FPROUND, e ) );
   3503 }
   3504 
   3505 
   3506 /* --------- Synthesise a 2-bit FPU rounding mode. --------- */
   3507 /* Produces a value in 0 .. 3, which is encoded as per the type
   3508    IRRoundingMode.  Since the guest_FPROUND value is also encoded as
   3509    per IRRoundingMode, we merely need to get it and mask it for
   3510    safety.
   3511 */
   3512 static IRExpr* /* :: Ity_I32 */ get_roundingmode ( void )
   3513 {
   3514    return binop( Iop_And32, get_fpround(), mkU32(3) );
   3515 }
   3516 
   3517 static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
   3518 {
   3519    return mkU32(Irrm_NEAREST);
   3520 }
   3521 
   3522 
   3523 /* --------- Get/set FP register tag bytes. --------- */
   3524 
   3525 /* Given i, and some expression e, generate 'ST_TAG(i) = e'. */
   3526 
   3527 static void put_ST_TAG ( Int i, IRExpr* value )
   3528 {
   3529    IRRegArray* descr;
   3530    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_I8);
   3531    descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   3532    stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
   3533 }
   3534 
   3535 /* Given i, generate an expression yielding 'ST_TAG(i)'.  This will be
   3536    zero to indicate "Empty" and nonzero to indicate "NonEmpty".  */
   3537 
   3538 static IRExpr* get_ST_TAG ( Int i )
   3539 {
   3540    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   3541    return IRExpr_GetI( descr, get_ftop(), i );
   3542 }
   3543 
   3544 
   3545 /* --------- Get/set FP registers. --------- */
   3546 
   3547 /* Given i, and some expression e, emit 'ST(i) = e' and set the
   3548    register's tag to indicate the register is full.  The previous
   3549    state of the register is not checked. */
   3550 
   3551 static void put_ST_UNCHECKED ( Int i, IRExpr* value )
   3552 {
   3553    IRRegArray* descr;
   3554    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_F64);
   3555    descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
   3556    stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
   3557    /* Mark the register as in-use. */
   3558    put_ST_TAG(i, mkU8(1));
   3559 }
   3560 
   3561 /* Given i, and some expression e, emit
   3562       ST(i) = is_full(i) ? NaN : e
   3563    and set the tag accordingly.
   3564 */
   3565 
   3566 static void put_ST ( Int i, IRExpr* value )
   3567 {
   3568    put_ST_UNCHECKED(
   3569       i,
   3570       IRExpr_ITE( binop(Iop_CmpNE8, get_ST_TAG(i), mkU8(0)),
   3571                   /* non-0 means full */
   3572                   mkQNaN64(),
   3573                   /* 0 means empty */
   3574                   value
   3575       )
   3576    );
   3577 }
   3578 
   3579 
   3580 /* Given i, generate an expression yielding 'ST(i)'. */
   3581 
   3582 static IRExpr* get_ST_UNCHECKED ( Int i )
   3583 {
   3584    IRRegArray* descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
   3585    return IRExpr_GetI( descr, get_ftop(), i );
   3586 }
   3587 
   3588 
   3589 /* Given i, generate an expression yielding
   3590   is_full(i) ? ST(i) : NaN
   3591 */
   3592 
   3593 static IRExpr* get_ST ( Int i )
   3594 {
   3595    return
   3596       IRExpr_ITE( binop(Iop_CmpNE8, get_ST_TAG(i), mkU8(0)),
   3597                   /* non-0 means full */
   3598                   get_ST_UNCHECKED(i),
   3599                   /* 0 means empty */
   3600                   mkQNaN64());
   3601 }
   3602 
   3603 
   3604 /* Given i, and some expression e, and a condition cond, generate IR
   3605    which has the same effect as put_ST(i,e) when cond is true and has
   3606    no effect when cond is false.  Given the lack of proper
   3607    if-then-else in the IR, this is pretty tricky.
   3608 */
   3609 
   3610 static void maybe_put_ST ( IRTemp cond, Int i, IRExpr* value )
   3611 {
   3612    // new_tag = if cond then FULL else old_tag
   3613    // new_val = if cond then (if old_tag==FULL then NaN else val)
   3614    //                   else old_val
   3615 
   3616    IRTemp old_tag = newTemp(Ity_I8);
   3617    assign(old_tag, get_ST_TAG(i));
   3618    IRTemp new_tag = newTemp(Ity_I8);
   3619    assign(new_tag,
   3620           IRExpr_ITE(mkexpr(cond), mkU8(1)/*FULL*/, mkexpr(old_tag)));
   3621 
   3622    IRTemp old_val = newTemp(Ity_F64);
   3623    assign(old_val, get_ST_UNCHECKED(i));
   3624    IRTemp new_val = newTemp(Ity_F64);
   3625    assign(new_val,
   3626           IRExpr_ITE(mkexpr(cond),
   3627                      IRExpr_ITE(binop(Iop_CmpNE8, mkexpr(old_tag), mkU8(0)),
   3628                                 /* non-0 means full */
   3629                                 mkQNaN64(),
   3630                                 /* 0 means empty */
   3631                                 value),
   3632                      mkexpr(old_val)));
   3633 
   3634    put_ST_UNCHECKED(i, mkexpr(new_val));
   3635    // put_ST_UNCHECKED incorrectly sets tag(i) to always be FULL.  So
   3636    // now set it to new_tag instead.
   3637    put_ST_TAG(i, mkexpr(new_tag));
   3638 }
   3639 
   3640 /* Adjust FTOP downwards by one register. */
   3641 
   3642 static void fp_push ( void )
   3643 {
   3644    put_ftop( binop(Iop_Sub32, get_ftop(), mkU32(1)) );
   3645 }
   3646 
   3647 /* Adjust FTOP downwards by one register when COND is 1:I1.  Else
   3648    don't change it. */
   3649 
   3650 static void maybe_fp_push ( IRTemp cond )
   3651 {
   3652    put_ftop( binop(Iop_Sub32, get_ftop(), unop(Iop_1Uto32,mkexpr(cond))) );
   3653 }
   3654 
   3655 /* Adjust FTOP upwards by one register, and mark the vacated register
   3656    as empty.  */
   3657 
   3658 static void fp_pop ( void )
   3659 {
   3660    put_ST_TAG(0, mkU8(0));
   3661    put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
   3662 }
   3663 
   3664 /* Set the C2 bit of the FPU status register to e[0].  Assumes that
   3665    e[31:1] == 0.
   3666 */
   3667 static void set_C2 ( IRExpr* e )
   3668 {
   3669    IRExpr* cleared = binop(Iop_And32, get_C3210(), mkU32(~X86G_FC_MASK_C2));
   3670    put_C3210( binop(Iop_Or32,
   3671                     cleared,
   3672                     binop(Iop_Shl32, e, mkU8(X86G_FC_SHIFT_C2))) );
   3673 }
   3674 
   3675 /* Generate code to check that abs(d64) < 2^63 and is finite.  This is
   3676    used to do the range checks for FSIN, FCOS, FSINCOS and FPTAN.  The
   3677    test is simple, but the derivation of it is not so simple.
   3678 
   3679    The exponent field for an IEEE754 double is 11 bits.  That means it
   3680    can take values 0 through 0x7FF.  If the exponent has value 0x7FF,
   3681    the number is either a NaN or an Infinity and so is not finite.
   3682    Furthermore, a finite value of exactly 2^63 is the smallest value
   3683    that has exponent value 0x43E.  Hence, what we need to do is
   3684    extract the exponent, ignoring the sign bit and mantissa, and check
   3685    it is < 0x43E, or <= 0x43D.
   3686 
   3687    To make this easily applicable to 32- and 64-bit targets, a
   3688    roundabout approach is used.  First the number is converted to I64,
   3689    then the top 32 bits are taken.  Shifting them right by 20 bits
   3690    places the sign bit and exponent in the bottom 12 bits.  Anding
   3691    with 0x7FF gets rid of the sign bit, leaving just the exponent
   3692    available for comparison.
   3693 */
   3694 static IRTemp math_IS_TRIG_ARG_FINITE_AND_IN_RANGE ( IRTemp d64 )
   3695 {
   3696    IRTemp i64 = newTemp(Ity_I64);
   3697    assign(i64, unop(Iop_ReinterpF64asI64, mkexpr(d64)) );
   3698    IRTemp exponent = newTemp(Ity_I32);
   3699    assign(exponent,
   3700           binop(Iop_And32,
   3701                 binop(Iop_Shr32, unop(Iop_64HIto32, mkexpr(i64)), mkU8(20)),
   3702                 mkU32(0x7FF)));
   3703    IRTemp in_range_and_finite = newTemp(Ity_I1);
   3704    assign(in_range_and_finite,
   3705           binop(Iop_CmpLE32U, mkexpr(exponent), mkU32(0x43D)));
   3706    return in_range_and_finite;
   3707 }
   3708 
   3709 /* Invent a plausible-looking FPU status word value:
   3710       ((ftop & 7) << 11) | (c3210 & 0x4700)
   3711  */
   3712 static IRExpr* get_FPU_sw ( void )
   3713 {
   3714    return
   3715       unop(Iop_32to16,
   3716            binop(Iop_Or32,
   3717                  binop(Iop_Shl32,
   3718                        binop(Iop_And32, get_ftop(), mkU32(7)),
   3719                              mkU8(11)),
   3720                        binop(Iop_And32, get_C3210(), mkU32(0x4700))
   3721       ));
   3722 }
   3723 
   3724 
   3725 /* ------------------------------------------------------- */
   3726 /* Given all that stack-mangling junk, we can now go ahead
   3727    and describe FP instructions.
   3728 */
   3729 
   3730 /* ST(0) = ST(0) `op` mem64/32(addr)
   3731    Need to check ST(0)'s tag on read, but not on write.
   3732 */
   3733 static
   3734 void fp_do_op_mem_ST_0 ( IRTemp addr, const HChar* op_txt, HChar* dis_buf,
   3735                          IROp op, Bool dbl )
   3736 {
   3737    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
   3738    if (dbl) {
   3739       put_ST_UNCHECKED(0,
   3740          triop( op,
   3741                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3742                 get_ST(0),
   3743                 loadLE(Ity_F64,mkexpr(addr))
   3744          ));
   3745    } else {
   3746       put_ST_UNCHECKED(0,
   3747          triop( op,
   3748                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3749                 get_ST(0),
   3750                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr)))
   3751          ));
   3752    }
   3753 }
   3754 
   3755 
   3756 /* ST(0) = mem64/32(addr) `op` ST(0)
   3757    Need to check ST(0)'s tag on read, but not on write.
   3758 */
   3759 static
   3760 void fp_do_oprev_mem_ST_0 ( IRTemp addr, const HChar* op_txt, HChar* dis_buf,
   3761                             IROp op, Bool dbl )
   3762 {
   3763    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
   3764    if (dbl) {
   3765       put_ST_UNCHECKED(0,
   3766          triop( op,
   3767                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3768                 loadLE(Ity_F64,mkexpr(addr)),
   3769                 get_ST(0)
   3770          ));
   3771    } else {
   3772       put_ST_UNCHECKED(0,
   3773          triop( op,
   3774                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3775                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr))),
   3776                 get_ST(0)
   3777          ));
   3778    }
   3779 }
   3780 
   3781 
   3782 /* ST(dst) = ST(dst) `op` ST(src).
   3783    Check dst and src tags when reading but not on write.
   3784 */
   3785 static
   3786 void fp_do_op_ST_ST ( const HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
   3787                       Bool pop_after )
   3788 {
   3789    DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"",
   3790                                  st_src, st_dst);
   3791    put_ST_UNCHECKED(
   3792       st_dst,
   3793       triop( op,
   3794              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3795              get_ST(st_dst),
   3796              get_ST(st_src) )
   3797    );
   3798    if (pop_after)
   3799       fp_pop();
   3800 }
   3801 
   3802 /* ST(dst) = ST(src) `op` ST(dst).
   3803    Check dst and src tags when reading but not on write.
   3804 */
   3805 static
   3806 void fp_do_oprev_ST_ST ( const HChar* op_txt, IROp op, UInt st_src,
   3807                          UInt st_dst, Bool pop_after )
   3808 {
   3809    DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"",
   3810                                  st_src, st_dst);
   3811    put_ST_UNCHECKED(
   3812       st_dst,
   3813       triop( op,
   3814              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3815              get_ST(st_src),
   3816              get_ST(st_dst) )
   3817    );
   3818    if (pop_after)
   3819       fp_pop();
   3820 }
   3821 
   3822 /* %eflags(Z,P,C) = UCOMI( st(0), st(i) ) */
   3823 static void fp_do_ucomi_ST0_STi ( UInt i, Bool pop_after )
   3824 {
   3825    DIP("fucomi%s %%st(0),%%st(%u)\n", pop_after ? "p" : "", i);
   3826    /* This is a bit of a hack (and isn't really right).  It sets
   3827       Z,P,C,O correctly, but forces A and S to zero, whereas the Intel
   3828       documentation implies A and S are unchanged.
   3829    */
   3830    /* It's also fishy in that it is used both for COMIP and
   3831       UCOMIP, and they aren't the same (although similar). */
   3832    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   3833    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   3834    stmt( IRStmt_Put( OFFB_CC_DEP1,
   3835                      binop( Iop_And32,
   3836                             binop(Iop_CmpF64, get_ST(0), get_ST(i)),
   3837                             mkU32(0x45)
   3838        )));
   3839    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   3840       elimination of previous stores to this field work better. */
   3841    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   3842    if (pop_after)
   3843       fp_pop();
   3844 }
   3845 
   3846 
   3847 static
   3848 UInt dis_FPU ( Bool* decode_ok, UChar sorb, Int delta )
   3849 {
   3850    Int    len;
   3851    UInt   r_src, r_dst;
   3852    HChar  dis_buf[50];
   3853    IRTemp t1, t2;
   3854 
   3855    /* On entry, delta points at the second byte of the insn (the modrm
   3856       byte).*/
   3857    UChar first_opcode = getIByte(delta-1);
   3858    UChar modrm        = getIByte(delta+0);
   3859 
   3860    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD8 opcodes +-+-+-+-+-+-+-+ */
   3861 
   3862    if (first_opcode == 0xD8) {
   3863       if (modrm < 0xC0) {
   3864 
   3865          /* bits 5,4,3 are an opcode extension, and the modRM also
   3866            specifies an address. */
   3867          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   3868          delta += len;
   3869 
   3870          switch (gregOfRM(modrm)) {
   3871 
   3872             case 0: /* FADD single-real */
   3873                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, False );
   3874                break;
   3875 
   3876             case 1: /* FMUL single-real */
   3877                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, False );
   3878                break;
   3879 
   3880             case 2: /* FCOM single-real */
   3881                DIP("fcoms %s\n", dis_buf);
   3882                /* This forces C1 to zero, which isn't right. */
   3883                put_C3210(
   3884                    binop( Iop_And32,
   3885                           binop(Iop_Shl32,
   3886                                 binop(Iop_CmpF64,
   3887                                       get_ST(0),
   3888                                       unop(Iop_F32toF64,
   3889                                            loadLE(Ity_F32,mkexpr(addr)))),
   3890                                 mkU8(8)),
   3891                           mkU32(0x4500)
   3892                    ));
   3893                break;
   3894 
   3895             case 3: /* FCOMP single-real */
   3896                DIP("fcomps %s\n", dis_buf);
   3897                /* This forces C1 to zero, which isn't right. */
   3898                put_C3210(
   3899                    binop( Iop_And32,
   3900                           binop(Iop_Shl32,
   3901                                 binop(Iop_CmpF64,
   3902                                       get_ST(0),
   3903                                       unop(Iop_F32toF64,
   3904                                            loadLE(Ity_F32,mkexpr(addr)))),
   3905                                 mkU8(8)),
   3906                           mkU32(0x4500)
   3907                    ));
   3908                fp_pop();
   3909                break;
   3910 
   3911             case 4: /* FSUB single-real */
   3912                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, False );
   3913                break;
   3914 
   3915             case 5: /* FSUBR single-real */
   3916                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, False );
   3917                break;
   3918 
   3919             case 6: /* FDIV single-real */
   3920                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, False );
   3921                break;
   3922 
   3923             case 7: /* FDIVR single-real */
   3924                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, False );
   3925                break;
   3926 
   3927             default:
   3928                vex_printf("unhandled opc_aux = 0x%2x\n", (UInt)gregOfRM(modrm));
   3929                vex_printf("first_opcode == 0xD8\n");
   3930                goto decode_fail;
   3931          }
   3932       } else {
   3933          delta++;
   3934          switch (modrm) {
   3935 
   3936             case 0xC0 ... 0xC7: /* FADD %st(?),%st(0) */
   3937                fp_do_op_ST_ST ( "add", Iop_AddF64, modrm - 0xC0, 0, False );
   3938                break;
   3939 
   3940             case 0xC8 ... 0xCF: /* FMUL %st(?),%st(0) */
   3941                fp_do_op_ST_ST ( "mul", Iop_MulF64, modrm - 0xC8, 0, False );
   3942                break;
   3943 
   3944             /* Dunno if this is right */
   3945             case 0xD0 ... 0xD7: /* FCOM %st(?),%st(0) */
   3946                r_dst = (UInt)modrm - 0xD0;
   3947                DIP("fcom %%st(0),%%st(%u)\n", r_dst);
   3948                /* This forces C1 to zero, which isn't right. */
   3949                put_C3210(
   3950                    binop( Iop_And32,
   3951                           binop(Iop_Shl32,
   3952                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   3953                                 mkU8(8)),
   3954                           mkU32(0x4500)
   3955                    ));
   3956                break;
   3957 
   3958             /* Dunno if this is right */
   3959             case 0xD8 ... 0xDF: /* FCOMP %st(?),%st(0) */
   3960                r_dst = (UInt)modrm - 0xD8;
   3961                DIP("fcomp %%st(0),%%st(%u)\n", r_dst);
   3962                /* This forces C1 to zero, which isn't right. */
   3963                put_C3210(
   3964                    binop( Iop_And32,
   3965                           binop(Iop_Shl32,
   3966                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   3967                                 mkU8(8)),
   3968                           mkU32(0x4500)
   3969                    ));
   3970                fp_pop();
   3971                break;
   3972 
   3973             case 0xE0 ... 0xE7: /* FSUB %st(?),%st(0) */
   3974                fp_do_op_ST_ST ( "sub", Iop_SubF64, modrm - 0xE0, 0, False );
   3975                break;
   3976 
   3977             case 0xE8 ... 0xEF: /* FSUBR %st(?),%st(0) */
   3978                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, modrm - 0xE8, 0, False );
   3979                break;
   3980 
   3981             case 0xF0 ... 0xF7: /* FDIV %st(?),%st(0) */
   3982                fp_do_op_ST_ST ( "div", Iop_DivF64, modrm - 0xF0, 0, False );
   3983                break;
   3984 
   3985             case 0xF8 ... 0xFF: /* FDIVR %st(?),%st(0) */
   3986                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, modrm - 0xF8, 0, False );
   3987                break;
   3988 
   3989             default:
   3990                goto decode_fail;
   3991          }
   3992       }
   3993    }
   3994 
   3995    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD9 opcodes +-+-+-+-+-+-+-+ */
   3996    else
   3997    if (first_opcode == 0xD9) {
   3998       if (modrm < 0xC0) {
   3999 
   4000          /* bits 5,4,3 are an opcode extension, and the modRM also
   4001             specifies an address. */
   4002          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   4003          delta += len;
   4004 
   4005          switch (gregOfRM(modrm)) {
   4006 
   4007             case 0: /* FLD single-real */
   4008                DIP("flds %s\n", dis_buf);
   4009                fp_push();
   4010                put_ST(0, unop(Iop_F32toF64,
   4011                               loadLE(Ity_F32, mkexpr(addr))));
   4012                break;
   4013 
   4014             case 2: /* FST single-real */
   4015                DIP("fsts %s\n", dis_buf);
   4016                storeLE(mkexpr(addr),
   4017                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
   4018                break;
   4019 
   4020             case 3: /* FSTP single-real */
   4021                DIP("fstps %s\n", dis_buf);
   4022                storeLE(mkexpr(addr),
   4023                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
   4024                fp_pop();
   4025                break;
   4026 
   4027             case 4: { /* FLDENV m28 */
   4028                /* Uses dirty helper:
   4029                      VexEmNote x86g_do_FLDENV ( VexGuestX86State*, HWord ) */
   4030                IRTemp   ew = newTemp(Ity_I32);
   4031                IRDirty* d  = unsafeIRDirty_0_N (
   4032                                 0/*regparms*/,
   4033                                 "x86g_dirtyhelper_FLDENV",
   4034                                 &x86g_dirtyhelper_FLDENV,
   4035                                 mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
   4036                              );
   4037                d->tmp   = ew;
   4038                /* declare we're reading memory */
   4039                d->mFx   = Ifx_Read;
   4040                d->mAddr = mkexpr(addr);
   4041                d->mSize = 28;
   4042 
   4043                /* declare we're writing guest state */
   4044                d->nFxState = 4;
   4045                vex_bzero(&d->fxState, sizeof(d->fxState));
   4046 
   4047                d->fxState[0].fx     = Ifx_Write;
   4048                d->fxState[0].offset = OFFB_FTOP;
   4049                d->fxState[0].size   = sizeof(UInt);
   4050 
   4051                d->fxState[1].fx     = Ifx_Write;
   4052                d->fxState[1].offset = OFFB_FPTAGS;
   4053                d->fxState[1].size   = 8 * sizeof(UChar);
   4054 
   4055                d->fxState[2].fx     = Ifx_Write;
   4056                d->fxState[2].offset = OFFB_FPROUND;
   4057                d->fxState[2].size   = sizeof(UInt);
   4058 
   4059                d->fxState[3].fx     = Ifx_Write;
   4060                d->fxState[3].offset = OFFB_FC3210;
   4061                d->fxState[3].size   = sizeof(UInt);
   4062 
   4063                stmt( IRStmt_Dirty(d) );
   4064 
   4065                /* ew contains any emulation warning we may need to
   4066                   issue.  If needed, side-exit to the next insn,
   4067                   reporting the warning, so that Valgrind's dispatcher
   4068                   sees the warning. */
   4069                put_emwarn( mkexpr(ew) );
   4070                stmt(
   4071                   IRStmt_Exit(
   4072                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   4073                      Ijk_EmWarn,
   4074                      IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
   4075                      OFFB_EIP
   4076                   )
   4077                );
   4078 
   4079                DIP("fldenv %s\n", dis_buf);
   4080                break;
   4081             }
   4082 
   4083             case 5: {/* FLDCW */
   4084                /* The only thing we observe in the control word is the
   4085                   rounding mode.  Therefore, pass the 16-bit value
   4086                   (x87 native-format control word) to a clean helper,
   4087                   getting back a 64-bit value, the lower half of which
   4088                   is the FPROUND value to store, and the upper half of
   4089                   which is the emulation-warning token which may be
   4090                   generated.
   4091                */
   4092                /* ULong x86h_check_fldcw ( UInt ); */
   4093                IRTemp t64 = newTemp(Ity_I64);
   4094                IRTemp ew = newTemp(Ity_I32);
   4095                DIP("fldcw %s\n", dis_buf);
   4096                assign( t64, mkIRExprCCall(
   4097                                Ity_I64, 0/*regparms*/,
   4098                                "x86g_check_fldcw",
   4099                                &x86g_check_fldcw,
   4100                                mkIRExprVec_1(
   4101                                   unop( Iop_16Uto32,
   4102                                         loadLE(Ity_I16, mkexpr(addr)))
   4103                                )
   4104                             )
   4105                      );
   4106 
   4107                put_fpround( unop(Iop_64to32, mkexpr(t64)) );
   4108                assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
   4109                put_emwarn( mkexpr(ew) );
   4110                /* Finally, if an emulation warning was reported,
   4111                   side-exit to the next insn, reporting the warning,
   4112                   so that Valgrind's dispatcher sees the warning. */
   4113                stmt(
   4114                   IRStmt_Exit(
   4115                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   4116                      Ijk_EmWarn,
   4117                      IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
   4118                      OFFB_EIP
   4119                   )
   4120                );
   4121                break;
   4122             }
   4123 
   4124             case 6: { /* FNSTENV m28 */
   4125                /* Uses dirty helper:
   4126                      void x86g_do_FSTENV ( VexGuestX86State*, HWord ) */
   4127                IRDirty* d = unsafeIRDirty_0_N (
   4128                                0/*regparms*/,
   4129                                "x86g_dirtyhelper_FSTENV",
   4130                                &x86g_dirtyhelper_FSTENV,
   4131                                mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
   4132                             );
   4133                /* declare we're writing memory */
   4134                d->mFx   = Ifx_Write;
   4135                d->mAddr = mkexpr(addr);
   4136                d->mSize = 28;
   4137 
   4138                /* declare we're reading guest state */
   4139                d->nFxState = 4;
   4140                vex_bzero(&d->fxState, sizeof(d->fxState));
   4141 
   4142                d->fxState[0].fx     = Ifx_Read;
   4143                d->fxState[0].offset = OFFB_FTOP;
   4144                d->fxState[0].size   = sizeof(UInt);
   4145 
   4146                d->fxState[1].fx     = Ifx_Read;
   4147                d->fxState[1].offset = OFFB_FPTAGS;
   4148                d->fxState[1].size   = 8 * sizeof(UChar);
   4149 
   4150                d->fxState[2].fx     = Ifx_Read;
   4151                d->fxState[2].offset = OFFB_FPROUND;
   4152                d->fxState[2].size   = sizeof(UInt);
   4153 
   4154                d->fxState[3].fx     = Ifx_Read;
   4155                d->fxState[3].offset = OFFB_FC3210;
   4156                d->fxState[3].size   = sizeof(UInt);
   4157 
   4158                stmt( IRStmt_Dirty(d) );
   4159 
   4160                DIP("fnstenv %s\n", dis_buf);
   4161                break;
   4162             }
   4163 
   4164             case 7: /* FNSTCW */
   4165               /* Fake up a native x87 FPU control word.  The only
   4166                  thing it depends on is FPROUND[1:0], so call a clean
   4167                  helper to cook it up. */
   4168                /* UInt x86h_create_fpucw ( UInt fpround ) */
   4169                DIP("fnstcw %s\n", dis_buf);
   4170                storeLE(
   4171                   mkexpr(addr),
   4172                   unop( Iop_32to16,
   4173                         mkIRExprCCall(
   4174                            Ity_I32, 0/*regp*/,
   4175                            "x86g_create_fpucw", &x86g_create_fpucw,
   4176                            mkIRExprVec_1( get_fpround() )
   4177                         )
   4178                   )
   4179                );
   4180                break;
   4181 
   4182             default:
   4183                vex_printf("unhandled opc_aux = 0x%2x\n", (UInt)gregOfRM(modrm));
   4184                vex_printf("first_opcode == 0xD9\n");
   4185                goto decode_fail;
   4186          }
   4187 
   4188       } else {
   4189          delta++;
   4190          switch (modrm) {
   4191 
   4192             case 0xC0 ... 0xC7: /* FLD %st(?) */
   4193                r_src = (UInt)modrm - 0xC0;
   4194                DIP("fld %%st(%u)\n", r_src);
   4195                t1 = newTemp(Ity_F64);
   4196                assign(t1, get_ST(r_src));
   4197                fp_push();
   4198                put_ST(0, mkexpr(t1));
   4199                break;
   4200 
   4201             case 0xC8 ... 0xCF: /* FXCH %st(?) */
   4202                r_src = (UInt)modrm - 0xC8;
   4203                DIP("fxch %%st(%u)\n", r_src);
   4204                t1 = newTemp(Ity_F64);
   4205                t2 = newTemp(Ity_F64);
   4206                assign(t1, get_ST(0));
   4207                assign(t2, get_ST(r_src));
   4208                put_ST_UNCHECKED(0, mkexpr(t2));
   4209                put_ST_UNCHECKED(r_src, mkexpr(t1));
   4210                break;
   4211 
   4212             case 0xE0: /* FCHS */
   4213                DIP("fchs\n");
   4214                put_ST_UNCHECKED(0, unop(Iop_NegF64, get_ST(0)));
   4215                break;
   4216 
   4217             case 0xE1: /* FABS */
   4218                DIP("fabs\n");
   4219                put_ST_UNCHECKED(0, unop(Iop_AbsF64, get_ST(0)));
   4220                break;
   4221 
   4222             case 0xE4: /* FTST */
   4223                DIP("ftst\n");
   4224                /* This forces C1 to zero, which isn't right. */
   4225                /* Well, in fact the Intel docs say (bizarrely): "C1 is
   4226                   set to 0 if stack underflow occurred; otherwise, set
   4227                   to 0" which is pretty nonsensical.  I guess it's a
   4228                    typo. */
   4229                put_C3210(
   4230                    binop( Iop_And32,
   4231                           binop(Iop_Shl32,
   4232                                 binop(Iop_CmpF64,
   4233                                       get_ST(0),
   4234                                       IRExpr_Const(IRConst_F64i(0x0ULL))),
   4235                                 mkU8(8)),
   4236                           mkU32(0x4500)
   4237                    ));
   4238                break;
   4239 
   4240             case 0xE5: { /* FXAM */
   4241                /* This is an interesting one.  It examines %st(0),
   4242                   regardless of whether the tag says it's empty or not.
   4243                   Here, just pass both the tag (in our format) and the
   4244                   value (as a double, actually a ULong) to a helper
   4245                   function. */
   4246                IRExpr** args
   4247                   = mkIRExprVec_2( unop(Iop_8Uto32, get_ST_TAG(0)),
   4248                                    unop(Iop_ReinterpF64asI64,
   4249                                         get_ST_UNCHECKED(0)) );
   4250                put_C3210(mkIRExprCCall(
   4251                             Ity_I32,
   4252                             0/*regparm*/,
   4253                             "x86g_calculate_FXAM", &x86g_calculate_FXAM,
   4254                             args
   4255                         ));
   4256                DIP("fxam\n");
   4257                break;
   4258             }
   4259 
   4260             case 0xE8: /* FLD1 */
   4261                DIP("fld1\n");
   4262                fp_push();
   4263                /* put_ST(0, IRExpr_Const(IRConst_F64(1.0))); */
   4264                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff0000000000000ULL)));
   4265                break;
   4266 
   4267             case 0xE9: /* FLDL2T */
   4268                DIP("fldl2t\n");
   4269                fp_push();
   4270                /* put_ST(0, IRExpr_Const(IRConst_F64(3.32192809488736234781))); */
   4271                put_ST(0, IRExpr_Const(IRConst_F64i(0x400a934f0979a371ULL)));
   4272                break;
   4273 
   4274             case 0xEA: /* FLDL2E */
   4275                DIP("fldl2e\n");
   4276                fp_push();
   4277                /* put_ST(0, IRExpr_Const(IRConst_F64(1.44269504088896340739))); */
   4278                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff71547652b82feULL)));
   4279                break;
   4280 
   4281             case 0xEB: /* FLDPI */
   4282                DIP("fldpi\n");
   4283                fp_push();
   4284                /* put_ST(0, IRExpr_Const(IRConst_F64(3.14159265358979323851))); */
   4285                put_ST(0, IRExpr_Const(IRConst_F64i(0x400921fb54442d18ULL)));
   4286                break;
   4287 
   4288             case 0xEC: /* FLDLG2 */
   4289                DIP("fldlg2\n");
   4290                fp_push();
   4291                /* put_ST(0, IRExpr_Const(IRConst_F64(0.301029995663981143))); */
   4292                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fd34413509f79ffULL)));
   4293                break;
   4294 
   4295             case 0xED: /* FLDLN2 */
   4296                DIP("fldln2\n");
   4297                fp_push();
   4298                /* put_ST(0, IRExpr_Const(IRConst_F64(0.69314718055994530942))); */
   4299                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fe62e42fefa39efULL)));
   4300                break;
   4301 
   4302             case 0xEE: /* FLDZ */
   4303                DIP("fldz\n");
   4304                fp_push();
   4305                /* put_ST(0, IRExpr_Const(IRConst_F64(0.0))); */
   4306                put_ST(0, IRExpr_Const(IRConst_F64i(0x0000000000000000ULL)));
   4307                break;
   4308 
   4309             case 0xF0: /* F2XM1 */
   4310                DIP("f2xm1\n");
   4311                put_ST_UNCHECKED(0,
   4312                   binop(Iop_2xm1F64,
   4313                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4314                         get_ST(0)));
   4315                break;
   4316 
   4317             case 0xF1: /* FYL2X */
   4318                DIP("fyl2x\n");
   4319                put_ST_UNCHECKED(1,
   4320                   triop(Iop_Yl2xF64,
   4321                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4322                         get_ST(1),
   4323                         get_ST(0)));
   4324                fp_pop();
   4325                break;
   4326 
   4327             case 0xF2: { /* FPTAN */
   4328                DIP("fptan\n");
   4329                IRTemp argD = newTemp(Ity_F64);
   4330                assign(argD, get_ST(0));
   4331                IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
   4332                IRTemp resD = newTemp(Ity_F64);
   4333                assign(resD,
   4334                   IRExpr_ITE(
   4335                      mkexpr(argOK),
   4336                      binop(Iop_TanF64,
   4337                            get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4338                            mkexpr(argD)),
   4339                      mkexpr(argD))
   4340                );
   4341                put_ST_UNCHECKED(0, mkexpr(resD));
   4342                /* Conditionally push 1.0 on the stack, if the arg is
   4343                   in range */
   4344                maybe_fp_push(argOK);
   4345                maybe_put_ST(argOK, 0,
   4346                             IRExpr_Const(IRConst_F64(1.0)));
   4347                set_C2( binop(Iop_Xor32,
   4348                              unop(Iop_1Uto32, mkexpr(argOK)),
   4349                              mkU32(1)) );
   4350                break;
   4351             }
   4352 
   4353             case 0xF3: /* FPATAN */
   4354                DIP("fpatan\n");
   4355                put_ST_UNCHECKED(1,
   4356                   triop(Iop_AtanF64,
   4357                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4358                         get_ST(1),
   4359                         get_ST(0)));
   4360                fp_pop();
   4361                break;
   4362 
   4363             case 0xF4: { /* FXTRACT */
   4364                IRTemp argF = newTemp(Ity_F64);
   4365                IRTemp sigF = newTemp(Ity_F64);
   4366                IRTemp expF = newTemp(Ity_F64);
   4367                IRTemp argI = newTemp(Ity_I64);
   4368                IRTemp sigI = newTemp(Ity_I64);
   4369                IRTemp expI = newTemp(Ity_I64);
   4370                DIP("fxtract\n");
   4371                assign( argF, get_ST(0) );
   4372                assign( argI, unop(Iop_ReinterpF64asI64, mkexpr(argF)));
   4373                assign( sigI,
   4374                        mkIRExprCCall(
   4375                           Ity_I64, 0/*regparms*/,
   4376                           "x86amd64g_calculate_FXTRACT",
   4377                           &x86amd64g_calculate_FXTRACT,
   4378                           mkIRExprVec_2( mkexpr(argI),
   4379                                          mkIRExpr_HWord(0)/*sig*/ ))
   4380                );
   4381                assign( expI,
   4382                        mkIRExprCCall(
   4383                           Ity_I64, 0/*regparms*/,
   4384                           "x86amd64g_calculate_FXTRACT",
   4385                           &x86amd64g_calculate_FXTRACT,
   4386                           mkIRExprVec_2( mkexpr(argI),
   4387                                          mkIRExpr_HWord(1)/*exp*/ ))
   4388                );
   4389                assign( sigF, unop(Iop_ReinterpI64asF64, mkexpr(sigI)) );
   4390                assign( expF, unop(Iop_ReinterpI64asF64, mkexpr(expI)) );
   4391                /* exponent */
   4392                put_ST_UNCHECKED(0, mkexpr(expF) );
   4393                fp_push();
   4394                /* significand */
   4395                put_ST(0, mkexpr(sigF) );
   4396                break;
   4397             }
   4398 
   4399             case 0xF5: { /* FPREM1 -- IEEE compliant */
   4400                IRTemp a1 = newTemp(Ity_F64);
   4401                IRTemp a2 = newTemp(Ity_F64);
   4402                DIP("fprem1\n");
   4403                /* Do FPREM1 twice, once to get the remainder, and once
   4404                   to get the C3210 flag values. */
   4405                assign( a1, get_ST(0) );
   4406                assign( a2, get_ST(1) );
   4407                put_ST_UNCHECKED(0,
   4408                   triop(Iop_PRem1F64,
   4409                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4410                         mkexpr(a1),
   4411                         mkexpr(a2)));
   4412                put_C3210(
   4413                   triop(Iop_PRem1C3210F64,
   4414                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4415                         mkexpr(a1),
   4416                         mkexpr(a2)) );
   4417                break;
   4418             }
   4419 
   4420             case 0xF7: /* FINCSTP */
   4421                DIP("fprem\n");
   4422                put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
   4423                break;
   4424 
   4425             case 0xF8: { /* FPREM -- not IEEE compliant */
   4426                IRTemp a1 = newTemp(Ity_F64);
   4427                IRTemp a2 = newTemp(Ity_F64);
   4428                DIP("fprem\n");
   4429                /* Do FPREM twice, once to get the remainder, and once
   4430                   to get the C3210 flag values. */
   4431                assign( a1, get_ST(0) );
   4432                assign( a2, get_ST(1) );
   4433                put_ST_UNCHECKED(0,
   4434                   triop(Iop_PRemF64,
   4435                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4436                         mkexpr(a1),
   4437                         mkexpr(a2)));
   4438                put_C3210(
   4439                   triop(Iop_PRemC3210F64,
   4440                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4441                         mkexpr(a1),
   4442                         mkexpr(a2)) );
   4443                break;
   4444             }
   4445 
   4446             case 0xF9: /* FYL2XP1 */
   4447                DIP("fyl2xp1\n");
   4448                put_ST_UNCHECKED(1,
   4449                   triop(Iop_Yl2xp1F64,
   4450                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4451                         get_ST(1),
   4452                         get_ST(0)));
   4453                fp_pop();
   4454                break;
   4455 
   4456             case 0xFA: /* FSQRT */
   4457                DIP("fsqrt\n");
   4458                put_ST_UNCHECKED(0,
   4459                   binop(Iop_SqrtF64,
   4460                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4461                         get_ST(0)));
   4462                break;
   4463 
   4464             case 0xFB: { /* FSINCOS */
   4465                DIP("fsincos\n");
   4466                IRTemp argD = newTemp(Ity_F64);
   4467                assign(argD, get_ST(0));
   4468                IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
   4469                IRTemp resD = newTemp(Ity_F64);
   4470                assign(resD,
   4471                   IRExpr_ITE(
   4472                      mkexpr(argOK),
   4473                      binop(Iop_SinF64,
   4474                            get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4475                            mkexpr(argD)),
   4476                      mkexpr(argD))
   4477                );
   4478                put_ST_UNCHECKED(0, mkexpr(resD));
   4479                /* Conditionally push the cos value on the stack, if
   4480                   the arg is in range */
   4481                maybe_fp_push(argOK);
   4482                maybe_put_ST(argOK, 0,
   4483                   binop(Iop_CosF64,
   4484                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4485                         mkexpr(argD)));
   4486                set_C2( binop(Iop_Xor32,
   4487                              unop(Iop_1Uto32, mkexpr(argOK)),
   4488                              mkU32(1)) );
   4489                break;
   4490             }
   4491 
   4492             case 0xFC: /* FRNDINT */
   4493                DIP("frndint\n");
   4494                put_ST_UNCHECKED(0,
   4495                   binop(Iop_RoundF64toInt, get_roundingmode(), get_ST(0)) );
   4496                break;
   4497 
   4498             case 0xFD: /* FSCALE */
   4499                DIP("fscale\n");
   4500                put_ST_UNCHECKED(0,
   4501                   triop(Iop_ScaleF64,
   4502                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4503                         get_ST(0),
   4504                         get_ST(1)));
   4505                break;
   4506 
   4507             case 0xFE:   /* FSIN */
   4508             case 0xFF: { /* FCOS */
   4509                Bool isSIN = modrm == 0xFE;
   4510                DIP("%s\n", isSIN ? "fsin" : "fcos");
   4511                IRTemp argD = newTemp(Ity_F64);
   4512                assign(argD, get_ST(0));
   4513                IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
   4514                IRTemp resD = newTemp(Ity_F64);
   4515                assign(resD,
   4516                   IRExpr_ITE(
   4517                      mkexpr(argOK),
   4518                      binop(isSIN ? Iop_SinF64 : Iop_CosF64,
   4519                            get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4520                            mkexpr(argD)),
   4521                      mkexpr(argD))
   4522                );
   4523                put_ST_UNCHECKED(0, mkexpr(resD));
   4524                set_C2( binop(Iop_Xor32,
   4525                              unop(Iop_1Uto32, mkexpr(argOK)),
   4526                              mkU32(1)) );
   4527                break;
   4528             }
   4529 
   4530             default:
   4531                goto decode_fail;
   4532          }
   4533       }
   4534    }
   4535 
   4536    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDA opcodes +-+-+-+-+-+-+-+ */
   4537    else
   4538    if (first_opcode == 0xDA) {
   4539 
   4540       if (modrm < 0xC0) {
   4541 
   4542          /* bits 5,4,3 are an opcode extension, and the modRM also
   4543             specifies an address. */
   4544          IROp   fop;
   4545          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   4546          delta += len;
   4547          switch (gregOfRM(modrm)) {
   4548 
   4549             case 0: /* FIADD m32int */ /* ST(0) += m32int */
   4550                DIP("fiaddl %s\n", dis_buf);
   4551                fop = Iop_AddF64;
   4552                goto do_fop_m32;
   4553 
   4554             case 1: /* FIMUL m32int */ /* ST(0) *= m32int */
   4555                DIP("fimull %s\n", dis_buf);
   4556                fop = Iop_MulF64;
   4557                goto do_fop_m32;
   4558 
   4559             case 2: /* FICOM m32int */
   4560                DIP("ficoml %s\n", dis_buf);
   4561                /* This forces C1 to zero, which isn't right. */
   4562                put_C3210(
   4563                    binop( Iop_And32,
   4564                           binop(Iop_Shl32,
   4565                                 binop(Iop_CmpF64,
   4566                                       get_ST(0),
   4567                                       unop(Iop_I32StoF64,
   4568                                            loadLE(Ity_I32,mkexpr(addr)))),
   4569                                 mkU8(8)),
   4570                           mkU32(0x4500)
   4571                    ));
   4572                break;
   4573 
   4574             case 3: /* FICOMP m32int */
   4575                DIP("ficompl %s\n", dis_buf);
   4576                /* This forces C1 to zero, which isn't right. */
   4577                put_C3210(
   4578                    binop( Iop_And32,
   4579                           binop(Iop_Shl32,
   4580                                 binop(Iop_CmpF64,
   4581                                       get_ST(0),
   4582                                       unop(Iop_I32StoF64,
   4583                                            loadLE(Ity_I32,mkexpr(addr)))),
   4584                                 mkU8(8)),
   4585                           mkU32(0x4500)
   4586                    ));
   4587                fp_pop();
   4588                break;
   4589 
   4590             case 4: /* FISUB m32int */ /* ST(0) -= m32int */
   4591                DIP("fisubl %s\n", dis_buf);
   4592                fop = Iop_SubF64;
   4593                goto do_fop_m32;
   4594 
   4595             case 5: /* FISUBR m32int */ /* ST(0) = m32int - ST(0) */
   4596                DIP("fisubrl %s\n", dis_buf);
   4597                fop = Iop_SubF64;
   4598                goto do_foprev_m32;
   4599 
   4600             case 6: /* FIDIV m32int */ /* ST(0) /= m32int */
   4601                DIP("fidivl %s\n", dis_buf);
   4602                fop = Iop_DivF64;
   4603                goto do_fop_m32;
   4604 
   4605             case 7: /* FIDIVR m32int */ /* ST(0) = m32int / ST(0) */
   4606                DIP("fidivrl %s\n", dis_buf);
   4607                fop = Iop_DivF64;
   4608                goto do_foprev_m32;
   4609 
   4610             do_fop_m32:
   4611                put_ST_UNCHECKED(0,
   4612                   triop(fop,
   4613                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4614                         get_ST(0),
   4615                         unop(Iop_I32StoF64,
   4616                              loadLE(Ity_I32, mkexpr(addr)))));
   4617                break;
   4618 
   4619             do_foprev_m32:
   4620                put_ST_UNCHECKED(0,
   4621                   triop(fop,
   4622                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4623                         unop(Iop_I32StoF64,
   4624                              loadLE(Ity_I32, mkexpr(addr))),
   4625                         get_ST(0)));
   4626                break;
   4627 
   4628             default:
   4629                vex_printf("unhandled opc_aux = 0x%2x\n", (UInt)gregOfRM(modrm));
   4630                vex_printf("first_opcode == 0xDA\n");
   4631                goto decode_fail;
   4632          }
   4633 
   4634       } else {
   4635 
   4636          delta++;
   4637          switch (modrm) {
   4638 
   4639             case 0xC0 ... 0xC7: /* FCMOVB ST(i), ST(0) */
   4640                r_src = (UInt)modrm - 0xC0;
   4641                DIP("fcmovb %%st(%u), %%st(0)\n", r_src);
   4642                put_ST_UNCHECKED(0,
   4643                                 IRExpr_ITE(
   4644                                     mk_x86g_calculate_condition(X86CondB),
   4645                                     get_ST(r_src), get_ST(0)) );
   4646                break;
   4647 
   4648             case 0xC8 ... 0xCF: /* FCMOVE(Z) ST(i), ST(0) */
   4649                r_src = (UInt)modrm - 0xC8;
   4650                DIP("fcmovz %%st(%u), %%st(0)\n", r_src);
   4651                put_ST_UNCHECKED(0,
   4652                                 IRExpr_ITE(
   4653                                     mk_x86g_calculate_condition(X86CondZ),
   4654                                     get_ST(r_src), get_ST(0)) );
   4655                break;
   4656 
   4657             case 0xD0 ... 0xD7: /* FCMOVBE ST(i), ST(0) */
   4658                r_src = (UInt)modrm - 0xD0;
   4659                DIP("fcmovbe %%st(%u), %%st(0)\n", r_src);
   4660                put_ST_UNCHECKED(0,
   4661                                 IRExpr_ITE(
   4662                                     mk_x86g_calculate_condition(X86CondBE),
   4663                                     get_ST(r_src), get_ST(0)) );
   4664                break;
   4665 
   4666             case 0xD8 ... 0xDF: /* FCMOVU ST(i), ST(0) */
   4667                r_src = (UInt)modrm - 0xD8;
   4668                DIP("fcmovu %%st(%u), %%st(0)\n", r_src);
   4669                put_ST_UNCHECKED(0,
   4670                                 IRExpr_ITE(
   4671                                     mk_x86g_calculate_condition(X86CondP),
   4672                                     get_ST(r_src), get_ST(0)) );
   4673                break;
   4674 
   4675             case 0xE9: /* FUCOMPP %st(0),%st(1) */
   4676                DIP("fucompp %%st(0),%%st(1)\n");
   4677                /* This forces C1 to zero, which isn't right. */
   4678                put_C3210(
   4679                    binop( Iop_And32,
   4680                           binop(Iop_Shl32,
   4681                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
   4682                                 mkU8(8)),
   4683                           mkU32(0x4500)
   4684                    ));
   4685                fp_pop();
   4686                fp_pop();
   4687                break;
   4688 
   4689             default:
   4690                goto decode_fail;
   4691          }
   4692 
   4693       }
   4694    }
   4695 
   4696    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDB opcodes +-+-+-+-+-+-+-+ */
   4697    else
   4698    if (first_opcode == 0xDB) {
   4699       if (modrm < 0xC0) {
   4700 
   4701          /* bits 5,4,3 are an opcode extension, and the modRM also
   4702             specifies an address. */
   4703          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   4704          delta += len;
   4705 
   4706          switch (gregOfRM(modrm)) {
   4707 
   4708             case 0: /* FILD m32int */
   4709                DIP("fildl %s\n", dis_buf);
   4710                fp_push();
   4711                put_ST(0, unop(Iop_I32StoF64,
   4712                               loadLE(Ity_I32, mkexpr(addr))));
   4713                break;
   4714 
   4715             case 1: /* FISTTPL m32 (SSE3) */
   4716                DIP("fisttpl %s\n", dis_buf);
   4717                storeLE( mkexpr(addr),
   4718                         binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) );
   4719                fp_pop();
   4720                break;
   4721 
   4722             case 2: /* FIST m32 */
   4723                DIP("fistl %s\n", dis_buf);
   4724                storeLE( mkexpr(addr),
   4725                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
   4726                break;
   4727 
   4728             case 3: /* FISTP m32 */
   4729                DIP("fistpl %s\n", dis_buf);
   4730                storeLE( mkexpr(addr),
   4731                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
   4732                fp_pop();
   4733                break;
   4734 
   4735             case 5: { /* FLD extended-real */
   4736                /* Uses dirty helper:
   4737                      ULong x86g_loadF80le ( UInt )
   4738                   addr holds the address.  First, do a dirty call to
   4739                   get hold of the data. */
   4740                IRTemp   val  = newTemp(Ity_I64);
   4741                IRExpr** args = mkIRExprVec_1 ( mkexpr(addr) );
   4742 
   4743                IRDirty* d = unsafeIRDirty_1_N (
   4744                                val,
   4745                                0/*regparms*/,
   4746                                "x86g_dirtyhelper_loadF80le",
   4747                                &x86g_dirtyhelper_loadF80le,
   4748                                args
   4749                             );
   4750                /* declare that we're reading memory */
   4751                d->mFx   = Ifx_Read;
   4752                d->mAddr = mkexpr(addr);
   4753                d->mSize = 10;
   4754 
   4755                /* execute the dirty call, dumping the result in val. */
   4756                stmt( IRStmt_Dirty(d) );
   4757                fp_push();
   4758                put_ST(0, unop(Iop_ReinterpI64asF64, mkexpr(val)));
   4759 
   4760                DIP("fldt %s\n", dis_buf);
   4761                break;
   4762             }
   4763 
   4764             case 7: { /* FSTP extended-real */
   4765                /* Uses dirty helper: void x86g_storeF80le ( UInt, ULong ) */
   4766                IRExpr** args
   4767                   = mkIRExprVec_2( mkexpr(addr),
   4768                                    unop(Iop_ReinterpF64asI64, get_ST(0)) );
   4769 
   4770                IRDirty* d = unsafeIRDirty_0_N (
   4771                                0/*regparms*/,
   4772                                "x86g_dirtyhelper_storeF80le",
   4773                                &x86g_dirtyhelper_storeF80le,
   4774                                args
   4775                             );
   4776                /* declare we're writing memory */
   4777                d->mFx   = Ifx_Write;
   4778                d->mAddr = mkexpr(addr);
   4779                d->mSize = 10;
   4780 
   4781                /* execute the dirty call. */
   4782                stmt( IRStmt_Dirty(d) );
   4783                fp_pop();
   4784 
   4785                DIP("fstpt\n %s", dis_buf);
   4786                break;
   4787             }
   4788 
   4789             default:
   4790                vex_printf("unhandled opc_aux = 0x%2x\n", (UInt)gregOfRM(modrm));
   4791                vex_printf("first_opcode == 0xDB\n");
   4792                goto decode_fail;
   4793          }
   4794 
   4795       } else {
   4796 
   4797          delta++;
   4798          switch (modrm) {
   4799 
   4800             case 0xC0 ... 0xC7: /* FCMOVNB ST(i), ST(0) */
   4801                r_src = (UInt)modrm - 0xC0;
   4802                DIP("fcmovnb %%st(%u), %%st(0)\n", r_src);
   4803                put_ST_UNCHECKED(0,
   4804                                 IRExpr_ITE(
   4805                                     mk_x86g_calculate_condition(X86CondNB),
   4806                                     get_ST(r_src), get_ST(0)) );
   4807                break;
   4808 
   4809             case 0xC8 ... 0xCF: /* FCMOVNE(NZ) ST(i), ST(0) */
   4810                r_src = (UInt)modrm - 0xC8;
   4811                DIP("fcmovnz %%st(%u), %%st(0)\n", r_src);
   4812                put_ST_UNCHECKED(0,
   4813                                 IRExpr_ITE(
   4814                                     mk_x86g_calculate_condition(X86CondNZ),
   4815                                     get_ST(r_src), get_ST(0)) );
   4816                break;
   4817 
   4818             case 0xD0 ... 0xD7: /* FCMOVNBE ST(i), ST(0) */
   4819                r_src = (UInt)modrm - 0xD0;
   4820                DIP("fcmovnbe %%st(%u), %%st(0)\n", r_src);
   4821                put_ST_UNCHECKED(0,
   4822                                 IRExpr_ITE(
   4823                                     mk_x86g_calculate_condition(X86CondNBE),
   4824                                     get_ST(r_src), get_ST(0)) );
   4825                break;
   4826 
   4827             case 0xD8 ... 0xDF: /* FCMOVNU ST(i), ST(0) */
   4828                r_src = (UInt)modrm - 0xD8;
   4829                DIP("fcmovnu %%st(%u), %%st(0)\n", r_src);
   4830                put_ST_UNCHECKED(0,
   4831                                 IRExpr_ITE(
   4832                                     mk_x86g_calculate_condition(X86CondNP),
   4833                                     get_ST(r_src), get_ST(0)) );
   4834                break;
   4835 
   4836             case 0xE2:
   4837                DIP("fnclex\n");
   4838                break;
   4839 
   4840             case 0xE3: {
   4841                /* Uses dirty helper:
   4842                      void x86g_do_FINIT ( VexGuestX86State* ) */
   4843                IRDirty* d  = unsafeIRDirty_0_N (
   4844                                 0/*regparms*/,
   4845                                 "x86g_dirtyhelper_FINIT",
   4846                                 &x86g_dirtyhelper_FINIT,
   4847                                 mkIRExprVec_1(IRExpr_GSPTR())
   4848                              );
   4849 
   4850                /* declare we're writing guest state */
   4851                d->nFxState = 5;
   4852                vex_bzero(&d->fxState, sizeof(d->fxState));
   4853 
   4854                d->fxState[0].fx     = Ifx_Write;
   4855                d->fxState[0].offset = OFFB_FTOP;
   4856                d->fxState[0].size   = sizeof(UInt);
   4857 
   4858                d->fxState[1].fx     = Ifx_Write;
   4859                d->fxState[1].offset = OFFB_FPREGS;
   4860                d->fxState[1].size   = 8 * sizeof(ULong);
   4861 
   4862                d->fxState[2].fx     = Ifx_Write;
   4863                d->fxState[2].offset = OFFB_FPTAGS;
   4864                d->fxState[2].size   = 8 * sizeof(UChar);
   4865 
   4866                d->fxState[3].fx     = Ifx_Write;
   4867                d->fxState[3].offset = OFFB_FPROUND;
   4868                d->fxState[3].size   = sizeof(UInt);
   4869 
   4870                d->fxState[4].fx     = Ifx_Write;
   4871                d->fxState[4].offset = OFFB_FC3210;
   4872                d->fxState[4].size   = sizeof(UInt);
   4873 
   4874                stmt( IRStmt_Dirty(d) );
   4875 
   4876                DIP("fninit\n");
   4877                break;
   4878             }
   4879 
   4880             case 0xE8 ... 0xEF: /* FUCOMI %st(0),%st(?) */
   4881                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, False );
   4882                break;
   4883 
   4884             case 0xF0 ... 0xF7: /* FCOMI %st(0),%st(?) */
   4885                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, False );
   4886                break;
   4887 
   4888             default:
   4889                goto decode_fail;
   4890          }
   4891       }
   4892    }
   4893 
   4894    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDC opcodes +-+-+-+-+-+-+-+ */
   4895    else
   4896    if (first_opcode == 0xDC) {
   4897       if (modrm < 0xC0) {
   4898 
   4899          /* bits 5,4,3 are an opcode extension, and the modRM also
   4900             specifies an address. */
   4901          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   4902          delta += len;
   4903 
   4904          switch (gregOfRM(modrm)) {
   4905 
   4906             case 0: /* FADD double-real */
   4907                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, True );
   4908                break;
   4909 
   4910             case 1: /* FMUL double-real */
   4911                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, True );
   4912                break;
   4913 
   4914             case 2: /* FCOM double-real */
   4915                DIP("fcoml %s\n", dis_buf);
   4916                /* This forces C1 to zero, which isn't right. */
   4917                put_C3210(
   4918                    binop( Iop_And32,
   4919                           binop(Iop_Shl32,
   4920                                 binop(Iop_CmpF64,
   4921                                       get_ST(0),
   4922                                       loadLE(Ity_F64,mkexpr(addr))),
   4923                                 mkU8(8)),
   4924                           mkU32(0x4500)
   4925                    ));
   4926                break;
   4927 
   4928             case 3: /* FCOMP double-real */
   4929                DIP("fcompl %s\n", dis_buf);
   4930                /* This forces C1 to zero, which isn't right. */
   4931                put_C3210(
   4932                    binop( Iop_And32,
   4933                           binop(Iop_Shl32,
   4934                                 binop(Iop_CmpF64,
   4935                                       get_ST(0),
   4936                                       loadLE(Ity_F64,mkexpr(addr))),
   4937                                 mkU8(8)),
   4938                           mkU32(0x4500)
   4939                    ));
   4940                fp_pop();
   4941                break;
   4942 
   4943             case 4: /* FSUB double-real */
   4944                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, True );
   4945                break;
   4946 
   4947             case 5: /* FSUBR double-real */
   4948                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, True );
   4949                break;
   4950 
   4951             case 6: /* FDIV double-real */
   4952                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, True );
   4953                break;
   4954 
   4955             case 7: /* FDIVR double-real */
   4956                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, True );
   4957                break;
   4958 
   4959             default:
   4960                vex_printf("unhandled opc_aux = 0x%2x\n", (UInt)gregOfRM(modrm));
   4961                vex_printf("first_opcode == 0xDC\n");
   4962                goto decode_fail;
   4963          }
   4964 
   4965       } else {
   4966 
   4967          delta++;
   4968          switch (modrm) {
   4969 
   4970             case 0xC0 ... 0xC7: /* FADD %st(0),%st(?) */
   4971                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, False );
   4972                break;
   4973 
   4974             case 0xC8 ... 0xCF: /* FMUL %st(0),%st(?) */
   4975                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, False );
   4976                break;
   4977 
   4978             case 0xE0 ... 0xE7: /* FSUBR %st(0),%st(?) */
   4979                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0, modrm - 0xE0, False );
   4980                break;
   4981 
   4982             case 0xE8 ... 0xEF: /* FSUB %st(0),%st(?) */
   4983                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0, modrm - 0xE8, False );
   4984                break;
   4985 
   4986             case 0xF0 ... 0xF7: /* FDIVR %st(0),%st(?) */
   4987                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, False );
   4988                break;
   4989 
   4990             case 0xF8 ... 0xFF: /* FDIV %st(0),%st(?) */
   4991                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, False );
   4992                break;
   4993 
   4994             default:
   4995                goto decode_fail;
   4996          }
   4997 
   4998       }
   4999    }
   5000 
   5001    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDD opcodes +-+-+-+-+-+-+-+ */
   5002    else
   5003    if (first_opcode == 0xDD) {
   5004 
   5005       if (modrm < 0xC0) {
   5006 
   5007          /* bits 5,4,3 are an opcode extension, and the modRM also
   5008             specifies an address. */
   5009          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5010          delta += len;
   5011 
   5012          switch (gregOfRM(modrm)) {
   5013 
   5014             case 0: /* FLD double-real */
   5015                DIP("fldl %s\n", dis_buf);
   5016                fp_push();
   5017                put_ST(0, loadLE(Ity_F64, mkexpr(addr)));
   5018                break;
   5019 
   5020             case 1: /* FISTTPQ m64 (SSE3) */
   5021                DIP("fistppll %s\n", dis_buf);
   5022                storeLE( mkexpr(addr),
   5023                         binop(Iop_F64toI64S, mkU32(Irrm_ZERO), get_ST(0)) );
   5024                fp_pop();
   5025                break;
   5026 
   5027             case 2: /* FST double-real */
   5028                DIP("fstl %s\n", dis_buf);
   5029                storeLE(mkexpr(addr), get_ST(0));
   5030                break;
   5031 
   5032             case 3: /* FSTP double-real */
   5033                DIP("fstpl %s\n", dis_buf);
   5034                storeLE(mkexpr(addr), get_ST(0));
   5035                fp_pop();
   5036                break;
   5037 
   5038             case 4: { /* FRSTOR m108 */
   5039                /* Uses dirty helper:
   5040                      VexEmNote x86g_do_FRSTOR ( VexGuestX86State*, Addr32 ) */
   5041                IRTemp   ew = newTemp(Ity_I32);
   5042                IRDirty* d  = unsafeIRDirty_0_N (
   5043                                 0/*regparms*/,
   5044                                 "x86g_dirtyhelper_FRSTOR",
   5045                                 &x86g_dirtyhelper_FRSTOR,
   5046                                 mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
   5047                              );
   5048                d->tmp   = ew;
   5049                /* declare we're reading memory */
   5050                d->mFx   = Ifx_Read;
   5051                d->mAddr = mkexpr(addr);
   5052                d->mSize = 108;
   5053 
   5054                /* declare we're writing guest state */
   5055                d->nFxState = 5;
   5056                vex_bzero(&d->fxState, sizeof(d->fxState));
   5057 
   5058                d->fxState[0].fx     = Ifx_Write;
   5059                d->fxState[0].offset = OFFB_FTOP;
   5060                d->fxState[0].size   = sizeof(UInt);
   5061 
   5062                d->fxState[1].fx     = Ifx_Write;
   5063                d->fxState[1].offset = OFFB_FPREGS;
   5064                d->fxState[1].size   = 8 * sizeof(ULong);
   5065 
   5066                d->fxState[2].fx     = Ifx_Write;
   5067                d->fxState[2].offset = OFFB_FPTAGS;
   5068                d->fxState[2].size   = 8 * sizeof(UChar);
   5069 
   5070                d->fxState[3].fx     = Ifx_Write;
   5071                d->fxState[3].offset = OFFB_FPROUND;
   5072                d->fxState[3].size   = sizeof(UInt);
   5073 
   5074                d->fxState[4].fx     = Ifx_Write;
   5075                d->fxState[4].offset = OFFB_FC3210;
   5076                d->fxState[4].size   = sizeof(UInt);
   5077 
   5078                stmt( IRStmt_Dirty(d) );
   5079 
   5080                /* ew contains any emulation warning we may need to
   5081                   issue.  If needed, side-exit to the next insn,
   5082                   reporting the warning, so that Valgrind's dispatcher
   5083                   sees the warning. */
   5084                put_emwarn( mkexpr(ew) );
   5085                stmt(
   5086                   IRStmt_Exit(
   5087                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   5088                      Ijk_EmWarn,
   5089                      IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
   5090                      OFFB_EIP
   5091                   )
   5092                );
   5093 
   5094                DIP("frstor %s\n", dis_buf);
   5095                break;
   5096             }
   5097 
   5098             case 6: { /* FNSAVE m108 */
   5099                /* Uses dirty helper:
   5100                      void x86g_do_FSAVE ( VexGuestX86State*, UInt ) */
   5101                IRDirty* d = unsafeIRDirty_0_N (
   5102                                0/*regparms*/,
   5103                                "x86g_dirtyhelper_FSAVE",
   5104                                &x86g_dirtyhelper_FSAVE,
   5105                                mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
   5106                             );
   5107                /* declare we're writing memory */
   5108                d->mFx   = Ifx_Write;
   5109                d->mAddr = mkexpr(addr);
   5110                d->mSize = 108;
   5111 
   5112                /* declare we're reading guest state */
   5113                d->nFxState = 5;
   5114                vex_bzero(&d->fxState, sizeof(d->fxState));
   5115 
   5116                d->fxState[0].fx     = Ifx_Read;
   5117                d->fxState[0].offset = OFFB_FTOP;
   5118                d->fxState[0].size   = sizeof(UInt);
   5119 
   5120                d->fxState[1].fx     = Ifx_Read;
   5121                d->fxState[1].offset = OFFB_FPREGS;
   5122                d->fxState[1].size   = 8 * sizeof(ULong);
   5123 
   5124                d->fxState[2].fx     = Ifx_Read;
   5125                d->fxState[2].offset = OFFB_FPTAGS;
   5126                d->fxState[2].size   = 8 * sizeof(UChar);
   5127 
   5128                d->fxState[3].fx     = Ifx_Read;
   5129                d->fxState[3].offset = OFFB_FPROUND;
   5130                d->fxState[3].size   = sizeof(UInt);
   5131 
   5132                d->fxState[4].fx     = Ifx_Read;
   5133                d->fxState[4].offset = OFFB_FC3210;
   5134                d->fxState[4].size   = sizeof(UInt);
   5135 
   5136                stmt( IRStmt_Dirty(d) );
   5137 
   5138                DIP("fnsave %s\n", dis_buf);
   5139                break;
   5140             }
   5141 
   5142             case 7: { /* FNSTSW m16 */
   5143                IRExpr* sw = get_FPU_sw();
   5144                vassert(typeOfIRExpr(irsb->tyenv, sw) == Ity_I16);
   5145                storeLE( mkexpr(addr), sw );
   5146                DIP("fnstsw %s\n", dis_buf);
   5147                break;
   5148             }
   5149 
   5150             default:
   5151                vex_printf("unhandled opc_aux = 0x%2x\n", (UInt)gregOfRM(modrm));
   5152                vex_printf("first_opcode == 0xDD\n");
   5153                goto decode_fail;
   5154          }
   5155       } else {
   5156          delta++;
   5157          switch (modrm) {
   5158 
   5159             case 0xC0 ... 0xC7: /* FFREE %st(?) */
   5160                r_dst = (UInt)modrm - 0xC0;
   5161                DIP("ffree %%st(%u)\n", r_dst);
   5162                put_ST_TAG ( r_dst, mkU8(0) );
   5163                break;
   5164 
   5165             case 0xD0 ... 0xD7: /* FST %st(0),%st(?) */
   5166                r_dst = (UInt)modrm - 0xD0;
   5167                DIP("fst %%st(0),%%st(%u)\n", r_dst);
   5168                /* P4 manual says: "If the destination operand is a
   5169                   non-empty register, the invalid-operation exception
   5170                   is not generated.  Hence put_ST_UNCHECKED. */
   5171                put_ST_UNCHECKED(r_dst, get_ST(0));
   5172                break;
   5173 
   5174             case 0xD8 ... 0xDF: /* FSTP %st(0),%st(?) */
   5175                r_dst = (UInt)modrm - 0xD8;
   5176                DIP("fstp %%st(0),%%st(%u)\n", r_dst);
   5177                /* P4 manual says: "If the destination operand is a
   5178                   non-empty register, the invalid-operation exception
   5179                   is not generated.  Hence put_ST_UNCHECKED. */
   5180                put_ST_UNCHECKED(r_dst, get_ST(0));
   5181                fp_pop();
   5182                break;
   5183 
   5184             case 0xE0 ... 0xE7: /* FUCOM %st(0),%st(?) */
   5185                r_dst = (UInt)modrm - 0xE0;
   5186                DIP("fucom %%st(0),%%st(%u)\n", r_dst);
   5187                /* This forces C1 to zero, which isn't right. */
   5188                put_C3210(
   5189                    binop( Iop_And32,
   5190                           binop(Iop_Shl32,
   5191                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   5192                                 mkU8(8)),
   5193                           mkU32(0x4500)
   5194                    ));
   5195                break;
   5196 
   5197             case 0xE8 ... 0xEF: /* FUCOMP %st(0),%st(?) */
   5198                r_dst = (UInt)modrm - 0xE8;
   5199                DIP("fucomp %%st(0),%%st(%u)\n", r_dst);
   5200                /* This forces C1 to zero, which isn't right. */
   5201                put_C3210(
   5202                    binop( Iop_And32,
   5203                           binop(Iop_Shl32,
   5204                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   5205                                 mkU8(8)),
   5206                           mkU32(0x4500)
   5207                    ));
   5208                fp_pop();
   5209                break;
   5210 
   5211             default:
   5212                goto decode_fail;
   5213          }
   5214       }
   5215    }
   5216 
   5217    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDE opcodes +-+-+-+-+-+-+-+ */
   5218    else
   5219    if (first_opcode == 0xDE) {
   5220 
   5221       if (modrm < 0xC0) {
   5222 
   5223          /* bits 5,4,3 are an opcode extension, and the modRM also
   5224             specifies an address. */
   5225          IROp   fop;
   5226          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5227          delta += len;
   5228 
   5229          switch (gregOfRM(modrm)) {
   5230 
   5231             case 0: /* FIADD m16int */ /* ST(0) += m16int */
   5232                DIP("fiaddw %s\n", dis_buf);
   5233                fop = Iop_AddF64;
   5234                goto do_fop_m16;
   5235 
   5236             case 1: /* FIMUL m16int */ /* ST(0) *= m16int */
   5237                DIP("fimulw %s\n", dis_buf);
   5238                fop = Iop_MulF64;
   5239                goto do_fop_m16;
   5240 
   5241             case 2: /* FICOM m16int */
   5242                DIP("ficomw %s\n", dis_buf);
   5243                /* This forces C1 to zero, which isn't right. */
   5244                put_C3210(
   5245                    binop( Iop_And32,
   5246                           binop(Iop_Shl32,
   5247                                 binop(Iop_CmpF64,
   5248                                       get_ST(0),
   5249                                       unop(Iop_I32StoF64,
   5250                                          unop(Iop_16Sto32,
   5251                                            loadLE(Ity_I16,mkexpr(addr))))),
   5252                                 mkU8(8)),
   5253                           mkU32(0x4500)
   5254                    ));
   5255                break;
   5256 
   5257             case 3: /* FICOMP m16int */
   5258                DIP("ficompw %s\n", dis_buf);
   5259                /* This forces C1 to zero, which isn't right. */
   5260                put_C3210(
   5261                    binop( Iop_And32,
   5262                           binop(Iop_Shl32,
   5263                                 binop(Iop_CmpF64,
   5264                                       get_ST(0),
   5265                                       unop(Iop_I32StoF64,
   5266                                          unop(Iop_16Sto32,
   5267                                               loadLE(Ity_I16,mkexpr(addr))))),
   5268                                 mkU8(8)),
   5269                           mkU32(0x4500)
   5270                    ));
   5271                fp_pop();
   5272                break;
   5273 
   5274             case 4: /* FISUB m16int */ /* ST(0) -= m16int */
   5275                DIP("fisubw %s\n", dis_buf);
   5276                fop = Iop_SubF64;
   5277                goto do_fop_m16;
   5278 
   5279             case 5: /* FISUBR m16int */ /* ST(0) = m16int - ST(0) */
   5280                DIP("fisubrw %s\n", dis_buf);
   5281                fop = Iop_SubF64;
   5282                goto do_foprev_m16;
   5283 
   5284             case 6: /* FIDIV m16int */ /* ST(0) /= m16int */
   5285                DIP("fisubw %s\n", dis_buf);
   5286                fop = Iop_DivF64;
   5287                goto do_fop_m16;
   5288 
   5289             case 7: /* FIDIVR m16int */ /* ST(0) = m16int / ST(0) */
   5290                DIP("fidivrw %s\n", dis_buf);
   5291                fop = Iop_DivF64;
   5292                goto do_foprev_m16;
   5293 
   5294             do_fop_m16:
   5295                put_ST_UNCHECKED(0,
   5296                   triop(fop,
   5297                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5298                         get_ST(0),
   5299                         unop(Iop_I32StoF64,
   5300                              unop(Iop_16Sto32,
   5301                                   loadLE(Ity_I16, mkexpr(addr))))));
   5302                break;
   5303 
   5304             do_foprev_m16:
   5305                put_ST_UNCHECKED(0,
   5306                   triop(fop,
   5307                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5308                         unop(Iop_I32StoF64,
   5309                              unop(Iop_16Sto32,
   5310                                   loadLE(Ity_I16, mkexpr(addr)))),
   5311                         get_ST(0)));
   5312                break;
   5313 
   5314             default:
   5315                vex_printf("unhandled opc_aux = 0x%2x\n", (UInt)gregOfRM(modrm));
   5316                vex_printf("first_opcode == 0xDE\n");
   5317                goto decode_fail;
   5318          }
   5319 
   5320       } else {
   5321 
   5322          delta++;
   5323          switch (modrm) {
   5324 
   5325             case 0xC0 ... 0xC7: /* FADDP %st(0),%st(?) */
   5326                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, True );
   5327                break;
   5328 
   5329             case 0xC8 ... 0xCF: /* FMULP %st(0),%st(?) */
   5330                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, True );
   5331                break;
   5332 
   5333             case 0xD9: /* FCOMPP %st(0),%st(1) */
   5334                DIP("fuompp %%st(0),%%st(1)\n");
   5335                /* This forces C1 to zero, which isn't right. */
   5336                put_C3210(
   5337                    binop( Iop_And32,
   5338                           binop(Iop_Shl32,
   5339                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
   5340                                 mkU8(8)),
   5341                           mkU32(0x4500)
   5342                    ));
   5343                fp_pop();
   5344                fp_pop();
   5345                break;
   5346 
   5347             case 0xE0 ... 0xE7: /* FSUBRP %st(0),%st(?) */
   5348                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0,  modrm - 0xE0, True );
   5349                break;
   5350 
   5351             case 0xE8 ... 0xEF: /* FSUBP %st(0),%st(?) */
   5352                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0,  modrm - 0xE8, True );
   5353                break;
   5354 
   5355             case 0xF0 ... 0xF7: /* FDIVRP %st(0),%st(?) */
   5356                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, True );
   5357                break;
   5358 
   5359             case 0xF8 ... 0xFF: /* FDIVP %st(0),%st(?) */
   5360                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, True );
   5361                break;
   5362 
   5363             default:
   5364                goto decode_fail;
   5365          }
   5366 
   5367       }
   5368    }
   5369 
   5370    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDF opcodes +-+-+-+-+-+-+-+ */
   5371    else
   5372    if (first_opcode == 0xDF) {
   5373 
   5374       if (modrm < 0xC0) {
   5375 
   5376          /* bits 5,4,3 are an opcode extension, and the modRM also
   5377             specifies an address. */
   5378          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5379          delta += len;
   5380 
   5381          switch (gregOfRM(modrm)) {
   5382 
   5383             case 0: /* FILD m16int */
   5384                DIP("fildw %s\n", dis_buf);
   5385                fp_push();
   5386                put_ST(0, unop(Iop_I32StoF64,
   5387                               unop(Iop_16Sto32,
   5388                                    loadLE(Ity_I16, mkexpr(addr)))));
   5389                break;
   5390 
   5391             case 1: /* FISTTPS m16 (SSE3) */
   5392                DIP("fisttps %s\n", dis_buf);
   5393                storeLE( mkexpr(addr),
   5394                         binop(Iop_F64toI16S, mkU32(Irrm_ZERO), get_ST(0)) );
   5395                fp_pop();
   5396                break;
   5397 
   5398             case 2: /* FIST m16 */
   5399                DIP("fistp %s\n", dis_buf);
   5400                storeLE( mkexpr(addr),
   5401                         binop(Iop_F64toI16S, get_roundingmode(), get_ST(0)) );
   5402                break;
   5403 
   5404             case 3: /* FISTP m16 */
   5405                DIP("fistps %s\n", dis_buf);
   5406                storeLE( mkexpr(addr),
   5407                         binop(Iop_F64toI16S, get_roundingmode(), get_ST(0)) );
   5408                fp_pop();
   5409                break;
   5410 
   5411             case 5: /* FILD m64 */
   5412                DIP("fildll %s\n", dis_buf);
   5413                fp_push();
   5414                put_ST(0, binop(Iop_I64StoF64,
   5415                                get_roundingmode(),
   5416                                loadLE(Ity_I64, mkexpr(addr))));
   5417                break;
   5418 
   5419             case 7: /* FISTP m64 */
   5420                DIP("fistpll %s\n", dis_buf);
   5421                storeLE( mkexpr(addr),
   5422                         binop(Iop_F64toI64S, get_roundingmode(), get_ST(0)) );
   5423                fp_pop();
   5424                break;
   5425 
   5426             default:
   5427                vex_printf("unhandled opc_aux = 0x%2x\n", (UInt)gregOfRM(modrm));
   5428                vex_printf("first_opcode == 0xDF\n");
   5429                goto decode_fail;
   5430          }
   5431 
   5432       } else {
   5433 
   5434          delta++;
   5435          switch (modrm) {
   5436 
   5437             case 0xC0: /* FFREEP %st(0) */
   5438                DIP("ffreep %%st(%d)\n", 0);
   5439                put_ST_TAG ( 0, mkU8(0) );
   5440                fp_pop();
   5441                break;
   5442 
   5443             case 0xE0: /* FNSTSW %ax */
   5444                DIP("fnstsw %%ax\n");
   5445                /* Get the FPU status word value and dump it in %AX. */
   5446                if (0) {
   5447                   /* The obvious thing to do is simply dump the 16-bit
   5448                      status word value in %AX.  However, due to a
   5449                      limitation in Memcheck's origin tracking
   5450                      machinery, this causes Memcheck not to track the
   5451                      origin of any undefinedness into %AH (only into
   5452                      %AL/%AX/%EAX), which means origins are lost in
   5453                      the sequence "fnstsw %ax; test $M,%ah; jcond .." */
   5454                   putIReg(2, R_EAX, get_FPU_sw());
   5455                } else {
   5456                   /* So a somewhat lame kludge is to make it very
   5457                      clear to Memcheck that the value is written to
   5458                      both %AH and %AL.  This generates marginally
   5459                      worse code, but I don't think it matters much. */
   5460                   IRTemp t16 = newTemp(Ity_I16);
   5461                   assign(t16, get_FPU_sw());
   5462                   putIReg( 1, R_AL, unop(Iop_16to8, mkexpr(t16)) );
   5463                   putIReg( 1, R_AH, unop(Iop_16HIto8, mkexpr(t16)) );
   5464                }
   5465                break;
   5466 
   5467             case 0xE8 ... 0xEF: /* FUCOMIP %st(0),%st(?) */
   5468                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, True );
   5469                break;
   5470 
   5471             case 0xF0 ... 0xF7: /* FCOMIP %st(0),%st(?) */
   5472                /* not really right since COMIP != UCOMIP */
   5473                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, True );
   5474                break;
   5475 
   5476             default:
   5477                goto decode_fail;
   5478          }
   5479       }
   5480 
   5481    }
   5482 
   5483    else
   5484    vpanic("dis_FPU(x86): invalid primary opcode");
   5485 
   5486    *decode_ok = True;
   5487    return delta;
   5488 
   5489   decode_fail:
   5490    *decode_ok = False;
   5491    return delta;
   5492 }
   5493 
   5494 
   5495 /*------------------------------------------------------------*/
   5496 /*---                                                      ---*/
   5497 /*--- MMX INSTRUCTIONS                                     ---*/
   5498 /*---                                                      ---*/
   5499 /*------------------------------------------------------------*/
   5500 
   5501 /* Effect of MMX insns on x87 FPU state (table 11-2 of
   5502    IA32 arch manual, volume 3):
   5503 
   5504    Read from, or write to MMX register (viz, any insn except EMMS):
   5505    * All tags set to Valid (non-empty) -- FPTAGS[i] := nonzero
   5506    * FP stack pointer set to zero
   5507 
   5508    EMMS:
   5509    * All tags set to Invalid (empty) -- FPTAGS[i] := zero
   5510    * FP stack pointer set to zero
   5511 */
   5512 
   5513 static void do_MMX_preamble ( void )
   5514 {
   5515    Int         i;
   5516    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   5517    IRExpr*     zero  = mkU32(0);
   5518    IRExpr*     tag1  = mkU8(1);
   5519    put_ftop(zero);
   5520    for (i = 0; i < 8; i++)
   5521       stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag1) ) );
   5522 }
   5523 
   5524 static void do_EMMS_preamble ( void )
   5525 {
   5526    Int         i;
   5527    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   5528    IRExpr*     zero  = mkU32(0);
   5529    IRExpr*     tag0  = mkU8(0);
   5530    put_ftop(zero);
   5531    for (i = 0; i < 8; i++)
   5532       stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag0) ) );
   5533 }
   5534 
   5535 
   5536 static IRExpr* getMMXReg ( UInt archreg )
   5537 {
   5538    vassert(archreg < 8);
   5539    return IRExpr_Get( OFFB_FPREGS + 8 * archreg, Ity_I64 );
   5540 }
   5541 
   5542 
   5543 static void putMMXReg ( UInt archreg, IRExpr* e )
   5544 {
   5545    vassert(archreg < 8);
   5546    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   5547    stmt( IRStmt_Put( OFFB_FPREGS + 8 * archreg, e ) );
   5548 }
   5549 
   5550 
   5551 /* Helper for non-shift MMX insns.  Note this is incomplete in the
   5552    sense that it does not first call do_MMX_preamble() -- that is the
   5553    responsibility of its caller. */
   5554 
   5555 static
   5556 UInt dis_MMXop_regmem_to_reg ( UChar  sorb,
   5557                                Int    delta,
   5558                                UChar  opc,
   5559                                const HChar* name,
   5560                                Bool   show_granularity )
   5561 {
   5562    HChar   dis_buf[50];
   5563    UChar   modrm = getIByte(delta);
   5564    Bool    isReg = epartIsReg(modrm);
   5565    IRExpr* argL  = NULL;
   5566    IRExpr* argR  = NULL;
   5567    IRExpr* argG  = NULL;
   5568    IRExpr* argE  = NULL;
   5569    IRTemp  res   = newTemp(Ity_I64);
   5570 
   5571    Bool    invG  = False;
   5572    IROp    op    = Iop_INVALID;
   5573    void*   hAddr = NULL;
   5574    Bool    eLeft = False;
   5575    const HChar*  hName = NULL;
   5576 
   5577 #  define XXX(_name) do { hAddr = &_name; hName = #_name; } while (0)
   5578 
   5579    switch (opc) {
   5580       /* Original MMX ones */
   5581       case 0xFC: op = Iop_Add8x8; break;
   5582       case 0xFD: op = Iop_Add16x4; break;
   5583       case 0xFE: op = Iop_Add32x2; break;
   5584 
   5585       case 0xEC: op = Iop_QAdd8Sx8; break;
   5586       case 0xED: op = Iop_QAdd16Sx4; break;
   5587 
   5588       case 0xDC: op = Iop_QAdd8Ux8; break;
   5589       case 0xDD: op = Iop_QAdd16Ux4; break;
   5590 
   5591       case 0xF8: op = Iop_Sub8x8;  break;
   5592       case 0xF9: op = Iop_Sub16x4; break;
   5593       case 0xFA: op = Iop_Sub32x2; break;
   5594 
   5595       case 0xE8: op = Iop_QSub8Sx8; break;
   5596       case 0xE9: op = Iop_QSub16Sx4; break;
   5597 
   5598       case 0xD8: op = Iop_QSub8Ux8; break;
   5599       case 0xD9: op = Iop_QSub16Ux4; break;
   5600 
   5601       case 0xE5: op = Iop_MulHi16Sx4; break;
   5602       case 0xD5: op = Iop_Mul16x4; break;
   5603       case 0xF5: XXX(x86g_calculate_mmx_pmaddwd); break;
   5604 
   5605       case 0x74: op = Iop_CmpEQ8x8; break;
   5606       case 0x75: op = Iop_CmpEQ16x4; break;
   5607       case 0x76: op = Iop_CmpEQ32x2; break;
   5608 
   5609       case 0x64: op = Iop_CmpGT8Sx8; break;
   5610       case 0x65: op = Iop_CmpGT16Sx4; break;
   5611       case 0x66: op = Iop_CmpGT32Sx2; break;
   5612 
   5613       case 0x6B: op = Iop_QNarrowBin32Sto16Sx4; eLeft = True; break;
   5614       case 0x63: op = Iop_QNarrowBin16Sto8Sx8;  eLeft = True; break;
   5615       case 0x67: op = Iop_QNarrowBin16Sto8Ux8;  eLeft = True; break;
   5616 
   5617       case 0x68: op = Iop_InterleaveHI8x8;  eLeft = True; break;
   5618       case 0x69: op = Iop_InterleaveHI16x4; eLeft = True; break;
   5619       case 0x6A: op = Iop_InterleaveHI32x2; eLeft = True; break;
   5620 
   5621       case 0x60: op = Iop_InterleaveLO8x8;  eLeft = True; break;
   5622       case 0x61: op = Iop_InterleaveLO16x4; eLeft = True; break;
   5623       case 0x62: op = Iop_InterleaveLO32x2; eLeft = True; break;
   5624 
   5625       case 0xDB: op = Iop_And64; break;
   5626       case 0xDF: op = Iop_And64; invG = True; break;
   5627       case 0xEB: op = Iop_Or64; break;
   5628       case 0xEF: /* Possibly do better here if argL and argR are the
   5629                     same reg */
   5630                  op = Iop_Xor64; break;
   5631 
   5632       /* Introduced in SSE1 */
   5633       case 0xE0: op = Iop_Avg8Ux8;    break;
   5634       case 0xE3: op = Iop_Avg16Ux4;   break;
   5635       case 0xEE: op = Iop_Max16Sx4;   break;
   5636       case 0xDE: op = Iop_Max8Ux8;    break;
   5637       case 0xEA: op = Iop_Min16Sx4;   break;
   5638       case 0xDA: op = Iop_Min8Ux8;    break;
   5639       case 0xE4: op = Iop_MulHi16Ux4; break;
   5640       case 0xF6: XXX(x86g_calculate_mmx_psadbw); break;
   5641 
   5642       /* Introduced in SSE2 */
   5643       case 0xD4: op = Iop_Add64; break;
   5644       case 0xFB: op = Iop_Sub64; break;
   5645 
   5646       default:
   5647          vex_printf("\n0x%x\n", opc);
   5648          vpanic("dis_MMXop_regmem_to_reg");
   5649    }
   5650 
   5651 #  undef XXX
   5652 
   5653    argG = getMMXReg(gregOfRM(modrm));
   5654    if (invG)
   5655       argG = unop(Iop_Not64, argG);
   5656 
   5657    if (isReg) {
   5658       delta++;
   5659       argE = getMMXReg(eregOfRM(modrm));
   5660    } else {
   5661       Int    len;
   5662       IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5663       delta += len;
   5664       argE = loadLE(Ity_I64, mkexpr(addr));
   5665    }
   5666 
   5667    if (eLeft) {
   5668       argL = argE;
   5669       argR = argG;
   5670    } else {
   5671       argL = argG;
   5672       argR = argE;
   5673    }
   5674 
   5675    if (op != Iop_INVALID) {
   5676       vassert(hName == NULL);
   5677       vassert(hAddr == NULL);
   5678       assign(res, binop(op, argL, argR));
   5679    } else {
   5680       vassert(hName != NULL);
   5681       vassert(hAddr != NULL);
   5682       assign( res,
   5683               mkIRExprCCall(
   5684                  Ity_I64,
   5685                  0/*regparms*/, hName, hAddr,
   5686                  mkIRExprVec_2( argL, argR )
   5687               )
   5688             );
   5689    }
   5690 
   5691    putMMXReg( gregOfRM(modrm), mkexpr(res) );
   5692 
   5693    DIP("%s%s %s, %s\n",
   5694        name, show_granularity ? nameMMXGran(opc & 3) : "",
   5695        ( isReg ? nameMMXReg(eregOfRM(modrm)) : dis_buf ),
   5696        nameMMXReg(gregOfRM(modrm)) );
   5697 
   5698    return delta;
   5699 }
   5700 
   5701 
   5702 /* Vector by scalar shift of G by the amount specified at the bottom
   5703    of E.  This is a straight copy of dis_SSE_shiftG_byE. */
   5704 
   5705 static UInt dis_MMX_shiftG_byE ( UChar sorb, Int delta,
   5706                                  const HChar* opname, IROp op )
   5707 {
   5708    HChar   dis_buf[50];
   5709    Int     alen, size;
   5710    IRTemp  addr;
   5711    Bool    shl, shr, sar;
   5712    UChar   rm   = getIByte(delta);
   5713    IRTemp  g0   = newTemp(Ity_I64);
   5714    IRTemp  g1   = newTemp(Ity_I64);
   5715    IRTemp  amt  = newTemp(Ity_I32);
   5716    IRTemp  amt8 = newTemp(Ity_I8);
   5717 
   5718    if (epartIsReg(rm)) {
   5719       assign( amt, unop(Iop_64to32, getMMXReg(eregOfRM(rm))) );
   5720       DIP("%s %s,%s\n", opname,
   5721                         nameMMXReg(eregOfRM(rm)),
   5722                         nameMMXReg(gregOfRM(rm)) );
   5723       delta++;
   5724    } else {
   5725       addr = disAMode ( &alen, sorb, delta, dis_buf );
   5726       assign( amt, loadLE(Ity_I32, mkexpr(addr)) );
   5727       DIP("%s %s,%s\n", opname,
   5728                         dis_buf,
   5729                         nameMMXReg(gregOfRM(rm)) );
   5730       delta += alen;
   5731    }
   5732    assign( g0,   getMMXReg(gregOfRM(rm)) );
   5733    assign( amt8, unop(Iop_32to8, mkexpr(amt)) );
   5734 
   5735    shl = shr = sar = False;
   5736    size = 0;
   5737    switch (op) {
   5738       case Iop_ShlN16x4: shl = True; size = 32; break;
   5739       case Iop_ShlN32x2: shl = True; size = 32; break;
   5740       case Iop_Shl64:    shl = True; size = 64; break;
   5741       case Iop_ShrN16x4: shr = True; size = 16; break;
   5742       case Iop_ShrN32x2: shr = True; size = 32; break;
   5743       case Iop_Shr64:    shr = True; size = 64; break;
   5744       case Iop_SarN16x4: sar = True; size = 16; break;
   5745       case Iop_SarN32x2: sar = True; size = 32; break;
   5746       default: vassert(0);
   5747    }
   5748 
   5749    if (shl || shr) {
   5750      assign(
   5751         g1,
   5752         IRExpr_ITE(
   5753            binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size)),
   5754            binop(op, mkexpr(g0), mkexpr(amt8)),
   5755            mkU64(0)
   5756         )
   5757      );
   5758    } else
   5759    if (sar) {
   5760      assign(
   5761         g1,
   5762         IRExpr_ITE(
   5763            binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size)),
   5764            binop(op, mkexpr(g0), mkexpr(amt8)),
   5765            binop(op, mkexpr(g0), mkU8(size-1))
   5766         )
   5767      );
   5768    } else {
   5769       /*NOTREACHED*/
   5770       vassert(0);
   5771    }
   5772 
   5773    putMMXReg( gregOfRM(rm), mkexpr(g1) );
   5774    return delta;
   5775 }
   5776 
   5777 
   5778 /* Vector by scalar shift of E by an immediate byte.  This is a
   5779    straight copy of dis_SSE_shiftE_imm. */
   5780 
   5781 static
   5782 UInt dis_MMX_shiftE_imm ( Int delta, const HChar* opname, IROp op )
   5783 {
   5784    Bool    shl, shr, sar;
   5785    UChar   rm   = getIByte(delta);
   5786    IRTemp  e0   = newTemp(Ity_I64);
   5787    IRTemp  e1   = newTemp(Ity_I64);
   5788    UChar   amt, size;
   5789    vassert(epartIsReg(rm));
   5790    vassert(gregOfRM(rm) == 2
   5791            || gregOfRM(rm) == 4 || gregOfRM(rm) == 6);
   5792    amt = getIByte(delta+1);
   5793    delta += 2;
   5794    DIP("%s $%d,%s\n", opname,
   5795                       (Int)amt,
   5796                       nameMMXReg(eregOfRM(rm)) );
   5797 
   5798    assign( e0, getMMXReg(eregOfRM(rm)) );
   5799 
   5800    shl = shr = sar = False;
   5801    size = 0;
   5802    switch (op) {
   5803       case Iop_ShlN16x4: shl = True; size = 16; break;
   5804       case Iop_ShlN32x2: shl = True; size = 32; break;
   5805       case Iop_Shl64:    shl = True; size = 64; break;
   5806       case Iop_SarN16x4: sar = True; size = 16; break;
   5807       case Iop_SarN32x2: sar = True; size = 32; break;
   5808       case Iop_ShrN16x4: shr = True; size = 16; break;
   5809       case Iop_ShrN32x2: shr = True; size = 32; break;
   5810       case Iop_Shr64:    shr = True; size = 64; break;
   5811       default: vassert(0);
   5812    }
   5813 
   5814    if (shl || shr) {
   5815       assign( e1, amt >= size
   5816                      ? mkU64(0)
   5817                      : binop(op, mkexpr(e0), mkU8(amt))
   5818       );
   5819    } else
   5820    if (sar) {
   5821       assign( e1, amt >= size
   5822                      ? binop(op, mkexpr(e0), mkU8(size-1))
   5823                      : binop(op, mkexpr(e0), mkU8(amt))
   5824       );
   5825    } else {
   5826       /*NOTREACHED*/
   5827       vassert(0);
   5828    }
   5829 
   5830    putMMXReg( eregOfRM(rm), mkexpr(e1) );
   5831    return delta;
   5832 }
   5833 
   5834 
   5835 /* Completely handle all MMX instructions except emms. */
   5836 
   5837 static
   5838 UInt dis_MMX ( Bool* decode_ok, UChar sorb, Int sz, Int delta )
   5839 {
   5840    Int   len;
   5841    UChar modrm;
   5842    HChar dis_buf[50];
   5843    UChar opc = getIByte(delta);
   5844    delta++;
   5845 
   5846    /* dis_MMX handles all insns except emms. */
   5847    do_MMX_preamble();
   5848 
   5849    switch (opc) {
   5850 
   5851       case 0x6E:
   5852          /* MOVD (src)ireg-or-mem (E), (dst)mmxreg (G)*/
   5853          if (sz != 4)
   5854             goto mmx_decode_failure;
   5855          modrm = getIByte(delta);
   5856          if (epartIsReg(modrm)) {
   5857             delta++;
   5858             putMMXReg(
   5859                gregOfRM(modrm),
   5860                binop( Iop_32HLto64,
   5861                       mkU32(0),
   5862                       getIReg(4, eregOfRM(modrm)) ) );
   5863             DIP("movd %s, %s\n",
   5864                 nameIReg(4,eregOfRM(modrm)), nameMMXReg(gregOfRM(modrm)));
   5865          } else {
   5866             IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5867             delta += len;
   5868             putMMXReg(
   5869                gregOfRM(modrm),
   5870                binop( Iop_32HLto64,
   5871                       mkU32(0),
   5872                       loadLE(Ity_I32, mkexpr(addr)) ) );
   5873             DIP("movd %s, %s\n", dis_buf, nameMMXReg(gregOfRM(modrm)));
   5874          }
   5875          break;
   5876 
   5877       case 0x7E: /* MOVD (src)mmxreg (G), (dst)ireg-or-mem (E) */
   5878          if (sz != 4)
   5879             goto mmx_decode_failure;
   5880          modrm = getIByte(delta);
   5881          if (epartIsReg(modrm)) {
   5882             delta++;
   5883             putIReg( 4, eregOfRM(modrm),
   5884                      unop(Iop_64to32, getMMXReg(gregOfRM(modrm)) ) );
   5885             DIP("movd %s, %s\n",
   5886                 nameMMXReg(gregOfRM(modrm)), nameIReg(4,eregOfRM(modrm)));
   5887          } else {
   5888             IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5889             delta += len;
   5890             storeLE( mkexpr(addr),
   5891                      unop(Iop_64to32, getMMXReg(gregOfRM(modrm)) ) );
   5892             DIP("movd %s, %s\n", nameMMXReg(gregOfRM(modrm)), dis_buf);
   5893          }
   5894          break;
   5895 
   5896       case 0x6F:
   5897          /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
   5898          if (sz != 4)
   5899             goto mmx_decode_failure;
   5900          modrm = getIByte(delta);
   5901          if (epartIsReg(modrm)) {
   5902             delta++;
   5903             putMMXReg( gregOfRM(modrm), getMMXReg(eregOfRM(modrm)) );
   5904             DIP("movq %s, %s\n",
   5905                 nameMMXReg(eregOfRM(modrm)), nameMMXReg(gregOfRM(modrm)));
   5906          } else {
   5907             IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5908             delta += len;
   5909             putMMXReg( gregOfRM(modrm), loadLE(Ity_I64, mkexpr(addr)) );
   5910             DIP("movq %s, %s\n",
   5911                 dis_buf, nameMMXReg(gregOfRM(modrm)));
   5912          }
   5913          break;
   5914 
   5915       case 0x7F:
   5916          /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
   5917          if (sz != 4)
   5918             goto mmx_decode_failure;
   5919          modrm = getIByte(delta);
   5920          if (epartIsReg(modrm)) {
   5921             delta++;
   5922             putMMXReg( eregOfRM(modrm), getMMXReg(gregOfRM(modrm)) );
   5923             DIP("movq %s, %s\n",
   5924                 nameMMXReg(gregOfRM(modrm)), nameMMXReg(eregOfRM(modrm)));
   5925          } else {
   5926             IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5927             delta += len;
   5928             storeLE( mkexpr(addr), getMMXReg(gregOfRM(modrm)) );
   5929             DIP("mov(nt)q %s, %s\n",
   5930                 nameMMXReg(gregOfRM(modrm)), dis_buf);
   5931          }
   5932          break;
   5933 
   5934       case 0xFC:
   5935       case 0xFD:
   5936       case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
   5937          if (sz != 4)
   5938             goto mmx_decode_failure;
   5939          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "padd", True );
   5940          break;
   5941 
   5942       case 0xEC:
   5943       case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
   5944          if (sz != 4)
   5945             goto mmx_decode_failure;
   5946          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "padds", True );
   5947          break;
   5948 
   5949       case 0xDC:
   5950       case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   5951          if (sz != 4)
   5952             goto mmx_decode_failure;
   5953          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "paddus", True );
   5954          break;
   5955 
   5956       case 0xF8:
   5957       case 0xF9:
   5958       case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
   5959          if (sz != 4)
   5960             goto mmx_decode_failure;
   5961          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psub", True );
   5962          break;
   5963 
   5964       case 0xE8:
   5965       case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
   5966          if (sz != 4)
   5967             goto mmx_decode_failure;
   5968          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psubs", True );
   5969          break;
   5970 
   5971       case 0xD8:
   5972       case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   5973          if (sz != 4)
   5974             goto mmx_decode_failure;
   5975          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psubus", True );
   5976          break;
   5977 
   5978       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
   5979          if (sz != 4)
   5980             goto mmx_decode_failure;
   5981          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmulhw", False );
   5982          break;
   5983 
   5984       case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
   5985          if (sz != 4)
   5986             goto mmx_decode_failure;
   5987          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmullw", False );
   5988          break;
   5989 
   5990       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
   5991          vassert(sz == 4);
   5992          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmaddwd", False );
   5993          break;
   5994 
   5995       case 0x74:
   5996       case 0x75:
   5997       case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
   5998          if (sz != 4)
   5999             goto mmx_decode_failure;
   6000          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pcmpeq", True );
   6001          break;
   6002 
   6003       case 0x64:
   6004       case 0x65:
   6005       case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
   6006          if (sz != 4)
   6007             goto mmx_decode_failure;
   6008          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pcmpgt", True );
   6009          break;
   6010 
   6011       case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
   6012          if (sz != 4)
   6013             goto mmx_decode_failure;
   6014          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packssdw", False );
   6015          break;
   6016 
   6017       case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
   6018          if (sz != 4)
   6019             goto mmx_decode_failure;
   6020          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packsswb", False );
   6021          break;
   6022 
   6023       case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
   6024          if (sz != 4)
   6025             goto mmx_decode_failure;
   6026          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packuswb", False );
   6027          break;
   6028 
   6029       case 0x68:
   6030       case 0x69:
   6031       case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
   6032          if (sz != 4)
   6033             goto mmx_decode_failure;
   6034          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "punpckh", True );
   6035          break;
   6036 
   6037       case 0x60:
   6038       case 0x61:
   6039       case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
   6040          if (sz != 4)
   6041             goto mmx_decode_failure;
   6042          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "punpckl", True );
   6043          break;
   6044 
   6045       case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
   6046          if (sz != 4)
   6047             goto mmx_decode_failure;
   6048          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pand", False );
   6049          break;
   6050 
   6051       case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
   6052          if (sz != 4)
   6053             goto mmx_decode_failure;
   6054          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pandn", False );
   6055          break;
   6056 
   6057       case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
   6058          if (sz != 4)
   6059             goto mmx_decode_failure;
   6060          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "por", False );
   6061          break;
   6062 
   6063       case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
   6064          if (sz != 4)
   6065             goto mmx_decode_failure;
   6066          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pxor", False );
   6067          break;
   6068 
   6069 #     define SHIFT_BY_REG(_name,_op)                                 \
   6070                 delta = dis_MMX_shiftG_byE(sorb, delta, _name, _op); \
   6071                 break;
   6072 
   6073       /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
   6074       case 0xF1: SHIFT_BY_REG("psllw", Iop_ShlN16x4);
   6075       case 0xF2: SHIFT_BY_REG("pslld", Iop_ShlN32x2);
   6076       case 0xF3: SHIFT_BY_REG("psllq", Iop_Shl64);
   6077 
   6078       /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
   6079       case 0xD1: SHIFT_BY_REG("psrlw", Iop_ShrN16x4);
   6080       case 0xD2: SHIFT_BY_REG("psrld", Iop_ShrN32x2);
   6081       case 0xD3: SHIFT_BY_REG("psrlq", Iop_Shr64);
   6082 
   6083       /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
   6084       case 0xE1: SHIFT_BY_REG("psraw", Iop_SarN16x4);
   6085       case 0xE2: SHIFT_BY_REG("psrad", Iop_SarN32x2);
   6086 
   6087 #     undef SHIFT_BY_REG
   6088 
   6089       case 0x71:
   6090       case 0x72:
   6091       case 0x73: {
   6092          /* (sz==4): PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
   6093          UChar byte2, subopc;
   6094          if (sz != 4)
   6095             goto mmx_decode_failure;
   6096          byte2  = getIByte(delta);           /* amode / sub-opcode */
   6097          subopc = toUChar( (byte2 >> 3) & 7 );
   6098 
   6099 #        define SHIFT_BY_IMM(_name,_op)                         \
   6100              do { delta = dis_MMX_shiftE_imm(delta,_name,_op);  \
   6101              } while (0)
   6102 
   6103               if (subopc == 2 /*SRL*/ && opc == 0x71)
   6104                  SHIFT_BY_IMM("psrlw", Iop_ShrN16x4);
   6105          else if (subopc == 2 /*SRL*/ && opc == 0x72)
   6106                  SHIFT_BY_IMM("psrld", Iop_ShrN32x2);
   6107          else if (subopc == 2 /*SRL*/ && opc == 0x73)
   6108                  SHIFT_BY_IMM("psrlq", Iop_Shr64);
   6109 
   6110          else if (subopc == 4 /*SAR*/ && opc == 0x71)
   6111                  SHIFT_BY_IMM("psraw", Iop_SarN16x4);
   6112          else if (subopc == 4 /*SAR*/ && opc == 0x72)
   6113                  SHIFT_BY_IMM("psrad", Iop_SarN32x2);
   6114 
   6115          else if (subopc == 6 /*SHL*/ && opc == 0x71)
   6116                  SHIFT_BY_IMM("psllw", Iop_ShlN16x4);
   6117          else if (subopc == 6 /*SHL*/ && opc == 0x72)
   6118                  SHIFT_BY_IMM("pslld", Iop_ShlN32x2);
   6119          else if (subopc == 6 /*SHL*/ && opc == 0x73)
   6120                  SHIFT_BY_IMM("psllq", Iop_Shl64);
   6121 
   6122          else goto mmx_decode_failure;
   6123 
   6124 #        undef SHIFT_BY_IMM
   6125          break;
   6126       }
   6127 
   6128       case 0xF7: {
   6129          IRTemp addr    = newTemp(Ity_I32);
   6130          IRTemp regD    = newTemp(Ity_I64);
   6131          IRTemp regM    = newTemp(Ity_I64);
   6132          IRTemp mask    = newTemp(Ity_I64);
   6133          IRTemp olddata = newTemp(Ity_I64);
   6134          IRTemp newdata = newTemp(Ity_I64);
   6135 
   6136          modrm = getIByte(delta);
   6137          if (sz != 4 || (!epartIsReg(modrm)))
   6138             goto mmx_decode_failure;
   6139          delta++;
   6140 
   6141          assign( addr, handleSegOverride( sorb, getIReg(4, R_EDI) ));
   6142          assign( regM, getMMXReg( eregOfRM(modrm) ));
   6143          assign( regD, getMMXReg( gregOfRM(modrm) ));
   6144          assign( mask, binop(Iop_SarN8x8, mkexpr(regM), mkU8(7)) );
   6145          assign( olddata, loadLE( Ity_I64, mkexpr(addr) ));
   6146          assign( newdata,
   6147                  binop(Iop_Or64,
   6148                        binop(Iop_And64,
   6149                              mkexpr(regD),
   6150                              mkexpr(mask) ),
   6151                        binop(Iop_And64,
   6152                              mkexpr(olddata),
   6153                              unop(Iop_Not64, mkexpr(mask)))) );
   6154          storeLE( mkexpr(addr), mkexpr(newdata) );
   6155          DIP("maskmovq %s,%s\n", nameMMXReg( eregOfRM(modrm) ),
   6156                                  nameMMXReg( gregOfRM(modrm) ) );
   6157          break;
   6158       }
   6159 
   6160       /* --- MMX decode failure --- */
   6161       default:
   6162       mmx_decode_failure:
   6163          *decode_ok = False;
   6164          return delta; /* ignored */
   6165 
   6166    }
   6167 
   6168    *decode_ok = True;
   6169    return delta;
   6170 }
   6171 
   6172 
   6173 /*------------------------------------------------------------*/
   6174 /*--- More misc arithmetic and other obscure insns.        ---*/
   6175 /*------------------------------------------------------------*/
   6176 
   6177 /* Double length left and right shifts.  Apparently only required in
   6178    v-size (no b- variant). */
   6179 static
   6180 UInt dis_SHLRD_Gv_Ev ( UChar sorb,
   6181                        Int delta, UChar modrm,
   6182                        Int sz,
   6183                        IRExpr* shift_amt,
   6184                        Bool amt_is_literal,
   6185                        const HChar* shift_amt_txt,
   6186                        Bool left_shift )
   6187 {
   6188    /* shift_amt :: Ity_I8 is the amount to shift.  shift_amt_txt is used
   6189       for printing it.   And eip on entry points at the modrm byte. */
   6190    Int len;
   6191    HChar dis_buf[50];
   6192 
   6193    IRType ty       = szToITy(sz);
   6194    IRTemp gsrc     = newTemp(ty);
   6195    IRTemp esrc     = newTemp(ty);
   6196    IRTemp addr     = IRTemp_INVALID;
   6197    IRTemp tmpSH    = newTemp(Ity_I8);
   6198    IRTemp tmpL     = IRTemp_INVALID;
   6199    IRTemp tmpRes   = IRTemp_INVALID;
   6200    IRTemp tmpSubSh = IRTemp_INVALID;
   6201    IROp   mkpair;
   6202    IROp   getres;
   6203    IROp   shift;
   6204    IRExpr* mask = NULL;
   6205 
   6206    vassert(sz == 2 || sz == 4);
   6207 
   6208    /* The E-part is the destination; this is shifted.  The G-part
   6209       supplies bits to be shifted into the E-part, but is not
   6210       changed.
   6211 
   6212       If shifting left, form a double-length word with E at the top
   6213       and G at the bottom, and shift this left.  The result is then in
   6214       the high part.
   6215 
   6216       If shifting right, form a double-length word with G at the top
   6217       and E at the bottom, and shift this right.  The result is then
   6218       at the bottom.  */
   6219 
   6220    /* Fetch the operands. */
   6221 
   6222    assign( gsrc, getIReg(sz, gregOfRM(modrm)) );
   6223 
   6224    if (epartIsReg(modrm)) {
   6225       delta++;
   6226       assign( esrc, getIReg(sz, eregOfRM(modrm)) );
   6227       DIP("sh%cd%c %s, %s, %s\n",
   6228           ( left_shift ? 'l' : 'r' ), nameISize(sz),
   6229           shift_amt_txt,
   6230           nameIReg(sz, gregOfRM(modrm)), nameIReg(sz, eregOfRM(modrm)));
   6231    } else {
   6232       addr = disAMode ( &len, sorb, delta, dis_buf );
   6233       delta += len;
   6234       assign( esrc, loadLE(ty, mkexpr(addr)) );
   6235       DIP("sh%cd%c %s, %s, %s\n",
   6236           ( left_shift ? 'l' : 'r' ), nameISize(sz),
   6237           shift_amt_txt,
   6238           nameIReg(sz, gregOfRM(modrm)), dis_buf);
   6239    }
   6240 
   6241    /* Round up the relevant primops. */
   6242 
   6243    if (sz == 4) {
   6244       tmpL     = newTemp(Ity_I64);
   6245       tmpRes   = newTemp(Ity_I32);
   6246       tmpSubSh = newTemp(Ity_I32);
   6247       mkpair   = Iop_32HLto64;
   6248       getres   = left_shift ? Iop_64HIto32 : Iop_64to32;
   6249       shift    = left_shift ? Iop_Shl64 : Iop_Shr64;
   6250       mask     = mkU8(31);
   6251    } else {
   6252       /* sz == 2 */
   6253       tmpL     = newTemp(Ity_I32);
   6254       tmpRes   = newTemp(Ity_I16);
   6255       tmpSubSh = newTemp(Ity_I16);
   6256       mkpair   = Iop_16HLto32;
   6257       getres   = left_shift ? Iop_32HIto16 : Iop_32to16;
   6258       shift    = left_shift ? Iop_Shl32 : Iop_Shr32;
   6259       mask     = mkU8(15);
   6260    }
   6261 
   6262    /* Do the shift, calculate the subshift value, and set
   6263       the flag thunk. */
   6264 
   6265    assign( tmpSH, binop(Iop_And8, shift_amt, mask) );
   6266 
   6267    if (left_shift)
   6268       assign( tmpL, binop(mkpair, mkexpr(esrc), mkexpr(gsrc)) );
   6269    else
   6270       assign( tmpL, binop(mkpair, mkexpr(gsrc), mkexpr(esrc)) );
   6271 
   6272    assign( tmpRes, unop(getres, binop(shift, mkexpr(tmpL), mkexpr(tmpSH)) ) );
   6273    assign( tmpSubSh,
   6274            unop(getres,
   6275                 binop(shift,
   6276                       mkexpr(tmpL),
   6277                       binop(Iop_And8,
   6278                             binop(Iop_Sub8, mkexpr(tmpSH), mkU8(1) ),
   6279                             mask))) );
   6280 
   6281    setFlags_DEP1_DEP2_shift ( left_shift ? Iop_Shl32 : Iop_Sar32,
   6282                               tmpRes, tmpSubSh, ty, tmpSH );
   6283 
   6284    /* Put result back. */
   6285 
   6286    if (epartIsReg(modrm)) {
   6287       putIReg(sz, eregOfRM(modrm), mkexpr(tmpRes));
   6288    } else {
   6289       storeLE( mkexpr(addr), mkexpr(tmpRes) );
   6290    }
   6291 
   6292    if (amt_is_literal) delta++;
   6293    return delta;
   6294 }
   6295 
   6296 
   6297 /* Handle BT/BTS/BTR/BTC Gv, Ev.  Apparently b-size is not
   6298    required. */
   6299 
   6300 typedef enum { BtOpNone, BtOpSet, BtOpReset, BtOpComp } BtOp;
   6301 
   6302 static const HChar* nameBtOp ( BtOp op )
   6303 {
   6304    switch (op) {
   6305       case BtOpNone:  return "";
   6306       case BtOpSet:   return "s";
   6307       case BtOpReset: return "r";
   6308       case BtOpComp:  return "c";
   6309       default: vpanic("nameBtOp(x86)");
   6310    }
   6311 }
   6312 
   6313 
   6314 static
   6315 UInt dis_bt_G_E ( const VexAbiInfo* vbi,
   6316                   UChar sorb, Bool locked, Int sz, Int delta, BtOp op )
   6317 {
   6318    HChar  dis_buf[50];
   6319    UChar  modrm;
   6320    Int    len;
   6321    IRTemp t_fetched, t_bitno0, t_bitno1, t_bitno2, t_addr0,
   6322           t_addr1, t_esp, t_mask, t_new;
   6323 
   6324    vassert(sz == 2 || sz == 4);
   6325 
   6326    t_fetched = t_bitno0 = t_bitno1 = t_bitno2
   6327              = t_addr0 = t_addr1 = t_esp
   6328              = t_mask = t_new = IRTemp_INVALID;
   6329 
   6330    t_fetched = newTemp(Ity_I8);
   6331    t_new     = newTemp(Ity_I8);
   6332    t_bitno0  = newTemp(Ity_I32);
   6333    t_bitno1  = newTemp(Ity_I32);
   6334    t_bitno2  = newTemp(Ity_I8);
   6335    t_addr1   = newTemp(Ity_I32);
   6336    modrm     = getIByte(delta);
   6337 
   6338    assign( t_bitno0, widenSto32(getIReg(sz, gregOfRM(modrm))) );
   6339 
   6340    if (epartIsReg(modrm)) {
   6341       delta++;
   6342       /* Get it onto the client's stack. */
   6343       t_esp = newTemp(Ity_I32);
   6344       t_addr0 = newTemp(Ity_I32);
   6345 
   6346       /* For the choice of the value 128, see comment in dis_bt_G_E in
   6347          guest_amd64_toIR.c.  We point out here only that 128 is
   6348          fast-cased in Memcheck and is > 0, so seems like a good
   6349          choice. */
   6350       vassert(vbi->guest_stack_redzone_size == 0);
   6351       assign( t_esp, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(128)) );
   6352       putIReg(4, R_ESP, mkexpr(t_esp));
   6353 
   6354       storeLE( mkexpr(t_esp), getIReg(sz, eregOfRM(modrm)) );
   6355 
   6356       /* Make t_addr0 point at it. */
   6357       assign( t_addr0, mkexpr(t_esp) );
   6358 
   6359       /* Mask out upper bits of the shift amount, since we're doing a
   6360          reg. */
   6361       assign( t_bitno1, binop(Iop_And32,
   6362                               mkexpr(t_bitno0),
   6363                               mkU32(sz == 4 ? 31 : 15)) );
   6364 
   6365    } else {
   6366       t_addr0 = disAMode ( &len, sorb, delta, dis_buf );
   6367       delta += len;
   6368       assign( t_bitno1, mkexpr(t_bitno0) );
   6369    }
   6370 
   6371    /* At this point: t_addr0 is the address being operated on.  If it
   6372       was a reg, we will have pushed it onto the client's stack.
   6373       t_bitno1 is the bit number, suitably masked in the case of a
   6374       reg.  */
   6375 
   6376    /* Now the main sequence. */
   6377    assign( t_addr1,
   6378            binop(Iop_Add32,
   6379                  mkexpr(t_addr0),
   6380                  binop(Iop_Sar32, mkexpr(t_bitno1), mkU8(3))) );
   6381 
   6382    /* t_addr1 now holds effective address */
   6383 
   6384    assign( t_bitno2,
   6385            unop(Iop_32to8,
   6386                 binop(Iop_And32, mkexpr(t_bitno1), mkU32(7))) );
   6387 
   6388    /* t_bitno2 contains offset of bit within byte */
   6389 
   6390    if (op != BtOpNone) {
   6391       t_mask = newTemp(Ity_I8);
   6392       assign( t_mask, binop(Iop_Shl8, mkU8(1), mkexpr(t_bitno2)) );
   6393    }
   6394 
   6395    /* t_mask is now a suitable byte mask */
   6396 
   6397    assign( t_fetched, loadLE(Ity_I8, mkexpr(t_addr1)) );
   6398 
   6399    if (op != BtOpNone) {
   6400       switch (op) {
   6401          case BtOpSet:
   6402             assign( t_new,
   6403                     binop(Iop_Or8, mkexpr(t_fetched), mkexpr(t_mask)) );
   6404             break;
   6405          case BtOpComp:
   6406             assign( t_new,
   6407                     binop(Iop_Xor8, mkexpr(t_fetched), mkexpr(t_mask)) );
   6408             break;
   6409          case BtOpReset:
   6410             assign( t_new,
   6411                     binop(Iop_And8, mkexpr(t_fetched),
   6412                                     unop(Iop_Not8, mkexpr(t_mask))) );
   6413             break;
   6414          default:
   6415             vpanic("dis_bt_G_E(x86)");
   6416       }
   6417       if (locked && !epartIsReg(modrm)) {
   6418          casLE( mkexpr(t_addr1), mkexpr(t_fetched)/*expd*/,
   6419                                  mkexpr(t_new)/*new*/,
   6420                                  guest_EIP_curr_instr );
   6421       } else {
   6422          storeLE( mkexpr(t_addr1), mkexpr(t_new) );
   6423       }
   6424    }
   6425 
   6426    /* Side effect done; now get selected bit into Carry flag */
   6427    /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
   6428    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   6429    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   6430    stmt( IRStmt_Put(
   6431             OFFB_CC_DEP1,
   6432             binop(Iop_And32,
   6433                   binop(Iop_Shr32,
   6434                         unop(Iop_8Uto32, mkexpr(t_fetched)),
   6435                         mkexpr(t_bitno2)),
   6436                   mkU32(1)))
   6437        );
   6438    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   6439       elimination of previous stores to this field work better. */
   6440    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   6441 
   6442    /* Move reg operand from stack back to reg */
   6443    if (epartIsReg(modrm)) {
   6444       /* t_esp still points at it. */
   6445       putIReg(sz, eregOfRM(modrm), loadLE(szToITy(sz), mkexpr(t_esp)) );
   6446       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t_esp), mkU32(128)) );
   6447    }
   6448 
   6449    DIP("bt%s%c %s, %s\n",
   6450        nameBtOp(op), nameISize(sz), nameIReg(sz, gregOfRM(modrm)),
   6451        ( epartIsReg(modrm) ? nameIReg(sz, eregOfRM(modrm)) : dis_buf ) );
   6452 
   6453    return delta;
   6454 }
   6455 
   6456 
   6457 
   6458 /* Handle BSF/BSR.  Only v-size seems necessary. */
   6459 static
   6460 UInt dis_bs_E_G ( UChar sorb, Int sz, Int delta, Bool fwds )
   6461 {
   6462    Bool   isReg;
   6463    UChar  modrm;
   6464    HChar  dis_buf[50];
   6465 
   6466    IRType ty  = szToITy(sz);
   6467    IRTemp src = newTemp(ty);
   6468    IRTemp dst = newTemp(ty);
   6469 
   6470    IRTemp src32 = newTemp(Ity_I32);
   6471    IRTemp dst32 = newTemp(Ity_I32);
   6472    IRTemp srcB  = newTemp(Ity_I1);
   6473 
   6474    vassert(sz == 4 || sz == 2);
   6475 
   6476    modrm = getIByte(delta);
   6477 
   6478    isReg = epartIsReg(modrm);
   6479    if (isReg) {
   6480       delta++;
   6481       assign( src, getIReg(sz, eregOfRM(modrm)) );
   6482    } else {
   6483       Int    len;
   6484       IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   6485       delta += len;
   6486       assign( src, loadLE(ty, mkexpr(addr)) );
   6487    }
   6488 
   6489    DIP("bs%c%c %s, %s\n",
   6490        fwds ? 'f' : 'r', nameISize(sz),
   6491        ( isReg ? nameIReg(sz, eregOfRM(modrm)) : dis_buf ),
   6492        nameIReg(sz, gregOfRM(modrm)));
   6493 
   6494    /* Generate a bool expression which is zero iff the original is
   6495       zero, and nonzero otherwise.  Ask for a CmpNE version which, if
   6496       instrumented by Memcheck, is instrumented expensively, since
   6497       this may be used on the output of a preceding movmskb insn,
   6498       which has been known to be partially defined, and in need of
   6499       careful handling. */
   6500    assign( srcB, binop(mkSizedOp(ty,Iop_ExpCmpNE8),
   6501                        mkexpr(src), mkU(ty,0)) );
   6502 
   6503    /* Flags: Z is 1 iff source value is zero.  All others
   6504       are undefined -- we force them to zero. */
   6505    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   6506    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   6507    stmt( IRStmt_Put(
   6508             OFFB_CC_DEP1,
   6509             IRExpr_ITE( mkexpr(srcB),
   6510                         /* src!=0 */
   6511                         mkU32(0),
   6512                         /* src==0 */
   6513                         mkU32(X86G_CC_MASK_Z)
   6514                         )
   6515        ));
   6516    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   6517       elimination of previous stores to this field work better. */
   6518    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   6519 
   6520    /* Result: iff source value is zero, we can't use
   6521       Iop_Clz32/Iop_Ctz32 as they have no defined result in that case.
   6522       But anyway, Intel x86 semantics say the result is undefined in
   6523       such situations.  Hence handle the zero case specially. */
   6524 
   6525    /* Bleh.  What we compute:
   6526 
   6527           bsf32:  if src == 0 then 0 else  Ctz32(src)
   6528           bsr32:  if src == 0 then 0 else  31 - Clz32(src)
   6529 
   6530           bsf16:  if src == 0 then 0 else  Ctz32(16Uto32(src))
   6531           bsr16:  if src == 0 then 0 else  31 - Clz32(16Uto32(src))
   6532 
   6533       First, widen src to 32 bits if it is not already.
   6534 
   6535       Postscript 15 Oct 04: it seems that at least VIA Nehemiah leaves the
   6536       dst register unchanged when src == 0.  Hence change accordingly.
   6537    */
   6538    if (sz == 2)
   6539       assign( src32, unop(Iop_16Uto32, mkexpr(src)) );
   6540    else
   6541       assign( src32, mkexpr(src) );
   6542 
   6543    /* The main computation, guarding against zero. */
   6544    assign( dst32,
   6545            IRExpr_ITE(
   6546               mkexpr(srcB),
   6547               /* src != 0 */
   6548               fwds ? unop(Iop_Ctz32, mkexpr(src32))
   6549                    : binop(Iop_Sub32,
   6550                            mkU32(31),
   6551                            unop(Iop_Clz32, mkexpr(src32))),
   6552               /* src == 0 -- leave dst unchanged */
   6553               widenUto32( getIReg( sz, gregOfRM(modrm) ) )
   6554            )
   6555          );
   6556 
   6557    if (sz == 2)
   6558       assign( dst, unop(Iop_32to16, mkexpr(dst32)) );
   6559    else
   6560       assign( dst, mkexpr(dst32) );
   6561 
   6562    /* dump result back */
   6563    putIReg( sz, gregOfRM(modrm), mkexpr(dst) );
   6564 
   6565    return delta;
   6566 }
   6567 
   6568 
   6569 static
   6570 void codegen_xchg_eAX_Reg ( Int sz, Int reg )
   6571 {
   6572    IRType ty = szToITy(sz);
   6573    IRTemp t1 = newTemp(ty);
   6574    IRTemp t2 = newTemp(ty);
   6575    vassert(sz == 2 || sz == 4);
   6576    assign( t1, getIReg(sz, R_EAX) );
   6577    assign( t2, getIReg(sz, reg) );
   6578    putIReg( sz, R_EAX, mkexpr(t2) );
   6579    putIReg( sz, reg, mkexpr(t1) );
   6580    DIP("xchg%c %s, %s\n",
   6581        nameISize(sz), nameIReg(sz, R_EAX), nameIReg(sz, reg));
   6582 }
   6583 
   6584 
   6585 static
   6586 void codegen_SAHF ( void )
   6587 {
   6588    /* Set the flags to:
   6589       (x86g_calculate_flags_all() & X86G_CC_MASK_O)  -- retain the old O flag
   6590       | (%AH & (X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
   6591                 |X86G_CC_MASK_P|X86G_CC_MASK_C)
   6592    */
   6593    UInt   mask_SZACP = X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
   6594                        |X86G_CC_MASK_C|X86G_CC_MASK_P;
   6595    IRTemp oldflags   = newTemp(Ity_I32);
   6596    assign( oldflags, mk_x86g_calculate_eflags_all() );
   6597    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   6598    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   6599    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   6600    stmt( IRStmt_Put( OFFB_CC_DEP1,
   6601          binop(Iop_Or32,
   6602                binop(Iop_And32, mkexpr(oldflags), mkU32(X86G_CC_MASK_O)),
   6603                binop(Iop_And32,
   6604                      binop(Iop_Shr32, getIReg(4, R_EAX), mkU8(8)),
   6605                      mkU32(mask_SZACP))
   6606               )
   6607    ));
   6608    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   6609       elimination of previous stores to this field work better. */
   6610    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   6611 }
   6612 
   6613 
   6614 static
   6615 void codegen_LAHF ( void  )
   6616 {
   6617    /* AH <- EFLAGS(SF:ZF:0:AF:0:PF:1:CF) */
   6618    IRExpr* eax_with_hole;
   6619    IRExpr* new_byte;
   6620    IRExpr* new_eax;
   6621    UInt    mask_SZACP = X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
   6622                         |X86G_CC_MASK_C|X86G_CC_MASK_P;
   6623 
   6624    IRTemp  flags = newTemp(Ity_I32);
   6625    assign( flags, mk_x86g_calculate_eflags_all() );
   6626 
   6627    eax_with_hole
   6628       = binop(Iop_And32, getIReg(4, R_EAX), mkU32(0xFFFF00FF));
   6629    new_byte
   6630       = binop(Iop_Or32, binop(Iop_And32, mkexpr(flags), mkU32(mask_SZACP)),
   6631                         mkU32(1<<1));
   6632    new_eax
   6633       = binop(Iop_Or32, eax_with_hole,
   6634                         binop(Iop_Shl32, new_byte, mkU8(8)));
   6635    putIReg(4, R_EAX, new_eax);
   6636 }
   6637 
   6638 
   6639 static
   6640 UInt dis_cmpxchg_G_E ( UChar       sorb,
   6641                        Bool        locked,
   6642                        Int         size,
   6643                        Int         delta0 )
   6644 {
   6645    HChar dis_buf[50];
   6646    Int   len;
   6647 
   6648    IRType ty    = szToITy(size);
   6649    IRTemp acc   = newTemp(ty);
   6650    IRTemp src   = newTemp(ty);
   6651    IRTemp dest  = newTemp(ty);
   6652    IRTemp dest2 = newTemp(ty);
   6653    IRTemp acc2  = newTemp(ty);
   6654    IRTemp cond  = newTemp(Ity_I1);
   6655    IRTemp addr  = IRTemp_INVALID;
   6656    UChar  rm    = getUChar(delta0);
   6657 
   6658    /* There are 3 cases to consider:
   6659 
   6660       reg-reg: ignore any lock prefix, generate sequence based
   6661                on ITE
   6662 
   6663       reg-mem, not locked: ignore any lock prefix, generate sequence
   6664                            based on ITE
   6665 
   6666       reg-mem, locked: use IRCAS
   6667    */
   6668    if (epartIsReg(rm)) {
   6669       /* case 1 */
   6670       assign( dest, getIReg(size, eregOfRM(rm)) );
   6671       delta0++;
   6672       assign( src, getIReg(size, gregOfRM(rm)) );
   6673       assign( acc, getIReg(size, R_EAX) );
   6674       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   6675       assign( cond, mk_x86g_calculate_condition(X86CondZ) );
   6676       assign( dest2, IRExpr_ITE(mkexpr(cond), mkexpr(src), mkexpr(dest)) );
   6677       assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
   6678       putIReg(size, R_EAX, mkexpr(acc2));
   6679       putIReg(size, eregOfRM(rm), mkexpr(dest2));
   6680       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   6681                                nameIReg(size,gregOfRM(rm)),
   6682                                nameIReg(size,eregOfRM(rm)) );
   6683    }
   6684    else if (!epartIsReg(rm) && !locked) {
   6685       /* case 2 */
   6686       addr = disAMode ( &len, sorb, delta0, dis_buf );
   6687       assign( dest, loadLE(ty, mkexpr(addr)) );
   6688       delta0 += len;
   6689       assign( src, getIReg(size, gregOfRM(rm)) );
   6690       assign( acc, getIReg(size, R_EAX) );
   6691       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   6692       assign( cond, mk_x86g_calculate_condition(X86CondZ) );
   6693       assign( dest2, IRExpr_ITE(mkexpr(cond), mkexpr(src), mkexpr(dest)) );
   6694       assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
   6695       putIReg(size, R_EAX, mkexpr(acc2));
   6696       storeLE( mkexpr(addr), mkexpr(dest2) );
   6697       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   6698                                nameIReg(size,gregOfRM(rm)), dis_buf);
   6699    }
   6700    else if (!epartIsReg(rm) && locked) {
   6701       /* case 3 */
   6702       /* src is new value.  acc is expected value.  dest is old value.
   6703          Compute success from the output of the IRCAS, and steer the
   6704          new value for EAX accordingly: in case of success, EAX is
   6705          unchanged. */
   6706       addr = disAMode ( &len, sorb, delta0, dis_buf );
   6707       delta0 += len;
   6708       assign( src, getIReg(size, gregOfRM(rm)) );
   6709       assign( acc, getIReg(size, R_EAX) );
   6710       stmt( IRStmt_CAS(
   6711          mkIRCAS( IRTemp_INVALID, dest, Iend_LE, mkexpr(addr),
   6712                   NULL, mkexpr(acc), NULL, mkexpr(src) )
   6713       ));
   6714       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   6715       assign( cond, mk_x86g_calculate_condition(X86CondZ) );
   6716       assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
   6717       putIReg(size, R_EAX, mkexpr(acc2));
   6718       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   6719                                nameIReg(size,gregOfRM(rm)), dis_buf);
   6720    }
   6721    else vassert(0);
   6722 
   6723    return delta0;
   6724 }
   6725 
   6726 
   6727 /* Handle conditional move instructions of the form
   6728       cmovcc E(reg-or-mem), G(reg)
   6729 
   6730    E(src) is reg-or-mem
   6731    G(dst) is reg.
   6732 
   6733    If E is reg, -->    GET %E, tmps
   6734                        GET %G, tmpd
   6735                        CMOVcc tmps, tmpd
   6736                        PUT tmpd, %G
   6737 
   6738    If E is mem  -->    (getAddr E) -> tmpa
   6739                        LD (tmpa), tmps
   6740                        GET %G, tmpd
   6741                        CMOVcc tmps, tmpd
   6742                        PUT tmpd, %G
   6743 */
   6744 static
   6745 UInt dis_cmov_E_G ( UChar       sorb,
   6746                     Int         sz,
   6747                     X86Condcode cond,
   6748                     Int         delta0 )
   6749 {
   6750    UChar rm  = getIByte(delta0);
   6751    HChar dis_buf[50];
   6752    Int   len;
   6753 
   6754    IRType ty   = szToITy(sz);
   6755    IRTemp tmps = newTemp(ty);
   6756    IRTemp tmpd = newTemp(ty);
   6757 
   6758    if (epartIsReg(rm)) {
   6759       assign( tmps, getIReg(sz, eregOfRM(rm)) );
   6760       assign( tmpd, getIReg(sz, gregOfRM(rm)) );
   6761 
   6762       putIReg(sz, gregOfRM(rm),
   6763                   IRExpr_ITE( mk_x86g_calculate_condition(cond),
   6764                               mkexpr(tmps),
   6765                               mkexpr(tmpd) )
   6766              );
   6767       DIP("cmov%c%s %s,%s\n", nameISize(sz),
   6768                               name_X86Condcode(cond),
   6769                               nameIReg(sz,eregOfRM(rm)),
   6770                               nameIReg(sz,gregOfRM(rm)));
   6771       return 1+delta0;
   6772    }
   6773 
   6774    /* E refers to memory */
   6775    {
   6776       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
   6777       assign( tmps, loadLE(ty, mkexpr(addr)) );
   6778       assign( tmpd, getIReg(sz, gregOfRM(rm)) );
   6779 
   6780       putIReg(sz, gregOfRM(rm),
   6781                   IRExpr_ITE( mk_x86g_calculate_condition(cond),
   6782                               mkexpr(tmps),
   6783                               mkexpr(tmpd) )
   6784              );
   6785 
   6786       DIP("cmov%c%s %s,%s\n", nameISize(sz),
   6787                               name_X86Condcode(cond),
   6788                               dis_buf,
   6789                               nameIReg(sz,gregOfRM(rm)));
   6790       return len+delta0;
   6791    }
   6792 }
   6793 
   6794 
   6795 static
   6796 UInt dis_xadd_G_E ( UChar sorb, Bool locked, Int sz, Int delta0,
   6797                     Bool* decodeOK )
   6798 {
   6799    Int   len;
   6800    UChar rm = getIByte(delta0);
   6801    HChar dis_buf[50];
   6802 
   6803    IRType ty    = szToITy(sz);
   6804    IRTemp tmpd  = newTemp(ty);
   6805    IRTemp tmpt0 = newTemp(ty);
   6806    IRTemp tmpt1 = newTemp(ty);
   6807 
   6808    /* There are 3 cases to consider:
   6809 
   6810       reg-reg: ignore any lock prefix,
   6811                generate 'naive' (non-atomic) sequence
   6812 
   6813       reg-mem, not locked: ignore any lock prefix, generate 'naive'
   6814                            (non-atomic) sequence
   6815 
   6816       reg-mem, locked: use IRCAS
   6817    */
   6818 
   6819    if (epartIsReg(rm)) {
   6820       /* case 1 */
   6821       assign( tmpd,  getIReg(sz, eregOfRM(rm)));
   6822       assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
   6823       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   6824                            mkexpr(tmpd), mkexpr(tmpt0)) );
   6825       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   6826       putIReg(sz, eregOfRM(rm), mkexpr(tmpt1));
   6827       putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
   6828       DIP("xadd%c %s, %s\n",
   6829           nameISize(sz), nameIReg(sz,gregOfRM(rm)),
   6830           				 nameIReg(sz,eregOfRM(rm)));
   6831       *decodeOK = True;
   6832       return 1+delta0;
   6833    }
   6834    else if (!epartIsReg(rm) && !locked) {
   6835       /* case 2 */
   6836       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
   6837       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
   6838       assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
   6839       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   6840                            mkexpr(tmpd), mkexpr(tmpt0)) );
   6841       storeLE( mkexpr(addr), mkexpr(tmpt1) );
   6842       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   6843       putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
   6844       DIP("xadd%c %s, %s\n",
   6845           nameISize(sz), nameIReg(sz,gregOfRM(rm)), dis_buf);
   6846       *decodeOK = True;
   6847       return len+delta0;
   6848    }
   6849    else if (!epartIsReg(rm) && locked) {
   6850       /* case 3 */
   6851       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
   6852       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
   6853       assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
   6854       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   6855                            mkexpr(tmpd), mkexpr(tmpt0)) );
   6856       casLE( mkexpr(addr), mkexpr(tmpd)/*expVal*/,
   6857                            mkexpr(tmpt1)/*newVal*/, guest_EIP_curr_instr );
   6858       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   6859       putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
   6860       DIP("xadd%c %s, %s\n",
   6861           nameISize(sz), nameIReg(sz,gregOfRM(rm)), dis_buf);
   6862       *decodeOK = True;
   6863       return len+delta0;
   6864    }
   6865    /*UNREACHED*/
   6866    vassert(0);
   6867 }
   6868 
   6869 /* Move 16 bits from Ew (ireg or mem) to G (a segment register). */
   6870 
   6871 static
   6872 UInt dis_mov_Ew_Sw ( UChar sorb, Int delta0 )
   6873 {
   6874    Int    len;
   6875    IRTemp addr;
   6876    UChar  rm  = getIByte(delta0);
   6877    HChar  dis_buf[50];
   6878 
   6879    if (epartIsReg(rm)) {
   6880       putSReg( gregOfRM(rm), getIReg(2, eregOfRM(rm)) );
   6881       DIP("movw %s,%s\n", nameIReg(2,eregOfRM(rm)), nameSReg(gregOfRM(rm)));
   6882       return 1+delta0;
   6883    } else {
   6884       addr = disAMode ( &len, sorb, delta0, dis_buf );
   6885       putSReg( gregOfRM(rm), loadLE(Ity_I16, mkexpr(addr)) );
   6886       DIP("movw %s,%s\n", dis_buf, nameSReg(gregOfRM(rm)));
   6887       return len+delta0;
   6888    }
   6889 }
   6890 
   6891 /* Move 16 bits from G (a segment register) to Ew (ireg or mem).  If
   6892    dst is ireg and sz==4, zero out top half of it.  */
   6893 
   6894 static
   6895 UInt dis_mov_Sw_Ew ( UChar sorb,
   6896                      Int   sz,
   6897                      Int   delta0 )
   6898 {
   6899    Int    len;
   6900    IRTemp addr;
   6901    UChar  rm  = getIByte(delta0);
   6902    HChar  dis_buf[50];
   6903 
   6904    vassert(sz == 2 || sz == 4);
   6905 
   6906    if (epartIsReg(rm)) {
   6907       if (sz == 4)
   6908          putIReg(4, eregOfRM(rm), unop(Iop_16Uto32, getSReg(gregOfRM(rm))));
   6909       else
   6910          putIReg(2, eregOfRM(rm), getSReg(gregOfRM(rm)));
   6911 
   6912       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), nameIReg(sz,eregOfRM(rm)));
   6913       return 1+delta0;
   6914    } else {
   6915       addr = disAMode ( &len, sorb, delta0, dis_buf );
   6916       storeLE( mkexpr(addr), getSReg(gregOfRM(rm)) );
   6917       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), dis_buf);
   6918       return len+delta0;
   6919    }
   6920 }
   6921 
   6922 
   6923 static
   6924 void dis_push_segreg ( UInt sreg, Int sz )
   6925 {
   6926     IRTemp t1 = newTemp(Ity_I16);
   6927     IRTemp ta = newTemp(Ity_I32);
   6928     vassert(sz == 2 || sz == 4);
   6929 
   6930     assign( t1, getSReg(sreg) );
   6931     assign( ta, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)) );
   6932     putIReg(4, R_ESP, mkexpr(ta));
   6933     storeLE( mkexpr(ta), mkexpr(t1) );
   6934 
   6935     DIP("push%c %s\n", sz==2 ? 'w' : 'l', nameSReg(sreg));
   6936 }
   6937 
   6938 static
   6939 void dis_pop_segreg ( UInt sreg, Int sz )
   6940 {
   6941     IRTemp t1 = newTemp(Ity_I16);
   6942     IRTemp ta = newTemp(Ity_I32);
   6943     vassert(sz == 2 || sz == 4);
   6944 
   6945     assign( ta, getIReg(4, R_ESP) );
   6946     assign( t1, loadLE(Ity_I16, mkexpr(ta)) );
   6947 
   6948     putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(ta), mkU32(sz)) );
   6949     putSReg( sreg, mkexpr(t1) );
   6950     DIP("pop%c %s\n", sz==2 ? 'w' : 'l', nameSReg(sreg));
   6951 }
   6952 
   6953 static
   6954 void dis_ret ( /*MOD*/DisResult* dres, UInt d32 )
   6955 {
   6956    IRTemp t1 = newTemp(Ity_I32);
   6957    IRTemp t2 = newTemp(Ity_I32);
   6958    assign(t1, getIReg(4,R_ESP));
   6959    assign(t2, loadLE(Ity_I32,mkexpr(t1)));
   6960    putIReg(4, R_ESP,binop(Iop_Add32, mkexpr(t1), mkU32(4+d32)));
   6961    jmp_treg(dres, Ijk_Ret, t2);
   6962    vassert(dres->whatNext == Dis_StopHere);
   6963 }
   6964 
   6965 /*------------------------------------------------------------*/
   6966 /*--- SSE/SSE2/SSE3 helpers                                ---*/
   6967 /*------------------------------------------------------------*/
   6968 
   6969 /* Indicates whether the op requires a rounding-mode argument.  Note
   6970    that this covers only vector floating point arithmetic ops, and
   6971    omits the scalar ones that need rounding modes.  Note also that
   6972    inconsistencies here will get picked up later by the IR sanity
   6973    checker, so this isn't correctness-critical. */
   6974 static Bool requiresRMode ( IROp op )
   6975 {
   6976    switch (op) {
   6977       /* 128 bit ops */
   6978       case Iop_Add32Fx4: case Iop_Sub32Fx4:
   6979       case Iop_Mul32Fx4: case Iop_Div32Fx4:
   6980       case Iop_Add64Fx2: case Iop_Sub64Fx2:
   6981       case Iop_Mul64Fx2: case Iop_Div64Fx2:
   6982          return True;
   6983       default:
   6984          break;
   6985    }
   6986    return False;
   6987 }
   6988 
   6989 
   6990 /* Worker function; do not call directly.
   6991    Handles full width G = G `op` E   and   G = (not G) `op` E.
   6992 */
   6993 
   6994 static UInt dis_SSE_E_to_G_all_wrk (
   6995                UChar sorb, Int delta,
   6996                const HChar* opname, IROp op,
   6997                Bool   invertG
   6998             )
   6999 {
   7000    HChar   dis_buf[50];
   7001    Int     alen;
   7002    IRTemp  addr;
   7003    UChar   rm = getIByte(delta);
   7004    IRExpr* gpart
   7005       = invertG ? unop(Iop_NotV128, getXMMReg(gregOfRM(rm)))
   7006                 : getXMMReg(gregOfRM(rm));
   7007    if (epartIsReg(rm)) {
   7008       putXMMReg(
   7009          gregOfRM(rm),
   7010          requiresRMode(op)
   7011             ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   7012                         gpart,
   7013                         getXMMReg(eregOfRM(rm)))
   7014             : binop(op, gpart,
   7015                         getXMMReg(eregOfRM(rm)))
   7016       );
   7017       DIP("%s %s,%s\n", opname,
   7018                         nameXMMReg(eregOfRM(rm)),
   7019                         nameXMMReg(gregOfRM(rm)) );
   7020       return delta+1;
   7021    } else {
   7022       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7023       putXMMReg(
   7024          gregOfRM(rm),
   7025          requiresRMode(op)
   7026             ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   7027                         gpart,
   7028                         loadLE(Ity_V128, mkexpr(addr)))
   7029             : binop(op, gpart,
   7030                         loadLE(Ity_V128, mkexpr(addr)))
   7031       );
   7032       DIP("%s %s,%s\n", opname,
   7033                         dis_buf,
   7034                         nameXMMReg(gregOfRM(rm)) );
   7035       return delta+alen;
   7036    }
   7037 }
   7038 
   7039 
   7040 /* All lanes SSE binary operation, G = G `op` E. */
   7041 
   7042 static
   7043 UInt dis_SSE_E_to_G_all ( UChar sorb, Int delta, const HChar* opname, IROp op )
   7044 {
   7045    return dis_SSE_E_to_G_all_wrk( sorb, delta, opname, op, False );
   7046 }
   7047 
   7048 /* All lanes SSE binary operation, G = (not G) `op` E. */
   7049 
   7050 static
   7051 UInt dis_SSE_E_to_G_all_invG ( UChar sorb, Int delta,
   7052                                const HChar* opname, IROp op )
   7053 {
   7054    return dis_SSE_E_to_G_all_wrk( sorb, delta, opname, op, True );
   7055 }
   7056 
   7057 
   7058 /* Lowest 32-bit lane only SSE binary operation, G = G `op` E. */
   7059 
   7060 static UInt dis_SSE_E_to_G_lo32 ( UChar sorb, Int delta,
   7061                                   const HChar* opname, IROp op )
   7062 {
   7063    HChar   dis_buf[50];
   7064    Int     alen;
   7065    IRTemp  addr;
   7066    UChar   rm = getIByte(delta);
   7067    IRExpr* gpart = getXMMReg(gregOfRM(rm));
   7068    if (epartIsReg(rm)) {
   7069       putXMMReg( gregOfRM(rm),
   7070                  binop(op, gpart,
   7071                            getXMMReg(eregOfRM(rm))) );
   7072       DIP("%s %s,%s\n", opname,
   7073                         nameXMMReg(eregOfRM(rm)),
   7074                         nameXMMReg(gregOfRM(rm)) );
   7075       return delta+1;
   7076    } else {
   7077       /* We can only do a 32-bit memory read, so the upper 3/4 of the
   7078          E operand needs to be made simply of zeroes. */
   7079       IRTemp epart = newTemp(Ity_V128);
   7080       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7081       assign( epart, unop( Iop_32UtoV128,
   7082                            loadLE(Ity_I32, mkexpr(addr))) );
   7083       putXMMReg( gregOfRM(rm),
   7084                  binop(op, gpart, mkexpr(epart)) );
   7085       DIP("%s %s,%s\n", opname,
   7086                         dis_buf,
   7087                         nameXMMReg(gregOfRM(rm)) );
   7088       return delta+alen;
   7089    }
   7090 }
   7091 
   7092 
   7093 /* Lower 64-bit lane only SSE binary operation, G = G `op` E. */
   7094 
   7095 static UInt dis_SSE_E_to_G_lo64 ( UChar sorb, Int delta,
   7096                                   const HChar* opname, IROp op )
   7097 {
   7098    HChar   dis_buf[50];
   7099    Int     alen;
   7100    IRTemp  addr;
   7101    UChar   rm = getIByte(delta);
   7102    IRExpr* gpart = getXMMReg(gregOfRM(rm));
   7103    if (epartIsReg(rm)) {
   7104       putXMMReg( gregOfRM(rm),
   7105                  binop(op, gpart,
   7106                            getXMMReg(eregOfRM(rm))) );
   7107       DIP("%s %s,%s\n", opname,
   7108                         nameXMMReg(eregOfRM(rm)),
   7109                         nameXMMReg(gregOfRM(rm)) );
   7110       return delta+1;
   7111    } else {
   7112       /* We can only do a 64-bit memory read, so the upper half of the
   7113          E operand needs to be made simply of zeroes. */
   7114       IRTemp epart = newTemp(Ity_V128);
   7115       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7116       assign( epart, unop( Iop_64UtoV128,
   7117                            loadLE(Ity_I64, mkexpr(addr))) );
   7118       putXMMReg( gregOfRM(rm),
   7119                  binop(op, gpart, mkexpr(epart)) );
   7120       DIP("%s %s,%s\n", opname,
   7121                         dis_buf,
   7122                         nameXMMReg(gregOfRM(rm)) );
   7123       return delta+alen;
   7124    }
   7125 }
   7126 
   7127 
   7128 /* All lanes unary SSE operation, G = op(E). */
   7129 
   7130 static UInt dis_SSE_E_to_G_unary_all (
   7131                UChar sorb, Int delta,
   7132                const HChar* opname, IROp op
   7133             )
   7134 {
   7135    HChar   dis_buf[50];
   7136    Int     alen;
   7137    IRTemp  addr;
   7138    UChar   rm = getIByte(delta);
   7139    // Sqrt32Fx4 and Sqrt64Fx2 take a rounding mode, which is faked
   7140    // up in the usual way.
   7141    Bool needsIRRM = op == Iop_Sqrt32Fx4 || op == Iop_Sqrt64Fx2;
   7142    if (epartIsReg(rm)) {
   7143       IRExpr* src = getXMMReg(eregOfRM(rm));
   7144       /* XXXROUNDINGFIXME */
   7145       IRExpr* res = needsIRRM ? binop(op, get_FAKE_roundingmode(), src)
   7146                               : unop(op, src);
   7147       putXMMReg( gregOfRM(rm), res );
   7148       DIP("%s %s,%s\n", opname,
   7149                         nameXMMReg(eregOfRM(rm)),
   7150                         nameXMMReg(gregOfRM(rm)) );
   7151       return delta+1;
   7152    } else {
   7153       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7154       IRExpr* src = loadLE(Ity_V128, mkexpr(addr));
   7155       /* XXXROUNDINGFIXME */
   7156       IRExpr* res = needsIRRM ? binop(op, get_FAKE_roundingmode(), src)
   7157                               : unop(op, src);
   7158       putXMMReg( gregOfRM(rm), res );
   7159       DIP("%s %s,%s\n", opname,
   7160                         dis_buf,
   7161                         nameXMMReg(gregOfRM(rm)) );
   7162       return delta+alen;
   7163    }
   7164 }
   7165 
   7166 
   7167 /* Lowest 32-bit lane only unary SSE operation, G = op(E). */
   7168 
   7169 static UInt dis_SSE_E_to_G_unary_lo32 (
   7170                UChar sorb, Int delta,
   7171                const HChar* opname, IROp op
   7172             )
   7173 {
   7174    /* First we need to get the old G value and patch the low 32 bits
   7175       of the E operand into it.  Then apply op and write back to G. */
   7176    HChar   dis_buf[50];
   7177    Int     alen;
   7178    IRTemp  addr;
   7179    UChar   rm = getIByte(delta);
   7180    IRTemp  oldG0 = newTemp(Ity_V128);
   7181    IRTemp  oldG1 = newTemp(Ity_V128);
   7182 
   7183    assign( oldG0, getXMMReg(gregOfRM(rm)) );
   7184 
   7185    if (epartIsReg(rm)) {
   7186       assign( oldG1,
   7187               binop( Iop_SetV128lo32,
   7188                      mkexpr(oldG0),
   7189                      getXMMRegLane32(eregOfRM(rm), 0)) );
   7190       putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
   7191       DIP("%s %s,%s\n", opname,
   7192                         nameXMMReg(eregOfRM(rm)),
   7193                         nameXMMReg(gregOfRM(rm)) );
   7194       return delta+1;
   7195    } else {
   7196       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7197       assign( oldG1,
   7198               binop( Iop_SetV128lo32,
   7199                      mkexpr(oldG0),
   7200                      loadLE(Ity_I32, mkexpr(addr)) ));
   7201       putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
   7202       DIP("%s %s,%s\n", opname,
   7203                         dis_buf,
   7204                         nameXMMReg(gregOfRM(rm)) );
   7205       return delta+alen;
   7206    }
   7207 }
   7208 
   7209 
   7210 /* Lowest 64-bit lane only unary SSE operation, G = op(E). */
   7211 
   7212 static UInt dis_SSE_E_to_G_unary_lo64 (
   7213                UChar sorb, Int delta,
   7214                const HChar* opname, IROp op
   7215             )
   7216 {
   7217    /* First we need to get the old G value and patch the low 64 bits
   7218       of the E operand into it.  Then apply op and write back to G. */
   7219    HChar   dis_buf[50];
   7220    Int     alen;
   7221    IRTemp  addr;
   7222    UChar   rm = getIByte(delta);
   7223    IRTemp  oldG0 = newTemp(Ity_V128);
   7224    IRTemp  oldG1 = newTemp(Ity_V128);
   7225 
   7226    assign( oldG0, getXMMReg(gregOfRM(rm)) );
   7227 
   7228    if (epartIsReg(rm)) {
   7229       assign( oldG1,
   7230               binop( Iop_SetV128lo64,
   7231                      mkexpr(oldG0),
   7232                      getXMMRegLane64(eregOfRM(rm), 0)) );
   7233       putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
   7234       DIP("%s %s,%s\n", opname,
   7235                         nameXMMReg(eregOfRM(rm)),
   7236                         nameXMMReg(gregOfRM(rm)) );
   7237       return delta+1;
   7238    } else {
   7239       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7240       assign( oldG1,
   7241               binop( Iop_SetV128lo64,
   7242                      mkexpr(oldG0),
   7243                      loadLE(Ity_I64, mkexpr(addr)) ));
   7244       putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
   7245       DIP("%s %s,%s\n", opname,
   7246                         dis_buf,
   7247                         nameXMMReg(gregOfRM(rm)) );
   7248       return delta+alen;
   7249    }
   7250 }
   7251 
   7252 
   7253 /* SSE integer binary operation:
   7254       G = G `op` E   (eLeft == False)
   7255       G = E `op` G   (eLeft == True)
   7256 */
   7257 static UInt dis_SSEint_E_to_G(
   7258                UChar sorb, Int delta,
   7259                const HChar* opname, IROp op,
   7260                Bool   eLeft
   7261             )
   7262 {
   7263    HChar   dis_buf[50];
   7264    Int     alen;
   7265    IRTemp  addr;
   7266    UChar   rm = getIByte(delta);
   7267    IRExpr* gpart = getXMMReg(gregOfRM(rm));
   7268    IRExpr* epart = NULL;
   7269    if (epartIsReg(rm)) {
   7270       epart = getXMMReg(eregOfRM(rm));
   7271       DIP("%s %s,%s\n", opname,
   7272                         nameXMMReg(eregOfRM(rm)),
   7273                         nameXMMReg(gregOfRM(rm)) );
   7274       delta += 1;
   7275    } else {
   7276       addr  = disAMode ( &alen, sorb, delta, dis_buf );
   7277       epart = loadLE(Ity_V128, mkexpr(addr));
   7278       DIP("%s %s,%s\n", opname,
   7279                         dis_buf,
   7280                         nameXMMReg(gregOfRM(rm)) );
   7281       delta += alen;
   7282    }
   7283    putXMMReg( gregOfRM(rm),
   7284               eLeft ? binop(op, epart, gpart)
   7285 	            : binop(op, gpart, epart) );
   7286    return delta;
   7287 }
   7288 
   7289 
   7290 /* Helper for doing SSE FP comparisons. */
   7291 
   7292 static void findSSECmpOp ( Bool* needNot, IROp* op,
   7293                            Int imm8, Bool all_lanes, Int sz )
   7294 {
   7295    imm8 &= 7;
   7296    *needNot = False;
   7297    *op      = Iop_INVALID;
   7298    if (imm8 >= 4) {
   7299       *needNot = True;
   7300       imm8 -= 4;
   7301    }
   7302 
   7303    if (sz == 4 && all_lanes) {
   7304       switch (imm8) {
   7305          case 0: *op = Iop_CmpEQ32Fx4; return;
   7306          case 1: *op = Iop_CmpLT32Fx4; return;
   7307          case 2: *op = Iop_CmpLE32Fx4; return;
   7308          case 3: *op = Iop_CmpUN32Fx4; return;
   7309          default: break;
   7310       }
   7311    }
   7312    if (sz == 4 && !all_lanes) {
   7313       switch (imm8) {
   7314          case 0: *op = Iop_CmpEQ32F0x4; return;
   7315          case 1: *op = Iop_CmpLT32F0x4; return;
   7316          case 2: *op = Iop_CmpLE32F0x4; return;
   7317          case 3: *op = Iop_CmpUN32F0x4; return;
   7318          default: break;
   7319       }
   7320    }
   7321    if (sz == 8 && all_lanes) {
   7322       switch (imm8) {
   7323          case 0: *op = Iop_CmpEQ64Fx2; return;
   7324          case 1: *op = Iop_CmpLT64Fx2; return;
   7325          case 2: *op = Iop_CmpLE64Fx2; return;
   7326          case 3: *op = Iop_CmpUN64Fx2; return;
   7327          default: break;
   7328       }
   7329    }
   7330    if (sz == 8 && !all_lanes) {
   7331       switch (imm8) {
   7332          case 0: *op = Iop_CmpEQ64F0x2; return;
   7333          case 1: *op = Iop_CmpLT64F0x2; return;
   7334          case 2: *op = Iop_CmpLE64F0x2; return;
   7335          case 3: *op = Iop_CmpUN64F0x2; return;
   7336          default: break;
   7337       }
   7338    }
   7339    vpanic("findSSECmpOp(x86,guest)");
   7340 }
   7341 
   7342 /* Handles SSE 32F/64F comparisons. */
   7343 
   7344 static UInt dis_SSEcmp_E_to_G ( UChar sorb, Int delta,
   7345 				const HChar* opname, Bool all_lanes, Int sz )
   7346 {
   7347    HChar   dis_buf[50];
   7348    Int     alen, imm8;
   7349    IRTemp  addr;
   7350    Bool    needNot = False;
   7351    IROp    op      = Iop_INVALID;
   7352    IRTemp  plain   = newTemp(Ity_V128);
   7353    UChar   rm      = getIByte(delta);
   7354    UShort  mask    = 0;
   7355    vassert(sz == 4 || sz == 8);
   7356    if (epartIsReg(rm)) {
   7357       imm8 = getIByte(delta+1);
   7358       findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
   7359       assign( plain, binop(op, getXMMReg(gregOfRM(rm)),
   7360                                getXMMReg(eregOfRM(rm))) );
   7361       delta += 2;
   7362       DIP("%s $%d,%s,%s\n", opname,
   7363                             imm8,
   7364                             nameXMMReg(eregOfRM(rm)),
   7365                             nameXMMReg(gregOfRM(rm)) );
   7366    } else {
   7367       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7368       imm8 = getIByte(delta+alen);
   7369       findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
   7370       assign( plain,
   7371               binop(
   7372                  op,
   7373                  getXMMReg(gregOfRM(rm)),
   7374                    all_lanes  ? loadLE(Ity_V128, mkexpr(addr))
   7375                  : sz == 8    ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
   7376                  : /*sz==4*/    unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr)))
   7377              )
   7378       );
   7379       delta += alen+1;
   7380       DIP("%s $%d,%s,%s\n", opname,
   7381                             imm8,
   7382                             dis_buf,
   7383                             nameXMMReg(gregOfRM(rm)) );
   7384    }
   7385 
   7386    if (needNot && all_lanes) {
   7387       putXMMReg( gregOfRM(rm),
   7388                  unop(Iop_NotV128, mkexpr(plain)) );
   7389    }
   7390    else
   7391    if (needNot && !all_lanes) {
   7392       mask = toUShort( sz==4 ? 0x000F : 0x00FF );
   7393       putXMMReg( gregOfRM(rm),
   7394                  binop(Iop_XorV128, mkexpr(plain), mkV128(mask)) );
   7395    }
   7396    else {
   7397       putXMMReg( gregOfRM(rm), mkexpr(plain) );
   7398    }
   7399 
   7400    return delta;
   7401 }
   7402 
   7403 
   7404 /* Vector by scalar shift of G by the amount specified at the bottom
   7405    of E. */
   7406 
   7407 static UInt dis_SSE_shiftG_byE ( UChar sorb, Int delta,
   7408                                  const HChar* opname, IROp op )
   7409 {
   7410    HChar   dis_buf[50];
   7411    Int     alen, size;
   7412    IRTemp  addr;
   7413    Bool    shl, shr, sar;
   7414    UChar   rm   = getIByte(delta);
   7415    IRTemp  g0   = newTemp(Ity_V128);
   7416    IRTemp  g1   = newTemp(Ity_V128);
   7417    IRTemp  amt  = newTemp(Ity_I32);
   7418    IRTemp  amt8 = newTemp(Ity_I8);
   7419    if (epartIsReg(rm)) {
   7420       assign( amt, getXMMRegLane32(eregOfRM(rm), 0) );
   7421       DIP("%s %s,%s\n", opname,
   7422                         nameXMMReg(eregOfRM(rm)),
   7423                         nameXMMReg(gregOfRM(rm)) );
   7424       delta++;
   7425    } else {
   7426       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7427       assign( amt, loadLE(Ity_I32, mkexpr(addr)) );
   7428       DIP("%s %s,%s\n", opname,
   7429                         dis_buf,
   7430                         nameXMMReg(gregOfRM(rm)) );
   7431       delta += alen;
   7432    }
   7433    assign( g0,   getXMMReg(gregOfRM(rm)) );
   7434    assign( amt8, unop(Iop_32to8, mkexpr(amt)) );
   7435 
   7436    shl = shr = sar = False;
   7437    size = 0;
   7438    switch (op) {
   7439       case Iop_ShlN16x8: shl = True; size = 32; break;
   7440       case Iop_ShlN32x4: shl = True; size = 32; break;
   7441       case Iop_ShlN64x2: shl = True; size = 64; break;
   7442       case Iop_SarN16x8: sar = True; size = 16; break;
   7443       case Iop_SarN32x4: sar = True; size = 32; break;
   7444       case Iop_ShrN16x8: shr = True; size = 16; break;
   7445       case Iop_ShrN32x4: shr = True; size = 32; break;
   7446       case Iop_ShrN64x2: shr = True; size = 64; break;
   7447       default: vassert(0);
   7448    }
   7449 
   7450    if (shl || shr) {
   7451      assign(
   7452         g1,
   7453         IRExpr_ITE(
   7454            binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size)),
   7455            binop(op, mkexpr(g0), mkexpr(amt8)),
   7456            mkV128(0x0000)
   7457         )
   7458      );
   7459    } else
   7460    if (sar) {
   7461      assign(
   7462         g1,
   7463         IRExpr_ITE(
   7464            binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size)),
   7465            binop(op, mkexpr(g0), mkexpr(amt8)),
   7466            binop(op, mkexpr(g0), mkU8(size-1))
   7467         )
   7468      );
   7469    } else {
   7470       /*NOTREACHED*/
   7471       vassert(0);
   7472    }
   7473 
   7474    putXMMReg( gregOfRM(rm), mkexpr(g1) );
   7475    return delta;
   7476 }
   7477 
   7478 
   7479 /* Vector by scalar shift of E by an immediate byte. */
   7480 
   7481 static
   7482 UInt dis_SSE_shiftE_imm ( Int delta, const HChar* opname, IROp op )
   7483 {
   7484    Bool    shl, shr, sar;
   7485    UChar   rm   = getIByte(delta);
   7486    IRTemp  e0   = newTemp(Ity_V128);
   7487    IRTemp  e1   = newTemp(Ity_V128);
   7488    UChar   amt, size;
   7489    vassert(epartIsReg(rm));
   7490    vassert(gregOfRM(rm) == 2
   7491            || gregOfRM(rm) == 4 || gregOfRM(rm) == 6);
   7492    amt = getIByte(delta+1);
   7493    delta += 2;
   7494    DIP("%s $%d,%s\n", opname,
   7495                       (Int)amt,
   7496                       nameXMMReg(eregOfRM(rm)) );
   7497    assign( e0, getXMMReg(eregOfRM(rm)) );
   7498 
   7499    shl = shr = sar = False;
   7500    size = 0;
   7501    switch (op) {
   7502       case Iop_ShlN16x8: shl = True; size = 16; break;
   7503       case Iop_ShlN32x4: shl = True; size = 32; break;
   7504       case Iop_ShlN64x2: shl = True; size = 64; break;
   7505       case Iop_SarN16x8: sar = True; size = 16; break;
   7506       case Iop_SarN32x4: sar = True; size = 32; break;
   7507       case Iop_ShrN16x8: shr = True; size = 16; break;
   7508       case Iop_ShrN32x4: shr = True; size = 32; break;
   7509       case Iop_ShrN64x2: shr = True; size = 64; break;
   7510       default: vassert(0);
   7511    }
   7512 
   7513    if (shl || shr) {
   7514       assign( e1, amt >= size
   7515                      ? mkV128(0x0000)
   7516                      : binop(op, mkexpr(e0), mkU8(amt))
   7517       );
   7518    } else
   7519    if (sar) {
   7520       assign( e1, amt >= size
   7521                      ? binop(op, mkexpr(e0), mkU8(size-1))
   7522                      : binop(op, mkexpr(e0), mkU8(amt))
   7523       );
   7524    } else {
   7525       /*NOTREACHED*/
   7526       vassert(0);
   7527    }
   7528 
   7529    putXMMReg( eregOfRM(rm), mkexpr(e1) );
   7530    return delta;
   7531 }
   7532 
   7533 
   7534 /* Get the current SSE rounding mode. */
   7535 
   7536 static IRExpr* /* :: Ity_I32 */ get_sse_roundingmode ( void )
   7537 {
   7538    return binop( Iop_And32,
   7539                  IRExpr_Get( OFFB_SSEROUND, Ity_I32 ),
   7540                  mkU32(3) );
   7541 }
   7542 
   7543 static void put_sse_roundingmode ( IRExpr* sseround )
   7544 {
   7545    vassert(typeOfIRExpr(irsb->tyenv, sseround) == Ity_I32);
   7546    stmt( IRStmt_Put( OFFB_SSEROUND, sseround ) );
   7547 }
   7548 
   7549 /* Break a 128-bit value up into four 32-bit ints. */
   7550 
   7551 static void breakup128to32s ( IRTemp t128,
   7552 			      /*OUTs*/
   7553                               IRTemp* t3, IRTemp* t2,
   7554                               IRTemp* t1, IRTemp* t0 )
   7555 {
   7556    IRTemp hi64 = newTemp(Ity_I64);
   7557    IRTemp lo64 = newTemp(Ity_I64);
   7558    assign( hi64, unop(Iop_V128HIto64, mkexpr(t128)) );
   7559    assign( lo64, unop(Iop_V128to64,   mkexpr(t128)) );
   7560 
   7561    vassert(t0 && *t0 == IRTemp_INVALID);
   7562    vassert(t1 && *t1 == IRTemp_INVALID);
   7563    vassert(t2 && *t2 == IRTemp_INVALID);
   7564    vassert(t3 && *t3 == IRTemp_INVALID);
   7565 
   7566    *t0 = newTemp(Ity_I32);
   7567    *t1 = newTemp(Ity_I32);
   7568    *t2 = newTemp(Ity_I32);
   7569    *t3 = newTemp(Ity_I32);
   7570    assign( *t0, unop(Iop_64to32,   mkexpr(lo64)) );
   7571    assign( *t1, unop(Iop_64HIto32, mkexpr(lo64)) );
   7572    assign( *t2, unop(Iop_64to32,   mkexpr(hi64)) );
   7573    assign( *t3, unop(Iop_64HIto32, mkexpr(hi64)) );
   7574 }
   7575 
   7576 /* Construct a 128-bit value from four 32-bit ints. */
   7577 
   7578 static IRExpr* mk128from32s ( IRTemp t3, IRTemp t2,
   7579                               IRTemp t1, IRTemp t0 )
   7580 {
   7581    return
   7582       binop( Iop_64HLtoV128,
   7583              binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
   7584              binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0))
   7585    );
   7586 }
   7587 
   7588 /* Break a 64-bit value up into four 16-bit ints. */
   7589 
   7590 static void breakup64to16s ( IRTemp t64,
   7591                              /*OUTs*/
   7592                              IRTemp* t3, IRTemp* t2,
   7593                              IRTemp* t1, IRTemp* t0 )
   7594 {
   7595    IRTemp hi32 = newTemp(Ity_I32);
   7596    IRTemp lo32 = newTemp(Ity_I32);
   7597    assign( hi32, unop(Iop_64HIto32, mkexpr(t64)) );
   7598    assign( lo32, unop(Iop_64to32,   mkexpr(t64)) );
   7599 
   7600    vassert(t0 && *t0 == IRTemp_INVALID);
   7601    vassert(t1 && *t1 == IRTemp_INVALID);
   7602    vassert(t2 && *t2 == IRTemp_INVALID);
   7603    vassert(t3 && *t3 == IRTemp_INVALID);
   7604 
   7605    *t0 = newTemp(Ity_I16);
   7606    *t1 = newTemp(Ity_I16);
   7607    *t2 = newTemp(Ity_I16);
   7608    *t3 = newTemp(Ity_I16);
   7609    assign( *t0, unop(Iop_32to16,   mkexpr(lo32)) );
   7610    assign( *t1, unop(Iop_32HIto16, mkexpr(lo32)) );
   7611    assign( *t2, unop(Iop_32to16,   mkexpr(hi32)) );
   7612    assign( *t3, unop(Iop_32HIto16, mkexpr(hi32)) );
   7613 }
   7614 
   7615 /* Construct a 64-bit value from four 16-bit ints. */
   7616 
   7617 static IRExpr* mk64from16s ( IRTemp t3, IRTemp t2,
   7618                              IRTemp t1, IRTemp t0 )
   7619 {
   7620    return
   7621       binop( Iop_32HLto64,
   7622              binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)),
   7623              binop(Iop_16HLto32, mkexpr(t1), mkexpr(t0))
   7624    );
   7625 }
   7626 
   7627 /* Generate IR to set the guest %EFLAGS from the pushfl-format image
   7628    in the given 32-bit temporary.  The flags that are set are: O S Z A
   7629    C P D ID AC.
   7630 
   7631    In all cases, code to set AC is generated.  However, VEX actually
   7632    ignores the AC value and so can optionally emit an emulation
   7633    warning when it is enabled.  In this routine, an emulation warning
   7634    is only emitted if emit_AC_emwarn is True, in which case
   7635    next_insn_EIP must be correct (this allows for correct code
   7636    generation for popfl/popfw).  If emit_AC_emwarn is False,
   7637    next_insn_EIP is unimportant (this allows for easy if kludgey code
   7638    generation for IRET.) */
   7639 
   7640 static
   7641 void set_EFLAGS_from_value ( IRTemp t1,
   7642                              Bool   emit_AC_emwarn,
   7643                              Addr32 next_insn_EIP )
   7644 {
   7645    vassert(typeOfIRTemp(irsb->tyenv,t1) == Ity_I32);
   7646 
   7647    /* t1 is the flag word.  Mask out everything except OSZACP and set
   7648       the flags thunk to X86G_CC_OP_COPY. */
   7649    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   7650    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   7651    stmt( IRStmt_Put( OFFB_CC_DEP1,
   7652                      binop(Iop_And32,
   7653                            mkexpr(t1),
   7654                            mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
   7655                                   | X86G_CC_MASK_A | X86G_CC_MASK_Z
   7656                                   | X86G_CC_MASK_S| X86G_CC_MASK_O )
   7657                           )
   7658                     )
   7659        );
   7660    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   7661       elimination of previous stores to this field work better. */
   7662    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   7663 
   7664    /* Also need to set the D flag, which is held in bit 10 of t1.
   7665       If zero, put 1 in OFFB_DFLAG, else -1 in OFFB_DFLAG. */
   7666    stmt( IRStmt_Put(
   7667             OFFB_DFLAG,
   7668             IRExpr_ITE(
   7669                unop(Iop_32to1,
   7670                     binop(Iop_And32,
   7671                           binop(Iop_Shr32, mkexpr(t1), mkU8(10)),
   7672                           mkU32(1))),
   7673                mkU32(0xFFFFFFFF),
   7674                mkU32(1)))
   7675        );
   7676 
   7677    /* Set the ID flag */
   7678    stmt( IRStmt_Put(
   7679             OFFB_IDFLAG,
   7680             IRExpr_ITE(
   7681                unop(Iop_32to1,
   7682                     binop(Iop_And32,
   7683                           binop(Iop_Shr32, mkexpr(t1), mkU8(21)),
   7684                           mkU32(1))),
   7685                mkU32(1),
   7686                mkU32(0)))
   7687        );
   7688 
   7689    /* And set the AC flag.  If setting it 1 to, possibly emit an
   7690       emulation warning. */
   7691    stmt( IRStmt_Put(
   7692             OFFB_ACFLAG,
   7693             IRExpr_ITE(
   7694                unop(Iop_32to1,
   7695                     binop(Iop_And32,
   7696                           binop(Iop_Shr32, mkexpr(t1), mkU8(18)),
   7697                           mkU32(1))),
   7698                mkU32(1),
   7699                mkU32(0)))
   7700        );
   7701 
   7702    if (emit_AC_emwarn) {
   7703       put_emwarn( mkU32(EmWarn_X86_acFlag) );
   7704       stmt(
   7705          IRStmt_Exit(
   7706             binop( Iop_CmpNE32,
   7707                    binop(Iop_And32, mkexpr(t1), mkU32(1<<18)),
   7708                    mkU32(0) ),
   7709             Ijk_EmWarn,
   7710             IRConst_U32( next_insn_EIP ),
   7711             OFFB_EIP
   7712          )
   7713       );
   7714    }
   7715 }
   7716 
   7717 
   7718 /* Helper for the SSSE3 (not SSE3) PMULHRSW insns.  Given two 64-bit
   7719    values (aa,bb), computes, for each of the 4 16-bit lanes:
   7720 
   7721    (((aa_lane *s32 bb_lane) >>u 14) + 1) >>u 1
   7722 */
   7723 static IRExpr* dis_PMULHRSW_helper ( IRExpr* aax, IRExpr* bbx )
   7724 {
   7725    IRTemp aa      = newTemp(Ity_I64);
   7726    IRTemp bb      = newTemp(Ity_I64);
   7727    IRTemp aahi32s = newTemp(Ity_I64);
   7728    IRTemp aalo32s = newTemp(Ity_I64);
   7729    IRTemp bbhi32s = newTemp(Ity_I64);
   7730    IRTemp bblo32s = newTemp(Ity_I64);
   7731    IRTemp rHi     = newTemp(Ity_I64);
   7732    IRTemp rLo     = newTemp(Ity_I64);
   7733    IRTemp one32x2 = newTemp(Ity_I64);
   7734    assign(aa, aax);
   7735    assign(bb, bbx);
   7736    assign( aahi32s,
   7737            binop(Iop_SarN32x2,
   7738                  binop(Iop_InterleaveHI16x4, mkexpr(aa), mkexpr(aa)),
   7739                  mkU8(16) ));
   7740    assign( aalo32s,
   7741            binop(Iop_SarN32x2,
   7742                  binop(Iop_InterleaveLO16x4, mkexpr(aa), mkexpr(aa)),
   7743                  mkU8(16) ));
   7744    assign( bbhi32s,
   7745            binop(Iop_SarN32x2,
   7746                  binop(Iop_InterleaveHI16x4, mkexpr(bb), mkexpr(bb)),
   7747                  mkU8(16) ));
   7748    assign( bblo32s,
   7749            binop(Iop_SarN32x2,
   7750                  binop(Iop_InterleaveLO16x4, mkexpr(bb), mkexpr(bb)),
   7751                  mkU8(16) ));
   7752    assign(one32x2, mkU64( (1ULL << 32) + 1 ));
   7753    assign(
   7754       rHi,
   7755       binop(
   7756          Iop_ShrN32x2,
   7757          binop(
   7758             Iop_Add32x2,
   7759             binop(
   7760                Iop_ShrN32x2,
   7761                binop(Iop_Mul32x2, mkexpr(aahi32s), mkexpr(bbhi32s)),
   7762                mkU8(14)
   7763             ),
   7764             mkexpr(one32x2)
   7765          ),
   7766          mkU8(1)
   7767       )
   7768    );
   7769    assign(
   7770       rLo,
   7771       binop(
   7772          Iop_ShrN32x2,
   7773          binop(
   7774             Iop_Add32x2,
   7775             binop(
   7776                Iop_ShrN32x2,
   7777                binop(Iop_Mul32x2, mkexpr(aalo32s), mkexpr(bblo32s)),
   7778                mkU8(14)
   7779             ),
   7780             mkexpr(one32x2)
   7781          ),
   7782          mkU8(1)
   7783       )
   7784    );
   7785    return
   7786       binop(Iop_CatEvenLanes16x4, mkexpr(rHi), mkexpr(rLo));
   7787 }
   7788 
   7789 /* Helper for the SSSE3 (not SSE3) PSIGN{B,W,D} insns.  Given two 64-bit
   7790    values (aa,bb), computes, for each lane:
   7791 
   7792           if aa_lane < 0 then - bb_lane
   7793      else if aa_lane > 0 then bb_lane
   7794      else 0
   7795 */
   7796 static IRExpr* dis_PSIGN_helper ( IRExpr* aax, IRExpr* bbx, Int laneszB )
   7797 {
   7798    IRTemp aa       = newTemp(Ity_I64);
   7799    IRTemp bb       = newTemp(Ity_I64);
   7800    IRTemp zero     = newTemp(Ity_I64);
   7801    IRTemp bbNeg    = newTemp(Ity_I64);
   7802    IRTemp negMask  = newTemp(Ity_I64);
   7803    IRTemp posMask  = newTemp(Ity_I64);
   7804    IROp   opSub    = Iop_INVALID;
   7805    IROp   opCmpGTS = Iop_INVALID;
   7806 
   7807    switch (laneszB) {
   7808       case 1: opSub = Iop_Sub8x8;  opCmpGTS = Iop_CmpGT8Sx8;  break;
   7809       case 2: opSub = Iop_Sub16x4; opCmpGTS = Iop_CmpGT16Sx4; break;
   7810       case 4: opSub = Iop_Sub32x2; opCmpGTS = Iop_CmpGT32Sx2; break;
   7811       default: vassert(0);
   7812    }
   7813 
   7814    assign( aa,      aax );
   7815    assign( bb,      bbx );
   7816    assign( zero,    mkU64(0) );
   7817    assign( bbNeg,   binop(opSub,    mkexpr(zero), mkexpr(bb)) );
   7818    assign( negMask, binop(opCmpGTS, mkexpr(zero), mkexpr(aa)) );
   7819    assign( posMask, binop(opCmpGTS, mkexpr(aa),   mkexpr(zero)) );
   7820 
   7821    return
   7822       binop(Iop_Or64,
   7823             binop(Iop_And64, mkexpr(bb),    mkexpr(posMask)),
   7824             binop(Iop_And64, mkexpr(bbNeg), mkexpr(negMask)) );
   7825 
   7826 }
   7827 
   7828 /* Helper for the SSSE3 (not SSE3) PABS{B,W,D} insns.  Given a 64-bit
   7829    value aa, computes, for each lane
   7830 
   7831    if aa < 0 then -aa else aa
   7832 
   7833    Note that the result is interpreted as unsigned, so that the
   7834    absolute value of the most negative signed input can be
   7835    represented.
   7836 */
   7837 static IRExpr* dis_PABS_helper ( IRExpr* aax, Int laneszB )
   7838 {
   7839    IRTemp aa      = newTemp(Ity_I64);
   7840    IRTemp zero    = newTemp(Ity_I64);
   7841    IRTemp aaNeg   = newTemp(Ity_I64);
   7842    IRTemp negMask = newTemp(Ity_I64);
   7843    IRTemp posMask = newTemp(Ity_I64);
   7844    IROp   opSub   = Iop_INVALID;
   7845    IROp   opSarN  = Iop_INVALID;
   7846 
   7847    switch (laneszB) {
   7848       case 1: opSub = Iop_Sub8x8;  opSarN = Iop_SarN8x8;  break;
   7849       case 2: opSub = Iop_Sub16x4; opSarN = Iop_SarN16x4; break;
   7850       case 4: opSub = Iop_Sub32x2; opSarN = Iop_SarN32x2; break;
   7851       default: vassert(0);
   7852    }
   7853 
   7854    assign( aa,      aax );
   7855    assign( negMask, binop(opSarN, mkexpr(aa), mkU8(8*laneszB-1)) );
   7856    assign( posMask, unop(Iop_Not64, mkexpr(negMask)) );
   7857    assign( zero,    mkU64(0) );
   7858    assign( aaNeg,   binop(opSub, mkexpr(zero), mkexpr(aa)) );
   7859    return
   7860       binop(Iop_Or64,
   7861             binop(Iop_And64, mkexpr(aa),    mkexpr(posMask)),
   7862             binop(Iop_And64, mkexpr(aaNeg), mkexpr(negMask)) );
   7863 }
   7864 
   7865 static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64,
   7866                                         IRTemp lo64, Int byteShift )
   7867 {
   7868    vassert(byteShift >= 1 && byteShift <= 7);
   7869    return
   7870       binop(Iop_Or64,
   7871             binop(Iop_Shl64, mkexpr(hi64), mkU8(8*(8-byteShift))),
   7872             binop(Iop_Shr64, mkexpr(lo64), mkU8(8*byteShift))
   7873       );
   7874 }
   7875 
   7876 /* Generate a SIGSEGV followed by a restart of the current instruction
   7877    if effective_addr is not 16-aligned.  This is required behaviour
   7878    for some SSE3 instructions and all 128-bit SSSE3 instructions.
   7879    This assumes that guest_RIP_curr_instr is set correctly! */
   7880 static void gen_SEGV_if_not_16_aligned ( IRTemp effective_addr )
   7881 {
   7882    stmt(
   7883       IRStmt_Exit(
   7884          binop(Iop_CmpNE32,
   7885                binop(Iop_And32,mkexpr(effective_addr),mkU32(0xF)),
   7886                mkU32(0)),
   7887          Ijk_SigSEGV,
   7888          IRConst_U32(guest_EIP_curr_instr),
   7889          OFFB_EIP
   7890       )
   7891    );
   7892 }
   7893 
   7894 
   7895 /* Helper for deciding whether a given insn (starting at the opcode
   7896    byte) may validly be used with a LOCK prefix.  The following insns
   7897    may be used with LOCK when their destination operand is in memory.
   7898    AFAICS this is exactly the same for both 32-bit and 64-bit mode.
   7899 
   7900    ADD        80 /0,  81 /0,  82 /0,  83 /0,  00,  01
   7901    OR         80 /1,  81 /1,  82 /x,  83 /1,  08,  09
   7902    ADC        80 /2,  81 /2,  82 /2,  83 /2,  10,  11
   7903    SBB        81 /3,  81 /3,  82 /x,  83 /3,  18,  19
   7904    AND        80 /4,  81 /4,  82 /x,  83 /4,  20,  21
   7905    SUB        80 /5,  81 /5,  82 /x,  83 /5,  28,  29
   7906    XOR        80 /6,  81 /6,  82 /x,  83 /6,  30,  31
   7907 
   7908    DEC        FE /1,  FF /1
   7909    INC        FE /0,  FF /0
   7910 
   7911    NEG        F6 /3,  F7 /3
   7912    NOT        F6 /2,  F7 /2
   7913 
   7914    XCHG       86, 87
   7915 
   7916    BTC        0F BB,  0F BA /7
   7917    BTR        0F B3,  0F BA /6
   7918    BTS        0F AB,  0F BA /5
   7919 
   7920    CMPXCHG    0F B0,  0F B1
   7921    CMPXCHG8B  0F C7 /1
   7922 
   7923    XADD       0F C0,  0F C1
   7924 
   7925    ------------------------------
   7926 
   7927    80 /0  =  addb $imm8,  rm8
   7928    81 /0  =  addl $imm32, rm32  and  addw $imm16, rm16
   7929    82 /0  =  addb $imm8,  rm8
   7930    83 /0  =  addl $simm8, rm32  and  addw $simm8, rm16
   7931 
   7932    00     =  addb r8,  rm8
   7933    01     =  addl r32, rm32  and  addw r16, rm16
   7934 
   7935    Same for ADD OR ADC SBB AND SUB XOR
   7936 
   7937    FE /1  = dec rm8
   7938    FF /1  = dec rm32  and  dec rm16
   7939 
   7940    FE /0  = inc rm8
   7941    FF /0  = inc rm32  and  inc rm16
   7942 
   7943    F6 /3  = neg rm8
   7944    F7 /3  = neg rm32  and  neg rm16
   7945 
   7946    F6 /2  = not rm8
   7947    F7 /2  = not rm32  and  not rm16
   7948 
   7949    0F BB     = btcw r16, rm16    and  btcl r32, rm32
   7950    OF BA /7  = btcw $imm8, rm16  and  btcw $imm8, rm32
   7951 
   7952    Same for BTS, BTR
   7953 */
   7954 static Bool can_be_used_with_LOCK_prefix ( const UChar* opc )
   7955 {
   7956    switch (opc[0]) {
   7957       case 0x00: case 0x01: case 0x08: case 0x09:
   7958       case 0x10: case 0x11: case 0x18: case 0x19:
   7959       case 0x20: case 0x21: case 0x28: case 0x29:
   7960       case 0x30: case 0x31:
   7961          if (!epartIsReg(opc[1]))
   7962             return True;
   7963          break;
   7964 
   7965       case 0x80: case 0x81: case 0x82: case 0x83:
   7966          if (gregOfRM(opc[1]) >= 0 && gregOfRM(opc[1]) <= 6
   7967              && !epartIsReg(opc[1]))
   7968             return True;
   7969          break;
   7970 
   7971       case 0xFE: case 0xFF:
   7972          if (gregOfRM(opc[1]) >= 0 && gregOfRM(opc[1]) <= 1
   7973              && !epartIsReg(opc[1]))
   7974             return True;
   7975          break;
   7976 
   7977       case 0xF6: case 0xF7:
   7978          if (gregOfRM(opc[1]) >= 2 && gregOfRM(opc[1]) <= 3
   7979              && !epartIsReg(opc[1]))
   7980             return True;
   7981          break;
   7982 
   7983       case 0x86: case 0x87:
   7984          if (!epartIsReg(opc[1]))
   7985             return True;
   7986          break;
   7987 
   7988       case 0x0F: {
   7989          switch (opc[1]) {
   7990             case 0xBB: case 0xB3: case 0xAB:
   7991                if (!epartIsReg(opc[2]))
   7992                   return True;
   7993                break;
   7994             case 0xBA:
   7995                if (gregOfRM(opc[2]) >= 5 && gregOfRM(opc[2]) <= 7
   7996                    && !epartIsReg(opc[2]))
   7997                   return True;
   7998                break;
   7999             case 0xB0: case 0xB1:
   8000                if (!epartIsReg(opc[2]))
   8001                   return True;
   8002                break;
   8003             case 0xC7:
   8004                if (gregOfRM(opc[2]) == 1 && !epartIsReg(opc[2]) )
   8005                   return True;
   8006                break;
   8007             case 0xC0: case 0xC1:
   8008                if (!epartIsReg(opc[2]))
   8009                   return True;
   8010                break;
   8011             default:
   8012                break;
   8013          } /* switch (opc[1]) */
   8014          break;
   8015       }
   8016 
   8017       default:
   8018          break;
   8019    } /* switch (opc[0]) */
   8020 
   8021    return False;
   8022 }
   8023 
   8024 static IRTemp math_BSWAP ( IRTemp t1, IRType ty )
   8025 {
   8026    IRTemp t2 = newTemp(ty);
   8027    if (ty == Ity_I32) {
   8028       assign( t2,
   8029          binop(
   8030             Iop_Or32,
   8031             binop(Iop_Shl32, mkexpr(t1), mkU8(24)),
   8032             binop(
   8033                Iop_Or32,
   8034                binop(Iop_And32, binop(Iop_Shl32, mkexpr(t1), mkU8(8)),
   8035                                 mkU32(0x00FF0000)),
   8036                binop(Iop_Or32,
   8037                      binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(8)),
   8038                                       mkU32(0x0000FF00)),
   8039                      binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(24)),
   8040                                       mkU32(0x000000FF) )
   8041             )))
   8042       );
   8043       return t2;
   8044    }
   8045    if (ty == Ity_I16) {
   8046       assign(t2,
   8047              binop(Iop_Or16,
   8048                    binop(Iop_Shl16, mkexpr(t1), mkU8(8)),
   8049                    binop(Iop_Shr16, mkexpr(t1), mkU8(8)) ));
   8050       return t2;
   8051    }
   8052    vassert(0);
   8053    /*NOTREACHED*/
   8054    return IRTemp_INVALID;
   8055 }
   8056 
   8057 /*------------------------------------------------------------*/
   8058 /*--- Disassemble a single instruction                     ---*/
   8059 /*------------------------------------------------------------*/
   8060 
   8061 /* Disassemble a single instruction into IR.  The instruction is
   8062    located in host memory at &guest_code[delta].  *expect_CAS is set
   8063    to True if the resulting IR is expected to contain an IRCAS
   8064    statement, and False if it's not expected to.  This makes it
   8065    possible for the caller of disInstr_X86_WRK to check that
   8066    LOCK-prefixed instructions are at least plausibly translated, in
   8067    that it becomes possible to check that a (validly) LOCK-prefixed
   8068    instruction generates a translation containing an IRCAS, and
   8069    instructions without LOCK prefixes don't generate translations
   8070    containing an IRCAS.
   8071 */
   8072 static
   8073 DisResult disInstr_X86_WRK (
   8074              /*OUT*/Bool* expect_CAS,
   8075              Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
   8076              Bool         resteerCisOk,
   8077              void*        callback_opaque,
   8078              Long         delta64,
   8079              const VexArchInfo* archinfo,
   8080              const VexAbiInfo*  vbi,
   8081              Bool         sigill_diag
   8082           )
   8083 {
   8084    IRType    ty;
   8085    IRTemp    addr, t0, t1, t2, t3, t4, t5, t6;
   8086    Int       alen;
   8087    UChar     opc, modrm, abyte, pre;
   8088    UInt      d32;
   8089    HChar     dis_buf[50];
   8090    Int       am_sz, d_sz, n_prefixes;
   8091    DisResult dres;
   8092    const UChar* insn; /* used in SSE decoders */
   8093 
   8094    /* The running delta */
   8095    Int delta = (Int)delta64;
   8096 
   8097    /* Holds eip at the start of the insn, so that we can print
   8098       consistent error messages for unimplemented insns. */
   8099    Int delta_start = delta;
   8100 
   8101    /* sz denotes the nominal data-op size of the insn; we change it to
   8102       2 if an 0x66 prefix is seen */
   8103    Int sz = 4;
   8104 
   8105    /* sorb holds the segment-override-prefix byte, if any.  Zero if no
   8106       prefix has been seen, else one of {0x26, 0x36, 0x3E, 0x64, 0x65}
   8107       indicating the prefix.  */
   8108    UChar sorb = 0;
   8109 
   8110    /* Gets set to True if a LOCK prefix is seen. */
   8111    Bool pfx_lock = False;
   8112 
   8113    /* Set result defaults. */
   8114    dres.whatNext    = Dis_Continue;
   8115    dres.len         = 0;
   8116    dres.continueAt  = 0;
   8117    dres.hint        = Dis_HintNone;
   8118    dres.jk_StopHere = Ijk_INVALID;
   8119 
   8120    *expect_CAS = False;
   8121 
   8122    addr = t0 = t1 = t2 = t3 = t4 = t5 = t6 = IRTemp_INVALID;
   8123 
   8124    vassert(guest_EIP_bbstart + delta == guest_EIP_curr_instr);
   8125    DIP("\t0x%x:  ", guest_EIP_bbstart+delta);
   8126 
   8127    /* Spot "Special" instructions (see comment at top of file). */
   8128    {
   8129       const UChar* code = guest_code + delta;
   8130       /* Spot the 12-byte preamble:
   8131          C1C703   roll $3,  %edi
   8132          C1C70D   roll $13, %edi
   8133          C1C71D   roll $29, %edi
   8134          C1C713   roll $19, %edi
   8135       */
   8136       if (code[ 0] == 0xC1 && code[ 1] == 0xC7 && code[ 2] == 0x03 &&
   8137           code[ 3] == 0xC1 && code[ 4] == 0xC7 && code[ 5] == 0x0D &&
   8138           code[ 6] == 0xC1 && code[ 7] == 0xC7 && code[ 8] == 0x1D &&
   8139           code[ 9] == 0xC1 && code[10] == 0xC7 && code[11] == 0x13) {
   8140          /* Got a "Special" instruction preamble.  Which one is it? */
   8141          if (code[12] == 0x87 && code[13] == 0xDB /* xchgl %ebx,%ebx */) {
   8142             /* %EDX = client_request ( %EAX ) */
   8143             DIP("%%edx = client_request ( %%eax )\n");
   8144             delta += 14;
   8145             jmp_lit(&dres, Ijk_ClientReq, guest_EIP_bbstart+delta);
   8146             vassert(dres.whatNext == Dis_StopHere);
   8147             goto decode_success;
   8148          }
   8149          else
   8150          if (code[12] == 0x87 && code[13] == 0xC9 /* xchgl %ecx,%ecx */) {
   8151             /* %EAX = guest_NRADDR */
   8152             DIP("%%eax = guest_NRADDR\n");
   8153             delta += 14;
   8154             putIReg(4, R_EAX, IRExpr_Get( OFFB_NRADDR, Ity_I32 ));
   8155             goto decode_success;
   8156          }
   8157          else
   8158          if (code[12] == 0x87 && code[13] == 0xD2 /* xchgl %edx,%edx */) {
   8159             /* call-noredir *%EAX */
   8160             DIP("call-noredir *%%eax\n");
   8161             delta += 14;
   8162             t1 = newTemp(Ity_I32);
   8163             assign(t1, getIReg(4,R_EAX));
   8164             t2 = newTemp(Ity_I32);
   8165             assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
   8166             putIReg(4, R_ESP, mkexpr(t2));
   8167             storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta));
   8168             jmp_treg(&dres, Ijk_NoRedir, t1);
   8169             vassert(dres.whatNext == Dis_StopHere);
   8170             goto decode_success;
   8171          }
   8172          else
   8173          if (code[12] == 0x87 && code[13] == 0xFF /* xchgl %edi,%edi */) {
   8174             /* IR injection */
   8175             DIP("IR injection\n");
   8176             vex_inject_ir(irsb, Iend_LE);
   8177 
   8178             // Invalidate the current insn. The reason is that the IRop we're
   8179             // injecting here can change. In which case the translation has to
   8180             // be redone. For ease of handling, we simply invalidate all the
   8181             // time.
   8182             stmt(IRStmt_Put(OFFB_CMSTART, mkU32(guest_EIP_curr_instr)));
   8183             stmt(IRStmt_Put(OFFB_CMLEN,   mkU32(14)));
   8184 
   8185             delta += 14;
   8186 
   8187             stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_bbstart + delta) ) );
   8188             dres.whatNext    = Dis_StopHere;
   8189             dres.jk_StopHere = Ijk_InvalICache;
   8190             goto decode_success;
   8191          }
   8192          /* We don't know what it is. */
   8193          goto decode_failure;
   8194          /*NOTREACHED*/
   8195       }
   8196    }
   8197 
   8198    /* Handle a couple of weird-ass NOPs that have been observed in the
   8199       wild. */
   8200    {
   8201       const UChar* code = guest_code + delta;
   8202       /* Sun's JVM 1.5.0 uses the following as a NOP:
   8203          26 2E 64 65 90  %es:%cs:%fs:%gs:nop */
   8204       if (code[0] == 0x26 && code[1] == 0x2E && code[2] == 0x64
   8205           && code[3] == 0x65 && code[4] == 0x90) {
   8206          DIP("%%es:%%cs:%%fs:%%gs:nop\n");
   8207          delta += 5;
   8208          goto decode_success;
   8209       }
   8210       /* Don't barf on recent binutils padding,
   8211          all variants of which are: nopw %cs:0x0(%eax,%eax,1)
   8212          66 2e 0f 1f 84 00 00 00 00 00
   8213          66 66 2e 0f 1f 84 00 00 00 00 00
   8214          66 66 66 2e 0f 1f 84 00 00 00 00 00
   8215          66 66 66 66 2e 0f 1f 84 00 00 00 00 00
   8216          66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00
   8217          66 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00
   8218       */
   8219       if (code[0] == 0x66) {
   8220          Int data16_cnt;
   8221          for (data16_cnt = 1; data16_cnt < 6; data16_cnt++)
   8222             if (code[data16_cnt] != 0x66)
   8223                break;
   8224          if (code[data16_cnt] == 0x2E && code[data16_cnt + 1] == 0x0F
   8225              && code[data16_cnt + 2] == 0x1F && code[data16_cnt + 3] == 0x84
   8226              && code[data16_cnt + 4] == 0x00 && code[data16_cnt + 5] == 0x00
   8227              && code[data16_cnt + 6] == 0x00 && code[data16_cnt + 7] == 0x00
   8228              && code[data16_cnt + 8] == 0x00 ) {
   8229             DIP("nopw %%cs:0x0(%%eax,%%eax,1)\n");
   8230             delta += 9 + data16_cnt;
   8231             goto decode_success;
   8232          }
   8233       }
   8234 
   8235       // Intel CET requires the following opcodes to be treated as NOPs
   8236       // with any prefix and ModRM, SIB and disp combination:
   8237       // "0F 19", "0F 1C", "0F 1D", "0F 1E", "0F 1F"
   8238       UInt opcode_index = 0;
   8239       // Skip any prefix combination
   8240       UInt addr_override = 0;
   8241       UInt temp_sz = 4;
   8242       Bool is_prefix = True;
   8243       while (is_prefix) {
   8244          switch (code[opcode_index]) {
   8245             case 0x66:
   8246                temp_sz = 2;
   8247                opcode_index++;
   8248                break;
   8249             case 0x67:
   8250                addr_override = 1;
   8251                opcode_index++;
   8252                break;
   8253             case 0x26: case 0x3E: // if we set segment override here,
   8254             case 0x64: case 0x65: //  disAMode segfaults
   8255             case 0x2E: case 0x36:
   8256             case 0xF0: case 0xF2: case 0xF3:
   8257                opcode_index++;
   8258                break;
   8259             default:
   8260                is_prefix = False;
   8261          }
   8262       }
   8263       // Check the opcode
   8264       if (code[opcode_index] == 0x0F) {
   8265          switch (code[opcode_index+1]) {
   8266             case 0x19:
   8267             case 0x1C: case 0x1D:
   8268             case 0x1E: case 0x1F:
   8269                delta += opcode_index+2;
   8270                modrm = getUChar(delta);
   8271                if (epartIsReg(modrm)) {
   8272                   delta += 1;
   8273                   DIP("nop%c\n", nameISize(temp_sz));
   8274                }
   8275                else {
   8276                   addr = disAMode(&alen, 0/*"no sorb"*/, delta, dis_buf);
   8277                   delta += alen - addr_override;
   8278                   DIP("nop%c %s\n", nameISize(temp_sz), dis_buf);
   8279                }
   8280                goto decode_success;
   8281             default:
   8282                break;
   8283          }
   8284       }
   8285    }
   8286    /* Normal instruction handling starts here. */
   8287 
   8288    /* Deal with some but not all prefixes:
   8289          66(oso)
   8290          F0(lock)
   8291          2E(cs:) 3E(ds:) 26(es:) 64(fs:) 65(gs:) 36(ss:)
   8292       Not dealt with (left in place):
   8293          F2 F3
   8294    */
   8295    n_prefixes = 0;
   8296    while (True) {
   8297       if (n_prefixes > 7) goto decode_failure;
   8298       pre = getUChar(delta);
   8299       switch (pre) {
   8300          case 0x66:
   8301             sz = 2;
   8302             break;
   8303          case 0xF0:
   8304             pfx_lock = True;
   8305             *expect_CAS = True;
   8306             break;
   8307          case 0x3E: /* %DS: */
   8308          case 0x26: /* %ES: */
   8309          case 0x64: /* %FS: */
   8310          case 0x65: /* %GS: */
   8311          case 0x36: /* %SS: */
   8312             if (sorb != 0)
   8313                goto decode_failure; /* only one seg override allowed */
   8314             sorb = pre;
   8315             break;
   8316          case 0x2E: { /* %CS: */
   8317             /* 2E prefix on a conditional branch instruction is a
   8318                branch-prediction hint, which can safely be ignored.  */
   8319             UChar op1 = getIByte(delta+1);
   8320             UChar op2 = getIByte(delta+2);
   8321             if ((op1 >= 0x70 && op1 <= 0x7F)
   8322                 || (op1 == 0xE3)
   8323                 || (op1 == 0x0F && op2 >= 0x80 && op2 <= 0x8F)) {
   8324                if (0) vex_printf("vex x86->IR: ignoring branch hint\n");
   8325             } else {
   8326                /* All other CS override cases are not handled */
   8327                goto decode_failure;
   8328             }
   8329             break;
   8330          }
   8331          default:
   8332             goto not_a_prefix;
   8333       }
   8334       n_prefixes++;
   8335       delta++;
   8336    }
   8337 
   8338    not_a_prefix:
   8339 
   8340    /* Now we should be looking at the primary opcode byte or the
   8341       leading F2 or F3.  Check that any LOCK prefix is actually
   8342       allowed. */
   8343 
   8344    if (pfx_lock) {
   8345      if (can_be_used_with_LOCK_prefix( &guest_code[delta] )) {
   8346          DIP("lock ");
   8347       } else {
   8348          *expect_CAS = False;
   8349          goto decode_failure;
   8350       }
   8351    }
   8352 
   8353 
   8354    /* ---------------------------------------------------- */
   8355    /* --- The SSE decoder.                             --- */
   8356    /* ---------------------------------------------------- */
   8357 
   8358    /* What did I do to deserve SSE ?  Perhaps I was really bad in a
   8359       previous life? */
   8360 
   8361    /* Note, this doesn't handle SSE2 or SSE3.  That is handled in a
   8362       later section, further on. */
   8363 
   8364    insn = &guest_code[delta];
   8365 
   8366    /* Treat fxsave specially.  It should be doable even on an SSE0
   8367       (Pentium-II class) CPU.  Hence be prepared to handle it on
   8368       any subarchitecture variant.
   8369    */
   8370 
   8371    /* 0F AE /0 = FXSAVE m512 -- write x87 and SSE state to memory */
   8372    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
   8373        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 0) {
   8374       IRDirty* d;
   8375       modrm = getIByte(delta+2);
   8376       vassert(sz == 4);
   8377       vassert(!epartIsReg(modrm));
   8378 
   8379       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8380       delta += 2+alen;
   8381       gen_SEGV_if_not_16_aligned(addr);
   8382 
   8383       DIP("fxsave %s\n", dis_buf);
   8384 
   8385       /* Uses dirty helper:
   8386             void x86g_do_FXSAVE ( VexGuestX86State*, UInt ) */
   8387       d = unsafeIRDirty_0_N (
   8388              0/*regparms*/,
   8389              "x86g_dirtyhelper_FXSAVE",
   8390              &x86g_dirtyhelper_FXSAVE,
   8391              mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
   8392           );
   8393 
   8394       /* declare we're writing memory */
   8395       d->mFx   = Ifx_Write;
   8396       d->mAddr = mkexpr(addr);
   8397       d->mSize = 464; /* according to recent Intel docs */
   8398 
   8399       /* declare we're reading guest state */
   8400       d->nFxState = 7;
   8401       vex_bzero(&d->fxState, sizeof(d->fxState));
   8402 
   8403       d->fxState[0].fx     = Ifx_Read;
   8404       d->fxState[0].offset = OFFB_FTOP;
   8405       d->fxState[0].size   = sizeof(UInt);
   8406 
   8407       d->fxState[1].fx     = Ifx_Read;
   8408       d->fxState[1].offset = OFFB_FPREGS;
   8409       d->fxState[1].size   = 8 * sizeof(ULong);
   8410 
   8411       d->fxState[2].fx     = Ifx_Read;
   8412       d->fxState[2].offset = OFFB_FPTAGS;
   8413       d->fxState[2].size   = 8 * sizeof(UChar);
   8414 
   8415       d->fxState[3].fx     = Ifx_Read;
   8416       d->fxState[3].offset = OFFB_FPROUND;
   8417       d->fxState[3].size   = sizeof(UInt);
   8418 
   8419       d->fxState[4].fx     = Ifx_Read;
   8420       d->fxState[4].offset = OFFB_FC3210;
   8421       d->fxState[4].size   = sizeof(UInt);
   8422 
   8423       d->fxState[5].fx     = Ifx_Read;
   8424       d->fxState[5].offset = OFFB_XMM0;
   8425       d->fxState[5].size   = 8 * sizeof(U128);
   8426 
   8427       d->fxState[6].fx     = Ifx_Read;
   8428       d->fxState[6].offset = OFFB_SSEROUND;
   8429       d->fxState[6].size   = sizeof(UInt);
   8430 
   8431       /* Be paranoid ... this assertion tries to ensure the 8 %xmm
   8432 	 images are packed back-to-back.  If not, the value of
   8433 	 d->fxState[5].size is wrong. */
   8434       vassert(16 == sizeof(U128));
   8435       vassert(OFFB_XMM7 == (OFFB_XMM0 + 7 * 16));
   8436 
   8437       stmt( IRStmt_Dirty(d) );
   8438 
   8439       goto decode_success;
   8440    }
   8441 
   8442    /* 0F AE /1 = FXRSTOR m512 -- read x87 and SSE state from memory */
   8443    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
   8444        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 1) {
   8445       IRDirty* d;
   8446       modrm = getIByte(delta+2);
   8447       vassert(sz == 4);
   8448       vassert(!epartIsReg(modrm));
   8449 
   8450       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8451       delta += 2+alen;
   8452       gen_SEGV_if_not_16_aligned(addr);
   8453 
   8454       DIP("fxrstor %s\n", dis_buf);
   8455 
   8456       /* Uses dirty helper:
   8457             VexEmNote x86g_do_FXRSTOR ( VexGuestX86State*, UInt )
   8458          NOTE:
   8459             the VexEmNote value is simply ignored (unlike for FRSTOR)
   8460       */
   8461       d = unsafeIRDirty_0_N (
   8462              0/*regparms*/,
   8463              "x86g_dirtyhelper_FXRSTOR",
   8464              &x86g_dirtyhelper_FXRSTOR,
   8465              mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
   8466           );
   8467 
   8468       /* declare we're reading memory */
   8469       d->mFx   = Ifx_Read;
   8470       d->mAddr = mkexpr(addr);
   8471       d->mSize = 464; /* according to recent Intel docs */
   8472 
   8473       /* declare we're writing guest state */
   8474       d->nFxState = 7;
   8475       vex_bzero(&d->fxState, sizeof(d->fxState));
   8476 
   8477       d->fxState[0].fx     = Ifx_Write;
   8478       d->fxState[0].offset = OFFB_FTOP;
   8479       d->fxState[0].size   = sizeof(UInt);
   8480 
   8481       d->fxState[1].fx     = Ifx_Write;
   8482       d->fxState[1].offset = OFFB_FPREGS;
   8483       d->fxState[1].size   = 8 * sizeof(ULong);
   8484 
   8485       d->fxState[2].fx     = Ifx_Write;
   8486       d->fxState[2].offset = OFFB_FPTAGS;
   8487       d->fxState[2].size   = 8 * sizeof(UChar);
   8488 
   8489       d->fxState[3].fx     = Ifx_Write;
   8490       d->fxState[3].offset = OFFB_FPROUND;
   8491       d->fxState[3].size   = sizeof(UInt);
   8492 
   8493       d->fxState[4].fx     = Ifx_Write;
   8494       d->fxState[4].offset = OFFB_FC3210;
   8495       d->fxState[4].size   = sizeof(UInt);
   8496 
   8497       d->fxState[5].fx     = Ifx_Write;
   8498       d->fxState[5].offset = OFFB_XMM0;
   8499       d->fxState[5].size   = 8 * sizeof(U128);
   8500 
   8501       d->fxState[6].fx     = Ifx_Write;
   8502       d->fxState[6].offset = OFFB_SSEROUND;
   8503       d->fxState[6].size   = sizeof(UInt);
   8504 
   8505       /* Be paranoid ... this assertion tries to ensure the 8 %xmm
   8506 	 images are packed back-to-back.  If not, the value of
   8507 	 d->fxState[5].size is wrong. */
   8508       vassert(16 == sizeof(U128));
   8509       vassert(OFFB_XMM7 == (OFFB_XMM0 + 7 * 16));
   8510 
   8511       stmt( IRStmt_Dirty(d) );
   8512 
   8513       goto decode_success;
   8514    }
   8515 
   8516    /* ------ SSE decoder main ------ */
   8517 
   8518    /* Skip parts of the decoder which don't apply given the stated
   8519       guest subarchitecture. */
   8520    if (archinfo->hwcaps == 0/*baseline, no sse at all*/)
   8521       goto after_sse_decoders;
   8522 
   8523    /* With mmxext only some extended MMX instructions are recognized.
   8524       The mmxext instructions are MASKMOVQ MOVNTQ PAVGB PAVGW PMAXSW
   8525       PMAXUB PMINSW PMINUB PMULHUW PSADBW PSHUFW PEXTRW PINSRW PMOVMSKB
   8526       PREFETCHNTA PREFETCHT0 PREFETCHT1 PREFETCHT2 SFENCE
   8527 
   8528       http://support.amd.com/us/Embedded_TechDocs/22466.pdf
   8529       https://en.wikipedia.org/wiki/3DNow!#3DNow.21_extensions */
   8530 
   8531    if (archinfo->hwcaps == VEX_HWCAPS_X86_MMXEXT/*integer only sse1 subset*/)
   8532       goto mmxext;
   8533 
   8534    /* Otherwise we must be doing sse1 or sse2, so we can at least try
   8535       for SSE1 here. */
   8536 
   8537    /* 0F 58 = ADDPS -- add 32Fx4 from R/M to R */
   8538    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x58) {
   8539       delta = dis_SSE_E_to_G_all( sorb, delta+2, "addps", Iop_Add32Fx4 );
   8540       goto decode_success;
   8541    }
   8542 
   8543    /* F3 0F 58 = ADDSS -- add 32F0x4 from R/M to R */
   8544    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x58) {
   8545       vassert(sz == 4);
   8546       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "addss", Iop_Add32F0x4 );
   8547       goto decode_success;
   8548    }
   8549 
   8550    /* 0F 55 = ANDNPS -- G = (not G) and E */
   8551    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x55) {
   8552       delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "andnps", Iop_AndV128 );
   8553       goto decode_success;
   8554    }
   8555 
   8556    /* 0F 54 = ANDPS -- G = G and E */
   8557    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x54) {
   8558       delta = dis_SSE_E_to_G_all( sorb, delta+2, "andps", Iop_AndV128 );
   8559       goto decode_success;
   8560    }
   8561 
   8562    /* 0F C2 = CMPPS -- 32Fx4 comparison from R/M to R */
   8563    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC2) {
   8564       delta = dis_SSEcmp_E_to_G( sorb, delta+2, "cmpps", True, 4 );
   8565       goto decode_success;
   8566    }
   8567 
   8568    /* F3 0F C2 = CMPSS -- 32F0x4 comparison from R/M to R */
   8569    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xC2) {
   8570       vassert(sz == 4);
   8571       delta = dis_SSEcmp_E_to_G( sorb, delta+3, "cmpss", False, 4 );
   8572       goto decode_success;
   8573    }
   8574 
   8575    /* 0F 2F = COMISS  -- 32F0x4 comparison G,E, and set ZCP */
   8576    /* 0F 2E = UCOMISS -- 32F0x4 comparison G,E, and set ZCP */
   8577    if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
   8578       IRTemp argL = newTemp(Ity_F32);
   8579       IRTemp argR = newTemp(Ity_F32);
   8580       modrm = getIByte(delta+2);
   8581       if (epartIsReg(modrm)) {
   8582          assign( argR, getXMMRegLane32F( eregOfRM(modrm), 0/*lowest lane*/ ) );
   8583          delta += 2+1;
   8584          DIP("[u]comiss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   8585                                   nameXMMReg(gregOfRM(modrm)) );
   8586       } else {
   8587          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8588 	 assign( argR, loadLE(Ity_F32, mkexpr(addr)) );
   8589          delta += 2+alen;
   8590          DIP("[u]comiss %s,%s\n", dis_buf,
   8591                                   nameXMMReg(gregOfRM(modrm)) );
   8592       }
   8593       assign( argL, getXMMRegLane32F( gregOfRM(modrm), 0/*lowest lane*/ ) );
   8594 
   8595       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   8596       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   8597       stmt( IRStmt_Put(
   8598                OFFB_CC_DEP1,
   8599                binop( Iop_And32,
   8600                       binop(Iop_CmpF64,
   8601                             unop(Iop_F32toF64,mkexpr(argL)),
   8602                             unop(Iop_F32toF64,mkexpr(argR))),
   8603                       mkU32(0x45)
   8604           )));
   8605       /* Set NDEP even though it isn't used.  This makes redundant-PUT
   8606          elimination of previous stores to this field work better. */
   8607       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   8608       goto decode_success;
   8609    }
   8610 
   8611    /* 0F 2A = CVTPI2PS -- convert 2 x I32 in mem/mmx to 2 x F32 in low
   8612       half xmm */
   8613    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x2A) {
   8614       IRTemp arg64 = newTemp(Ity_I64);
   8615       IRTemp rmode = newTemp(Ity_I32);
   8616       vassert(sz == 4);
   8617 
   8618       modrm = getIByte(delta+2);
   8619       if (epartIsReg(modrm)) {
   8620          /* Only switch to MMX mode if the source is a MMX register.
   8621             See comments on CVTPI2PD for details.  Fixes #357059. */
   8622          do_MMX_preamble();
   8623          assign( arg64, getMMXReg(eregOfRM(modrm)) );
   8624          delta += 2+1;
   8625          DIP("cvtpi2ps %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   8626                                  nameXMMReg(gregOfRM(modrm)));
   8627       } else {
   8628          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8629 	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   8630          delta += 2+alen;
   8631          DIP("cvtpi2ps %s,%s\n", dis_buf,
   8632                                  nameXMMReg(gregOfRM(modrm)) );
   8633       }
   8634 
   8635       assign( rmode, get_sse_roundingmode() );
   8636 
   8637       putXMMRegLane32F(
   8638          gregOfRM(modrm), 0,
   8639          binop(Iop_F64toF32,
   8640                mkexpr(rmode),
   8641                unop(Iop_I32StoF64,
   8642                     unop(Iop_64to32, mkexpr(arg64)) )) );
   8643 
   8644       putXMMRegLane32F(
   8645          gregOfRM(modrm), 1,
   8646          binop(Iop_F64toF32,
   8647                mkexpr(rmode),
   8648                unop(Iop_I32StoF64,
   8649                     unop(Iop_64HIto32, mkexpr(arg64)) )) );
   8650 
   8651       goto decode_success;
   8652    }
   8653 
   8654    /* F3 0F 2A = CVTSI2SS -- convert I32 in mem/ireg to F32 in low
   8655       quarter xmm */
   8656    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x2A) {
   8657       IRTemp arg32 = newTemp(Ity_I32);
   8658       IRTemp rmode = newTemp(Ity_I32);
   8659       vassert(sz == 4);
   8660 
   8661       modrm = getIByte(delta+3);
   8662       if (epartIsReg(modrm)) {
   8663          assign( arg32, getIReg(4, eregOfRM(modrm)) );
   8664          delta += 3+1;
   8665          DIP("cvtsi2ss %s,%s\n", nameIReg(4, eregOfRM(modrm)),
   8666                                  nameXMMReg(gregOfRM(modrm)));
   8667       } else {
   8668          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   8669 	 assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   8670          delta += 3+alen;
   8671          DIP("cvtsi2ss %s,%s\n", dis_buf,
   8672                                  nameXMMReg(gregOfRM(modrm)) );
   8673       }
   8674 
   8675       assign( rmode, get_sse_roundingmode() );
   8676 
   8677       putXMMRegLane32F(
   8678          gregOfRM(modrm), 0,
   8679          binop(Iop_F64toF32,
   8680                mkexpr(rmode),
   8681                unop(Iop_I32StoF64, mkexpr(arg32)) ) );
   8682 
   8683       goto decode_success;
   8684    }
   8685 
   8686    /* 0F 2D = CVTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
   8687       I32 in mmx, according to prevailing SSE rounding mode */
   8688    /* 0F 2C = CVTTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
   8689       I32 in mmx, rounding towards zero */
   8690    if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
   8691       IRTemp dst64  = newTemp(Ity_I64);
   8692       IRTemp rmode  = newTemp(Ity_I32);
   8693       IRTemp f32lo  = newTemp(Ity_F32);
   8694       IRTemp f32hi  = newTemp(Ity_F32);
   8695       Bool   r2zero = toBool(insn[1] == 0x2C);
   8696 
   8697       do_MMX_preamble();
   8698       modrm = getIByte(delta+2);
   8699 
   8700       if (epartIsReg(modrm)) {
   8701          delta += 2+1;
   8702 	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
   8703 	 assign(f32hi, getXMMRegLane32F(eregOfRM(modrm), 1));
   8704          DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
   8705                                    nameXMMReg(eregOfRM(modrm)),
   8706                                    nameMMXReg(gregOfRM(modrm)));
   8707       } else {
   8708          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8709 	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   8710 	 assign(f32hi, loadLE(Ity_F32, binop( Iop_Add32,
   8711                                               mkexpr(addr),
   8712                                               mkU32(4) )));
   8713          delta += 2+alen;
   8714          DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
   8715                                    dis_buf,
   8716                                    nameMMXReg(gregOfRM(modrm)));
   8717       }
   8718 
   8719       if (r2zero) {
   8720          assign(rmode, mkU32((UInt)Irrm_ZERO) );
   8721       } else {
   8722          assign( rmode, get_sse_roundingmode() );
   8723       }
   8724 
   8725       assign(
   8726          dst64,
   8727          binop( Iop_32HLto64,
   8728                 binop( Iop_F64toI32S,
   8729                        mkexpr(rmode),
   8730                        unop( Iop_F32toF64, mkexpr(f32hi) ) ),
   8731                 binop( Iop_F64toI32S,
   8732                        mkexpr(rmode),
   8733                        unop( Iop_F32toF64, mkexpr(f32lo) ) )
   8734               )
   8735       );
   8736 
   8737       putMMXReg(gregOfRM(modrm), mkexpr(dst64));
   8738       goto decode_success;
   8739    }
   8740 
   8741    /* F3 0F 2D = CVTSS2SI -- convert F32 in mem/low quarter xmm to
   8742       I32 in ireg, according to prevailing SSE rounding mode */
   8743    /* F3 0F 2C = CVTTSS2SI -- convert F32 in mem/low quarter xmm to
   8744       I32 in ireg, rounding towards zero */
   8745    if (insn[0] == 0xF3 && insn[1] == 0x0F
   8746        && (insn[2] == 0x2D || insn[2] == 0x2C)) {
   8747       IRTemp rmode = newTemp(Ity_I32);
   8748       IRTemp f32lo = newTemp(Ity_F32);
   8749       Bool   r2zero = toBool(insn[2] == 0x2C);
   8750       vassert(sz == 4);
   8751 
   8752       modrm = getIByte(delta+3);
   8753       if (epartIsReg(modrm)) {
   8754          delta += 3+1;
   8755 	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
   8756          DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
   8757                                    nameXMMReg(eregOfRM(modrm)),
   8758                                    nameIReg(4, gregOfRM(modrm)));
   8759       } else {
   8760          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   8761 	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   8762          delta += 3+alen;
   8763          DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
   8764                                    dis_buf,
   8765                                    nameIReg(4, gregOfRM(modrm)));
   8766       }
   8767 
   8768       if (r2zero) {
   8769          assign( rmode, mkU32((UInt)Irrm_ZERO) );
   8770       } else {
   8771          assign( rmode, get_sse_roundingmode() );
   8772       }
   8773 
   8774       putIReg(4, gregOfRM(modrm),
   8775                  binop( Iop_F64toI32S,
   8776                         mkexpr(rmode),
   8777                         unop( Iop_F32toF64, mkexpr(f32lo) ) )
   8778       );
   8779 
   8780       goto decode_success;
   8781    }
   8782 
   8783    /* 0F 5E = DIVPS -- div 32Fx4 from R/M to R */
   8784    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5E) {
   8785       delta = dis_SSE_E_to_G_all( sorb, delta+2, "divps", Iop_Div32Fx4 );
   8786       goto decode_success;
   8787    }
   8788 
   8789    /* F3 0F 5E = DIVSS -- div 32F0x4 from R/M to R */
   8790    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5E) {
   8791       vassert(sz == 4);
   8792       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "divss", Iop_Div32F0x4 );
   8793       goto decode_success;
   8794    }
   8795 
   8796    /* 0F AE /2 = LDMXCSR m32 -- load %mxcsr */
   8797    if (insn[0] == 0x0F && insn[1] == 0xAE
   8798        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 2) {
   8799 
   8800       IRTemp t64 = newTemp(Ity_I64);
   8801       IRTemp ew = newTemp(Ity_I32);
   8802 
   8803       modrm = getIByte(delta+2);
   8804       vassert(!epartIsReg(modrm));
   8805       vassert(sz == 4);
   8806 
   8807       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8808       delta += 2+alen;
   8809       DIP("ldmxcsr %s\n", dis_buf);
   8810 
   8811       /* The only thing we observe in %mxcsr is the rounding mode.
   8812          Therefore, pass the 32-bit value (SSE native-format control
   8813          word) to a clean helper, getting back a 64-bit value, the
   8814          lower half of which is the SSEROUND value to store, and the
   8815          upper half of which is the emulation-warning token which may
   8816          be generated.
   8817       */
   8818       /* ULong x86h_check_ldmxcsr ( UInt ); */
   8819       assign( t64, mkIRExprCCall(
   8820                       Ity_I64, 0/*regparms*/,
   8821                       "x86g_check_ldmxcsr",
   8822                       &x86g_check_ldmxcsr,
   8823                       mkIRExprVec_1( loadLE(Ity_I32, mkexpr(addr)) )
   8824                    )
   8825             );
   8826 
   8827       put_sse_roundingmode( unop(Iop_64to32, mkexpr(t64)) );
   8828       assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
   8829       put_emwarn( mkexpr(ew) );
   8830       /* Finally, if an emulation warning was reported, side-exit to
   8831          the next insn, reporting the warning, so that Valgrind's
   8832          dispatcher sees the warning. */
   8833       stmt(
   8834          IRStmt_Exit(
   8835             binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   8836             Ijk_EmWarn,
   8837             IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
   8838             OFFB_EIP
   8839          )
   8840       );
   8841       goto decode_success;
   8842    }
   8843 
   8844 
   8845    /* mmxext sse1 subset starts here. mmxext only arches will parse
   8846       only this subset of the sse1 instructions. */
   8847   mmxext:
   8848 
   8849    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8850    /* 0F F7 = MASKMOVQ -- 8x8 masked store */
   8851    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF7) {
   8852       Bool ok = False;
   8853       delta = dis_MMX( &ok, sorb, sz, delta+1 );
   8854       if (!ok)
   8855          goto decode_failure;
   8856       goto decode_success;
   8857    }
   8858 
   8859    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8860    /* 0F E7 = MOVNTQ -- for us, just a plain MMX store.  Note, the
   8861       Intel manual does not say anything about the usual business of
   8862       the FP reg tags getting trashed whenever an MMX insn happens.
   8863       So we just leave them alone.
   8864    */
   8865    if (insn[0] == 0x0F && insn[1] == 0xE7) {
   8866       modrm = getIByte(delta+2);
   8867       if (sz == 4 && !epartIsReg(modrm)) {
   8868          /* do_MMX_preamble(); Intel docs don't specify this */
   8869          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8870          storeLE( mkexpr(addr), getMMXReg(gregOfRM(modrm)) );
   8871          DIP("movntq %s,%s\n", dis_buf,
   8872                                nameMMXReg(gregOfRM(modrm)));
   8873          delta += 2+alen;
   8874          goto decode_success;
   8875       }
   8876       /* else fall through */
   8877    }
   8878 
   8879    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8880    /* 0F E0 = PAVGB -- 8x8 unsigned Packed Average, with rounding */
   8881    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE0) {
   8882       do_MMX_preamble();
   8883       delta = dis_MMXop_regmem_to_reg (
   8884                 sorb, delta+2, insn[1], "pavgb", False );
   8885       goto decode_success;
   8886    }
   8887 
   8888    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8889    /* 0F E3 = PAVGW -- 16x4 unsigned Packed Average, with rounding */
   8890    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE3) {
   8891       do_MMX_preamble();
   8892       delta = dis_MMXop_regmem_to_reg (
   8893                 sorb, delta+2, insn[1], "pavgw", False );
   8894       goto decode_success;
   8895    }
   8896 
   8897    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8898    /* 0F C5 = PEXTRW -- extract 16-bit field from mmx(E) and put
   8899       zero-extend of it in ireg(G). */
   8900    if (insn[0] == 0x0F && insn[1] == 0xC5) {
   8901       modrm = insn[2];
   8902       if (sz == 4 && epartIsReg(modrm)) {
   8903          IRTemp sV = newTemp(Ity_I64);
   8904          t5 = newTemp(Ity_I16);
   8905          do_MMX_preamble();
   8906          assign(sV, getMMXReg(eregOfRM(modrm)));
   8907          breakup64to16s( sV, &t3, &t2, &t1, &t0 );
   8908          switch (insn[3] & 3) {
   8909             case 0:  assign(t5, mkexpr(t0)); break;
   8910             case 1:  assign(t5, mkexpr(t1)); break;
   8911             case 2:  assign(t5, mkexpr(t2)); break;
   8912             case 3:  assign(t5, mkexpr(t3)); break;
   8913             default: vassert(0); /*NOTREACHED*/
   8914          }
   8915          putIReg(4, gregOfRM(modrm), unop(Iop_16Uto32, mkexpr(t5)));
   8916          DIP("pextrw $%d,%s,%s\n",
   8917              (Int)insn[3], nameMMXReg(eregOfRM(modrm)),
   8918                            nameIReg(4,gregOfRM(modrm)));
   8919          delta += 4;
   8920          goto decode_success;
   8921       }
   8922       /* else fall through */
   8923    }
   8924 
   8925    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8926    /* 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
   8927       put it into the specified lane of mmx(G). */
   8928    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC4) {
   8929       /* Use t0 .. t3 to hold the 4 original 16-bit lanes of the
   8930          mmx reg.  t4 is the new lane value.  t5 is the original
   8931          mmx value. t6 is the new mmx value. */
   8932       Int lane;
   8933       t4 = newTemp(Ity_I16);
   8934       t5 = newTemp(Ity_I64);
   8935       t6 = newTemp(Ity_I64);
   8936       modrm = insn[2];
   8937       do_MMX_preamble();
   8938 
   8939       assign(t5, getMMXReg(gregOfRM(modrm)));
   8940       breakup64to16s( t5, &t3, &t2, &t1, &t0 );
   8941 
   8942       if (epartIsReg(modrm)) {
   8943          assign(t4, getIReg(2, eregOfRM(modrm)));
   8944          delta += 3+1;
   8945          lane = insn[3+1-1];
   8946          DIP("pinsrw $%d,%s,%s\n", lane,
   8947                                    nameIReg(2,eregOfRM(modrm)),
   8948                                    nameMMXReg(gregOfRM(modrm)));
   8949       } else {
   8950          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8951          delta += 3+alen;
   8952          lane = insn[3+alen-1];
   8953          assign(t4, loadLE(Ity_I16, mkexpr(addr)));
   8954          DIP("pinsrw $%d,%s,%s\n", lane,
   8955                                    dis_buf,
   8956                                    nameMMXReg(gregOfRM(modrm)));
   8957       }
   8958 
   8959       switch (lane & 3) {
   8960          case 0:  assign(t6, mk64from16s(t3,t2,t1,t4)); break;
   8961          case 1:  assign(t6, mk64from16s(t3,t2,t4,t0)); break;
   8962          case 2:  assign(t6, mk64from16s(t3,t4,t1,t0)); break;
   8963          case 3:  assign(t6, mk64from16s(t4,t2,t1,t0)); break;
   8964          default: vassert(0); /*NOTREACHED*/
   8965       }
   8966       putMMXReg(gregOfRM(modrm), mkexpr(t6));
   8967       goto decode_success;
   8968    }
   8969 
   8970    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8971    /* 0F EE = PMAXSW -- 16x4 signed max */
   8972    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xEE) {
   8973       do_MMX_preamble();
   8974       delta = dis_MMXop_regmem_to_reg (
   8975                 sorb, delta+2, insn[1], "pmaxsw", False );
   8976       goto decode_success;
   8977    }
   8978 
   8979    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8980    /* 0F DE = PMAXUB -- 8x8 unsigned max */
   8981    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xDE) {
   8982       do_MMX_preamble();
   8983       delta = dis_MMXop_regmem_to_reg (
   8984                 sorb, delta+2, insn[1], "pmaxub", False );
   8985       goto decode_success;
   8986    }
   8987 
   8988    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8989    /* 0F EA = PMINSW -- 16x4 signed min */
   8990    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xEA) {
   8991       do_MMX_preamble();
   8992       delta = dis_MMXop_regmem_to_reg (
   8993                 sorb, delta+2, insn[1], "pminsw", False );
   8994       goto decode_success;
   8995    }
   8996 
   8997    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8998    /* 0F DA = PMINUB -- 8x8 unsigned min */
   8999    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xDA) {
   9000       do_MMX_preamble();
   9001       delta = dis_MMXop_regmem_to_reg (
   9002                 sorb, delta+2, insn[1], "pminub", False );
   9003       goto decode_success;
   9004    }
   9005 
   9006    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   9007    /* 0F D7 = PMOVMSKB -- extract sign bits from each of 8 lanes in
   9008       mmx(E), turn them into a byte, and put zero-extend of it in
   9009       ireg(G). */
   9010    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xD7) {
   9011       modrm = insn[2];
   9012       if (epartIsReg(modrm)) {
   9013          do_MMX_preamble();
   9014          t0 = newTemp(Ity_I64);
   9015          t1 = newTemp(Ity_I32);
   9016          assign(t0, getMMXReg(eregOfRM(modrm)));
   9017          assign(t1, unop(Iop_8Uto32, unop(Iop_GetMSBs8x8, mkexpr(t0))));
   9018          putIReg(4, gregOfRM(modrm), mkexpr(t1));
   9019          DIP("pmovmskb %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   9020                                  nameIReg(4,gregOfRM(modrm)));
   9021          delta += 3;
   9022          goto decode_success;
   9023       }
   9024       /* else fall through */
   9025    }
   9026 
   9027    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   9028    /* 0F E4 = PMULUH -- 16x4 hi-half of unsigned widening multiply */
   9029    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE4) {
   9030       do_MMX_preamble();
   9031       delta = dis_MMXop_regmem_to_reg (
   9032                 sorb, delta+2, insn[1], "pmuluh", False );
   9033       goto decode_success;
   9034    }
   9035 
   9036    /* 0F 18 /0 = PREFETCHNTA -- prefetch into caches, */
   9037    /* 0F 18 /1 = PREFETCH0   -- with various different hints */
   9038    /* 0F 18 /2 = PREFETCH1 */
   9039    /* 0F 18 /3 = PREFETCH2 */
   9040    if (insn[0] == 0x0F && insn[1] == 0x18
   9041        && !epartIsReg(insn[2])
   9042        && gregOfRM(insn[2]) >= 0 && gregOfRM(insn[2]) <= 3) {
   9043       const HChar* hintstr = "??";
   9044 
   9045       modrm = getIByte(delta+2);
   9046       vassert(!epartIsReg(modrm));
   9047 
   9048       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9049       delta += 2+alen;
   9050 
   9051       switch (gregOfRM(modrm)) {
   9052          case 0: hintstr = "nta"; break;
   9053          case 1: hintstr = "t0"; break;
   9054          case 2: hintstr = "t1"; break;
   9055          case 3: hintstr = "t2"; break;
   9056          default: vassert(0); /*NOTREACHED*/
   9057       }
   9058 
   9059       DIP("prefetch%s %s\n", hintstr, dis_buf);
   9060       goto decode_success;
   9061    }
   9062 
   9063    /* 0F 0D /0 = PREFETCH  m8 -- 3DNow! prefetch */
   9064    /* 0F 0D /1 = PREFETCHW m8 -- ditto, with some other hint */
   9065    if (insn[0] == 0x0F && insn[1] == 0x0D
   9066        && !epartIsReg(insn[2])
   9067        && gregOfRM(insn[2]) >= 0 && gregOfRM(insn[2]) <= 1) {
   9068       const HChar* hintstr = "??";
   9069 
   9070       modrm = getIByte(delta+2);
   9071       vassert(!epartIsReg(modrm));
   9072 
   9073       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9074       delta += 2+alen;
   9075 
   9076       switch (gregOfRM(modrm)) {
   9077          case 0: hintstr = ""; break;
   9078          case 1: hintstr = "w"; break;
   9079          default: vassert(0); /*NOTREACHED*/
   9080       }
   9081 
   9082       DIP("prefetch%s %s\n", hintstr, dis_buf);
   9083       goto decode_success;
   9084    }
   9085 
   9086    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   9087    /* 0F F6 = PSADBW -- sum of 8Ux8 absolute differences */
   9088    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF6) {
   9089       do_MMX_preamble();
   9090       delta = dis_MMXop_regmem_to_reg (
   9091                  sorb, delta+2, insn[1], "psadbw", False );
   9092       goto decode_success;
   9093    }
   9094 
   9095    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   9096    /* 0F 70 = PSHUFW -- rearrange 4x16 from E(mmx or mem) to G(mmx) */
   9097    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x70) {
   9098       Int order;
   9099       IRTemp sV, dV, s3, s2, s1, s0;
   9100       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   9101       sV = newTemp(Ity_I64);
   9102       dV = newTemp(Ity_I64);
   9103       do_MMX_preamble();
   9104       modrm = insn[2];
   9105       if (epartIsReg(modrm)) {
   9106          assign( sV, getMMXReg(eregOfRM(modrm)) );
   9107          order = (Int)insn[3];
   9108          delta += 2+2;
   9109          DIP("pshufw $%d,%s,%s\n", order,
   9110                                    nameMMXReg(eregOfRM(modrm)),
   9111                                    nameMMXReg(gregOfRM(modrm)));
   9112       } else {
   9113          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9114          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   9115 	 order = (Int)insn[2+alen];
   9116          delta += 3+alen;
   9117          DIP("pshufw $%d,%s,%s\n", order,
   9118                                    dis_buf,
   9119                                    nameMMXReg(gregOfRM(modrm)));
   9120       }
   9121       breakup64to16s( sV, &s3, &s2, &s1, &s0 );
   9122 
   9123 #     define SEL(n) \
   9124                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   9125       assign(dV,
   9126 	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
   9127                           SEL((order>>2)&3), SEL((order>>0)&3) )
   9128       );
   9129       putMMXReg(gregOfRM(modrm), mkexpr(dV));
   9130 #     undef SEL
   9131       goto decode_success;
   9132    }
   9133 
   9134    /* 0F AE /7 = SFENCE -- flush pending operations to memory */
   9135    if (insn[0] == 0x0F && insn[1] == 0xAE
   9136        && epartIsReg(insn[2]) && gregOfRM(insn[2]) == 7) {
   9137       vassert(sz == 4);
   9138       delta += 3;
   9139       /* Insert a memory fence.  It's sometimes important that these
   9140          are carried through to the generated code. */
   9141       stmt( IRStmt_MBE(Imbe_Fence) );
   9142       DIP("sfence\n");
   9143       goto decode_success;
   9144    }
   9145 
   9146    /* End of mmxext sse1 subset. No more sse parsing for mmxext only arches. */
   9147    if (archinfo->hwcaps == VEX_HWCAPS_X86_MMXEXT/*integer only sse1 subset*/)
   9148       goto after_sse_decoders;
   9149 
   9150 
   9151    /* 0F 5F = MAXPS -- max 32Fx4 from R/M to R */
   9152    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5F) {
   9153       delta = dis_SSE_E_to_G_all( sorb, delta+2, "maxps", Iop_Max32Fx4 );
   9154       goto decode_success;
   9155    }
   9156 
   9157    /* F3 0F 5F = MAXSS -- max 32F0x4 from R/M to R */
   9158    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5F) {
   9159       vassert(sz == 4);
   9160       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "maxss", Iop_Max32F0x4 );
   9161       goto decode_success;
   9162    }
   9163 
   9164    /* 0F 5D = MINPS -- min 32Fx4 from R/M to R */
   9165    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5D) {
   9166       delta = dis_SSE_E_to_G_all( sorb, delta+2, "minps", Iop_Min32Fx4 );
   9167       goto decode_success;
   9168    }
   9169 
   9170    /* F3 0F 5D = MINSS -- min 32F0x4 from R/M to R */
   9171    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5D) {
   9172       vassert(sz == 4);
   9173       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "minss", Iop_Min32F0x4 );
   9174       goto decode_success;
   9175    }
   9176 
   9177    /* 0F 28 = MOVAPS -- move from E (mem or xmm) to G (xmm). */
   9178    /* 0F 10 = MOVUPS -- move from E (mem or xmm) to G (xmm). */
   9179    if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x28 || insn[1] == 0x10)) {
   9180       modrm = getIByte(delta+2);
   9181       if (epartIsReg(modrm)) {
   9182          putXMMReg( gregOfRM(modrm),
   9183                     getXMMReg( eregOfRM(modrm) ));
   9184          DIP("mov[ua]ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9185                                   nameXMMReg(gregOfRM(modrm)));
   9186          delta += 2+1;
   9187       } else {
   9188          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9189          if (insn[1] == 0x28/*movaps*/)
   9190             gen_SEGV_if_not_16_aligned( addr );
   9191          putXMMReg( gregOfRM(modrm),
   9192                     loadLE(Ity_V128, mkexpr(addr)) );
   9193          DIP("mov[ua]ps %s,%s\n", dis_buf,
   9194                                   nameXMMReg(gregOfRM(modrm)));
   9195          delta += 2+alen;
   9196       }
   9197       goto decode_success;
   9198    }
   9199 
   9200    /* 0F 29 = MOVAPS -- move from G (xmm) to E (mem or xmm). */
   9201    /* 0F 11 = MOVUPS -- move from G (xmm) to E (mem or xmm). */
   9202    if (sz == 4 && insn[0] == 0x0F
   9203        && (insn[1] == 0x29 || insn[1] == 0x11)) {
   9204       modrm = getIByte(delta+2);
   9205       if (epartIsReg(modrm)) {
   9206          /* fall through; awaiting test case */
   9207       } else {
   9208          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9209          if (insn[1] == 0x29/*movaps*/)
   9210             gen_SEGV_if_not_16_aligned( addr );
   9211          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   9212          DIP("mov[ua]ps %s,%s\n", nameXMMReg(gregOfRM(modrm)),
   9213                                   dis_buf );
   9214          delta += 2+alen;
   9215          goto decode_success;
   9216       }
   9217    }
   9218 
   9219    /* 0F 16 = MOVHPS -- move from mem to high half of XMM. */
   9220    /* 0F 16 = MOVLHPS -- move from lo half to hi half of XMM. */
   9221    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x16) {
   9222       modrm = getIByte(delta+2);
   9223       if (epartIsReg(modrm)) {
   9224          delta += 2+1;
   9225          putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
   9226                           getXMMRegLane64( eregOfRM(modrm), 0 ) );
   9227          DIP("movhps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9228                                nameXMMReg(gregOfRM(modrm)));
   9229       } else {
   9230          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9231          delta += 2+alen;
   9232          putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
   9233                           loadLE(Ity_I64, mkexpr(addr)) );
   9234          DIP("movhps %s,%s\n", dis_buf,
   9235                                nameXMMReg( gregOfRM(modrm) ));
   9236       }
   9237       goto decode_success;
   9238    }
   9239 
   9240    /* 0F 17 = MOVHPS -- move from high half of XMM to mem. */
   9241    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x17) {
   9242       if (!epartIsReg(insn[2])) {
   9243          delta += 2;
   9244          addr = disAMode ( &alen, sorb, delta, dis_buf );
   9245          delta += alen;
   9246          storeLE( mkexpr(addr),
   9247                   getXMMRegLane64( gregOfRM(insn[2]),
   9248                                    1/*upper lane*/ ) );
   9249          DIP("movhps %s,%s\n", nameXMMReg( gregOfRM(insn[2]) ),
   9250                                dis_buf);
   9251          goto decode_success;
   9252       }
   9253       /* else fall through */
   9254    }
   9255 
   9256    /* 0F 12 = MOVLPS -- move from mem to low half of XMM. */
   9257    /* OF 12 = MOVHLPS -- from from hi half to lo half of XMM. */
   9258    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x12) {
   9259       modrm = getIByte(delta+2);
   9260       if (epartIsReg(modrm)) {
   9261          delta += 2+1;
   9262          putXMMRegLane64( gregOfRM(modrm),
   9263                           0/*lower lane*/,
   9264                           getXMMRegLane64( eregOfRM(modrm), 1 ));
   9265          DIP("movhlps %s, %s\n", nameXMMReg(eregOfRM(modrm)),
   9266                                  nameXMMReg(gregOfRM(modrm)));
   9267       } else {
   9268          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9269          delta += 2+alen;
   9270          putXMMRegLane64( gregOfRM(modrm),  0/*lower lane*/,
   9271                           loadLE(Ity_I64, mkexpr(addr)) );
   9272          DIP("movlps %s, %s\n",
   9273              dis_buf, nameXMMReg( gregOfRM(modrm) ));
   9274       }
   9275       goto decode_success;
   9276    }
   9277 
   9278    /* 0F 13 = MOVLPS -- move from low half of XMM to mem. */
   9279    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x13) {
   9280       if (!epartIsReg(insn[2])) {
   9281          delta += 2;
   9282          addr = disAMode ( &alen, sorb, delta, dis_buf );
   9283          delta += alen;
   9284          storeLE( mkexpr(addr),
   9285                   getXMMRegLane64( gregOfRM(insn[2]),
   9286                                    0/*lower lane*/ ) );
   9287          DIP("movlps %s, %s\n", nameXMMReg( gregOfRM(insn[2]) ),
   9288                                 dis_buf);
   9289          goto decode_success;
   9290       }
   9291       /* else fall through */
   9292    }
   9293 
   9294    /* 0F 50 = MOVMSKPS - move 4 sign bits from 4 x F32 in xmm(E)
   9295       to 4 lowest bits of ireg(G) */
   9296    if (insn[0] == 0x0F && insn[1] == 0x50) {
   9297       modrm = getIByte(delta+2);
   9298       if (sz == 4 && epartIsReg(modrm)) {
   9299          Int src;
   9300          t0 = newTemp(Ity_I32);
   9301          t1 = newTemp(Ity_I32);
   9302          t2 = newTemp(Ity_I32);
   9303          t3 = newTemp(Ity_I32);
   9304          delta += 2+1;
   9305          src = eregOfRM(modrm);
   9306          assign( t0, binop( Iop_And32,
   9307                             binop(Iop_Shr32, getXMMRegLane32(src,0), mkU8(31)),
   9308                             mkU32(1) ));
   9309          assign( t1, binop( Iop_And32,
   9310                             binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(30)),
   9311                             mkU32(2) ));
   9312          assign( t2, binop( Iop_And32,
   9313                             binop(Iop_Shr32, getXMMRegLane32(src,2), mkU8(29)),
   9314                             mkU32(4) ));
   9315          assign( t3, binop( Iop_And32,
   9316                             binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(28)),
   9317                             mkU32(8) ));
   9318          putIReg(4, gregOfRM(modrm),
   9319                     binop(Iop_Or32,
   9320                           binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
   9321                           binop(Iop_Or32, mkexpr(t2), mkexpr(t3))
   9322                          )
   9323                  );
   9324          DIP("movmskps %s,%s\n", nameXMMReg(src),
   9325                                  nameIReg(4, gregOfRM(modrm)));
   9326          goto decode_success;
   9327       }
   9328       /* else fall through */
   9329    }
   9330 
   9331    /* 0F 2B = MOVNTPS -- for us, just a plain SSE store. */
   9332    /* 66 0F 2B = MOVNTPD -- for us, just a plain SSE store. */
   9333    if (insn[0] == 0x0F && insn[1] == 0x2B) {
   9334       modrm = getIByte(delta+2);
   9335       if (!epartIsReg(modrm)) {
   9336          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9337          gen_SEGV_if_not_16_aligned( addr );
   9338          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   9339          DIP("movntp%s %s,%s\n", sz==2 ? "d" : "s",
   9340                                  dis_buf,
   9341                                  nameXMMReg(gregOfRM(modrm)));
   9342          delta += 2+alen;
   9343          goto decode_success;
   9344       }
   9345       /* else fall through */
   9346    }
   9347 
   9348    /* F3 0F 10 = MOVSS -- move 32 bits from E (mem or lo 1/4 xmm) to G
   9349       (lo 1/4 xmm).  If E is mem, upper 3/4 of G is zeroed out. */
   9350    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x10) {
   9351       vassert(sz == 4);
   9352       modrm = getIByte(delta+3);
   9353       if (epartIsReg(modrm)) {
   9354          putXMMRegLane32( gregOfRM(modrm), 0,
   9355                           getXMMRegLane32( eregOfRM(modrm), 0 ));
   9356          DIP("movss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9357                               nameXMMReg(gregOfRM(modrm)));
   9358          delta += 3+1;
   9359       } else {
   9360          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9361          /* zero bits 127:64 */
   9362          putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
   9363          /* zero bits 63:32 */
   9364          putXMMRegLane32( gregOfRM(modrm), 1, mkU32(0) );
   9365          /* write bits 31:0 */
   9366          putXMMRegLane32( gregOfRM(modrm), 0,
   9367                           loadLE(Ity_I32, mkexpr(addr)) );
   9368          DIP("movss %s,%s\n", dis_buf,
   9369                               nameXMMReg(gregOfRM(modrm)));
   9370          delta += 3+alen;
   9371       }
   9372       goto decode_success;
   9373    }
   9374 
   9375    /* F3 0F 11 = MOVSS -- move 32 bits from G (lo 1/4 xmm) to E (mem
   9376       or lo 1/4 xmm). */
   9377    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x11) {
   9378       vassert(sz == 4);
   9379       modrm = getIByte(delta+3);
   9380       if (epartIsReg(modrm)) {
   9381          /* fall through, we don't yet have a test case */
   9382       } else {
   9383          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9384          storeLE( mkexpr(addr),
   9385                   getXMMRegLane32(gregOfRM(modrm), 0) );
   9386          DIP("movss %s,%s\n", nameXMMReg(gregOfRM(modrm)),
   9387                               dis_buf);
   9388          delta += 3+alen;
   9389          goto decode_success;
   9390       }
   9391    }
   9392 
   9393    /* 0F 59 = MULPS -- mul 32Fx4 from R/M to R */
   9394    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x59) {
   9395       delta = dis_SSE_E_to_G_all( sorb, delta+2, "mulps", Iop_Mul32Fx4 );
   9396       goto decode_success;
   9397    }
   9398 
   9399    /* F3 0F 59 = MULSS -- mul 32F0x4 from R/M to R */
   9400    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x59) {
   9401       vassert(sz == 4);
   9402       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "mulss", Iop_Mul32F0x4 );
   9403       goto decode_success;
   9404    }
   9405 
   9406    /* 0F 56 = ORPS -- G = G and E */
   9407    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x56) {
   9408       delta = dis_SSE_E_to_G_all( sorb, delta+2, "orps", Iop_OrV128 );
   9409       goto decode_success;
   9410    }
   9411 
   9412    /* 0F 53 = RCPPS -- approx reciprocal 32Fx4 from R/M to R */
   9413    if (insn[0] == 0x0F && insn[1] == 0x53) {
   9414       vassert(sz == 4);
   9415       delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
   9416                                         "rcpps", Iop_RecipEst32Fx4 );
   9417       goto decode_success;
   9418    }
   9419 
   9420    /* F3 0F 53 = RCPSS -- approx reciprocal 32F0x4 from R/M to R */
   9421    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x53) {
   9422       vassert(sz == 4);
   9423       delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
   9424                                          "rcpss", Iop_RecipEst32F0x4 );
   9425       goto decode_success;
   9426    }
   9427 
   9428    /* 0F 52 = RSQRTPS -- approx reciprocal sqrt 32Fx4 from R/M to R */
   9429    if (insn[0] == 0x0F && insn[1] == 0x52) {
   9430       vassert(sz == 4);
   9431       delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
   9432                                         "rsqrtps", Iop_RSqrtEst32Fx4 );
   9433       goto decode_success;
   9434    }
   9435 
   9436    /* F3 0F 52 = RSQRTSS -- approx reciprocal sqrt 32F0x4 from R/M to R */
   9437    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x52) {
   9438       vassert(sz == 4);
   9439       delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
   9440                                          "rsqrtss", Iop_RSqrtEst32F0x4 );
   9441       goto decode_success;
   9442    }
   9443 
   9444    /* 0F C6 /r ib = SHUFPS -- shuffle packed F32s */
   9445    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC6) {
   9446       Int    select;
   9447       IRTemp sV, dV;
   9448       IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   9449       sV = newTemp(Ity_V128);
   9450       dV = newTemp(Ity_V128);
   9451       s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   9452       modrm = insn[2];
   9453       assign( dV, getXMMReg(gregOfRM(modrm)) );
   9454 
   9455       if (epartIsReg(modrm)) {
   9456          assign( sV, getXMMReg(eregOfRM(modrm)) );
   9457          select = (Int)insn[3];
   9458          delta += 2+2;
   9459          DIP("shufps $%d,%s,%s\n", select,
   9460                                    nameXMMReg(eregOfRM(modrm)),
   9461                                    nameXMMReg(gregOfRM(modrm)));
   9462       } else {
   9463          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9464          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   9465          select = (Int)insn[2+alen];
   9466          delta += 3+alen;
   9467          DIP("shufps $%d,%s,%s\n", select,
   9468                                    dis_buf,
   9469                                    nameXMMReg(gregOfRM(modrm)));
   9470       }
   9471 
   9472       breakup128to32s( dV, &d3, &d2, &d1, &d0 );
   9473       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   9474 
   9475 #     define SELD(n) ((n)==0 ? d0 : ((n)==1 ? d1 : ((n)==2 ? d2 : d3)))
   9476 #     define SELS(n) ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   9477 
   9478       putXMMReg(
   9479          gregOfRM(modrm),
   9480          mk128from32s( SELS((select>>6)&3), SELS((select>>4)&3),
   9481                        SELD((select>>2)&3), SELD((select>>0)&3) )
   9482       );
   9483 
   9484 #     undef SELD
   9485 #     undef SELS
   9486 
   9487       goto decode_success;
   9488    }
   9489 
   9490    /* 0F 51 = SQRTPS -- approx sqrt 32Fx4 from R/M to R */
   9491    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x51) {
   9492       delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
   9493                                         "sqrtps", Iop_Sqrt32Fx4 );
   9494       goto decode_success;
   9495    }
   9496 
   9497    /* F3 0F 51 = SQRTSS -- approx sqrt 32F0x4 from R/M to R */
   9498    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x51) {
   9499       vassert(sz == 4);
   9500       delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
   9501                                          "sqrtss", Iop_Sqrt32F0x4 );
   9502       goto decode_success;
   9503    }
   9504 
   9505    /* 0F AE /3 = STMXCSR m32 -- store %mxcsr */
   9506    if (insn[0] == 0x0F && insn[1] == 0xAE
   9507        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 3) {
   9508       modrm = getIByte(delta+2);
   9509       vassert(sz == 4);
   9510       vassert(!epartIsReg(modrm));
   9511 
   9512       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9513       delta += 2+alen;
   9514 
   9515       /* Fake up a native SSE mxcsr word.  The only thing it depends
   9516          on is SSEROUND[1:0], so call a clean helper to cook it up.
   9517       */
   9518       /* UInt x86h_create_mxcsr ( UInt sseround ) */
   9519       DIP("stmxcsr %s\n", dis_buf);
   9520       storeLE( mkexpr(addr),
   9521                mkIRExprCCall(
   9522                   Ity_I32, 0/*regp*/,
   9523                   "x86g_create_mxcsr", &x86g_create_mxcsr,
   9524                   mkIRExprVec_1( get_sse_roundingmode() )
   9525                )
   9526              );
   9527       goto decode_success;
   9528    }
   9529 
   9530    /* 0F 5C = SUBPS -- sub 32Fx4 from R/M to R */
   9531    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5C) {
   9532       delta = dis_SSE_E_to_G_all( sorb, delta+2, "subps", Iop_Sub32Fx4 );
   9533       goto decode_success;
   9534    }
   9535 
   9536    /* F3 0F 5C = SUBSS -- sub 32F0x4 from R/M to R */
   9537    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5C) {
   9538       vassert(sz == 4);
   9539       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "subss", Iop_Sub32F0x4 );
   9540       goto decode_success;
   9541    }
   9542 
   9543    /* 0F 15 = UNPCKHPS -- unpack and interleave high part F32s */
   9544    /* 0F 14 = UNPCKLPS -- unpack and interleave low part F32s */
   9545    /* These just appear to be special cases of SHUFPS */
   9546    if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
   9547       IRTemp sV, dV;
   9548       IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   9549       Bool hi = toBool(insn[1] == 0x15);
   9550       sV = newTemp(Ity_V128);
   9551       dV = newTemp(Ity_V128);
   9552       s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   9553       modrm = insn[2];
   9554       assign( dV, getXMMReg(gregOfRM(modrm)) );
   9555 
   9556       if (epartIsReg(modrm)) {
   9557          assign( sV, getXMMReg(eregOfRM(modrm)) );
   9558          delta += 2+1;
   9559          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   9560                                   nameXMMReg(eregOfRM(modrm)),
   9561                                   nameXMMReg(gregOfRM(modrm)));
   9562       } else {
   9563          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9564          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   9565          delta += 2+alen;
   9566          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   9567                                   dis_buf,
   9568                                   nameXMMReg(gregOfRM(modrm)));
   9569       }
   9570 
   9571       breakup128to32s( dV, &d3, &d2, &d1, &d0 );
   9572       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   9573 
   9574       if (hi) {
   9575          putXMMReg( gregOfRM(modrm), mk128from32s( s3, d3, s2, d2 ) );
   9576       } else {
   9577          putXMMReg( gregOfRM(modrm), mk128from32s( s1, d1, s0, d0 ) );
   9578       }
   9579 
   9580       goto decode_success;
   9581    }
   9582 
   9583    /* 0F 57 = XORPS -- G = G and E */
   9584    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x57) {
   9585       delta = dis_SSE_E_to_G_all( sorb, delta+2, "xorps", Iop_XorV128 );
   9586       goto decode_success;
   9587    }
   9588 
   9589    /* ---------------------------------------------------- */
   9590    /* --- end of the SSE decoder.                      --- */
   9591    /* ---------------------------------------------------- */
   9592 
   9593    /* ---------------------------------------------------- */
   9594    /* --- start of the SSE2 decoder.                   --- */
   9595    /* ---------------------------------------------------- */
   9596 
   9597    /* Skip parts of the decoder which don't apply given the stated
   9598       guest subarchitecture. */
   9599    if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2))
   9600       goto after_sse_decoders; /* no SSE2 capabilities */
   9601 
   9602    insn = &guest_code[delta];
   9603 
   9604    /* 66 0F 58 = ADDPD -- add 32Fx4 from R/M to R */
   9605    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x58) {
   9606       delta = dis_SSE_E_to_G_all( sorb, delta+2, "addpd", Iop_Add64Fx2 );
   9607       goto decode_success;
   9608    }
   9609 
   9610    /* F2 0F 58 = ADDSD -- add 64F0x2 from R/M to R */
   9611    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x58) {
   9612       vassert(sz == 4);
   9613       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "addsd", Iop_Add64F0x2 );
   9614       goto decode_success;
   9615    }
   9616 
   9617    /* 66 0F 55 = ANDNPD -- G = (not G) and E */
   9618    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x55) {
   9619       delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "andnpd", Iop_AndV128 );
   9620       goto decode_success;
   9621    }
   9622 
   9623    /* 66 0F 54 = ANDPD -- G = G and E */
   9624    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x54) {
   9625       delta = dis_SSE_E_to_G_all( sorb, delta+2, "andpd", Iop_AndV128 );
   9626       goto decode_success;
   9627    }
   9628 
   9629    /* 66 0F C2 = CMPPD -- 64Fx2 comparison from R/M to R */
   9630    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC2) {
   9631       delta = dis_SSEcmp_E_to_G( sorb, delta+2, "cmppd", True, 8 );
   9632       goto decode_success;
   9633    }
   9634 
   9635    /* F2 0F C2 = CMPSD -- 64F0x2 comparison from R/M to R */
   9636    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xC2) {
   9637       vassert(sz == 4);
   9638       delta = dis_SSEcmp_E_to_G( sorb, delta+3, "cmpsd", False, 8 );
   9639       goto decode_success;
   9640    }
   9641 
   9642    /* 66 0F 2F = COMISD  -- 64F0x2 comparison G,E, and set ZCP */
   9643    /* 66 0F 2E = UCOMISD -- 64F0x2 comparison G,E, and set ZCP */
   9644    if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
   9645       IRTemp argL = newTemp(Ity_F64);
   9646       IRTemp argR = newTemp(Ity_F64);
   9647       modrm = getIByte(delta+2);
   9648       if (epartIsReg(modrm)) {
   9649          assign( argR, getXMMRegLane64F( eregOfRM(modrm), 0/*lowest lane*/ ) );
   9650          delta += 2+1;
   9651          DIP("[u]comisd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9652                                   nameXMMReg(gregOfRM(modrm)) );
   9653       } else {
   9654          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9655 	 assign( argR, loadLE(Ity_F64, mkexpr(addr)) );
   9656          delta += 2+alen;
   9657          DIP("[u]comisd %s,%s\n", dis_buf,
   9658                                   nameXMMReg(gregOfRM(modrm)) );
   9659       }
   9660       assign( argL, getXMMRegLane64F( gregOfRM(modrm), 0/*lowest lane*/ ) );
   9661 
   9662       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   9663       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   9664       stmt( IRStmt_Put(
   9665                OFFB_CC_DEP1,
   9666                binop( Iop_And32,
   9667                       binop(Iop_CmpF64, mkexpr(argL), mkexpr(argR)),
   9668                       mkU32(0x45)
   9669           )));
   9670       /* Set NDEP even though it isn't used.  This makes redundant-PUT
   9671          elimination of previous stores to this field work better. */
   9672       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   9673       goto decode_success;
   9674    }
   9675 
   9676    /* F3 0F E6 = CVTDQ2PD -- convert 2 x I32 in mem/lo half xmm to 2 x
   9677       F64 in xmm(G) */
   9678    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xE6) {
   9679       IRTemp arg64 = newTemp(Ity_I64);
   9680       vassert(sz == 4);
   9681 
   9682       modrm = getIByte(delta+3);
   9683       if (epartIsReg(modrm)) {
   9684          assign( arg64, getXMMRegLane64(eregOfRM(modrm), 0) );
   9685          delta += 3+1;
   9686          DIP("cvtdq2pd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9687                                  nameXMMReg(gregOfRM(modrm)));
   9688       } else {
   9689          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9690 	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   9691          delta += 3+alen;
   9692          DIP("cvtdq2pd %s,%s\n", dis_buf,
   9693                                  nameXMMReg(gregOfRM(modrm)) );
   9694       }
   9695 
   9696       putXMMRegLane64F(
   9697          gregOfRM(modrm), 0,
   9698          unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)))
   9699       );
   9700 
   9701       putXMMRegLane64F(
   9702          gregOfRM(modrm), 1,
   9703          unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)))
   9704       );
   9705 
   9706       goto decode_success;
   9707    }
   9708 
   9709    /* 0F 5B = CVTDQ2PS -- convert 4 x I32 in mem/xmm to 4 x F32 in
   9710       xmm(G) */
   9711    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5B) {
   9712       IRTemp argV  = newTemp(Ity_V128);
   9713       IRTemp rmode = newTemp(Ity_I32);
   9714 
   9715       modrm = getIByte(delta+2);
   9716       if (epartIsReg(modrm)) {
   9717          assign( argV, getXMMReg(eregOfRM(modrm)) );
   9718          delta += 2+1;
   9719          DIP("cvtdq2ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9720                                  nameXMMReg(gregOfRM(modrm)));
   9721       } else {
   9722          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9723 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   9724          delta += 2+alen;
   9725          DIP("cvtdq2ps %s,%s\n", dis_buf,
   9726                                  nameXMMReg(gregOfRM(modrm)) );
   9727       }
   9728 
   9729       assign( rmode, get_sse_roundingmode() );
   9730       breakup128to32s( argV, &t3, &t2, &t1, &t0 );
   9731 
   9732 #     define CVT(_t)  binop( Iop_F64toF32,                    \
   9733                              mkexpr(rmode),                   \
   9734                              unop(Iop_I32StoF64,mkexpr(_t)))
   9735 
   9736       putXMMRegLane32F( gregOfRM(modrm), 3, CVT(t3) );
   9737       putXMMRegLane32F( gregOfRM(modrm), 2, CVT(t2) );
   9738       putXMMRegLane32F( gregOfRM(modrm), 1, CVT(t1) );
   9739       putXMMRegLane32F( gregOfRM(modrm), 0, CVT(t0) );
   9740 
   9741 #     undef CVT
   9742 
   9743       goto decode_success;
   9744    }
   9745 
   9746    /* F2 0F E6 = CVTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
   9747       lo half xmm(G), and zero upper half */
   9748    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xE6) {
   9749       IRTemp argV  = newTemp(Ity_V128);
   9750       IRTemp rmode = newTemp(Ity_I32);
   9751       vassert(sz == 4);
   9752 
   9753       modrm = getIByte(delta+3);
   9754       if (epartIsReg(modrm)) {
   9755          assign( argV, getXMMReg(eregOfRM(modrm)) );
   9756          delta += 3+1;
   9757          DIP("cvtpd2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9758                                  nameXMMReg(gregOfRM(modrm)));
   9759       } else {
   9760          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9761 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   9762          delta += 3+alen;
   9763          DIP("cvtpd2dq %s,%s\n", dis_buf,
   9764                                  nameXMMReg(gregOfRM(modrm)) );
   9765       }
   9766 
   9767       assign( rmode, get_sse_roundingmode() );
   9768       t0 = newTemp(Ity_F64);
   9769       t1 = newTemp(Ity_F64);
   9770       assign( t0, unop(Iop_ReinterpI64asF64,
   9771                        unop(Iop_V128to64, mkexpr(argV))) );
   9772       assign( t1, unop(Iop_ReinterpI64asF64,
   9773                        unop(Iop_V128HIto64, mkexpr(argV))) );
   9774 
   9775 #     define CVT(_t)  binop( Iop_F64toI32S,                   \
   9776                              mkexpr(rmode),                   \
   9777                              mkexpr(_t) )
   9778 
   9779       putXMMRegLane32( gregOfRM(modrm), 3, mkU32(0) );
   9780       putXMMRegLane32( gregOfRM(modrm), 2, mkU32(0) );
   9781       putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
   9782       putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
   9783 
   9784 #     undef CVT
   9785 
   9786       goto decode_success;
   9787    }
   9788 
   9789    /* 66 0F 2D = CVTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
   9790       I32 in mmx, according to prevailing SSE rounding mode */
   9791    /* 66 0F 2C = CVTTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
   9792       I32 in mmx, rounding towards zero */
   9793    if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
   9794       IRTemp dst64  = newTemp(Ity_I64);
   9795       IRTemp rmode  = newTemp(Ity_I32);
   9796       IRTemp f64lo  = newTemp(Ity_F64);
   9797       IRTemp f64hi  = newTemp(Ity_F64);
   9798       Bool   r2zero = toBool(insn[1] == 0x2C);
   9799 
   9800       do_MMX_preamble();
   9801       modrm = getIByte(delta+2);
   9802 
   9803       if (epartIsReg(modrm)) {
   9804          delta += 2+1;
   9805 	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
   9806 	 assign(f64hi, getXMMRegLane64F(eregOfRM(modrm), 1));
   9807          DIP("cvt%spd2pi %s,%s\n", r2zero ? "t" : "",
   9808                                    nameXMMReg(eregOfRM(modrm)),
   9809                                    nameMMXReg(gregOfRM(modrm)));
   9810       } else {
   9811          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9812 	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   9813 	 assign(f64hi, loadLE(Ity_F64, binop( Iop_Add32,
   9814                                               mkexpr(addr),
   9815                                               mkU32(8) )));
   9816          delta += 2+alen;
   9817          DIP("cvt%spf2pi %s,%s\n", r2zero ? "t" : "",
   9818                                    dis_buf,
   9819                                    nameMMXReg(gregOfRM(modrm)));
   9820       }
   9821 
   9822       if (r2zero) {
   9823          assign(rmode, mkU32((UInt)Irrm_ZERO) );
   9824       } else {
   9825          assign( rmode, get_sse_roundingmode() );
   9826       }
   9827 
   9828       assign(
   9829          dst64,
   9830          binop( Iop_32HLto64,
   9831                 binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64hi) ),
   9832                 binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo) )
   9833               )
   9834       );
   9835 
   9836       putMMXReg(gregOfRM(modrm), mkexpr(dst64));
   9837       goto decode_success;
   9838    }
   9839 
   9840    /* 66 0F 5A = CVTPD2PS -- convert 2 x F64 in mem/xmm to 2 x F32 in
   9841       lo half xmm(G), and zero upper half */
   9842    /* Note, this is practically identical to CVTPD2DQ.  It would have
   9843       been nicer to merge them together, but the insn[] offsets differ
   9844       by one. */
   9845    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5A) {
   9846       IRTemp argV  = newTemp(Ity_V128);
   9847       IRTemp rmode = newTemp(Ity_I32);
   9848 
   9849       modrm = getIByte(delta+2);
   9850       if (epartIsReg(modrm)) {
   9851          assign( argV, getXMMReg(eregOfRM(modrm)) );
   9852          delta += 2+1;
   9853          DIP("cvtpd2ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9854                                  nameXMMReg(gregOfRM(modrm)));
   9855       } else {
   9856          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9857 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   9858          delta += 2+alen;
   9859          DIP("cvtpd2ps %s,%s\n", dis_buf,
   9860                                  nameXMMReg(gregOfRM(modrm)) );
   9861       }
   9862 
   9863       assign( rmode, get_sse_roundingmode() );
   9864       t0 = newTemp(Ity_F64);
   9865       t1 = newTemp(Ity_F64);
   9866       assign( t0, unop(Iop_ReinterpI64asF64,
   9867                        unop(Iop_V128to64, mkexpr(argV))) );
   9868       assign( t1, unop(Iop_ReinterpI64asF64,
   9869                        unop(Iop_V128HIto64, mkexpr(argV))) );
   9870 
   9871 #     define CVT(_t)  binop( Iop_F64toF32,                    \
   9872                              mkexpr(rmode),                   \
   9873                              mkexpr(_t) )
   9874 
   9875       putXMMRegLane32(  gregOfRM(modrm), 3, mkU32(0) );
   9876       putXMMRegLane32(  gregOfRM(modrm), 2, mkU32(0) );
   9877       putXMMRegLane32F( gregOfRM(modrm), 1, CVT(t1) );
   9878       putXMMRegLane32F( gregOfRM(modrm), 0, CVT(t0) );
   9879 
   9880 #     undef CVT
   9881 
   9882       goto decode_success;
   9883    }
   9884 
   9885    /* 66 0F 2A = CVTPI2PD -- convert 2 x I32 in mem/mmx to 2 x F64 in
   9886       xmm(G) */
   9887    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x2A) {
   9888       IRTemp arg64 = newTemp(Ity_I64);
   9889 
   9890       modrm = getIByte(delta+2);
   9891       if (epartIsReg(modrm)) {
   9892          /* Only switch to MMX mode if the source is a MMX register.
   9893             This is inconsistent with all other instructions which
   9894             convert between XMM and (M64 or MMX), which always switch
   9895             to MMX mode even if 64-bit operand is M64 and not MMX.  At
   9896             least, that's what the Intel docs seem to me to say.
   9897             Fixes #210264. */
   9898          do_MMX_preamble();
   9899          assign( arg64, getMMXReg(eregOfRM(modrm)) );
   9900          delta += 2+1;
   9901          DIP("cvtpi2pd %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   9902                                  nameXMMReg(gregOfRM(modrm)));
   9903       } else {
   9904          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9905 	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   9906          delta += 2+alen;
   9907          DIP("cvtpi2pd %s,%s\n", dis_buf,
   9908                                  nameXMMReg(gregOfRM(modrm)) );
   9909       }
   9910 
   9911       putXMMRegLane64F(
   9912          gregOfRM(modrm), 0,
   9913          unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)) )
   9914       );
   9915 
   9916       putXMMRegLane64F(
   9917          gregOfRM(modrm), 1,
   9918          unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)) )
   9919       );
   9920 
   9921       goto decode_success;
   9922    }
   9923 
   9924    /* 66 0F 5B = CVTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
   9925       xmm(G) */
   9926    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5B) {
   9927       IRTemp argV  = newTemp(Ity_V128);
   9928       IRTemp rmode = newTemp(Ity_I32);
   9929 
   9930       modrm = getIByte(delta+2);
   9931       if (epartIsReg(modrm)) {
   9932          assign( argV, getXMMReg(eregOfRM(modrm)) );
   9933          delta += 2+1;
   9934          DIP("cvtps2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9935                                  nameXMMReg(gregOfRM(modrm)));
   9936       } else {
   9937          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9938 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   9939          delta += 2+alen;
   9940          DIP("cvtps2dq %s,%s\n", dis_buf,
   9941                                  nameXMMReg(gregOfRM(modrm)) );
   9942       }
   9943 
   9944       assign( rmode, get_sse_roundingmode() );
   9945       breakup128to32s( argV, &t3, &t2, &t1, &t0 );
   9946 
   9947       /* This is less than ideal.  If it turns out to be a performance
   9948 	 bottleneck it can be improved. */
   9949 #     define CVT(_t)                            \
   9950         binop( Iop_F64toI32S,                   \
   9951                mkexpr(rmode),                   \
   9952                unop( Iop_F32toF64,              \
   9953                      unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
   9954 
   9955       putXMMRegLane32( gregOfRM(modrm), 3, CVT(t3) );
   9956       putXMMRegLane32( gregOfRM(modrm), 2, CVT(t2) );
   9957       putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
   9958       putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
   9959 
   9960 #     undef CVT
   9961 
   9962       goto decode_success;
   9963    }
   9964 
   9965    /* 0F 5A = CVTPS2PD -- convert 2 x F32 in low half mem/xmm to 2 x
   9966       F64 in xmm(G). */
   9967    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5A) {
   9968       IRTemp f32lo = newTemp(Ity_F32);
   9969       IRTemp f32hi = newTemp(Ity_F32);
   9970 
   9971       modrm = getIByte(delta+2);
   9972       if (epartIsReg(modrm)) {
   9973          assign( f32lo, getXMMRegLane32F(eregOfRM(modrm), 0) );
   9974          assign( f32hi, getXMMRegLane32F(eregOfRM(modrm), 1) );
   9975          delta += 2+1;
   9976          DIP("cvtps2pd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9977                                  nameXMMReg(gregOfRM(modrm)));
   9978       } else {
   9979          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9980 	 assign( f32lo, loadLE(Ity_F32, mkexpr(addr)) );
   9981 	 assign( f32hi, loadLE(Ity_F32,
   9982                                binop(Iop_Add32,mkexpr(addr),mkU32(4))) );
   9983          delta += 2+alen;
   9984          DIP("cvtps2pd %s,%s\n", dis_buf,
   9985                                  nameXMMReg(gregOfRM(modrm)) );
   9986       }
   9987 
   9988       putXMMRegLane64F( gregOfRM(modrm), 1,
   9989                         unop(Iop_F32toF64, mkexpr(f32hi)) );
   9990       putXMMRegLane64F( gregOfRM(modrm), 0,
   9991                         unop(Iop_F32toF64, mkexpr(f32lo)) );
   9992 
   9993       goto decode_success;
   9994    }
   9995 
   9996    /* F2 0F 2D = CVTSD2SI -- convert F64 in mem/low half xmm to
   9997       I32 in ireg, according to prevailing SSE rounding mode */
   9998    /* F2 0F 2C = CVTTSD2SI -- convert F64 in mem/low half xmm to
   9999       I32 in ireg, rounding towards zero */
   10000    if (insn[0] == 0xF2 && insn[1] == 0x0F
   10001        && (insn[2] == 0x2D || insn[2] == 0x2C)) {
   10002       IRTemp rmode = newTemp(Ity_I32);
   10003       IRTemp f64lo = newTemp(Ity_F64);
   10004       Bool   r2zero = toBool(insn[2] == 0x2C);
   10005       vassert(sz == 4);
   10006 
   10007       modrm = getIByte(delta+3);
   10008       if (epartIsReg(modrm)) {
   10009          delta += 3+1;
   10010 	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
   10011          DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
   10012                                    nameXMMReg(eregOfRM(modrm)),
   10013                                    nameIReg(4, gregOfRM(modrm)));
   10014       } else {
   10015          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   10016 	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   10017          delta += 3+alen;
   10018          DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
   10019                                    dis_buf,
   10020                                    nameIReg(4, gregOfRM(modrm)));
   10021       }
   10022 
   10023       if (r2zero) {
   10024          assign( rmode, mkU32((UInt)Irrm_ZERO) );
   10025       } else {
   10026          assign( rmode, get_sse_roundingmode() );
   10027       }
   10028 
   10029       putIReg(4, gregOfRM(modrm),
   10030                  binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo)) );
   10031 
   10032       goto decode_success;
   10033    }
   10034 
   10035    /* F2 0F 5A = CVTSD2SS -- convert F64 in mem/low half xmm to F32 in
   10036       low 1/4 xmm(G), according to prevailing SSE rounding mode */
   10037    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5A) {
   10038       IRTemp rmode = newTemp(Ity_I32);
   10039       IRTemp f64lo = newTemp(Ity_F64);
   10040       vassert(sz == 4);
   10041 
   10042       modrm = getIByte(delta+3);
   10043       if (epartIsReg(modrm)) {
   10044          delta += 3+1;
   10045 	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
   10046          DIP("cvtsd2ss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10047                                  nameXMMReg(gregOfRM(modrm)));
   10048       } else {
   10049          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   10050 	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   10051          delta += 3+alen;
   10052          DIP("cvtsd2ss %s,%s\n", dis_buf,
   10053                                  nameXMMReg(gregOfRM(modrm)));
   10054       }
   10055 
   10056       assign( rmode, get_sse_roundingmode() );
   10057       putXMMRegLane32F(
   10058          gregOfRM(modrm), 0,
   10059          binop( Iop_F64toF32, mkexpr(rmode), mkexpr(f64lo) )
   10060       );
   10061 
   10062       goto decode_success;
   10063    }
   10064 
   10065    /* F2 0F 2A = CVTSI2SD -- convert I32 in mem/ireg to F64 in low
   10066       half xmm */
   10067    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x2A) {
   10068       IRTemp arg32 = newTemp(Ity_I32);
   10069       vassert(sz == 4);
   10070 
   10071       modrm = getIByte(delta+3);
   10072       if (epartIsReg(modrm)) {
   10073          assign( arg32, getIReg(4, eregOfRM(modrm)) );
   10074          delta += 3+1;
   10075          DIP("cvtsi2sd %s,%s\n", nameIReg(4, eregOfRM(modrm)),
   10076                                  nameXMMReg(gregOfRM(modrm)));
   10077       } else {
   10078          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   10079 	 assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   10080          delta += 3+alen;
   10081          DIP("cvtsi2sd %s,%s\n", dis_buf,
   10082                                  nameXMMReg(gregOfRM(modrm)) );
   10083       }
   10084 
   10085       putXMMRegLane64F(
   10086          gregOfRM(modrm), 0,
   10087          unop(Iop_I32StoF64, mkexpr(arg32)) );
   10088 
   10089       goto decode_success;
   10090    }
   10091 
   10092    /* F3 0F 5A = CVTSS2SD -- convert F32 in mem/low 1/4 xmm to F64 in
   10093       low half xmm(G) */
   10094    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5A) {
   10095       IRTemp f32lo = newTemp(Ity_F32);
   10096       vassert(sz == 4);
   10097 
   10098       modrm = getIByte(delta+3);
   10099       if (epartIsReg(modrm)) {
   10100          delta += 3+1;
   10101 	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
   10102          DIP("cvtss2sd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10103                                  nameXMMReg(gregOfRM(modrm)));
   10104       } else {
   10105          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   10106 	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   10107          delta += 3+alen;
   10108          DIP("cvtss2sd %s,%s\n", dis_buf,
   10109                                  nameXMMReg(gregOfRM(modrm)));
   10110       }
   10111 
   10112       putXMMRegLane64F( gregOfRM(modrm), 0,
   10113                         unop( Iop_F32toF64, mkexpr(f32lo) ) );
   10114 
   10115       goto decode_success;
   10116    }
   10117 
   10118    /* 66 0F E6 = CVTTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
   10119       lo half xmm(G), and zero upper half, rounding towards zero */
   10120    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE6) {
   10121       IRTemp argV  = newTemp(Ity_V128);
   10122       IRTemp rmode = newTemp(Ity_I32);
   10123 
   10124       modrm = getIByte(delta+2);
   10125       if (epartIsReg(modrm)) {
   10126          assign( argV, getXMMReg(eregOfRM(modrm)) );
   10127          delta += 2+1;
   10128          DIP("cvttpd2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10129                                   nameXMMReg(gregOfRM(modrm)));
   10130       } else {
   10131          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10132 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10133          delta += 2+alen;
   10134          DIP("cvttpd2dq %s,%s\n", dis_buf,
   10135                                   nameXMMReg(gregOfRM(modrm)) );
   10136       }
   10137 
   10138       assign( rmode, mkU32((UInt)Irrm_ZERO) );
   10139 
   10140       t0 = newTemp(Ity_F64);
   10141       t1 = newTemp(Ity_F64);
   10142       assign( t0, unop(Iop_ReinterpI64asF64,
   10143                        unop(Iop_V128to64, mkexpr(argV))) );
   10144       assign( t1, unop(Iop_ReinterpI64asF64,
   10145                        unop(Iop_V128HIto64, mkexpr(argV))) );
   10146 
   10147 #     define CVT(_t)  binop( Iop_F64toI32S,                   \
   10148                              mkexpr(rmode),                   \
   10149                              mkexpr(_t) )
   10150 
   10151       putXMMRegLane32( gregOfRM(modrm), 3, mkU32(0) );
   10152       putXMMRegLane32( gregOfRM(modrm), 2, mkU32(0) );
   10153       putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
   10154       putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
   10155 
   10156 #     undef CVT
   10157 
   10158       goto decode_success;
   10159    }
   10160 
   10161    /* F3 0F 5B = CVTTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
   10162       xmm(G), rounding towards zero */
   10163    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5B) {
   10164       IRTemp argV  = newTemp(Ity_V128);
   10165       IRTemp rmode = newTemp(Ity_I32);
   10166       vassert(sz == 4);
   10167 
   10168       modrm = getIByte(delta+3);
   10169       if (epartIsReg(modrm)) {
   10170          assign( argV, getXMMReg(eregOfRM(modrm)) );
   10171          delta += 3+1;
   10172          DIP("cvttps2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10173                                   nameXMMReg(gregOfRM(modrm)));
   10174       } else {
   10175          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   10176 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10177          delta += 3+alen;
   10178          DIP("cvttps2dq %s,%s\n", dis_buf,
   10179                                   nameXMMReg(gregOfRM(modrm)) );
   10180       }
   10181 
   10182       assign( rmode, mkU32((UInt)Irrm_ZERO) );
   10183       breakup128to32s( argV, &t3, &t2, &t1, &t0 );
   10184 
   10185       /* This is less than ideal.  If it turns out to be a performance
   10186 	 bottleneck it can be improved. */
   10187 #     define CVT(_t)                            \
   10188         binop( Iop_F64toI32S,                   \
   10189                mkexpr(rmode),                   \
   10190                unop( Iop_F32toF64,              \
   10191                      unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
   10192 
   10193       putXMMRegLane32( gregOfRM(modrm), 3, CVT(t3) );
   10194       putXMMRegLane32( gregOfRM(modrm), 2, CVT(t2) );
   10195       putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
   10196       putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
   10197 
   10198 #     undef CVT
   10199 
   10200       goto decode_success;
   10201    }
   10202 
   10203    /* 66 0F 5E = DIVPD -- div 64Fx2 from R/M to R */
   10204    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5E) {
   10205       delta = dis_SSE_E_to_G_all( sorb, delta+2, "divpd", Iop_Div64Fx2 );
   10206       goto decode_success;
   10207    }
   10208 
   10209    /* F2 0F 5E = DIVSD -- div 64F0x2 from R/M to R */
   10210    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5E) {
   10211       vassert(sz == 4);
   10212       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "divsd", Iop_Div64F0x2 );
   10213       goto decode_success;
   10214    }
   10215 
   10216    /* 0F AE /5 = LFENCE -- flush pending operations to memory */
   10217    /* 0F AE /6 = MFENCE -- flush pending operations to memory */
   10218    if (insn[0] == 0x0F && insn[1] == 0xAE
   10219        && epartIsReg(insn[2])
   10220        && (gregOfRM(insn[2]) == 5 || gregOfRM(insn[2]) == 6)) {
   10221       vassert(sz == 4);
   10222       delta += 3;
   10223       /* Insert a memory fence.  It's sometimes important that these
   10224          are carried through to the generated code. */
   10225       stmt( IRStmt_MBE(Imbe_Fence) );
   10226       DIP("%sfence\n", gregOfRM(insn[2])==5 ? "l" : "m");
   10227       goto decode_success;
   10228    }
   10229 
   10230    /* 66 0F 5F = MAXPD -- max 64Fx2 from R/M to R */
   10231    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5F) {
   10232       delta = dis_SSE_E_to_G_all( sorb, delta+2, "maxpd", Iop_Max64Fx2 );
   10233       goto decode_success;
   10234    }
   10235 
   10236    /* F2 0F 5F = MAXSD -- max 64F0x2 from R/M to R */
   10237    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5F) {
   10238       vassert(sz == 4);
   10239       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "maxsd", Iop_Max64F0x2 );
   10240       goto decode_success;
   10241    }
   10242 
   10243    /* 66 0F 5D = MINPD -- min 64Fx2 from R/M to R */
   10244    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5D) {
   10245       delta = dis_SSE_E_to_G_all( sorb, delta+2, "minpd", Iop_Min64Fx2 );
   10246       goto decode_success;
   10247    }
   10248 
   10249    /* F2 0F 5D = MINSD -- min 64F0x2 from R/M to R */
   10250    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5D) {
   10251       vassert(sz == 4);
   10252       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "minsd", Iop_Min64F0x2 );
   10253       goto decode_success;
   10254    }
   10255 
   10256    /* 66 0F 28 = MOVAPD -- move from E (mem or xmm) to G (xmm). */
   10257    /* 66 0F 10 = MOVUPD -- move from E (mem or xmm) to G (xmm). */
   10258    /* 66 0F 6F = MOVDQA -- move from E (mem or xmm) to G (xmm). */
   10259    if (sz == 2 && insn[0] == 0x0F
   10260        && (insn[1] == 0x28 || insn[1] == 0x10 || insn[1] == 0x6F)) {
   10261       const HChar* wot = insn[1]==0x28 ? "apd" :
   10262                          insn[1]==0x10 ? "upd" : "dqa";
   10263       modrm = getIByte(delta+2);
   10264       if (epartIsReg(modrm)) {
   10265          putXMMReg( gregOfRM(modrm),
   10266                     getXMMReg( eregOfRM(modrm) ));
   10267          DIP("mov%s %s,%s\n", wot, nameXMMReg(eregOfRM(modrm)),
   10268                                    nameXMMReg(gregOfRM(modrm)));
   10269          delta += 2+1;
   10270       } else {
   10271          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10272          if (insn[1] == 0x28/*movapd*/ || insn[1] == 0x6F/*movdqa*/)
   10273             gen_SEGV_if_not_16_aligned( addr );
   10274          putXMMReg( gregOfRM(modrm),
   10275                     loadLE(Ity_V128, mkexpr(addr)) );
   10276          DIP("mov%s %s,%s\n", wot, dis_buf,
   10277                                    nameXMMReg(gregOfRM(modrm)));
   10278          delta += 2+alen;
   10279       }
   10280       goto decode_success;
   10281    }
   10282 
   10283    /* 66 0F 29 = MOVAPD -- move from G (xmm) to E (mem or xmm). */
   10284    /* 66 0F 11 = MOVUPD -- move from G (xmm) to E (mem or xmm). */
   10285    if (sz == 2 && insn[0] == 0x0F
   10286        && (insn[1] == 0x29 || insn[1] == 0x11)) {
   10287       const HChar* wot = insn[1]==0x29 ? "apd" : "upd";
   10288       modrm = getIByte(delta+2);
   10289       if (epartIsReg(modrm)) {
   10290          /* fall through; awaiting test case */
   10291       } else {
   10292          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10293          if (insn[1] == 0x29/*movapd*/)
   10294             gen_SEGV_if_not_16_aligned( addr );
   10295          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   10296          DIP("mov%s %s,%s\n", wot, nameXMMReg(gregOfRM(modrm)),
   10297                                    dis_buf );
   10298          delta += 2+alen;
   10299          goto decode_success;
   10300       }
   10301    }
   10302 
   10303    /* 66 0F 6E = MOVD from r/m32 to xmm, zeroing high 3/4 of xmm. */
   10304    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6E) {
   10305       modrm = getIByte(delta+2);
   10306       if (epartIsReg(modrm)) {
   10307          delta += 2+1;
   10308          putXMMReg(
   10309             gregOfRM(modrm),
   10310             unop( Iop_32UtoV128, getIReg(4, eregOfRM(modrm)) )
   10311          );
   10312          DIP("movd %s, %s\n",
   10313              nameIReg(4,eregOfRM(modrm)), nameXMMReg(gregOfRM(modrm)));
   10314       } else {
   10315          addr = disAMode( &alen, sorb, delta+2, dis_buf );
   10316          delta += 2+alen;
   10317          putXMMReg(
   10318             gregOfRM(modrm),
   10319             unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)) )
   10320          );
   10321          DIP("movd %s, %s\n", dis_buf, nameXMMReg(gregOfRM(modrm)));
   10322       }
   10323       goto decode_success;
   10324    }
   10325 
   10326    /* 66 0F 7E = MOVD from xmm low 1/4 to r/m32. */
   10327    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x7E) {
   10328       modrm = getIByte(delta+2);
   10329       if (epartIsReg(modrm)) {
   10330          delta += 2+1;
   10331          putIReg( 4, eregOfRM(modrm),
   10332                   getXMMRegLane32(gregOfRM(modrm), 0) );
   10333          DIP("movd %s, %s\n",
   10334              nameXMMReg(gregOfRM(modrm)), nameIReg(4,eregOfRM(modrm)));
   10335       } else {
   10336          addr = disAMode( &alen, sorb, delta+2, dis_buf );
   10337          delta += 2+alen;
   10338          storeLE( mkexpr(addr),
   10339                   getXMMRegLane32(gregOfRM(modrm), 0) );
   10340          DIP("movd %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
   10341       }
   10342       goto decode_success;
   10343    }
   10344 
   10345    /* 66 0F 7F = MOVDQA -- move from G (xmm) to E (mem or xmm). */
   10346    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x7F) {
   10347       modrm = getIByte(delta+2);
   10348       if (epartIsReg(modrm)) {
   10349          delta += 2+1;
   10350          putXMMReg( eregOfRM(modrm),
   10351                     getXMMReg(gregOfRM(modrm)) );
   10352          DIP("movdqa %s, %s\n", nameXMMReg(gregOfRM(modrm)),
   10353                                 nameXMMReg(eregOfRM(modrm)));
   10354       } else {
   10355          addr = disAMode( &alen, sorb, delta+2, dis_buf );
   10356          delta += 2+alen;
   10357          gen_SEGV_if_not_16_aligned( addr );
   10358          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   10359          DIP("movdqa %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
   10360       }
   10361       goto decode_success;
   10362    }
   10363 
   10364    /* F3 0F 6F = MOVDQU -- move from E (mem or xmm) to G (xmm). */
   10365    /* Unfortunately can't simply use the MOVDQA case since the
   10366       prefix lengths are different (66 vs F3) */
   10367    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x6F) {
   10368       vassert(sz == 4);
   10369       modrm = getIByte(delta+3);
   10370       if (epartIsReg(modrm)) {
   10371          putXMMReg( gregOfRM(modrm),
   10372                     getXMMReg( eregOfRM(modrm) ));
   10373          DIP("movdqu %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10374                                nameXMMReg(gregOfRM(modrm)));
   10375          delta += 3+1;
   10376       } else {
   10377          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   10378          putXMMReg( gregOfRM(modrm),
   10379                     loadLE(Ity_V128, mkexpr(addr)) );
   10380          DIP("movdqu %s,%s\n", dis_buf,
   10381                                nameXMMReg(gregOfRM(modrm)));
   10382          delta += 3+alen;
   10383       }
   10384       goto decode_success;
   10385    }
   10386 
   10387    /* F3 0F 7F = MOVDQU -- move from G (xmm) to E (mem or xmm). */
   10388    /* Unfortunately can't simply use the MOVDQA case since the
   10389       prefix lengths are different (66 vs F3) */
   10390    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x7F) {
   10391       vassert(sz == 4);
   10392       modrm = getIByte(delta+3);
   10393       if (epartIsReg(modrm)) {
   10394          delta += 3+1;
   10395          putXMMReg( eregOfRM(modrm),
   10396                     getXMMReg(gregOfRM(modrm)) );
   10397          DIP("movdqu %s, %s\n", nameXMMReg(gregOfRM(modrm)),
   10398                                 nameXMMReg(eregOfRM(modrm)));
   10399       } else {
   10400          addr = disAMode( &alen, sorb, delta+3, dis_buf );
   10401          delta += 3+alen;
   10402          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   10403          DIP("movdqu %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
   10404       }
   10405       goto decode_success;
   10406    }
   10407 
   10408    /* F2 0F D6 = MOVDQ2Q -- move from E (lo half xmm, not mem) to G (mmx). */
   10409    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xD6) {
   10410       vassert(sz == 4);
   10411       modrm = getIByte(delta+3);
   10412       if (epartIsReg(modrm)) {
   10413          do_MMX_preamble();
   10414          putMMXReg( gregOfRM(modrm),
   10415                     getXMMRegLane64( eregOfRM(modrm), 0 ));
   10416          DIP("movdq2q %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10417                                 nameMMXReg(gregOfRM(modrm)));
   10418          delta += 3+1;
   10419          goto decode_success;
   10420       } else {
   10421          /* fall through, apparently no mem case for this insn */
   10422       }
   10423    }
   10424 
   10425    /* 66 0F 16 = MOVHPD -- move from mem to high half of XMM. */
   10426    /* These seems identical to MOVHPS.  This instruction encoding is
   10427       completely crazy. */
   10428    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x16) {
   10429       modrm = getIByte(delta+2);
   10430       if (epartIsReg(modrm)) {
   10431          /* fall through; apparently reg-reg is not possible */
   10432       } else {
   10433          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10434          delta += 2+alen;
   10435          putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
   10436                           loadLE(Ity_I64, mkexpr(addr)) );
   10437          DIP("movhpd %s,%s\n", dis_buf,
   10438                                nameXMMReg( gregOfRM(modrm) ));
   10439          goto decode_success;
   10440       }
   10441    }
   10442 
   10443    /* 66 0F 17 = MOVHPD -- move from high half of XMM to mem. */
   10444    /* Again, this seems identical to MOVHPS. */
   10445    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x17) {
   10446       if (!epartIsReg(insn[2])) {
   10447          delta += 2;
   10448          addr = disAMode ( &alen, sorb, delta, dis_buf );
   10449          delta += alen;
   10450          storeLE( mkexpr(addr),
   10451                   getXMMRegLane64( gregOfRM(insn[2]),
   10452                                    1/*upper lane*/ ) );
   10453          DIP("movhpd %s,%s\n", nameXMMReg( gregOfRM(insn[2]) ),
   10454                                dis_buf);
   10455          goto decode_success;
   10456       }
   10457       /* else fall through */
   10458    }
   10459 
   10460    /* 66 0F 12 = MOVLPD -- move from mem to low half of XMM. */
   10461    /* Identical to MOVLPS ? */
   10462    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x12) {
   10463       modrm = getIByte(delta+2);
   10464       if (epartIsReg(modrm)) {
   10465          /* fall through; apparently reg-reg is not possible */
   10466       } else {
   10467          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10468          delta += 2+alen;
   10469          putXMMRegLane64( gregOfRM(modrm),  0/*lower lane*/,
   10470                           loadLE(Ity_I64, mkexpr(addr)) );
   10471          DIP("movlpd %s, %s\n",
   10472              dis_buf, nameXMMReg( gregOfRM(modrm) ));
   10473          goto decode_success;
   10474       }
   10475    }
   10476 
   10477    /* 66 0F 13 = MOVLPD -- move from low half of XMM to mem. */
   10478    /* Identical to MOVLPS ? */
   10479    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x13) {
   10480       if (!epartIsReg(insn[2])) {
   10481          delta += 2;
   10482          addr = disAMode ( &alen, sorb, delta, dis_buf );
   10483          delta += alen;
   10484          storeLE( mkexpr(addr),
   10485                   getXMMRegLane64( gregOfRM(insn[2]),
   10486                                    0/*lower lane*/ ) );
   10487          DIP("movlpd %s, %s\n", nameXMMReg( gregOfRM(insn[2]) ),
   10488                                 dis_buf);
   10489          goto decode_success;
   10490       }
   10491       /* else fall through */
   10492    }
   10493 
   10494    /* 66 0F 50 = MOVMSKPD - move 2 sign bits from 2 x F64 in xmm(E) to
   10495       2 lowest bits of ireg(G) */
   10496    if (insn[0] == 0x0F && insn[1] == 0x50) {
   10497       modrm = getIByte(delta+2);
   10498       if (sz == 2 && epartIsReg(modrm)) {
   10499          Int src;
   10500          t0 = newTemp(Ity_I32);
   10501          t1 = newTemp(Ity_I32);
   10502          delta += 2+1;
   10503          src = eregOfRM(modrm);
   10504          assign( t0, binop( Iop_And32,
   10505                             binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(31)),
   10506                             mkU32(1) ));
   10507          assign( t1, binop( Iop_And32,
   10508                             binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(30)),
   10509                             mkU32(2) ));
   10510          putIReg(4, gregOfRM(modrm),
   10511                     binop(Iop_Or32, mkexpr(t0), mkexpr(t1))
   10512                  );
   10513          DIP("movmskpd %s,%s\n", nameXMMReg(src),
   10514                                  nameIReg(4, gregOfRM(modrm)));
   10515          goto decode_success;
   10516       }
   10517       /* else fall through */
   10518    }
   10519 
   10520    /* 66 0F F7 = MASKMOVDQU -- store selected bytes of double quadword */
   10521    if (insn[0] == 0x0F && insn[1] == 0xF7) {
   10522       modrm = getIByte(delta+2);
   10523       if (sz == 2 && epartIsReg(modrm)) {
   10524          IRTemp regD    = newTemp(Ity_V128);
   10525          IRTemp mask    = newTemp(Ity_V128);
   10526          IRTemp olddata = newTemp(Ity_V128);
   10527          IRTemp newdata = newTemp(Ity_V128);
   10528                 addr    = newTemp(Ity_I32);
   10529 
   10530          assign( addr, handleSegOverride( sorb, getIReg(4, R_EDI) ));
   10531          assign( regD, getXMMReg( gregOfRM(modrm) ));
   10532 
   10533          /* Unfortunately can't do the obvious thing with SarN8x16
   10534             here since that can't be re-emitted as SSE2 code - no such
   10535             insn. */
   10536 	 assign(
   10537             mask,
   10538             binop(Iop_64HLtoV128,
   10539                   binop(Iop_SarN8x8,
   10540                         getXMMRegLane64( eregOfRM(modrm), 1 ),
   10541                         mkU8(7) ),
   10542                   binop(Iop_SarN8x8,
   10543                         getXMMRegLane64( eregOfRM(modrm), 0 ),
   10544                         mkU8(7) ) ));
   10545          assign( olddata, loadLE( Ity_V128, mkexpr(addr) ));
   10546          assign( newdata,
   10547                  binop(Iop_OrV128,
   10548                        binop(Iop_AndV128,
   10549                              mkexpr(regD),
   10550                              mkexpr(mask) ),
   10551                        binop(Iop_AndV128,
   10552                              mkexpr(olddata),
   10553                              unop(Iop_NotV128, mkexpr(mask)))) );
   10554          storeLE( mkexpr(addr), mkexpr(newdata) );
   10555 
   10556          delta += 2+1;
   10557          DIP("maskmovdqu %s,%s\n", nameXMMReg( eregOfRM(modrm) ),
   10558                                    nameXMMReg( gregOfRM(modrm) ) );
   10559          goto decode_success;
   10560       }
   10561       /* else fall through */
   10562    }
   10563 
   10564    /* 66 0F E7 = MOVNTDQ -- for us, just a plain SSE store. */
   10565    if (insn[0] == 0x0F && insn[1] == 0xE7) {
   10566       modrm = getIByte(delta+2);
   10567       if (sz == 2 && !epartIsReg(modrm)) {
   10568          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10569          gen_SEGV_if_not_16_aligned( addr );
   10570          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   10571          DIP("movntdq %s,%s\n", dis_buf,
   10572                                 nameXMMReg(gregOfRM(modrm)));
   10573          delta += 2+alen;
   10574          goto decode_success;
   10575       }
   10576       /* else fall through */
   10577    }
   10578 
   10579    /* 0F C3 = MOVNTI -- for us, just a plain ireg store. */
   10580    if (insn[0] == 0x0F && insn[1] == 0xC3) {
   10581       vassert(sz == 4);
   10582       modrm = getIByte(delta+2);
   10583       if (!epartIsReg(modrm)) {
   10584          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10585          storeLE( mkexpr(addr), getIReg(4, gregOfRM(modrm)) );
   10586          DIP("movnti %s,%s\n", dis_buf,
   10587                                nameIReg(4, gregOfRM(modrm)));
   10588          delta += 2+alen;
   10589          goto decode_success;
   10590       }
   10591       /* else fall through */
   10592    }
   10593 
   10594    /* 66 0F D6 = MOVQ -- move 64 bits from G (lo half xmm) to E (mem
   10595       or lo half xmm).  */
   10596    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD6) {
   10597       modrm = getIByte(delta+2);
   10598       if (epartIsReg(modrm)) {
   10599          /* fall through, awaiting test case */
   10600          /* dst: lo half copied, hi half zeroed */
   10601       } else {
   10602          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10603          storeLE( mkexpr(addr),
   10604                   getXMMRegLane64( gregOfRM(modrm), 0 ));
   10605          DIP("movq %s,%s\n", nameXMMReg(gregOfRM(modrm)), dis_buf );
   10606          delta += 2+alen;
   10607          goto decode_success;
   10608       }
   10609    }
   10610 
   10611    /* F3 0F D6 = MOVQ2DQ -- move from E (mmx) to G (lo half xmm, zero
   10612       hi half). */
   10613    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xD6) {
   10614       vassert(sz == 4);
   10615       modrm = getIByte(delta+3);
   10616       if (epartIsReg(modrm)) {
   10617          do_MMX_preamble();
   10618          putXMMReg( gregOfRM(modrm),
   10619                     unop(Iop_64UtoV128, getMMXReg( eregOfRM(modrm) )) );
   10620          DIP("movq2dq %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   10621                                 nameXMMReg(gregOfRM(modrm)));
   10622          delta += 3+1;
   10623          goto decode_success;
   10624       } else {
   10625          /* fall through, apparently no mem case for this insn */
   10626       }
   10627    }
   10628 
   10629    /* F3 0F 7E = MOVQ -- move 64 bits from E (mem or lo half xmm) to
   10630       G (lo half xmm).  Upper half of G is zeroed out. */
   10631    /* F2 0F 10 = MOVSD -- move 64 bits from E (mem or lo half xmm) to
   10632       G (lo half xmm).  If E is mem, upper half of G is zeroed out.
   10633       If E is reg, upper half of G is unchanged. */
   10634    if ((insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x10)
   10635        || (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x7E)) {
   10636       vassert(sz == 4);
   10637       modrm = getIByte(delta+3);
   10638       if (epartIsReg(modrm)) {
   10639          putXMMRegLane64( gregOfRM(modrm), 0,
   10640                           getXMMRegLane64( eregOfRM(modrm), 0 ));
   10641          if (insn[0] == 0xF3/*MOVQ*/) {
   10642             /* zero bits 127:64 */
   10643             putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
   10644          }
   10645          DIP("movsd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10646                               nameXMMReg(gregOfRM(modrm)));
   10647          delta += 3+1;
   10648       } else {
   10649          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   10650          /* zero bits 127:64 */
   10651          putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
   10652          /* write bits 63:0 */
   10653          putXMMRegLane64( gregOfRM(modrm), 0,
   10654                           loadLE(Ity_I64, mkexpr(addr)) );
   10655          DIP("movsd %s,%s\n", dis_buf,
   10656                               nameXMMReg(gregOfRM(modrm)));
   10657          delta += 3+alen;
   10658       }
   10659       goto decode_success;
   10660    }
   10661 
   10662    /* F2 0F 11 = MOVSD -- move 64 bits from G (lo half xmm) to E (mem
   10663       or lo half xmm). */
   10664    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x11) {
   10665       vassert(sz == 4);
   10666       modrm = getIByte(delta+3);
   10667       if (epartIsReg(modrm)) {
   10668          putXMMRegLane64( eregOfRM(modrm), 0,
   10669                           getXMMRegLane64( gregOfRM(modrm), 0 ));
   10670          DIP("movsd %s,%s\n", nameXMMReg(gregOfRM(modrm)),
   10671                               nameXMMReg(eregOfRM(modrm)));
   10672          delta += 3+1;
   10673       } else {
   10674          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   10675          storeLE( mkexpr(addr),
   10676                   getXMMRegLane64(gregOfRM(modrm), 0) );
   10677          DIP("movsd %s,%s\n", nameXMMReg(gregOfRM(modrm)),
   10678                               dis_buf);
   10679          delta += 3+alen;
   10680       }
   10681       goto decode_success;
   10682    }
   10683 
   10684    /* 66 0F 59 = MULPD -- mul 64Fx2 from R/M to R */
   10685    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x59) {
   10686       delta = dis_SSE_E_to_G_all( sorb, delta+2, "mulpd", Iop_Mul64Fx2 );
   10687       goto decode_success;
   10688    }
   10689 
   10690    /* F2 0F 59 = MULSD -- mul 64F0x2 from R/M to R */
   10691    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x59) {
   10692       vassert(sz == 4);
   10693       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "mulsd", Iop_Mul64F0x2 );
   10694       goto decode_success;
   10695    }
   10696 
   10697    /* 66 0F 56 = ORPD -- G = G and E */
   10698    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x56) {
   10699       delta = dis_SSE_E_to_G_all( sorb, delta+2, "orpd", Iop_OrV128 );
   10700       goto decode_success;
   10701    }
   10702 
   10703    /* 66 0F C6 /r ib = SHUFPD -- shuffle packed F64s */
   10704    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC6) {
   10705       Int    select;
   10706       IRTemp sV = newTemp(Ity_V128);
   10707       IRTemp dV = newTemp(Ity_V128);
   10708       IRTemp s1 = newTemp(Ity_I64);
   10709       IRTemp s0 = newTemp(Ity_I64);
   10710       IRTemp d1 = newTemp(Ity_I64);
   10711       IRTemp d0 = newTemp(Ity_I64);
   10712 
   10713       modrm = insn[2];
   10714       assign( dV, getXMMReg(gregOfRM(modrm)) );
   10715 
   10716       if (epartIsReg(modrm)) {
   10717          assign( sV, getXMMReg(eregOfRM(modrm)) );
   10718          select = (Int)insn[3];
   10719          delta += 2+2;
   10720          DIP("shufpd $%d,%s,%s\n", select,
   10721                                    nameXMMReg(eregOfRM(modrm)),
   10722                                    nameXMMReg(gregOfRM(modrm)));
   10723       } else {
   10724          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10725          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   10726          select = (Int)insn[2+alen];
   10727          delta += 3+alen;
   10728          DIP("shufpd $%d,%s,%s\n", select,
   10729                                    dis_buf,
   10730                                    nameXMMReg(gregOfRM(modrm)));
   10731       }
   10732 
   10733       assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
   10734       assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
   10735       assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
   10736       assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
   10737 
   10738 #     define SELD(n) mkexpr((n)==0 ? d0 : d1)
   10739 #     define SELS(n) mkexpr((n)==0 ? s0 : s1)
   10740 
   10741       putXMMReg(
   10742          gregOfRM(modrm),
   10743          binop(Iop_64HLtoV128, SELS((select>>1)&1), SELD((select>>0)&1) )
   10744       );
   10745 
   10746 #     undef SELD
   10747 #     undef SELS
   10748 
   10749       goto decode_success;
   10750    }
   10751 
   10752    /* 66 0F 51 = SQRTPD -- approx sqrt 64Fx2 from R/M to R */
   10753    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x51) {
   10754       delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
   10755                                         "sqrtpd", Iop_Sqrt64Fx2 );
   10756       goto decode_success;
   10757    }
   10758 
   10759    /* F2 0F 51 = SQRTSD -- approx sqrt 64F0x2 from R/M to R */
   10760    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x51) {
   10761       vassert(sz == 4);
   10762       delta = dis_SSE_E_to_G_unary_lo64( sorb, delta+3,
   10763                                          "sqrtsd", Iop_Sqrt64F0x2 );
   10764       goto decode_success;
   10765    }
   10766 
   10767    /* 66 0F 5C = SUBPD -- sub 64Fx2 from R/M to R */
   10768    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5C) {
   10769       delta = dis_SSE_E_to_G_all( sorb, delta+2, "subpd", Iop_Sub64Fx2 );
   10770       goto decode_success;
   10771    }
   10772 
   10773    /* F2 0F 5C = SUBSD -- sub 64F0x2 from R/M to R */
   10774    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5C) {
   10775       vassert(sz == 4);
   10776       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "subsd", Iop_Sub64F0x2 );
   10777       goto decode_success;
   10778    }
   10779 
   10780    /* 66 0F 15 = UNPCKHPD -- unpack and interleave high part F64s */
   10781    /* 66 0F 14 = UNPCKLPD -- unpack and interleave low part F64s */
   10782    /* These just appear to be special cases of SHUFPS */
   10783    if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
   10784       IRTemp s1 = newTemp(Ity_I64);
   10785       IRTemp s0 = newTemp(Ity_I64);
   10786       IRTemp d1 = newTemp(Ity_I64);
   10787       IRTemp d0 = newTemp(Ity_I64);
   10788       IRTemp sV = newTemp(Ity_V128);
   10789       IRTemp dV = newTemp(Ity_V128);
   10790       Bool   hi = toBool(insn[1] == 0x15);
   10791 
   10792       modrm = insn[2];
   10793       assign( dV, getXMMReg(gregOfRM(modrm)) );
   10794 
   10795       if (epartIsReg(modrm)) {
   10796          assign( sV, getXMMReg(eregOfRM(modrm)) );
   10797          delta += 2+1;
   10798          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   10799                                   nameXMMReg(eregOfRM(modrm)),
   10800                                   nameXMMReg(gregOfRM(modrm)));
   10801       } else {
   10802          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10803          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   10804          delta += 2+alen;
   10805          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   10806                                   dis_buf,
   10807                                   nameXMMReg(gregOfRM(modrm)));
   10808       }
   10809 
   10810       assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
   10811       assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
   10812       assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
   10813       assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
   10814 
   10815       if (hi) {
   10816          putXMMReg( gregOfRM(modrm),
   10817                     binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1)) );
   10818       } else {
   10819          putXMMReg( gregOfRM(modrm),
   10820                     binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)) );
   10821       }
   10822 
   10823       goto decode_success;
   10824    }
   10825 
   10826    /* 66 0F 57 = XORPD -- G = G and E */
   10827    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x57) {
   10828       delta = dis_SSE_E_to_G_all( sorb, delta+2, "xorpd", Iop_XorV128 );
   10829       goto decode_success;
   10830    }
   10831 
   10832    /* 66 0F 6B = PACKSSDW */
   10833    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6B) {
   10834       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10835                                  "packssdw",
   10836                                  Iop_QNarrowBin32Sto16Sx8, True );
   10837       goto decode_success;
   10838    }
   10839 
   10840    /* 66 0F 63 = PACKSSWB */
   10841    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x63) {
   10842       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10843                                  "packsswb",
   10844                                  Iop_QNarrowBin16Sto8Sx16, True );
   10845       goto decode_success;
   10846    }
   10847 
   10848    /* 66 0F 67 = PACKUSWB */
   10849    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x67) {
   10850       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10851                                  "packuswb",
   10852                                  Iop_QNarrowBin16Sto8Ux16, True );
   10853       goto decode_success;
   10854    }
   10855 
   10856    /* 66 0F FC = PADDB */
   10857    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFC) {
   10858       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10859                                  "paddb", Iop_Add8x16, False );
   10860       goto decode_success;
   10861    }
   10862 
   10863    /* 66 0F FE = PADDD */
   10864    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFE) {
   10865       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10866                                  "paddd", Iop_Add32x4, False );
   10867       goto decode_success;
   10868    }
   10869 
   10870    /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   10871    /* 0F D4 = PADDQ -- add 64x1 */
   10872    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xD4) {
   10873       do_MMX_preamble();
   10874       delta = dis_MMXop_regmem_to_reg (
   10875                 sorb, delta+2, insn[1], "paddq", False );
   10876       goto decode_success;
   10877    }
   10878 
   10879    /* 66 0F D4 = PADDQ */
   10880    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD4) {
   10881       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10882                                  "paddq", Iop_Add64x2, False );
   10883       goto decode_success;
   10884    }
   10885 
   10886    /* 66 0F FD = PADDW */
   10887    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFD) {
   10888       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10889                                  "paddw", Iop_Add16x8, False );
   10890       goto decode_success;
   10891    }
   10892 
   10893    /* 66 0F EC = PADDSB */
   10894    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEC) {
   10895       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10896                                  "paddsb", Iop_QAdd8Sx16, False );
   10897       goto decode_success;
   10898    }
   10899 
   10900    /* 66 0F ED = PADDSW */
   10901    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xED) {
   10902       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10903                                  "paddsw", Iop_QAdd16Sx8, False );
   10904       goto decode_success;
   10905    }
   10906 
   10907    /* 66 0F DC = PADDUSB */
   10908    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDC) {
   10909       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10910                                  "paddusb", Iop_QAdd8Ux16, False );
   10911       goto decode_success;
   10912    }
   10913 
   10914    /* 66 0F DD = PADDUSW */
   10915    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDD) {
   10916       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10917                                  "paddusw", Iop_QAdd16Ux8, False );
   10918       goto decode_success;
   10919    }
   10920 
   10921    /* 66 0F DB = PAND */
   10922    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDB) {
   10923       delta = dis_SSE_E_to_G_all( sorb, delta+2, "pand", Iop_AndV128 );
   10924       goto decode_success;
   10925    }
   10926 
   10927    /* 66 0F DF = PANDN */
   10928    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDF) {
   10929       delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "pandn", Iop_AndV128 );
   10930       goto decode_success;
   10931    }
   10932 
   10933    /* 66 0F E0 = PAVGB */
   10934    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE0) {
   10935       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10936                                  "pavgb", Iop_Avg8Ux16, False );
   10937       goto decode_success;
   10938    }
   10939 
   10940    /* 66 0F E3 = PAVGW */
   10941    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE3) {
   10942       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10943                                  "pavgw", Iop_Avg16Ux8, False );
   10944       goto decode_success;
   10945    }
   10946 
   10947    /* 66 0F 74 = PCMPEQB */
   10948    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x74) {
   10949       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10950                                  "pcmpeqb", Iop_CmpEQ8x16, False );
   10951       goto decode_success;
   10952    }
   10953 
   10954    /* 66 0F 76 = PCMPEQD */
   10955    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x76) {
   10956       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10957                                  "pcmpeqd", Iop_CmpEQ32x4, False );
   10958       goto decode_success;
   10959    }
   10960 
   10961    /* 66 0F 75 = PCMPEQW */
   10962    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x75) {
   10963       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10964                                  "pcmpeqw", Iop_CmpEQ16x8, False );
   10965       goto decode_success;
   10966    }
   10967 
   10968    /* 66 0F 64 = PCMPGTB */
   10969    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x64) {
   10970       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10971                                  "pcmpgtb", Iop_CmpGT8Sx16, False );
   10972       goto decode_success;
   10973    }
   10974 
   10975    /* 66 0F 66 = PCMPGTD */
   10976    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x66) {
   10977       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10978                                  "pcmpgtd", Iop_CmpGT32Sx4, False );
   10979       goto decode_success;
   10980    }
   10981 
   10982    /* 66 0F 65 = PCMPGTW */
   10983    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x65) {
   10984       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10985                                  "pcmpgtw", Iop_CmpGT16Sx8, False );
   10986       goto decode_success;
   10987    }
   10988 
   10989    /* 66 0F C5 = PEXTRW -- extract 16-bit field from xmm(E) and put
   10990       zero-extend of it in ireg(G). */
   10991    if (insn[0] == 0x0F && insn[1] == 0xC5) {
   10992       modrm = insn[2];
   10993       if (sz == 2 && epartIsReg(modrm)) {
   10994          t5 = newTemp(Ity_V128);
   10995          t4 = newTemp(Ity_I16);
   10996          assign(t5, getXMMReg(eregOfRM(modrm)));
   10997          breakup128to32s( t5, &t3, &t2, &t1, &t0 );
   10998          switch (insn[3] & 7) {
   10999             case 0:  assign(t4, unop(Iop_32to16,   mkexpr(t0))); break;
   11000             case 1:  assign(t4, unop(Iop_32HIto16, mkexpr(t0))); break;
   11001             case 2:  assign(t4, unop(Iop_32to16,   mkexpr(t1))); break;
   11002             case 3:  assign(t4, unop(Iop_32HIto16, mkexpr(t1))); break;
   11003             case 4:  assign(t4, unop(Iop_32to16,   mkexpr(t2))); break;
   11004             case 5:  assign(t4, unop(Iop_32HIto16, mkexpr(t2))); break;
   11005             case 6:  assign(t4, unop(Iop_32to16,   mkexpr(t3))); break;
   11006             case 7:  assign(t4, unop(Iop_32HIto16, mkexpr(t3))); break;
   11007             default: vassert(0); /*NOTREACHED*/
   11008          }
   11009          putIReg(4, gregOfRM(modrm), unop(Iop_16Uto32, mkexpr(t4)));
   11010          DIP("pextrw $%d,%s,%s\n",
   11011              (Int)insn[3], nameXMMReg(eregOfRM(modrm)),
   11012                            nameIReg(4,gregOfRM(modrm)));
   11013          delta += 4;
   11014          goto decode_success;
   11015       }
   11016       /* else fall through */
   11017    }
   11018 
   11019    /* 66 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
   11020       put it into the specified lane of xmm(G). */
   11021    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC4) {
   11022       Int lane;
   11023       t4 = newTemp(Ity_I16);
   11024       modrm = insn[2];
   11025 
   11026       if (epartIsReg(modrm)) {
   11027          assign(t4, getIReg(2, eregOfRM(modrm)));
   11028          delta += 3+1;
   11029          lane = insn[3+1-1];
   11030          DIP("pinsrw $%d,%s,%s\n", lane,
   11031                                    nameIReg(2,eregOfRM(modrm)),
   11032                                    nameXMMReg(gregOfRM(modrm)));
   11033       } else {
   11034          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11035          delta += 3+alen;
   11036          lane = insn[3+alen-1];
   11037          assign(t4, loadLE(Ity_I16, mkexpr(addr)));
   11038          DIP("pinsrw $%d,%s,%s\n", lane,
   11039                                    dis_buf,
   11040                                    nameXMMReg(gregOfRM(modrm)));
   11041       }
   11042 
   11043       putXMMRegLane16( gregOfRM(modrm), lane & 7, mkexpr(t4) );
   11044       goto decode_success;
   11045    }
   11046 
   11047    /* 66 0F F5 = PMADDWD -- Multiply and add packed integers from
   11048       E(xmm or mem) to G(xmm) */
   11049    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF5) {
   11050       IRTemp s1V  = newTemp(Ity_V128);
   11051       IRTemp s2V  = newTemp(Ity_V128);
   11052       IRTemp dV   = newTemp(Ity_V128);
   11053       IRTemp s1Hi = newTemp(Ity_I64);
   11054       IRTemp s1Lo = newTemp(Ity_I64);
   11055       IRTemp s2Hi = newTemp(Ity_I64);
   11056       IRTemp s2Lo = newTemp(Ity_I64);
   11057       IRTemp dHi  = newTemp(Ity_I64);
   11058       IRTemp dLo  = newTemp(Ity_I64);
   11059       modrm = insn[2];
   11060       if (epartIsReg(modrm)) {
   11061          assign( s1V, getXMMReg(eregOfRM(modrm)) );
   11062          delta += 2+1;
   11063          DIP("pmaddwd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11064                                 nameXMMReg(gregOfRM(modrm)));
   11065       } else {
   11066          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11067          assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
   11068          delta += 2+alen;
   11069          DIP("pmaddwd %s,%s\n", dis_buf,
   11070                                 nameXMMReg(gregOfRM(modrm)));
   11071       }
   11072       assign( s2V, getXMMReg(gregOfRM(modrm)) );
   11073       assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
   11074       assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
   11075       assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
   11076       assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
   11077       assign( dHi, mkIRExprCCall(
   11078                       Ity_I64, 0/*regparms*/,
   11079                       "x86g_calculate_mmx_pmaddwd",
   11080                       &x86g_calculate_mmx_pmaddwd,
   11081                       mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
   11082                    ));
   11083       assign( dLo, mkIRExprCCall(
   11084                       Ity_I64, 0/*regparms*/,
   11085                       "x86g_calculate_mmx_pmaddwd",
   11086                       &x86g_calculate_mmx_pmaddwd,
   11087                       mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
   11088                    ));
   11089       assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
   11090       putXMMReg(gregOfRM(modrm), mkexpr(dV));
   11091       goto decode_success;
   11092    }
   11093 
   11094    /* 66 0F EE = PMAXSW -- 16x8 signed max */
   11095    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEE) {
   11096       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11097                                  "pmaxsw", Iop_Max16Sx8, False );
   11098       goto decode_success;
   11099    }
   11100 
   11101    /* 66 0F DE = PMAXUB -- 8x16 unsigned max */
   11102    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDE) {
   11103       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11104                                  "pmaxub", Iop_Max8Ux16, False );
   11105       goto decode_success;
   11106    }
   11107 
   11108    /* 66 0F EA = PMINSW -- 16x8 signed min */
   11109    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEA) {
   11110       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11111                                  "pminsw", Iop_Min16Sx8, False );
   11112       goto decode_success;
   11113    }
   11114 
   11115    /* 66 0F DA = PMINUB -- 8x16 unsigned min */
   11116    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDA) {
   11117       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11118                                  "pminub", Iop_Min8Ux16, False );
   11119       goto decode_success;
   11120    }
   11121 
   11122    /* 66 0F D7 = PMOVMSKB -- extract sign bits from each of 16 lanes
   11123       in xmm(E), turn them into a byte, and put zero-extend of it in
   11124       ireg(G). */
   11125    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD7) {
   11126       modrm = insn[2];
   11127       if (epartIsReg(modrm)) {
   11128          t0 = newTemp(Ity_I64);
   11129          t1 = newTemp(Ity_I64);
   11130          assign(t0, getXMMRegLane64(eregOfRM(modrm), 0));
   11131          assign(t1, getXMMRegLane64(eregOfRM(modrm), 1));
   11132          t5 = newTemp(Ity_I32);
   11133          assign(t5,
   11134                 unop(Iop_16Uto32,
   11135                      binop(Iop_8HLto16,
   11136                            unop(Iop_GetMSBs8x8, mkexpr(t1)),
   11137                            unop(Iop_GetMSBs8x8, mkexpr(t0)))));
   11138          putIReg(4, gregOfRM(modrm), mkexpr(t5));
   11139          DIP("pmovmskb %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11140                                  nameIReg(4,gregOfRM(modrm)));
   11141          delta += 3;
   11142          goto decode_success;
   11143       }
   11144       /* else fall through */
   11145    }
   11146 
   11147    /* 66 0F E4 = PMULHUW -- 16x8 hi-half of unsigned widening multiply */
   11148    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE4) {
   11149       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11150                                  "pmulhuw", Iop_MulHi16Ux8, False );
   11151       goto decode_success;
   11152    }
   11153 
   11154    /* 66 0F E5 = PMULHW -- 16x8 hi-half of signed widening multiply */
   11155    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE5) {
   11156       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11157                                  "pmulhw", Iop_MulHi16Sx8, False );
   11158       goto decode_success;
   11159    }
   11160 
   11161    /* 66 0F D5 = PMULHL -- 16x8 multiply */
   11162    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD5) {
   11163       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11164                                  "pmullw", Iop_Mul16x8, False );
   11165       goto decode_success;
   11166    }
   11167 
   11168    /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   11169    /* 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
   11170       0 to form 64-bit result */
   11171    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF4) {
   11172       IRTemp sV = newTemp(Ity_I64);
   11173       IRTemp dV = newTemp(Ity_I64);
   11174       t1 = newTemp(Ity_I32);
   11175       t0 = newTemp(Ity_I32);
   11176       modrm = insn[2];
   11177 
   11178       do_MMX_preamble();
   11179       assign( dV, getMMXReg(gregOfRM(modrm)) );
   11180 
   11181       if (epartIsReg(modrm)) {
   11182          assign( sV, getMMXReg(eregOfRM(modrm)) );
   11183          delta += 2+1;
   11184          DIP("pmuludq %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   11185                                 nameMMXReg(gregOfRM(modrm)));
   11186       } else {
   11187          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11188          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   11189          delta += 2+alen;
   11190          DIP("pmuludq %s,%s\n", dis_buf,
   11191                                 nameMMXReg(gregOfRM(modrm)));
   11192       }
   11193 
   11194       assign( t0, unop(Iop_64to32, mkexpr(dV)) );
   11195       assign( t1, unop(Iop_64to32, mkexpr(sV)) );
   11196       putMMXReg( gregOfRM(modrm),
   11197                  binop( Iop_MullU32, mkexpr(t0), mkexpr(t1) ) );
   11198       goto decode_success;
   11199    }
   11200 
   11201    /* 66 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
   11202       0 to form lower 64-bit half and lanes 2 x 2 to form upper 64-bit
   11203       half */
   11204    /* This is a really poor translation -- could be improved if
   11205       performance critical */
   11206    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF4) {
   11207       IRTemp sV, dV;
   11208       IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   11209       sV = newTemp(Ity_V128);
   11210       dV = newTemp(Ity_V128);
   11211       s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   11212       t1 = newTemp(Ity_I64);
   11213       t0 = newTemp(Ity_I64);
   11214       modrm = insn[2];
   11215       assign( dV, getXMMReg(gregOfRM(modrm)) );
   11216 
   11217       if (epartIsReg(modrm)) {
   11218          assign( sV, getXMMReg(eregOfRM(modrm)) );
   11219          delta += 2+1;
   11220          DIP("pmuludq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11221                                 nameXMMReg(gregOfRM(modrm)));
   11222       } else {
   11223          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11224          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11225          delta += 2+alen;
   11226          DIP("pmuludq %s,%s\n", dis_buf,
   11227                                 nameXMMReg(gregOfRM(modrm)));
   11228       }
   11229 
   11230       breakup128to32s( dV, &d3, &d2, &d1, &d0 );
   11231       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   11232 
   11233       assign( t0, binop( Iop_MullU32, mkexpr(d0), mkexpr(s0)) );
   11234       putXMMRegLane64( gregOfRM(modrm), 0, mkexpr(t0) );
   11235       assign( t1, binop( Iop_MullU32, mkexpr(d2), mkexpr(s2)) );
   11236       putXMMRegLane64( gregOfRM(modrm), 1, mkexpr(t1) );
   11237       goto decode_success;
   11238    }
   11239 
   11240    /* 66 0F EB = POR */
   11241    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEB) {
   11242       delta = dis_SSE_E_to_G_all( sorb, delta+2, "por", Iop_OrV128 );
   11243       goto decode_success;
   11244    }
   11245 
   11246    /* 66 0F F6 = PSADBW -- 2 x (8x8 -> 48 zeroes ++ u16) Sum Abs Diffs
   11247       from E(xmm or mem) to G(xmm) */
   11248    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF6) {
   11249       IRTemp s1V  = newTemp(Ity_V128);
   11250       IRTemp s2V  = newTemp(Ity_V128);
   11251       IRTemp dV   = newTemp(Ity_V128);
   11252       IRTemp s1Hi = newTemp(Ity_I64);
   11253       IRTemp s1Lo = newTemp(Ity_I64);
   11254       IRTemp s2Hi = newTemp(Ity_I64);
   11255       IRTemp s2Lo = newTemp(Ity_I64);
   11256       IRTemp dHi  = newTemp(Ity_I64);
   11257       IRTemp dLo  = newTemp(Ity_I64);
   11258       modrm = insn[2];
   11259       if (epartIsReg(modrm)) {
   11260          assign( s1V, getXMMReg(eregOfRM(modrm)) );
   11261          delta += 2+1;
   11262          DIP("psadbw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11263                                nameXMMReg(gregOfRM(modrm)));
   11264       } else {
   11265          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11266          assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
   11267          delta += 2+alen;
   11268          DIP("psadbw %s,%s\n", dis_buf,
   11269                                nameXMMReg(gregOfRM(modrm)));
   11270       }
   11271       assign( s2V, getXMMReg(gregOfRM(modrm)) );
   11272       assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
   11273       assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
   11274       assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
   11275       assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
   11276       assign( dHi, mkIRExprCCall(
   11277                       Ity_I64, 0/*regparms*/,
   11278                       "x86g_calculate_mmx_psadbw",
   11279                       &x86g_calculate_mmx_psadbw,
   11280                       mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
   11281                    ));
   11282       assign( dLo, mkIRExprCCall(
   11283                       Ity_I64, 0/*regparms*/,
   11284                       "x86g_calculate_mmx_psadbw",
   11285                       &x86g_calculate_mmx_psadbw,
   11286                       mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
   11287                    ));
   11288       assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
   11289       putXMMReg(gregOfRM(modrm), mkexpr(dV));
   11290       goto decode_success;
   11291    }
   11292 
   11293    /* 66 0F 70 = PSHUFD -- rearrange 4x32 from E(xmm or mem) to G(xmm) */
   11294    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x70) {
   11295       Int order;
   11296       IRTemp sV, dV, s3, s2, s1, s0;
   11297       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11298       sV = newTemp(Ity_V128);
   11299       dV = newTemp(Ity_V128);
   11300       modrm = insn[2];
   11301       if (epartIsReg(modrm)) {
   11302          assign( sV, getXMMReg(eregOfRM(modrm)) );
   11303          order = (Int)insn[3];
   11304          delta += 2+2;
   11305          DIP("pshufd $%d,%s,%s\n", order,
   11306                                    nameXMMReg(eregOfRM(modrm)),
   11307                                    nameXMMReg(gregOfRM(modrm)));
   11308       } else {
   11309          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11310          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11311 	 order = (Int)insn[2+alen];
   11312          delta += 3+alen;
   11313          DIP("pshufd $%d,%s,%s\n", order,
   11314                                    dis_buf,
   11315                                    nameXMMReg(gregOfRM(modrm)));
   11316       }
   11317       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   11318 
   11319 #     define SEL(n) \
   11320                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   11321       assign(dV,
   11322 	     mk128from32s( SEL((order>>6)&3), SEL((order>>4)&3),
   11323                            SEL((order>>2)&3), SEL((order>>0)&3) )
   11324       );
   11325       putXMMReg(gregOfRM(modrm), mkexpr(dV));
   11326 #     undef SEL
   11327       goto decode_success;
   11328    }
   11329 
   11330    /* F3 0F 70 = PSHUFHW -- rearrange upper half 4x16 from E(xmm or
   11331       mem) to G(xmm), and copy lower half */
   11332    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x70) {
   11333       Int order;
   11334       IRTemp sVhi, dVhi, sV, dV, s3, s2, s1, s0;
   11335       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11336       sV   = newTemp(Ity_V128);
   11337       dV   = newTemp(Ity_V128);
   11338       sVhi = newTemp(Ity_I64);
   11339       dVhi = newTemp(Ity_I64);
   11340       modrm = insn[3];
   11341       if (epartIsReg(modrm)) {
   11342          assign( sV, getXMMReg(eregOfRM(modrm)) );
   11343          order = (Int)insn[4];
   11344          delta += 4+1;
   11345          DIP("pshufhw $%d,%s,%s\n", order,
   11346                                     nameXMMReg(eregOfRM(modrm)),
   11347                                     nameXMMReg(gregOfRM(modrm)));
   11348       } else {
   11349          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11350          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11351 	 order = (Int)insn[3+alen];
   11352          delta += 4+alen;
   11353          DIP("pshufhw $%d,%s,%s\n", order,
   11354                                     dis_buf,
   11355                                     nameXMMReg(gregOfRM(modrm)));
   11356       }
   11357       assign( sVhi, unop(Iop_V128HIto64, mkexpr(sV)) );
   11358       breakup64to16s( sVhi, &s3, &s2, &s1, &s0 );
   11359 
   11360 #     define SEL(n) \
   11361                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   11362       assign(dVhi,
   11363 	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
   11364                           SEL((order>>2)&3), SEL((order>>0)&3) )
   11365       );
   11366       assign(dV, binop( Iop_64HLtoV128,
   11367                         mkexpr(dVhi),
   11368                         unop(Iop_V128to64, mkexpr(sV))) );
   11369       putXMMReg(gregOfRM(modrm), mkexpr(dV));
   11370 #     undef SEL
   11371       goto decode_success;
   11372    }
   11373 
   11374    /* F2 0F 70 = PSHUFLW -- rearrange lower half 4x16 from E(xmm or
   11375       mem) to G(xmm), and copy upper half */
   11376    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x70) {
   11377       Int order;
   11378       IRTemp sVlo, dVlo, sV, dV, s3, s2, s1, s0;
   11379       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11380       sV   = newTemp(Ity_V128);
   11381       dV   = newTemp(Ity_V128);
   11382       sVlo = newTemp(Ity_I64);
   11383       dVlo = newTemp(Ity_I64);
   11384       modrm = insn[3];
   11385       if (epartIsReg(modrm)) {
   11386          assign( sV, getXMMReg(eregOfRM(modrm)) );
   11387          order = (Int)insn[4];
   11388          delta += 4+1;
   11389          DIP("pshuflw $%d,%s,%s\n", order,
   11390                                     nameXMMReg(eregOfRM(modrm)),
   11391                                     nameXMMReg(gregOfRM(modrm)));
   11392       } else {
   11393          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11394          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11395 	 order = (Int)insn[3+alen];
   11396          delta += 4+alen;
   11397          DIP("pshuflw $%d,%s,%s\n", order,
   11398                                     dis_buf,
   11399                                     nameXMMReg(gregOfRM(modrm)));
   11400       }
   11401       assign( sVlo, unop(Iop_V128to64, mkexpr(sV)) );
   11402       breakup64to16s( sVlo, &s3, &s2, &s1, &s0 );
   11403 
   11404 #     define SEL(n) \
   11405                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   11406       assign(dVlo,
   11407 	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
   11408                           SEL((order>>2)&3), SEL((order>>0)&3) )
   11409       );
   11410       assign(dV, binop( Iop_64HLtoV128,
   11411                         unop(Iop_V128HIto64, mkexpr(sV)),
   11412                         mkexpr(dVlo) ) );
   11413       putXMMReg(gregOfRM(modrm), mkexpr(dV));
   11414 #     undef SEL
   11415       goto decode_success;
   11416    }
   11417 
   11418    /* 66 0F 72 /6 ib = PSLLD by immediate */
   11419    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
   11420        && epartIsReg(insn[2])
   11421        && gregOfRM(insn[2]) == 6) {
   11422       delta = dis_SSE_shiftE_imm( delta+2, "pslld", Iop_ShlN32x4 );
   11423       goto decode_success;
   11424    }
   11425 
   11426    /* 66 0F F2 = PSLLD by E */
   11427    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF2) {
   11428       delta = dis_SSE_shiftG_byE( sorb, delta+2, "pslld", Iop_ShlN32x4 );
   11429       goto decode_success;
   11430    }
   11431 
   11432    /* 66 0F 73 /7 ib = PSLLDQ by immediate */
   11433    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
   11434        && epartIsReg(insn[2])
   11435        && gregOfRM(insn[2]) == 7) {
   11436       IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
   11437       Int    imm = (Int)insn[3];
   11438       Int    reg = eregOfRM(insn[2]);
   11439       DIP("pslldq $%d,%s\n", imm, nameXMMReg(reg));
   11440       vassert(imm >= 0 && imm <= 255);
   11441       delta += 4;
   11442 
   11443       sV    = newTemp(Ity_V128);
   11444       dV    = newTemp(Ity_V128);
   11445       hi64  = newTemp(Ity_I64);
   11446       lo64  = newTemp(Ity_I64);
   11447       hi64r = newTemp(Ity_I64);
   11448       lo64r = newTemp(Ity_I64);
   11449 
   11450       if (imm >= 16) {
   11451          putXMMReg(reg, mkV128(0x0000));
   11452          goto decode_success;
   11453       }
   11454 
   11455       assign( sV, getXMMReg(reg) );
   11456       assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
   11457       assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
   11458 
   11459       if (imm == 0) {
   11460          assign( lo64r, mkexpr(lo64) );
   11461          assign( hi64r, mkexpr(hi64) );
   11462       }
   11463       else
   11464       if (imm == 8) {
   11465          assign( lo64r, mkU64(0) );
   11466          assign( hi64r, mkexpr(lo64) );
   11467       }
   11468       else
   11469       if (imm > 8) {
   11470          assign( lo64r, mkU64(0) );
   11471          assign( hi64r, binop( Iop_Shl64,
   11472                                mkexpr(lo64),
   11473                                mkU8( 8*(imm-8) ) ));
   11474       } else {
   11475          assign( lo64r, binop( Iop_Shl64,
   11476                                mkexpr(lo64),
   11477                                mkU8(8 * imm) ));
   11478          assign( hi64r,
   11479                  binop( Iop_Or64,
   11480                         binop(Iop_Shl64, mkexpr(hi64),
   11481                                          mkU8(8 * imm)),
   11482                         binop(Iop_Shr64, mkexpr(lo64),
   11483                                          mkU8(8 * (8 - imm)) )
   11484                       )
   11485                );
   11486       }
   11487       assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
   11488       putXMMReg(reg, mkexpr(dV));
   11489       goto decode_success;
   11490    }
   11491 
   11492    /* 66 0F 73 /6 ib = PSLLQ by immediate */
   11493    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
   11494        && epartIsReg(insn[2])
   11495        && gregOfRM(insn[2]) == 6) {
   11496       delta = dis_SSE_shiftE_imm( delta+2, "psllq", Iop_ShlN64x2 );
   11497       goto decode_success;
   11498    }
   11499 
   11500    /* 66 0F F3 = PSLLQ by E */
   11501    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF3) {
   11502       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psllq", Iop_ShlN64x2 );
   11503       goto decode_success;
   11504    }
   11505 
   11506    /* 66 0F 71 /6 ib = PSLLW by immediate */
   11507    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
   11508        && epartIsReg(insn[2])
   11509        && gregOfRM(insn[2]) == 6) {
   11510       delta = dis_SSE_shiftE_imm( delta+2, "psllw", Iop_ShlN16x8 );
   11511       goto decode_success;
   11512    }
   11513 
   11514    /* 66 0F F1 = PSLLW by E */
   11515    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF1) {
   11516       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psllw", Iop_ShlN16x8 );
   11517       goto decode_success;
   11518    }
   11519 
   11520    /* 66 0F 72 /4 ib = PSRAD by immediate */
   11521    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
   11522        && epartIsReg(insn[2])
   11523        && gregOfRM(insn[2]) == 4) {
   11524       delta = dis_SSE_shiftE_imm( delta+2, "psrad", Iop_SarN32x4 );
   11525       goto decode_success;
   11526    }
   11527 
   11528    /* 66 0F E2 = PSRAD by E */
   11529    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE2) {
   11530       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrad", Iop_SarN32x4 );
   11531       goto decode_success;
   11532    }
   11533 
   11534    /* 66 0F 71 /4 ib = PSRAW by immediate */
   11535    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
   11536        && epartIsReg(insn[2])
   11537        && gregOfRM(insn[2]) == 4) {
   11538       delta = dis_SSE_shiftE_imm( delta+2, "psraw", Iop_SarN16x8 );
   11539       goto decode_success;
   11540    }
   11541 
   11542    /* 66 0F E1 = PSRAW by E */
   11543    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE1) {
   11544       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psraw", Iop_SarN16x8 );
   11545       goto decode_success;
   11546    }
   11547 
   11548    /* 66 0F 72 /2 ib = PSRLD by immediate */
   11549    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
   11550        && epartIsReg(insn[2])
   11551        && gregOfRM(insn[2]) == 2) {
   11552       delta = dis_SSE_shiftE_imm( delta+2, "psrld", Iop_ShrN32x4 );
   11553       goto decode_success;
   11554    }
   11555 
   11556    /* 66 0F D2 = PSRLD by E */
   11557    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD2) {
   11558       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrld", Iop_ShrN32x4 );
   11559       goto decode_success;
   11560    }
   11561 
   11562    /* 66 0F 73 /3 ib = PSRLDQ by immediate */
   11563    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
   11564        && epartIsReg(insn[2])
   11565        && gregOfRM(insn[2]) == 3) {
   11566       IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
   11567       Int    imm = (Int)insn[3];
   11568       Int    reg = eregOfRM(insn[2]);
   11569       DIP("psrldq $%d,%s\n", imm, nameXMMReg(reg));
   11570       vassert(imm >= 0 && imm <= 255);
   11571       delta += 4;
   11572 
   11573       sV    = newTemp(Ity_V128);
   11574       dV    = newTemp(Ity_V128);
   11575       hi64  = newTemp(Ity_I64);
   11576       lo64  = newTemp(Ity_I64);
   11577       hi64r = newTemp(Ity_I64);
   11578       lo64r = newTemp(Ity_I64);
   11579 
   11580       if (imm >= 16) {
   11581          putXMMReg(reg, mkV128(0x0000));
   11582          goto decode_success;
   11583       }
   11584 
   11585       assign( sV, getXMMReg(reg) );
   11586       assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
   11587       assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
   11588 
   11589       if (imm == 0) {
   11590          assign( lo64r, mkexpr(lo64) );
   11591          assign( hi64r, mkexpr(hi64) );
   11592       }
   11593       else
   11594       if (imm == 8) {
   11595          assign( hi64r, mkU64(0) );
   11596          assign( lo64r, mkexpr(hi64) );
   11597       }
   11598       else
   11599       if (imm > 8) {
   11600          assign( hi64r, mkU64(0) );
   11601          assign( lo64r, binop( Iop_Shr64,
   11602                                mkexpr(hi64),
   11603                                mkU8( 8*(imm-8) ) ));
   11604       } else {
   11605          assign( hi64r, binop( Iop_Shr64,
   11606                                mkexpr(hi64),
   11607                                mkU8(8 * imm) ));
   11608          assign( lo64r,
   11609                  binop( Iop_Or64,
   11610                         binop(Iop_Shr64, mkexpr(lo64),
   11611                                          mkU8(8 * imm)),
   11612                         binop(Iop_Shl64, mkexpr(hi64),
   11613                                          mkU8(8 * (8 - imm)) )
   11614                       )
   11615                );
   11616       }
   11617 
   11618       assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
   11619       putXMMReg(reg, mkexpr(dV));
   11620       goto decode_success;
   11621    }
   11622 
   11623    /* 66 0F 73 /2 ib = PSRLQ by immediate */
   11624    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
   11625        && epartIsReg(insn[2])
   11626        && gregOfRM(insn[2]) == 2) {
   11627       delta = dis_SSE_shiftE_imm( delta+2, "psrlq", Iop_ShrN64x2 );
   11628       goto decode_success;
   11629    }
   11630 
   11631    /* 66 0F D3 = PSRLQ by E */
   11632    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD3) {
   11633       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrlq", Iop_ShrN64x2 );
   11634       goto decode_success;
   11635    }
   11636 
   11637    /* 66 0F 71 /2 ib = PSRLW by immediate */
   11638    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
   11639        && epartIsReg(insn[2])
   11640        && gregOfRM(insn[2]) == 2) {
   11641       delta = dis_SSE_shiftE_imm( delta+2, "psrlw", Iop_ShrN16x8 );
   11642       goto decode_success;
   11643    }
   11644 
   11645    /* 66 0F D1 = PSRLW by E */
   11646    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD1) {
   11647       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrlw", Iop_ShrN16x8 );
   11648       goto decode_success;
   11649    }
   11650 
   11651    /* 66 0F F8 = PSUBB */
   11652    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF8) {
   11653       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11654                                  "psubb", Iop_Sub8x16, False );
   11655       goto decode_success;
   11656    }
   11657 
   11658    /* 66 0F FA = PSUBD */
   11659    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFA) {
   11660       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11661                                  "psubd", Iop_Sub32x4, False );
   11662       goto decode_success;
   11663    }
   11664 
   11665    /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   11666    /* 0F FB = PSUBQ -- sub 64x1 */
   11667    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xFB) {
   11668       do_MMX_preamble();
   11669       delta = dis_MMXop_regmem_to_reg (
   11670                 sorb, delta+2, insn[1], "psubq", False );
   11671       goto decode_success;
   11672    }
   11673 
   11674    /* 66 0F FB = PSUBQ */
   11675    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFB) {
   11676       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11677                                  "psubq", Iop_Sub64x2, False );
   11678       goto decode_success;
   11679    }
   11680 
   11681    /* 66 0F F9 = PSUBW */
   11682    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF9) {
   11683       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11684                                  "psubw", Iop_Sub16x8, False );
   11685       goto decode_success;
   11686    }
   11687 
   11688    /* 66 0F E8 = PSUBSB */
   11689    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE8) {
   11690       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11691                                  "psubsb", Iop_QSub8Sx16, False );
   11692       goto decode_success;
   11693    }
   11694 
   11695    /* 66 0F E9 = PSUBSW */
   11696    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE9) {
   11697       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11698                                  "psubsw", Iop_QSub16Sx8, False );
   11699       goto decode_success;
   11700    }
   11701 
   11702    /* 66 0F D8 = PSUBSB */
   11703    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD8) {
   11704       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11705                                  "psubusb", Iop_QSub8Ux16, False );
   11706       goto decode_success;
   11707    }
   11708 
   11709    /* 66 0F D9 = PSUBSW */
   11710    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD9) {
   11711       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11712                                  "psubusw", Iop_QSub16Ux8, False );
   11713       goto decode_success;
   11714    }
   11715 
   11716    /* 66 0F 68 = PUNPCKHBW */
   11717    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x68) {
   11718       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11719                                  "punpckhbw",
   11720                                  Iop_InterleaveHI8x16, True );
   11721       goto decode_success;
   11722    }
   11723 
   11724    /* 66 0F 6A = PUNPCKHDQ */
   11725    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6A) {
   11726       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11727                                  "punpckhdq",
   11728                                  Iop_InterleaveHI32x4, True );
   11729       goto decode_success;
   11730    }
   11731 
   11732    /* 66 0F 6D = PUNPCKHQDQ */
   11733    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6D) {
   11734       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11735                                  "punpckhqdq",
   11736                                  Iop_InterleaveHI64x2, True );
   11737       goto decode_success;
   11738    }
   11739 
   11740    /* 66 0F 69 = PUNPCKHWD */
   11741    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x69) {
   11742       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11743                                  "punpckhwd",
   11744                                  Iop_InterleaveHI16x8, True );
   11745       goto decode_success;
   11746    }
   11747 
   11748    /* 66 0F 60 = PUNPCKLBW */
   11749    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x60) {
   11750       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11751                                  "punpcklbw",
   11752                                  Iop_InterleaveLO8x16, True );
   11753       goto decode_success;
   11754    }
   11755 
   11756    /* 66 0F 62 = PUNPCKLDQ */
   11757    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x62) {
   11758       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11759                                  "punpckldq",
   11760                                  Iop_InterleaveLO32x4, True );
   11761       goto decode_success;
   11762    }
   11763 
   11764    /* 66 0F 6C = PUNPCKLQDQ */
   11765    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6C) {
   11766       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11767                                  "punpcklqdq",
   11768                                  Iop_InterleaveLO64x2, True );
   11769       goto decode_success;
   11770    }
   11771 
   11772    /* 66 0F 61 = PUNPCKLWD */
   11773    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x61) {
   11774       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11775                                  "punpcklwd",
   11776                                  Iop_InterleaveLO16x8, True );
   11777       goto decode_success;
   11778    }
   11779 
   11780    /* 66 0F EF = PXOR */
   11781    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEF) {
   11782       delta = dis_SSE_E_to_G_all( sorb, delta+2, "pxor", Iop_XorV128 );
   11783       goto decode_success;
   11784    }
   11785 
   11786 //--    /* FXSAVE/FXRSTOR m32 -- load/store the FPU/MMX/SSE state. */
   11787 //--    if (insn[0] == 0x0F && insn[1] == 0xAE
   11788 //--        && (!epartIsReg(insn[2]))
   11789 //--        && (gregOfRM(insn[2]) == 1 || gregOfRM(insn[2]) == 0) ) {
   11790 //--       Bool store = gregOfRM(insn[2]) == 0;
   11791 //--       vg_assert(sz == 4);
   11792 //--       pair = disAMode ( cb, sorb, eip+2, dis_buf );
   11793 //--       t1   = LOW24(pair);
   11794 //--       eip += 2+HI8(pair);
   11795 //--       uInstr3(cb, store ? SSE2a_MemWr : SSE2a_MemRd, 512,
   11796 //--                   Lit16, (((UShort)insn[0]) << 8) | (UShort)insn[1],
   11797 //--                   Lit16, (UShort)insn[2],
   11798 //--                   TempReg, t1 );
   11799 //--       DIP("fx%s %s\n", store ? "save" : "rstor", dis_buf );
   11800 //--       goto decode_success;
   11801 //--    }
   11802 
   11803    /* 0F AE /7 = CLFLUSH -- flush cache line */
   11804    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
   11805        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 7) {
   11806 
   11807       /* This is something of a hack.  We need to know the size of the
   11808          cache line containing addr.  Since we don't (easily), assume
   11809          256 on the basis that no real cache would have a line that
   11810          big.  It's safe to invalidate more stuff than we need, just
   11811          inefficient. */
   11812       UInt lineszB = 256;
   11813 
   11814       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11815       delta += 2+alen;
   11816 
   11817       /* Round addr down to the start of the containing block. */
   11818       stmt( IRStmt_Put(
   11819                OFFB_CMSTART,
   11820                binop( Iop_And32,
   11821                       mkexpr(addr),
   11822                       mkU32( ~(lineszB-1) ))) );
   11823 
   11824       stmt( IRStmt_Put(OFFB_CMLEN, mkU32(lineszB) ) );
   11825 
   11826       jmp_lit(&dres, Ijk_InvalICache, (Addr32)(guest_EIP_bbstart+delta));
   11827 
   11828       DIP("clflush %s\n", dis_buf);
   11829       goto decode_success;
   11830    }
   11831 
   11832    /* ---------------------------------------------------- */
   11833    /* --- end of the SSE2 decoder.                     --- */
   11834    /* ---------------------------------------------------- */
   11835 
   11836    /* ---------------------------------------------------- */
   11837    /* --- start of the SSE3 decoder.                   --- */
   11838    /* ---------------------------------------------------- */
   11839 
   11840    /* Skip parts of the decoder which don't apply given the stated
   11841       guest subarchitecture. */
   11842    if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE3))
   11843       goto after_sse_decoders; /* no SSE3 capabilities */
   11844 
   11845    insn = &guest_code[delta];
   11846 
   11847    /* F3 0F 12 = MOVSLDUP -- move from E (mem or xmm) to G (xmm),
   11848       duplicating some lanes (2:2:0:0). */
   11849    /* F3 0F 16 = MOVSHDUP -- move from E (mem or xmm) to G (xmm),
   11850       duplicating some lanes (3:3:1:1). */
   11851    if (sz == 4 && insn[0] == 0xF3 && insn[1] == 0x0F
   11852        && (insn[2] == 0x12 || insn[2] == 0x16)) {
   11853       IRTemp s3, s2, s1, s0;
   11854       IRTemp sV  = newTemp(Ity_V128);
   11855       Bool   isH = insn[2] == 0x16;
   11856       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11857 
   11858       modrm = insn[3];
   11859       if (epartIsReg(modrm)) {
   11860          assign( sV, getXMMReg( eregOfRM(modrm)) );
   11861          DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
   11862                                   nameXMMReg(eregOfRM(modrm)),
   11863                                   nameXMMReg(gregOfRM(modrm)));
   11864          delta += 3+1;
   11865       } else {
   11866          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11867          gen_SEGV_if_not_16_aligned( addr );
   11868          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11869          DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
   11870 	     dis_buf,
   11871              nameXMMReg(gregOfRM(modrm)));
   11872          delta += 3+alen;
   11873       }
   11874 
   11875       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   11876       putXMMReg( gregOfRM(modrm),
   11877                  isH ? mk128from32s( s3, s3, s1, s1 )
   11878                      : mk128from32s( s2, s2, s0, s0 ) );
   11879       goto decode_success;
   11880    }
   11881 
   11882    /* F2 0F 12 = MOVDDUP -- move from E (mem or xmm) to G (xmm),
   11883       duplicating some lanes (0:1:0:1). */
   11884    if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x12) {
   11885       IRTemp sV = newTemp(Ity_V128);
   11886       IRTemp d0 = newTemp(Ity_I64);
   11887 
   11888       modrm = insn[3];
   11889       if (epartIsReg(modrm)) {
   11890          assign( sV, getXMMReg( eregOfRM(modrm)) );
   11891          DIP("movddup %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11892                                 nameXMMReg(gregOfRM(modrm)));
   11893          delta += 3+1;
   11894          assign ( d0, unop(Iop_V128to64, mkexpr(sV)) );
   11895       } else {
   11896          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11897          assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
   11898          DIP("movddup %s,%s\n", dis_buf,
   11899                                 nameXMMReg(gregOfRM(modrm)));
   11900          delta += 3+alen;
   11901       }
   11902 
   11903       putXMMReg( gregOfRM(modrm), binop(Iop_64HLtoV128,mkexpr(d0),mkexpr(d0)) );
   11904       goto decode_success;
   11905    }
   11906 
   11907    /* F2 0F D0 = ADDSUBPS -- 32x4 +/-/+/- from E (mem or xmm) to G (xmm). */
   11908    if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xD0) {
   11909       IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
   11910       IRTemp eV   = newTemp(Ity_V128);
   11911       IRTemp gV   = newTemp(Ity_V128);
   11912       IRTemp addV = newTemp(Ity_V128);
   11913       IRTemp subV = newTemp(Ity_V128);
   11914       IRTemp rm     = newTemp(Ity_I32);
   11915       a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11916 
   11917       modrm = insn[3];
   11918       if (epartIsReg(modrm)) {
   11919          assign( eV, getXMMReg( eregOfRM(modrm)) );
   11920          DIP("addsubps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11921                                  nameXMMReg(gregOfRM(modrm)));
   11922          delta += 3+1;
   11923       } else {
   11924          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11925          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   11926          DIP("addsubps %s,%s\n", dis_buf,
   11927                                  nameXMMReg(gregOfRM(modrm)));
   11928          delta += 3+alen;
   11929       }
   11930 
   11931       assign( gV, getXMMReg(gregOfRM(modrm)) );
   11932 
   11933       assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   11934       assign( addV, triop(Iop_Add32Fx4, mkexpr(rm), mkexpr(gV), mkexpr(eV)) );
   11935       assign( subV, triop(Iop_Sub32Fx4, mkexpr(rm), mkexpr(gV), mkexpr(eV)) );
   11936 
   11937       breakup128to32s( addV, &a3, &a2, &a1, &a0 );
   11938       breakup128to32s( subV, &s3, &s2, &s1, &s0 );
   11939 
   11940       putXMMReg( gregOfRM(modrm), mk128from32s( a3, s2, a1, s0 ));
   11941       goto decode_success;
   11942    }
   11943 
   11944    /* 66 0F D0 = ADDSUBPD -- 64x4 +/- from E (mem or xmm) to G (xmm). */
   11945    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD0) {
   11946       IRTemp eV   = newTemp(Ity_V128);
   11947       IRTemp gV   = newTemp(Ity_V128);
   11948       IRTemp addV = newTemp(Ity_V128);
   11949       IRTemp subV = newTemp(Ity_V128);
   11950       IRTemp a1     = newTemp(Ity_I64);
   11951       IRTemp s0     = newTemp(Ity_I64);
   11952       IRTemp rm     = newTemp(Ity_I32);
   11953 
   11954       modrm = insn[2];
   11955       if (epartIsReg(modrm)) {
   11956          assign( eV, getXMMReg( eregOfRM(modrm)) );
   11957          DIP("addsubpd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11958                                  nameXMMReg(gregOfRM(modrm)));
   11959          delta += 2+1;
   11960       } else {
   11961          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11962          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   11963          DIP("addsubpd %s,%s\n", dis_buf,
   11964                                  nameXMMReg(gregOfRM(modrm)));
   11965          delta += 2+alen;
   11966       }
   11967 
   11968       assign( gV, getXMMReg(gregOfRM(modrm)) );
   11969 
   11970       assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   11971       assign( addV, triop(Iop_Add64Fx2, mkexpr(rm), mkexpr(gV), mkexpr(eV)) );
   11972       assign( subV, triop(Iop_Sub64Fx2, mkexpr(rm), mkexpr(gV), mkexpr(eV)) );
   11973 
   11974       assign( a1, unop(Iop_V128HIto64, mkexpr(addV) ));
   11975       assign( s0, unop(Iop_V128to64,   mkexpr(subV) ));
   11976 
   11977       putXMMReg( gregOfRM(modrm),
   11978                  binop(Iop_64HLtoV128, mkexpr(a1), mkexpr(s0)) );
   11979       goto decode_success;
   11980    }
   11981 
   11982    /* F2 0F 7D = HSUBPS -- 32x4 sub across from E (mem or xmm) to G (xmm). */
   11983    /* F2 0F 7C = HADDPS -- 32x4 add across from E (mem or xmm) to G (xmm). */
   11984    if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F
   11985        && (insn[2] == 0x7C || insn[2] == 0x7D)) {
   11986       IRTemp e3, e2, e1, e0, g3, g2, g1, g0;
   11987       IRTemp eV     = newTemp(Ity_V128);
   11988       IRTemp gV     = newTemp(Ity_V128);
   11989       IRTemp leftV  = newTemp(Ity_V128);
   11990       IRTemp rightV = newTemp(Ity_V128);
   11991       IRTemp rm     = newTemp(Ity_I32);
   11992       Bool   isAdd  = insn[2] == 0x7C;
   11993       const HChar* str = isAdd ? "add" : "sub";
   11994       e3 = e2 = e1 = e0 = g3 = g2 = g1 = g0 = IRTemp_INVALID;
   11995 
   11996       modrm = insn[3];
   11997       if (epartIsReg(modrm)) {
   11998          assign( eV, getXMMReg( eregOfRM(modrm)) );
   11999          DIP("h%sps %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
   12000                                    nameXMMReg(gregOfRM(modrm)));
   12001          delta += 3+1;
   12002       } else {
   12003          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12004          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   12005          DIP("h%sps %s,%s\n", str, dis_buf,
   12006                                    nameXMMReg(gregOfRM(modrm)));
   12007          delta += 3+alen;
   12008       }
   12009 
   12010       assign( gV, getXMMReg(gregOfRM(modrm)) );
   12011 
   12012       breakup128to32s( eV, &e3, &e2, &e1, &e0 );
   12013       breakup128to32s( gV, &g3, &g2, &g1, &g0 );
   12014 
   12015       assign( leftV,  mk128from32s( e2, e0, g2, g0 ) );
   12016       assign( rightV, mk128from32s( e3, e1, g3, g1 ) );
   12017 
   12018       assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   12019       putXMMReg( gregOfRM(modrm),
   12020                  triop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4,
   12021                        mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
   12022       goto decode_success;
   12023    }
   12024 
   12025    /* 66 0F 7D = HSUBPD -- 64x2 sub across from E (mem or xmm) to G (xmm). */
   12026    /* 66 0F 7C = HADDPD -- 64x2 add across from E (mem or xmm) to G (xmm). */
   12027    if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x7C || insn[1] == 0x7D)) {
   12028       IRTemp e1     = newTemp(Ity_I64);
   12029       IRTemp e0     = newTemp(Ity_I64);
   12030       IRTemp g1     = newTemp(Ity_I64);
   12031       IRTemp g0     = newTemp(Ity_I64);
   12032       IRTemp eV     = newTemp(Ity_V128);
   12033       IRTemp gV     = newTemp(Ity_V128);
   12034       IRTemp leftV  = newTemp(Ity_V128);
   12035       IRTemp rightV = newTemp(Ity_V128);
   12036       IRTemp rm     = newTemp(Ity_I32);
   12037       Bool   isAdd  = insn[1] == 0x7C;
   12038       const HChar* str = isAdd ? "add" : "sub";
   12039 
   12040       modrm = insn[2];
   12041       if (epartIsReg(modrm)) {
   12042          assign( eV, getXMMReg( eregOfRM(modrm)) );
   12043          DIP("h%spd %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
   12044                                    nameXMMReg(gregOfRM(modrm)));
   12045          delta += 2+1;
   12046       } else {
   12047          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   12048          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   12049          DIP("h%spd %s,%s\n", str, dis_buf,
   12050                               nameXMMReg(gregOfRM(modrm)));
   12051          delta += 2+alen;
   12052       }
   12053 
   12054       assign( gV, getXMMReg(gregOfRM(modrm)) );
   12055 
   12056       assign( e1, unop(Iop_V128HIto64, mkexpr(eV) ));
   12057       assign( e0, unop(Iop_V128to64, mkexpr(eV) ));
   12058       assign( g1, unop(Iop_V128HIto64, mkexpr(gV) ));
   12059       assign( g0, unop(Iop_V128to64, mkexpr(gV) ));
   12060 
   12061       assign( leftV,  binop(Iop_64HLtoV128, mkexpr(e0),mkexpr(g0)) );
   12062       assign( rightV, binop(Iop_64HLtoV128, mkexpr(e1),mkexpr(g1)) );
   12063 
   12064       assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   12065       putXMMReg( gregOfRM(modrm),
   12066                  triop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2,
   12067                        mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
   12068       goto decode_success;
   12069    }
   12070 
   12071    /* F2 0F F0 = LDDQU -- move from E (mem or xmm) to G (xmm). */
   12072    if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xF0) {
   12073       modrm = getIByte(delta+3);
   12074       if (epartIsReg(modrm)) {
   12075          goto decode_failure;
   12076       } else {
   12077          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12078          putXMMReg( gregOfRM(modrm),
   12079                     loadLE(Ity_V128, mkexpr(addr)) );
   12080          DIP("lddqu %s,%s\n", dis_buf,
   12081                               nameXMMReg(gregOfRM(modrm)));
   12082          delta += 3+alen;
   12083       }
   12084       goto decode_success;
   12085    }
   12086 
   12087    /* ---------------------------------------------------- */
   12088    /* --- end of the SSE3 decoder.                     --- */
   12089    /* ---------------------------------------------------- */
   12090 
   12091    /* ---------------------------------------------------- */
   12092    /* --- start of the SSSE3 decoder.                  --- */
   12093    /* ---------------------------------------------------- */
   12094 
   12095    /* 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
   12096       Unsigned Bytes (MMX) */
   12097    if (sz == 4
   12098        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
   12099       IRTemp sV        = newTemp(Ity_I64);
   12100       IRTemp dV        = newTemp(Ity_I64);
   12101       IRTemp sVoddsSX  = newTemp(Ity_I64);
   12102       IRTemp sVevensSX = newTemp(Ity_I64);
   12103       IRTemp dVoddsZX  = newTemp(Ity_I64);
   12104       IRTemp dVevensZX = newTemp(Ity_I64);
   12105 
   12106       modrm = insn[3];
   12107       do_MMX_preamble();
   12108       assign( dV, getMMXReg(gregOfRM(modrm)) );
   12109 
   12110       if (epartIsReg(modrm)) {
   12111          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12112          delta += 3+1;
   12113          DIP("pmaddubsw %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   12114                                   nameMMXReg(gregOfRM(modrm)));
   12115       } else {
   12116          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12117          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12118          delta += 3+alen;
   12119          DIP("pmaddubsw %s,%s\n", dis_buf,
   12120                                   nameMMXReg(gregOfRM(modrm)));
   12121       }
   12122 
   12123       /* compute dV unsigned x sV signed */
   12124       assign( sVoddsSX,
   12125               binop(Iop_SarN16x4, mkexpr(sV), mkU8(8)) );
   12126       assign( sVevensSX,
   12127               binop(Iop_SarN16x4,
   12128                     binop(Iop_ShlN16x4, mkexpr(sV), mkU8(8)),
   12129                     mkU8(8)) );
   12130       assign( dVoddsZX,
   12131               binop(Iop_ShrN16x4, mkexpr(dV), mkU8(8)) );
   12132       assign( dVevensZX,
   12133               binop(Iop_ShrN16x4,
   12134                     binop(Iop_ShlN16x4, mkexpr(dV), mkU8(8)),
   12135                     mkU8(8)) );
   12136 
   12137       putMMXReg(
   12138          gregOfRM(modrm),
   12139          binop(Iop_QAdd16Sx4,
   12140                binop(Iop_Mul16x4, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
   12141                binop(Iop_Mul16x4, mkexpr(sVevensSX), mkexpr(dVevensZX))
   12142          )
   12143       );
   12144       goto decode_success;
   12145    }
   12146 
   12147    /* 66 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
   12148       Unsigned Bytes (XMM) */
   12149    if (sz == 2
   12150        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
   12151       IRTemp sV        = newTemp(Ity_V128);
   12152       IRTemp dV        = newTemp(Ity_V128);
   12153       IRTemp sVoddsSX  = newTemp(Ity_V128);
   12154       IRTemp sVevensSX = newTemp(Ity_V128);
   12155       IRTemp dVoddsZX  = newTemp(Ity_V128);
   12156       IRTemp dVevensZX = newTemp(Ity_V128);
   12157 
   12158       modrm = insn[3];
   12159       assign( dV, getXMMReg(gregOfRM(modrm)) );
   12160 
   12161       if (epartIsReg(modrm)) {
   12162          assign( sV, getXMMReg(eregOfRM(modrm)) );
   12163          delta += 3+1;
   12164          DIP("pmaddubsw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   12165                                   nameXMMReg(gregOfRM(modrm)));
   12166       } else {
   12167          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12168          gen_SEGV_if_not_16_aligned( addr );
   12169          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12170          delta += 3+alen;
   12171          DIP("pmaddubsw %s,%s\n", dis_buf,
   12172                                   nameXMMReg(gregOfRM(modrm)));
   12173       }
   12174 
   12175       /* compute dV unsigned x sV signed */
   12176       assign( sVoddsSX,
   12177               binop(Iop_SarN16x8, mkexpr(sV), mkU8(8)) );
   12178       assign( sVevensSX,
   12179               binop(Iop_SarN16x8,
   12180                     binop(Iop_ShlN16x8, mkexpr(sV), mkU8(8)),
   12181                     mkU8(8)) );
   12182       assign( dVoddsZX,
   12183               binop(Iop_ShrN16x8, mkexpr(dV), mkU8(8)) );
   12184       assign( dVevensZX,
   12185               binop(Iop_ShrN16x8,
   12186                     binop(Iop_ShlN16x8, mkexpr(dV), mkU8(8)),
   12187                     mkU8(8)) );
   12188 
   12189       putXMMReg(
   12190          gregOfRM(modrm),
   12191          binop(Iop_QAdd16Sx8,
   12192                binop(Iop_Mul16x8, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
   12193                binop(Iop_Mul16x8, mkexpr(sVevensSX), mkexpr(dVevensZX))
   12194          )
   12195       );
   12196       goto decode_success;
   12197    }
   12198 
   12199    /* ***--- these are MMX class insns introduced in SSSE3 ---*** */
   12200    /* 0F 38 03 = PHADDSW -- 16x4 signed qadd across from E (mem or
   12201       mmx) and G to G (mmx). */
   12202    /* 0F 38 07 = PHSUBSW -- 16x4 signed qsub across from E (mem or
   12203       mmx) and G to G (mmx). */
   12204    /* 0F 38 01 = PHADDW -- 16x4 add across from E (mem or mmx) and G
   12205       to G (mmx). */
   12206    /* 0F 38 05 = PHSUBW -- 16x4 sub across from E (mem or mmx) and G
   12207       to G (mmx). */
   12208    /* 0F 38 02 = PHADDD -- 32x2 add across from E (mem or mmx) and G
   12209       to G (mmx). */
   12210    /* 0F 38 06 = PHSUBD -- 32x2 sub across from E (mem or mmx) and G
   12211       to G (mmx). */
   12212 
   12213    if (sz == 4
   12214        && insn[0] == 0x0F && insn[1] == 0x38
   12215        && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
   12216            || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
   12217       const HChar* str = "???";
   12218       IROp   opV64  = Iop_INVALID;
   12219       IROp   opCatO = Iop_CatOddLanes16x4;
   12220       IROp   opCatE = Iop_CatEvenLanes16x4;
   12221       IRTemp sV     = newTemp(Ity_I64);
   12222       IRTemp dV     = newTemp(Ity_I64);
   12223 
   12224       modrm = insn[3];
   12225 
   12226       switch (insn[2]) {
   12227          case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
   12228          case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
   12229          case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
   12230          case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
   12231          case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
   12232          case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
   12233          default: vassert(0);
   12234       }
   12235       if (insn[2] == 0x02 || insn[2] == 0x06) {
   12236          opCatO = Iop_InterleaveHI32x2;
   12237          opCatE = Iop_InterleaveLO32x2;
   12238       }
   12239 
   12240       do_MMX_preamble();
   12241       assign( dV, getMMXReg(gregOfRM(modrm)) );
   12242 
   12243       if (epartIsReg(modrm)) {
   12244          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12245          delta += 3+1;
   12246          DIP("ph%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
   12247                                   nameMMXReg(gregOfRM(modrm)));
   12248       } else {
   12249          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12250          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12251          delta += 3+alen;
   12252          DIP("ph%s %s,%s\n", str, dis_buf,
   12253                                   nameMMXReg(gregOfRM(modrm)));
   12254       }
   12255 
   12256       putMMXReg(
   12257          gregOfRM(modrm),
   12258          binop(opV64,
   12259                binop(opCatE,mkexpr(sV),mkexpr(dV)),
   12260                binop(opCatO,mkexpr(sV),mkexpr(dV))
   12261          )
   12262       );
   12263       goto decode_success;
   12264    }
   12265 
   12266    /* 66 0F 38 03 = PHADDSW -- 16x8 signed qadd across from E (mem or
   12267       xmm) and G to G (xmm). */
   12268    /* 66 0F 38 07 = PHSUBSW -- 16x8 signed qsub across from E (mem or
   12269       xmm) and G to G (xmm). */
   12270    /* 66 0F 38 01 = PHADDW -- 16x8 add across from E (mem or xmm) and
   12271       G to G (xmm). */
   12272    /* 66 0F 38 05 = PHSUBW -- 16x8 sub across from E (mem or xmm) and
   12273       G to G (xmm). */
   12274    /* 66 0F 38 02 = PHADDD -- 32x4 add across from E (mem or xmm) and
   12275       G to G (xmm). */
   12276    /* 66 0F 38 06 = PHSUBD -- 32x4 sub across from E (mem or xmm) and
   12277       G to G (xmm). */
   12278 
   12279    if (sz == 2
   12280        && insn[0] == 0x0F && insn[1] == 0x38
   12281        && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
   12282            || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
   12283       const HChar* str = "???";
   12284       IROp   opV64  = Iop_INVALID;
   12285       IROp   opCatO = Iop_CatOddLanes16x4;
   12286       IROp   opCatE = Iop_CatEvenLanes16x4;
   12287       IRTemp sV     = newTemp(Ity_V128);
   12288       IRTemp dV     = newTemp(Ity_V128);
   12289       IRTemp sHi    = newTemp(Ity_I64);
   12290       IRTemp sLo    = newTemp(Ity_I64);
   12291       IRTemp dHi    = newTemp(Ity_I64);
   12292       IRTemp dLo    = newTemp(Ity_I64);
   12293 
   12294       modrm = insn[3];
   12295 
   12296       switch (insn[2]) {
   12297          case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
   12298          case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
   12299          case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
   12300          case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
   12301          case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
   12302          case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
   12303          default: vassert(0);
   12304       }
   12305       if (insn[2] == 0x02 || insn[2] == 0x06) {
   12306          opCatO = Iop_InterleaveHI32x2;
   12307          opCatE = Iop_InterleaveLO32x2;
   12308       }
   12309 
   12310       assign( dV, getXMMReg(gregOfRM(modrm)) );
   12311 
   12312       if (epartIsReg(modrm)) {
   12313          assign( sV, getXMMReg( eregOfRM(modrm)) );
   12314          DIP("ph%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
   12315                                   nameXMMReg(gregOfRM(modrm)));
   12316          delta += 3+1;
   12317       } else {
   12318          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12319          gen_SEGV_if_not_16_aligned( addr );
   12320          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12321          DIP("ph%s %s,%s\n", str, dis_buf,
   12322                              nameXMMReg(gregOfRM(modrm)));
   12323          delta += 3+alen;
   12324       }
   12325 
   12326       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   12327       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   12328       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12329       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12330 
   12331       /* This isn't a particularly efficient way to compute the
   12332          result, but at least it avoids a proliferation of IROps,
   12333          hence avoids complication all the backends. */
   12334       putXMMReg(
   12335          gregOfRM(modrm),
   12336          binop(Iop_64HLtoV128,
   12337                binop(opV64,
   12338                      binop(opCatE,mkexpr(sHi),mkexpr(sLo)),
   12339                      binop(opCatO,mkexpr(sHi),mkexpr(sLo))
   12340                ),
   12341                binop(opV64,
   12342                      binop(opCatE,mkexpr(dHi),mkexpr(dLo)),
   12343                      binop(opCatO,mkexpr(dHi),mkexpr(dLo))
   12344                )
   12345          )
   12346       );
   12347       goto decode_success;
   12348    }
   12349 
   12350    /* 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and Scale
   12351       (MMX) */
   12352    if (sz == 4
   12353        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
   12354       IRTemp sV = newTemp(Ity_I64);
   12355       IRTemp dV = newTemp(Ity_I64);
   12356 
   12357       modrm = insn[3];
   12358       do_MMX_preamble();
   12359       assign( dV, getMMXReg(gregOfRM(modrm)) );
   12360 
   12361       if (epartIsReg(modrm)) {
   12362          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12363          delta += 3+1;
   12364          DIP("pmulhrsw %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   12365                                  nameMMXReg(gregOfRM(modrm)));
   12366       } else {
   12367          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12368          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12369          delta += 3+alen;
   12370          DIP("pmulhrsw %s,%s\n", dis_buf,
   12371                                  nameMMXReg(gregOfRM(modrm)));
   12372       }
   12373 
   12374       putMMXReg(
   12375          gregOfRM(modrm),
   12376          dis_PMULHRSW_helper( mkexpr(sV), mkexpr(dV) )
   12377       );
   12378       goto decode_success;
   12379    }
   12380 
   12381    /* 66 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and
   12382       Scale (XMM) */
   12383    if (sz == 2
   12384        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
   12385       IRTemp sV  = newTemp(Ity_V128);
   12386       IRTemp dV  = newTemp(Ity_V128);
   12387       IRTemp sHi = newTemp(Ity_I64);
   12388       IRTemp sLo = newTemp(Ity_I64);
   12389       IRTemp dHi = newTemp(Ity_I64);
   12390       IRTemp dLo = newTemp(Ity_I64);
   12391 
   12392       modrm = insn[3];
   12393       assign( dV, getXMMReg(gregOfRM(modrm)) );
   12394 
   12395       if (epartIsReg(modrm)) {
   12396          assign( sV, getXMMReg(eregOfRM(modrm)) );
   12397          delta += 3+1;
   12398          DIP("pmulhrsw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   12399                                  nameXMMReg(gregOfRM(modrm)));
   12400       } else {
   12401          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12402          gen_SEGV_if_not_16_aligned( addr );
   12403          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12404          delta += 3+alen;
   12405          DIP("pmulhrsw %s,%s\n", dis_buf,
   12406                                  nameXMMReg(gregOfRM(modrm)));
   12407       }
   12408 
   12409       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   12410       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   12411       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12412       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12413 
   12414       putXMMReg(
   12415          gregOfRM(modrm),
   12416          binop(Iop_64HLtoV128,
   12417                dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
   12418                dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
   12419          )
   12420       );
   12421       goto decode_success;
   12422    }
   12423 
   12424    /* 0F 38 08 = PSIGNB -- Packed Sign 8x8  (MMX) */
   12425    /* 0F 38 09 = PSIGNW -- Packed Sign 16x4 (MMX) */
   12426    /* 0F 38 09 = PSIGND -- Packed Sign 32x2 (MMX) */
   12427    if (sz == 4
   12428        && insn[0] == 0x0F && insn[1] == 0x38
   12429        && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
   12430       IRTemp sV      = newTemp(Ity_I64);
   12431       IRTemp dV      = newTemp(Ity_I64);
   12432       const HChar* str = "???";
   12433       Int    laneszB = 0;
   12434 
   12435       switch (insn[2]) {
   12436          case 0x08: laneszB = 1; str = "b"; break;
   12437          case 0x09: laneszB = 2; str = "w"; break;
   12438          case 0x0A: laneszB = 4; str = "d"; break;
   12439          default: vassert(0);
   12440       }
   12441 
   12442       modrm = insn[3];
   12443       do_MMX_preamble();
   12444       assign( dV, getMMXReg(gregOfRM(modrm)) );
   12445 
   12446       if (epartIsReg(modrm)) {
   12447          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12448          delta += 3+1;
   12449          DIP("psign%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
   12450                                      nameMMXReg(gregOfRM(modrm)));
   12451       } else {
   12452          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12453          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12454          delta += 3+alen;
   12455          DIP("psign%s %s,%s\n", str, dis_buf,
   12456                                      nameMMXReg(gregOfRM(modrm)));
   12457       }
   12458 
   12459       putMMXReg(
   12460          gregOfRM(modrm),
   12461          dis_PSIGN_helper( mkexpr(sV), mkexpr(dV), laneszB )
   12462       );
   12463       goto decode_success;
   12464    }
   12465 
   12466    /* 66 0F 38 08 = PSIGNB -- Packed Sign 8x16 (XMM) */
   12467    /* 66 0F 38 09 = PSIGNW -- Packed Sign 16x8 (XMM) */
   12468    /* 66 0F 38 09 = PSIGND -- Packed Sign 32x4 (XMM) */
   12469    if (sz == 2
   12470        && insn[0] == 0x0F && insn[1] == 0x38
   12471        && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
   12472       IRTemp sV      = newTemp(Ity_V128);
   12473       IRTemp dV      = newTemp(Ity_V128);
   12474       IRTemp sHi     = newTemp(Ity_I64);
   12475       IRTemp sLo     = newTemp(Ity_I64);
   12476       IRTemp dHi     = newTemp(Ity_I64);
   12477       IRTemp dLo     = newTemp(Ity_I64);
   12478       const HChar* str = "???";
   12479       Int    laneszB = 0;
   12480 
   12481       switch (insn[2]) {
   12482          case 0x08: laneszB = 1; str = "b"; break;
   12483          case 0x09: laneszB = 2; str = "w"; break;
   12484          case 0x0A: laneszB = 4; str = "d"; break;
   12485          default: vassert(0);
   12486       }
   12487 
   12488       modrm = insn[3];
   12489       assign( dV, getXMMReg(gregOfRM(modrm)) );
   12490 
   12491       if (epartIsReg(modrm)) {
   12492          assign( sV, getXMMReg(eregOfRM(modrm)) );
   12493          delta += 3+1;
   12494          DIP("psign%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
   12495                                      nameXMMReg(gregOfRM(modrm)));
   12496       } else {
   12497          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12498          gen_SEGV_if_not_16_aligned( addr );
   12499          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12500          delta += 3+alen;
   12501          DIP("psign%s %s,%s\n", str, dis_buf,
   12502                                      nameXMMReg(gregOfRM(modrm)));
   12503       }
   12504 
   12505       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   12506       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   12507       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12508       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12509 
   12510       putXMMReg(
   12511          gregOfRM(modrm),
   12512          binop(Iop_64HLtoV128,
   12513                dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
   12514                dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
   12515          )
   12516       );
   12517       goto decode_success;
   12518    }
   12519 
   12520    /* 0F 38 1C = PABSB -- Packed Absolute Value 8x8  (MMX) */
   12521    /* 0F 38 1D = PABSW -- Packed Absolute Value 16x4 (MMX) */
   12522    /* 0F 38 1E = PABSD -- Packed Absolute Value 32x2 (MMX) */
   12523    if (sz == 4
   12524        && insn[0] == 0x0F && insn[1] == 0x38
   12525        && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
   12526       IRTemp sV      = newTemp(Ity_I64);
   12527       const HChar* str = "???";
   12528       Int    laneszB = 0;
   12529 
   12530       switch (insn[2]) {
   12531          case 0x1C: laneszB = 1; str = "b"; break;
   12532          case 0x1D: laneszB = 2; str = "w"; break;
   12533          case 0x1E: laneszB = 4; str = "d"; break;
   12534          default: vassert(0);
   12535       }
   12536 
   12537       modrm = insn[3];
   12538       do_MMX_preamble();
   12539 
   12540       if (epartIsReg(modrm)) {
   12541          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12542          delta += 3+1;
   12543          DIP("pabs%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
   12544                                     nameMMXReg(gregOfRM(modrm)));
   12545       } else {
   12546          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12547          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12548          delta += 3+alen;
   12549          DIP("pabs%s %s,%s\n", str, dis_buf,
   12550                                     nameMMXReg(gregOfRM(modrm)));
   12551       }
   12552 
   12553       putMMXReg(
   12554          gregOfRM(modrm),
   12555          dis_PABS_helper( mkexpr(sV), laneszB )
   12556       );
   12557       goto decode_success;
   12558    }
   12559 
   12560    /* 66 0F 38 1C = PABSB -- Packed Absolute Value 8x16 (XMM) */
   12561    /* 66 0F 38 1D = PABSW -- Packed Absolute Value 16x8 (XMM) */
   12562    /* 66 0F 38 1E = PABSD -- Packed Absolute Value 32x4 (XMM) */
   12563    if (sz == 2
   12564        && insn[0] == 0x0F && insn[1] == 0x38
   12565        && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
   12566       IRTemp sV      = newTemp(Ity_V128);
   12567       IRTemp sHi     = newTemp(Ity_I64);
   12568       IRTemp sLo     = newTemp(Ity_I64);
   12569       const HChar* str = "???";
   12570       Int    laneszB = 0;
   12571 
   12572       switch (insn[2]) {
   12573          case 0x1C: laneszB = 1; str = "b"; break;
   12574          case 0x1D: laneszB = 2; str = "w"; break;
   12575          case 0x1E: laneszB = 4; str = "d"; break;
   12576          default: vassert(0);
   12577       }
   12578 
   12579       modrm = insn[3];
   12580 
   12581       if (epartIsReg(modrm)) {
   12582          assign( sV, getXMMReg(eregOfRM(modrm)) );
   12583          delta += 3+1;
   12584          DIP("pabs%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
   12585                                     nameXMMReg(gregOfRM(modrm)));
   12586       } else {
   12587          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12588          gen_SEGV_if_not_16_aligned( addr );
   12589          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12590          delta += 3+alen;
   12591          DIP("pabs%s %s,%s\n", str, dis_buf,
   12592                                     nameXMMReg(gregOfRM(modrm)));
   12593       }
   12594 
   12595       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12596       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12597 
   12598       putXMMReg(
   12599          gregOfRM(modrm),
   12600          binop(Iop_64HLtoV128,
   12601                dis_PABS_helper( mkexpr(sHi), laneszB ),
   12602                dis_PABS_helper( mkexpr(sLo), laneszB )
   12603          )
   12604       );
   12605       goto decode_success;
   12606    }
   12607 
   12608    /* 0F 3A 0F = PALIGNR -- Packed Align Right (MMX) */
   12609    if (sz == 4
   12610        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
   12611       IRTemp sV  = newTemp(Ity_I64);
   12612       IRTemp dV  = newTemp(Ity_I64);
   12613       IRTemp res = newTemp(Ity_I64);
   12614 
   12615       modrm = insn[3];
   12616       do_MMX_preamble();
   12617       assign( dV, getMMXReg(gregOfRM(modrm)) );
   12618 
   12619       if (epartIsReg(modrm)) {
   12620          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12621          d32 = (UInt)insn[3+1];
   12622          delta += 3+1+1;
   12623          DIP("palignr $%u,%s,%s\n",  d32,
   12624                                      nameMMXReg(eregOfRM(modrm)),
   12625                                      nameMMXReg(gregOfRM(modrm)));
   12626       } else {
   12627          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12628          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12629          d32 = (UInt)insn[3+alen];
   12630          delta += 3+alen+1;
   12631          DIP("palignr $%u%s,%s\n", d32,
   12632                                    dis_buf,
   12633                                    nameMMXReg(gregOfRM(modrm)));
   12634       }
   12635 
   12636       if (d32 == 0) {
   12637          assign( res, mkexpr(sV) );
   12638       }
   12639       else if (d32 >= 1 && d32 <= 7) {
   12640          assign(res,
   12641                 binop(Iop_Or64,
   12642                       binop(Iop_Shr64, mkexpr(sV), mkU8(8*d32)),
   12643                       binop(Iop_Shl64, mkexpr(dV), mkU8(8*(8-d32))
   12644                      )));
   12645       }
   12646       else if (d32 == 8) {
   12647         assign( res, mkexpr(dV) );
   12648       }
   12649       else if (d32 >= 9 && d32 <= 15) {
   12650          assign( res, binop(Iop_Shr64, mkexpr(dV), mkU8(8*(d32-8))) );
   12651       }
   12652       else if (d32 >= 16 && d32 <= 255) {
   12653          assign( res, mkU64(0) );
   12654       }
   12655       else
   12656          vassert(0);
   12657 
   12658       putMMXReg( gregOfRM(modrm), mkexpr(res) );
   12659       goto decode_success;
   12660    }
   12661 
   12662    /* 66 0F 3A 0F = PALIGNR -- Packed Align Right (XMM) */
   12663    if (sz == 2
   12664        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
   12665       IRTemp sV  = newTemp(Ity_V128);
   12666       IRTemp dV  = newTemp(Ity_V128);
   12667       IRTemp sHi = newTemp(Ity_I64);
   12668       IRTemp sLo = newTemp(Ity_I64);
   12669       IRTemp dHi = newTemp(Ity_I64);
   12670       IRTemp dLo = newTemp(Ity_I64);
   12671       IRTemp rHi = newTemp(Ity_I64);
   12672       IRTemp rLo = newTemp(Ity_I64);
   12673 
   12674       modrm = insn[3];
   12675       assign( dV, getXMMReg(gregOfRM(modrm)) );
   12676 
   12677       if (epartIsReg(modrm)) {
   12678          assign( sV, getXMMReg(eregOfRM(modrm)) );
   12679          d32 = (UInt)insn[3+1];
   12680          delta += 3+1+1;
   12681          DIP("palignr $%u,%s,%s\n", d32,
   12682                                     nameXMMReg(eregOfRM(modrm)),
   12683                                     nameXMMReg(gregOfRM(modrm)));
   12684       } else {
   12685          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12686          gen_SEGV_if_not_16_aligned( addr );
   12687          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12688          d32 = (UInt)insn[3+alen];
   12689          delta += 3+alen+1;
   12690          DIP("palignr $%u,%s,%s\n", d32,
   12691                                     dis_buf,
   12692                                     nameXMMReg(gregOfRM(modrm)));
   12693       }
   12694 
   12695       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   12696       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   12697       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12698       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12699 
   12700       if (d32 == 0) {
   12701          assign( rHi, mkexpr(sHi) );
   12702          assign( rLo, mkexpr(sLo) );
   12703       }
   12704       else if (d32 >= 1 && d32 <= 7) {
   12705          assign( rHi, dis_PALIGNR_XMM_helper(dLo, sHi, d32) );
   12706          assign( rLo, dis_PALIGNR_XMM_helper(sHi, sLo, d32) );
   12707       }
   12708       else if (d32 == 8) {
   12709          assign( rHi, mkexpr(dLo) );
   12710          assign( rLo, mkexpr(sHi) );
   12711       }
   12712       else if (d32 >= 9 && d32 <= 15) {
   12713          assign( rHi, dis_PALIGNR_XMM_helper(dHi, dLo, d32-8) );
   12714          assign( rLo, dis_PALIGNR_XMM_helper(dLo, sHi, d32-8) );
   12715       }
   12716       else if (d32 == 16) {
   12717          assign( rHi, mkexpr(dHi) );
   12718          assign( rLo, mkexpr(dLo) );
   12719       }
   12720       else if (d32 >= 17 && d32 <= 23) {
   12721          assign( rHi, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d32-16))) );
   12722          assign( rLo, dis_PALIGNR_XMM_helper(dHi, dLo, d32-16) );
   12723       }
   12724       else if (d32 == 24) {
   12725          assign( rHi, mkU64(0) );
   12726          assign( rLo, mkexpr(dHi) );
   12727       }
   12728       else if (d32 >= 25 && d32 <= 31) {
   12729          assign( rHi, mkU64(0) );
   12730          assign( rLo, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d32-24))) );
   12731       }
   12732       else if (d32 >= 32 && d32 <= 255) {
   12733          assign( rHi, mkU64(0) );
   12734          assign( rLo, mkU64(0) );
   12735       }
   12736       else
   12737          vassert(0);
   12738 
   12739       putXMMReg(
   12740          gregOfRM(modrm),
   12741          binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
   12742       );
   12743       goto decode_success;
   12744    }
   12745 
   12746    /* 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x8 (MMX) */
   12747    if (sz == 4
   12748        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
   12749       IRTemp sV      = newTemp(Ity_I64);
   12750       IRTemp dV      = newTemp(Ity_I64);
   12751 
   12752       modrm = insn[3];
   12753       do_MMX_preamble();
   12754       assign( dV, getMMXReg(gregOfRM(modrm)) );
   12755 
   12756       if (epartIsReg(modrm)) {
   12757          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12758          delta += 3+1;
   12759          DIP("pshufb %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   12760                                nameMMXReg(gregOfRM(modrm)));
   12761       } else {
   12762          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12763          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12764          delta += 3+alen;
   12765          DIP("pshufb %s,%s\n", dis_buf,
   12766                                nameMMXReg(gregOfRM(modrm)));
   12767       }
   12768 
   12769       putMMXReg(
   12770          gregOfRM(modrm),
   12771          binop(
   12772             Iop_And64,
   12773             /* permute the lanes */
   12774             binop(
   12775                Iop_Perm8x8,
   12776                mkexpr(dV),
   12777                binop(Iop_And64, mkexpr(sV), mkU64(0x0707070707070707ULL))
   12778             ),
   12779             /* mask off lanes which have (index & 0x80) == 0x80 */
   12780             unop(Iop_Not64, binop(Iop_SarN8x8, mkexpr(sV), mkU8(7)))
   12781          )
   12782       );
   12783       goto decode_success;
   12784    }
   12785 
   12786    /* 66 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x16 (XMM) */
   12787    if (sz == 2
   12788        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
   12789       IRTemp sV         = newTemp(Ity_V128);
   12790       IRTemp dV         = newTemp(Ity_V128);
   12791       IRTemp sHi        = newTemp(Ity_I64);
   12792       IRTemp sLo        = newTemp(Ity_I64);
   12793       IRTemp dHi        = newTemp(Ity_I64);
   12794       IRTemp dLo        = newTemp(Ity_I64);
   12795       IRTemp rHi        = newTemp(Ity_I64);
   12796       IRTemp rLo        = newTemp(Ity_I64);
   12797       IRTemp sevens     = newTemp(Ity_I64);
   12798       IRTemp mask0x80hi = newTemp(Ity_I64);
   12799       IRTemp mask0x80lo = newTemp(Ity_I64);
   12800       IRTemp maskBit3hi = newTemp(Ity_I64);
   12801       IRTemp maskBit3lo = newTemp(Ity_I64);
   12802       IRTemp sAnd7hi    = newTemp(Ity_I64);
   12803       IRTemp sAnd7lo    = newTemp(Ity_I64);
   12804       IRTemp permdHi    = newTemp(Ity_I64);
   12805       IRTemp permdLo    = newTemp(Ity_I64);
   12806 
   12807       modrm = insn[3];
   12808       assign( dV, getXMMReg(gregOfRM(modrm)) );
   12809 
   12810       if (epartIsReg(modrm)) {
   12811          assign( sV, getXMMReg(eregOfRM(modrm)) );
   12812          delta += 3+1;
   12813          DIP("pshufb %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   12814                                nameXMMReg(gregOfRM(modrm)));
   12815       } else {
   12816          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12817          gen_SEGV_if_not_16_aligned( addr );
   12818          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12819          delta += 3+alen;
   12820          DIP("pshufb %s,%s\n", dis_buf,
   12821                                nameXMMReg(gregOfRM(modrm)));
   12822       }
   12823 
   12824       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   12825       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   12826       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12827       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12828 
   12829       assign( sevens, mkU64(0x0707070707070707ULL) );
   12830 
   12831       /*
   12832       mask0x80hi = Not(SarN8x8(sHi,7))
   12833       maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7)
   12834       sAnd7hi    = And(sHi,sevens)
   12835       permdHi    = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi),
   12836                        And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) )
   12837       rHi        = And(permdHi,mask0x80hi)
   12838       */
   12839       assign(
   12840          mask0x80hi,
   12841          unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7))));
   12842 
   12843       assign(
   12844          maskBit3hi,
   12845          binop(Iop_SarN8x8,
   12846                binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)),
   12847                mkU8(7)));
   12848 
   12849       assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens)));
   12850 
   12851       assign(
   12852          permdHi,
   12853          binop(
   12854             Iop_Or64,
   12855             binop(Iop_And64,
   12856                   binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)),
   12857                   mkexpr(maskBit3hi)),
   12858             binop(Iop_And64,
   12859                   binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)),
   12860                   unop(Iop_Not64,mkexpr(maskBit3hi))) ));
   12861 
   12862       assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) );
   12863 
   12864       /* And the same for the lower half of the result.  What fun. */
   12865 
   12866       assign(
   12867          mask0x80lo,
   12868          unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7))));
   12869 
   12870       assign(
   12871          maskBit3lo,
   12872          binop(Iop_SarN8x8,
   12873                binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)),
   12874                mkU8(7)));
   12875 
   12876       assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens)));
   12877 
   12878       assign(
   12879          permdLo,
   12880          binop(
   12881             Iop_Or64,
   12882             binop(Iop_And64,
   12883                   binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)),
   12884                   mkexpr(maskBit3lo)),
   12885             binop(Iop_And64,
   12886                   binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)),
   12887                   unop(Iop_Not64,mkexpr(maskBit3lo))) ));
   12888 
   12889       assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) );
   12890 
   12891       putXMMReg(
   12892          gregOfRM(modrm),
   12893          binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
   12894       );
   12895       goto decode_success;
   12896    }
   12897 
   12898    /* 0F 38 F0 = MOVBE m16/32(E), r16/32(G) */
   12899    /* 0F 38 F1 = MOVBE r16/32(G), m16/32(E) */
   12900    if ((sz == 2 || sz == 4)
   12901        && insn[0] == 0x0F && insn[1] == 0x38
   12902        && (insn[2] == 0xF0 || insn[2] == 0xF1)
   12903        && !epartIsReg(insn[3])) {
   12904 
   12905       modrm = insn[3];
   12906       addr = disAMode(&alen, sorb, delta + 3, dis_buf);
   12907       delta += 3 + alen;
   12908       ty = szToITy(sz);
   12909       IRTemp src = newTemp(ty);
   12910 
   12911       if (insn[2] == 0xF0) { /* LOAD */
   12912          assign(src, loadLE(ty, mkexpr(addr)));
   12913          IRTemp dst = math_BSWAP(src, ty);
   12914          putIReg(sz, gregOfRM(modrm), mkexpr(dst));
   12915          DIP("movbe %s,%s\n", dis_buf, nameIReg(sz, gregOfRM(modrm)));
   12916       } else { /* STORE */
   12917          assign(src, getIReg(sz, gregOfRM(modrm)));
   12918          IRTemp dst = math_BSWAP(src, ty);
   12919          storeLE(mkexpr(addr), mkexpr(dst));
   12920          DIP("movbe %s,%s\n", nameIReg(sz, gregOfRM(modrm)), dis_buf);
   12921       }
   12922       goto decode_success;
   12923    }
   12924 
   12925    /* ---------------------------------------------------- */
   12926    /* --- end of the SSSE3 decoder.                    --- */
   12927    /* ---------------------------------------------------- */
   12928 
   12929    /* ---------------------------------------------------- */
   12930    /* --- start of the SSE4 decoder                    --- */
   12931    /* ---------------------------------------------------- */
   12932 
   12933    /* 66 0F 3A 0B /r ib = ROUNDSD imm8, xmm2/m64, xmm1
   12934       (Partial implementation only -- only deal with cases where
   12935       the rounding mode is specified directly by the immediate byte.)
   12936       66 0F 3A 0A /r ib = ROUNDSS imm8, xmm2/m32, xmm1
   12937       (Limitations ditto)
   12938    */
   12939    if (sz == 2
   12940        && insn[0] == 0x0F && insn[1] == 0x3A
   12941        && (insn[2] == 0x0B || insn[2] == 0x0A)) {
   12942 
   12943       Bool   isD = insn[2] == 0x0B;
   12944       IRTemp src = newTemp(isD ? Ity_F64 : Ity_F32);
   12945       IRTemp res = newTemp(isD ? Ity_F64 : Ity_F32);
   12946       Int    imm = 0;
   12947 
   12948       modrm = insn[3];
   12949 
   12950       if (epartIsReg(modrm)) {
   12951          assign( src,
   12952                  isD ? getXMMRegLane64F( eregOfRM(modrm), 0 )
   12953                      : getXMMRegLane32F( eregOfRM(modrm), 0 ) );
   12954          imm = insn[3+1];
   12955          if (imm & ~3) goto decode_failure;
   12956          delta += 3+1+1;
   12957          DIP( "rounds%c $%d,%s,%s\n",
   12958               isD ? 'd' : 's',
   12959               imm, nameXMMReg( eregOfRM(modrm) ),
   12960                    nameXMMReg( gregOfRM(modrm) ) );
   12961       } else {
   12962          addr = disAMode( &alen, sorb, delta+3, dis_buf );
   12963          assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
   12964          imm = insn[3+alen];
   12965          if (imm & ~3) goto decode_failure;
   12966          delta += 3+alen+1;
   12967          DIP( "roundsd $%d,%s,%s\n",
   12968               imm, dis_buf, nameXMMReg( gregOfRM(modrm) ) );
   12969       }
   12970 
   12971       /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   12972          that encoding is the same as the encoding for IRRoundingMode,
   12973          we can use that value directly in the IR as a rounding
   12974          mode. */
   12975       assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
   12976                   mkU32(imm & 3), mkexpr(src)) );
   12977 
   12978       if (isD)
   12979          putXMMRegLane64F( gregOfRM(modrm), 0, mkexpr(res) );
   12980       else
   12981          putXMMRegLane32F( gregOfRM(modrm), 0, mkexpr(res) );
   12982 
   12983       goto decode_success;
   12984    }
   12985 
   12986    /* F3 0F BD -- LZCNT (count leading zeroes.  An AMD extension,
   12987       which we can only decode if we're sure this is an AMD cpu that
   12988       supports LZCNT, since otherwise it's BSR, which behaves
   12989       differently. */
   12990    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xBD
   12991        && 0 != (archinfo->hwcaps & VEX_HWCAPS_X86_LZCNT)) {
   12992       vassert(sz == 2 || sz == 4);
   12993       /*IRType*/ ty  = szToITy(sz);
   12994       IRTemp     src = newTemp(ty);
   12995       modrm = insn[3];
   12996       if (epartIsReg(modrm)) {
   12997          assign(src, getIReg(sz, eregOfRM(modrm)));
   12998          delta += 3+1;
   12999          DIP("lzcnt%c %s, %s\n", nameISize(sz),
   13000              nameIReg(sz, eregOfRM(modrm)),
   13001              nameIReg(sz, gregOfRM(modrm)));
   13002       } else {
   13003          addr = disAMode( &alen, sorb, delta+3, dis_buf );
   13004          assign(src, loadLE(ty, mkexpr(addr)));
   13005          delta += 3+alen;
   13006          DIP("lzcnt%c %s, %s\n", nameISize(sz), dis_buf,
   13007              nameIReg(sz, gregOfRM(modrm)));
   13008       }
   13009 
   13010       IRTemp res = gen_LZCNT(ty, src);
   13011       putIReg(sz, gregOfRM(modrm), mkexpr(res));
   13012 
   13013       // Update flags.  This is pretty lame .. perhaps can do better
   13014       // if this turns out to be performance critical.
   13015       // O S A P are cleared.  Z is set if RESULT == 0.
   13016       // C is set if SRC is zero.
   13017       IRTemp src32 = newTemp(Ity_I32);
   13018       IRTemp res32 = newTemp(Ity_I32);
   13019       assign(src32, widenUto32(mkexpr(src)));
   13020       assign(res32, widenUto32(mkexpr(res)));
   13021 
   13022       IRTemp oszacp = newTemp(Ity_I32);
   13023       assign(
   13024          oszacp,
   13025          binop(Iop_Or32,
   13026                binop(Iop_Shl32,
   13027                      unop(Iop_1Uto32,
   13028                           binop(Iop_CmpEQ32, mkexpr(res32), mkU32(0))),
   13029                      mkU8(X86G_CC_SHIFT_Z)),
   13030                binop(Iop_Shl32,
   13031                      unop(Iop_1Uto32,
   13032                           binop(Iop_CmpEQ32, mkexpr(src32), mkU32(0))),
   13033                      mkU8(X86G_CC_SHIFT_C))
   13034          )
   13035       );
   13036 
   13037       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   13038       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   13039       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   13040       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
   13041 
   13042       goto decode_success;
   13043    }
   13044 
   13045    /* ---------------------------------------------------- */
   13046    /* --- end of the SSE4 decoder                      --- */
   13047    /* ---------------------------------------------------- */
   13048 
   13049    after_sse_decoders:
   13050 
   13051    /* ---------------------------------------------------- */
   13052    /* --- deal with misc 0x67 pfxs (addr size override) -- */
   13053    /* ---------------------------------------------------- */
   13054 
   13055    /* 67 E3 = JCXZ (for JECXZ see below) */
   13056    if (insn[0] == 0x67 && insn[1] == 0xE3 && sz == 4) {
   13057       delta += 2;
   13058       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
   13059       delta ++;
   13060       stmt( IRStmt_Exit(
   13061                binop(Iop_CmpEQ16, getIReg(2,R_ECX), mkU16(0)),
   13062                Ijk_Boring,
   13063                IRConst_U32(d32),
   13064                OFFB_EIP
   13065             ));
   13066        DIP("jcxz 0x%x\n", d32);
   13067        goto decode_success;
   13068    }
   13069 
   13070    /* ---------------------------------------------------- */
   13071    /* --- start of the baseline insn decoder            -- */
   13072    /* ---------------------------------------------------- */
   13073 
   13074    /* Get the primary opcode. */
   13075    opc = getIByte(delta); delta++;
   13076 
   13077    /* We get here if the current insn isn't SSE, or this CPU doesn't
   13078       support SSE. */
   13079 
   13080    switch (opc) {
   13081 
   13082    /* ------------------------ Control flow --------------- */
   13083 
   13084    case 0xC2: /* RET imm16 */
   13085       d32 = getUDisp16(delta);
   13086       delta += 2;
   13087       dis_ret(&dres, d32);
   13088       DIP("ret %u\n", d32);
   13089       break;
   13090    case 0xC3: /* RET */
   13091       dis_ret(&dres, 0);
   13092       DIP("ret\n");
   13093       break;
   13094 
   13095    case 0xCF: /* IRET */
   13096       /* Note, this is an extremely kludgey and limited implementation
   13097          of iret.  All it really does is:
   13098             popl %EIP; popl %CS; popl %EFLAGS.
   13099          %CS is set but ignored (as it is in (eg) popw %cs)". */
   13100       t1 = newTemp(Ity_I32); /* ESP */
   13101       t2 = newTemp(Ity_I32); /* new EIP */
   13102       t3 = newTemp(Ity_I32); /* new CS */
   13103       t4 = newTemp(Ity_I32); /* new EFLAGS */
   13104       assign(t1, getIReg(4,R_ESP));
   13105       assign(t2, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(0) )));
   13106       assign(t3, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(4) )));
   13107       assign(t4, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(8) )));
   13108       /* Get stuff off stack */
   13109       putIReg(4, R_ESP,binop(Iop_Add32, mkexpr(t1), mkU32(12)));
   13110       /* set %CS (which is ignored anyway) */
   13111       putSReg( R_CS, unop(Iop_32to16, mkexpr(t3)) );
   13112       /* set %EFLAGS */
   13113       set_EFLAGS_from_value( t4, False/*!emit_AC_emwarn*/, 0/*unused*/ );
   13114       /* goto new EIP value */
   13115       jmp_treg(&dres, Ijk_Ret, t2);
   13116       vassert(dres.whatNext == Dis_StopHere);
   13117       DIP("iret (very kludgey)\n");
   13118       break;
   13119 
   13120    case 0xE8: /* CALL J4 */
   13121       d32 = getUDisp32(delta); delta += 4;
   13122       d32 += (guest_EIP_bbstart+delta);
   13123       /* (guest_eip_bbstart+delta) == return-to addr, d32 == call-to addr */
   13124       if (d32 == guest_EIP_bbstart+delta && getIByte(delta) >= 0x58
   13125                                          && getIByte(delta) <= 0x5F) {
   13126          /* Specially treat the position-independent-code idiom
   13127                  call X
   13128               X: popl %reg
   13129             as
   13130                  movl %eip, %reg.
   13131             since this generates better code, but for no other reason. */
   13132          Int archReg = getIByte(delta) - 0x58;
   13133          /* vex_printf("-- fPIC thingy\n"); */
   13134          putIReg(4, archReg, mkU32(guest_EIP_bbstart+delta));
   13135          delta++; /* Step over the POP */
   13136          DIP("call 0x%x ; popl %s\n",d32,nameIReg(4,archReg));
   13137       } else {
   13138          /* The normal sequence for a call. */
   13139          t1 = newTemp(Ity_I32);
   13140          assign(t1, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
   13141          putIReg(4, R_ESP, mkexpr(t1));
   13142          storeLE( mkexpr(t1), mkU32(guest_EIP_bbstart+delta));
   13143          if (resteerOkFn( callback_opaque, (Addr32)d32 )) {
   13144             /* follow into the call target. */
   13145             dres.whatNext   = Dis_ResteerU;
   13146             dres.continueAt = (Addr32)d32;
   13147          } else {
   13148             jmp_lit(&dres, Ijk_Call, d32);
   13149             vassert(dres.whatNext == Dis_StopHere);
   13150          }
   13151          DIP("call 0x%x\n",d32);
   13152       }
   13153       break;
   13154 
   13155 //--    case 0xC8: /* ENTER */
   13156 //--       d32 = getUDisp16(eip); eip += 2;
   13157 //--       abyte = getIByte(delta); delta++;
   13158 //--
   13159 //--       vg_assert(sz == 4);
   13160 //--       vg_assert(abyte == 0);
   13161 //--
   13162 //--       t1 = newTemp(cb); t2 = newTemp(cb);
   13163 //--       uInstr2(cb, GET,   sz, ArchReg, R_EBP, TempReg, t1);
   13164 //--       uInstr2(cb, GET,    4, ArchReg, R_ESP, TempReg, t2);
   13165 //--       uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
   13166 //--       uLiteral(cb, sz);
   13167 //--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
   13168 //--       uInstr2(cb, STORE,  4, TempReg, t1,    TempReg, t2);
   13169 //--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_EBP);
   13170 //--       if (d32) {
   13171 //--          uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
   13172 //--          uLiteral(cb, d32);
   13173 //--          uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
   13174 //--       }
   13175 //--       DIP("enter 0x%x, 0x%x", d32, abyte);
   13176 //--       break;
   13177 
   13178    case 0xC9: /* LEAVE */
   13179       vassert(sz == 4);
   13180       t1 = newTemp(Ity_I32); t2 = newTemp(Ity_I32);
   13181       assign(t1, getIReg(4,R_EBP));
   13182       /* First PUT ESP looks redundant, but need it because ESP must
   13183          always be up-to-date for Memcheck to work... */
   13184       putIReg(4, R_ESP, mkexpr(t1));
   13185       assign(t2, loadLE(Ity_I32,mkexpr(t1)));
   13186       putIReg(4, R_EBP, mkexpr(t2));
   13187       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t1), mkU32(4)) );
   13188       DIP("leave\n");
   13189       break;
   13190 
   13191    /* ---------------- Misc weird-ass insns --------------- */
   13192 
   13193    case 0x27: /* DAA */
   13194    case 0x2F: /* DAS */
   13195    case 0x37: /* AAA */
   13196    case 0x3F: /* AAS */
   13197       /* An ugly implementation for some ugly instructions.  Oh
   13198 	 well. */
   13199       if (sz != 4) goto decode_failure;
   13200       t1 = newTemp(Ity_I32);
   13201       t2 = newTemp(Ity_I32);
   13202       /* Make up a 32-bit value (t1), with the old value of AX in the
   13203          bottom 16 bits, and the old OSZACP bitmask in the upper 16
   13204          bits. */
   13205       assign(t1,
   13206              binop(Iop_16HLto32,
   13207                    unop(Iop_32to16,
   13208                         mk_x86g_calculate_eflags_all()),
   13209                    getIReg(2, R_EAX)
   13210             ));
   13211       /* Call the helper fn, to get a new AX and OSZACP value, and
   13212          poke both back into the guest state.  Also pass the helper
   13213          the actual opcode so it knows which of the 4 instructions it
   13214          is doing the computation for. */
   13215       vassert(opc == 0x27 || opc == 0x2F || opc == 0x37 || opc == 0x3F);
   13216       assign(t2,
   13217               mkIRExprCCall(
   13218                  Ity_I32, 0/*regparm*/, "x86g_calculate_daa_das_aaa_aas",
   13219                  &x86g_calculate_daa_das_aaa_aas,
   13220                  mkIRExprVec_2( mkexpr(t1), mkU32( opc & 0xFF) )
   13221             ));
   13222      putIReg(2, R_EAX, unop(Iop_32to16, mkexpr(t2) ));
   13223 
   13224      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   13225      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   13226      stmt( IRStmt_Put( OFFB_CC_DEP1,
   13227                        binop(Iop_And32,
   13228                              binop(Iop_Shr32, mkexpr(t2), mkU8(16)),
   13229                              mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
   13230                                     | X86G_CC_MASK_A | X86G_CC_MASK_Z
   13231                                     | X86G_CC_MASK_S| X86G_CC_MASK_O )
   13232                             )
   13233                       )
   13234          );
   13235      /* Set NDEP even though it isn't used.  This makes redundant-PUT
   13236         elimination of previous stores to this field work better. */
   13237      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   13238      switch (opc) {
   13239         case 0x27: DIP("daa\n"); break;
   13240         case 0x2F: DIP("das\n"); break;
   13241         case 0x37: DIP("aaa\n"); break;
   13242         case 0x3F: DIP("aas\n"); break;
   13243         default: vassert(0);
   13244      }
   13245      break;
   13246 
   13247    case 0xD4: /* AAM */
   13248    case 0xD5: /* AAD */
   13249       d32 = getIByte(delta); delta++;
   13250       if (sz != 4 || d32 != 10) goto decode_failure;
   13251       t1 = newTemp(Ity_I32);
   13252       t2 = newTemp(Ity_I32);
   13253       /* Make up a 32-bit value (t1), with the old value of AX in the
   13254          bottom 16 bits, and the old OSZACP bitmask in the upper 16
   13255          bits. */
   13256       assign(t1,
   13257              binop(Iop_16HLto32,
   13258                    unop(Iop_32to16,
   13259                         mk_x86g_calculate_eflags_all()),
   13260                    getIReg(2, R_EAX)
   13261             ));
   13262       /* Call the helper fn, to get a new AX and OSZACP value, and
   13263          poke both back into the guest state.  Also pass the helper
   13264          the actual opcode so it knows which of the 2 instructions it
   13265          is doing the computation for. */
   13266       assign(t2,
   13267               mkIRExprCCall(
   13268                  Ity_I32, 0/*regparm*/, "x86g_calculate_aad_aam",
   13269                  &x86g_calculate_aad_aam,
   13270                  mkIRExprVec_2( mkexpr(t1), mkU32( opc & 0xFF) )
   13271             ));
   13272       putIReg(2, R_EAX, unop(Iop_32to16, mkexpr(t2) ));
   13273 
   13274       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   13275       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   13276       stmt( IRStmt_Put( OFFB_CC_DEP1,
   13277                         binop(Iop_And32,
   13278                               binop(Iop_Shr32, mkexpr(t2), mkU8(16)),
   13279                               mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
   13280                                      | X86G_CC_MASK_A | X86G_CC_MASK_Z
   13281                                      | X86G_CC_MASK_S| X86G_CC_MASK_O )
   13282                              )
   13283                        )
   13284           );
   13285       /* Set NDEP even though it isn't used.  This makes
   13286          redundant-PUT elimination of previous stores to this field
   13287          work better. */
   13288       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   13289 
   13290       DIP(opc == 0xD4 ? "aam\n" : "aad\n");
   13291       break;
   13292 
   13293    /* ------------------------ CWD/CDQ -------------------- */
   13294 
   13295    case 0x98: /* CBW */
   13296       if (sz == 4) {
   13297          putIReg(4, R_EAX, unop(Iop_16Sto32, getIReg(2, R_EAX)));
   13298          DIP("cwde\n");
   13299       } else {
   13300          vassert(sz == 2);
   13301          putIReg(2, R_EAX, unop(Iop_8Sto16, getIReg(1, R_EAX)));
   13302          DIP("cbw\n");
   13303       }
   13304       break;
   13305 
   13306    case 0x99: /* CWD/CDQ */
   13307       ty = szToITy(sz);
   13308       putIReg(sz, R_EDX,
   13309                   binop(mkSizedOp(ty,Iop_Sar8),
   13310                         getIReg(sz, R_EAX),
   13311                         mkU8(sz == 2 ? 15 : 31)) );
   13312       DIP(sz == 2 ? "cwdq\n" : "cdqq\n");
   13313       break;
   13314 
   13315    /* ------------------------ FPU ops -------------------- */
   13316 
   13317    case 0x9E: /* SAHF */
   13318       codegen_SAHF();
   13319       DIP("sahf\n");
   13320       break;
   13321 
   13322    case 0x9F: /* LAHF */
   13323       codegen_LAHF();
   13324       DIP("lahf\n");
   13325       break;
   13326 
   13327    case 0x9B: /* FWAIT */
   13328       /* ignore? */
   13329       DIP("fwait\n");
   13330       break;
   13331 
   13332    case 0xD8:
   13333    case 0xD9:
   13334    case 0xDA:
   13335    case 0xDB:
   13336    case 0xDC:
   13337    case 0xDD:
   13338    case 0xDE:
   13339    case 0xDF: {
   13340       Int  delta0    = delta;
   13341       Bool decode_OK = False;
   13342       delta = dis_FPU ( &decode_OK, sorb, delta );
   13343       if (!decode_OK) {
   13344          delta = delta0;
   13345          goto decode_failure;
   13346       }
   13347       break;
   13348    }
   13349 
   13350    /* ------------------------ INC & DEC ------------------ */
   13351 
   13352    case 0x40: /* INC eAX */
   13353    case 0x41: /* INC eCX */
   13354    case 0x42: /* INC eDX */
   13355    case 0x43: /* INC eBX */
   13356    case 0x44: /* INC eSP */
   13357    case 0x45: /* INC eBP */
   13358    case 0x46: /* INC eSI */
   13359    case 0x47: /* INC eDI */
   13360       vassert(sz == 2 || sz == 4);
   13361       ty = szToITy(sz);
   13362       t1 = newTemp(ty);
   13363       assign( t1, binop(mkSizedOp(ty,Iop_Add8),
   13364                         getIReg(sz, (UInt)(opc - 0x40)),
   13365                         mkU(ty,1)) );
   13366       setFlags_INC_DEC( True, t1, ty );
   13367       putIReg(sz, (UInt)(opc - 0x40), mkexpr(t1));
   13368       DIP("inc%c %s\n", nameISize(sz), nameIReg(sz,opc-0x40));
   13369       break;
   13370 
   13371    case 0x48: /* DEC eAX */
   13372    case 0x49: /* DEC eCX */
   13373    case 0x4A: /* DEC eDX */
   13374    case 0x4B: /* DEC eBX */
   13375    case 0x4C: /* DEC eSP */
   13376    case 0x4D: /* DEC eBP */
   13377    case 0x4E: /* DEC eSI */
   13378    case 0x4F: /* DEC eDI */
   13379       vassert(sz == 2 || sz == 4);
   13380       ty = szToITy(sz);
   13381       t1 = newTemp(ty);
   13382       assign( t1, binop(mkSizedOp(ty,Iop_Sub8),
   13383                         getIReg(sz, (UInt)(opc - 0x48)),
   13384                         mkU(ty,1)) );
   13385       setFlags_INC_DEC( False, t1, ty );
   13386       putIReg(sz, (UInt)(opc - 0x48), mkexpr(t1));
   13387       DIP("dec%c %s\n", nameISize(sz), nameIReg(sz,opc-0x48));
   13388       break;
   13389 
   13390    /* ------------------------ INT ------------------------ */
   13391 
   13392    case 0xCC: /* INT 3 */
   13393       jmp_lit(&dres, Ijk_SigTRAP, ((Addr32)guest_EIP_bbstart)+delta);
   13394       vassert(dres.whatNext == Dis_StopHere);
   13395       DIP("int $0x3\n");
   13396       break;
   13397 
   13398    case 0xCD: /* INT imm8 */
   13399       d32 = getIByte(delta); delta++;
   13400 
   13401       /* For any of the cases where we emit a jump (that is, for all
   13402          currently handled cases), it's important that all ArchRegs
   13403          carry their up-to-date value at this point.  So we declare an
   13404          end-of-block here, which forces any TempRegs caching ArchRegs
   13405          to be flushed. */
   13406 
   13407       /* Handle int $0x3F .. $0x4F by synthesising a segfault and a
   13408          restart of this instruction (hence the "-2" two lines below,
   13409          to get the restart EIP to be this instruction.  This is
   13410          probably Linux-specific and it would be more correct to only
   13411          do this if the VexAbiInfo says that is what we should do.
   13412          This used to handle just 0x40-0x43; Jikes RVM uses a larger
   13413          range (0x3F-0x49), and this allows some slack as well. */
   13414       if (d32 >= 0x3F && d32 <= 0x4F) {
   13415          jmp_lit(&dres, Ijk_SigSEGV, ((Addr32)guest_EIP_bbstart)+delta-2);
   13416          vassert(dres.whatNext == Dis_StopHere);
   13417          DIP("int $0x%x\n", d32);
   13418          break;
   13419       }
   13420 
   13421       /* Handle int $0x80 (linux syscalls), int $0x81 and $0x82
   13422          (darwin syscalls), int $0x91 (Solaris syscalls) and int $0xD2
   13423          (Solaris fasttrap syscalls).  As part of this, note where we are, so we
   13424          can back up the guest to this point if the syscall needs to
   13425          be restarted. */
   13426       IRJumpKind jump_kind;
   13427       switch (d32) {
   13428       case 0x80:
   13429          jump_kind = Ijk_Sys_int128;
   13430          break;
   13431       case 0x81:
   13432          jump_kind = Ijk_Sys_int129;
   13433          break;
   13434       case 0x82:
   13435          jump_kind = Ijk_Sys_int130;
   13436          break;
   13437       case 0x91:
   13438          jump_kind = Ijk_Sys_int145;
   13439          break;
   13440       case 0xD2:
   13441          jump_kind = Ijk_Sys_int210;
   13442          break;
   13443       default:
   13444          /* none of the above */
   13445          goto decode_failure;
   13446       }
   13447 
   13448       stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
   13449                         mkU32(guest_EIP_curr_instr) ) );
   13450       jmp_lit(&dres, jump_kind, ((Addr32)guest_EIP_bbstart)+delta);
   13451       vassert(dres.whatNext == Dis_StopHere);
   13452       DIP("int $0x%x\n", d32);
   13453       break;
   13454 
   13455    /* ------------------------ Jcond, byte offset --------- */
   13456 
   13457    case 0xEB: /* Jb (jump, byte offset) */
   13458       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
   13459       delta++;
   13460       if (resteerOkFn( callback_opaque, (Addr32)d32) ) {
   13461          dres.whatNext   = Dis_ResteerU;
   13462          dres.continueAt = (Addr32)d32;
   13463       } else {
   13464          jmp_lit(&dres, Ijk_Boring, d32);
   13465          vassert(dres.whatNext == Dis_StopHere);
   13466       }
   13467       DIP("jmp-8 0x%x\n", d32);
   13468       break;
   13469 
   13470    case 0xE9: /* Jv (jump, 16/32 offset) */
   13471       vassert(sz == 4); /* JRS added 2004 July 11 */
   13472       d32 = (((Addr32)guest_EIP_bbstart)+delta+sz) + getSDisp(sz,delta);
   13473       delta += sz;
   13474       if (resteerOkFn( callback_opaque, (Addr32)d32) ) {
   13475          dres.whatNext   = Dis_ResteerU;
   13476          dres.continueAt = (Addr32)d32;
   13477       } else {
   13478          jmp_lit(&dres, Ijk_Boring, d32);
   13479          vassert(dres.whatNext == Dis_StopHere);
   13480       }
   13481       DIP("jmp 0x%x\n", d32);
   13482       break;
   13483 
   13484    case 0x70:
   13485    case 0x71:
   13486    case 0x72: /* JBb/JNAEb (jump below) */
   13487    case 0x73: /* JNBb/JAEb (jump not below) */
   13488    case 0x74: /* JZb/JEb (jump zero) */
   13489    case 0x75: /* JNZb/JNEb (jump not zero) */
   13490    case 0x76: /* JBEb/JNAb (jump below or equal) */
   13491    case 0x77: /* JNBEb/JAb (jump not below or equal) */
   13492    case 0x78: /* JSb (jump negative) */
   13493    case 0x79: /* JSb (jump not negative) */
   13494    case 0x7A: /* JP (jump parity even) */
   13495    case 0x7B: /* JNP/JPO (jump parity odd) */
   13496    case 0x7C: /* JLb/JNGEb (jump less) */
   13497    case 0x7D: /* JGEb/JNLb (jump greater or equal) */
   13498    case 0x7E: /* JLEb/JNGb (jump less or equal) */
   13499    case 0x7F: /* JGb/JNLEb (jump greater) */
   13500     { Int    jmpDelta;
   13501       const HChar* comment  = "";
   13502       jmpDelta = (Int)getSDisp8(delta);
   13503       vassert(-128 <= jmpDelta && jmpDelta < 128);
   13504       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + jmpDelta;
   13505       delta++;
   13506       if (resteerCisOk
   13507           && vex_control.guest_chase_cond
   13508           && (Addr32)d32 != (Addr32)guest_EIP_bbstart
   13509           && jmpDelta < 0
   13510           && resteerOkFn( callback_opaque, (Addr32)d32) ) {
   13511          /* Speculation: assume this backward branch is taken.  So we
   13512             need to emit a side-exit to the insn following this one,
   13513             on the negation of the condition, and continue at the
   13514             branch target address (d32).  If we wind up back at the
   13515             first instruction of the trace, just stop; it's better to
   13516             let the IR loop unroller handle that case. */
   13517          stmt( IRStmt_Exit(
   13518                   mk_x86g_calculate_condition((X86Condcode)(1 ^ (opc - 0x70))),
   13519                   Ijk_Boring,
   13520                   IRConst_U32(guest_EIP_bbstart+delta),
   13521                   OFFB_EIP ) );
   13522          dres.whatNext   = Dis_ResteerC;
   13523          dres.continueAt = (Addr32)d32;
   13524          comment = "(assumed taken)";
   13525       }
   13526       else
   13527       if (resteerCisOk
   13528           && vex_control.guest_chase_cond
   13529           && (Addr32)d32 != (Addr32)guest_EIP_bbstart
   13530           && jmpDelta >= 0
   13531           && resteerOkFn( callback_opaque,
   13532                           (Addr32)(guest_EIP_bbstart+delta)) ) {
   13533          /* Speculation: assume this forward branch is not taken.  So
   13534             we need to emit a side-exit to d32 (the dest) and continue
   13535             disassembling at the insn immediately following this
   13536             one. */
   13537          stmt( IRStmt_Exit(
   13538                   mk_x86g_calculate_condition((X86Condcode)(opc - 0x70)),
   13539                   Ijk_Boring,
   13540                   IRConst_U32(d32),
   13541                   OFFB_EIP ) );
   13542          dres.whatNext   = Dis_ResteerC;
   13543          dres.continueAt = guest_EIP_bbstart + delta;
   13544          comment = "(assumed not taken)";
   13545       }
   13546       else {
   13547          /* Conservative default translation - end the block at this
   13548             point. */
   13549          jcc_01( &dres, (X86Condcode)(opc - 0x70),
   13550                  (Addr32)(guest_EIP_bbstart+delta), d32);
   13551          vassert(dres.whatNext == Dis_StopHere);
   13552       }
   13553       DIP("j%s-8 0x%x %s\n", name_X86Condcode(opc - 0x70), d32, comment);
   13554       break;
   13555     }
   13556 
   13557    case 0xE3: /* JECXZ (for JCXZ see above) */
   13558       if (sz != 4) goto decode_failure;
   13559       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
   13560       delta ++;
   13561       stmt( IRStmt_Exit(
   13562                binop(Iop_CmpEQ32, getIReg(4,R_ECX), mkU32(0)),
   13563             Ijk_Boring,
   13564             IRConst_U32(d32),
   13565             OFFB_EIP
   13566           ));
   13567       DIP("jecxz 0x%x\n", d32);
   13568       break;
   13569 
   13570    case 0xE0: /* LOOPNE disp8: decrement count, jump if count != 0 && ZF==0 */
   13571    case 0xE1: /* LOOPE  disp8: decrement count, jump if count != 0 && ZF==1 */
   13572    case 0xE2: /* LOOP   disp8: decrement count, jump if count != 0 */
   13573     { /* Again, the docs say this uses ECX/CX as a count depending on
   13574          the address size override, not the operand one.  Since we
   13575          don't handle address size overrides, I guess that means
   13576          ECX. */
   13577       IRExpr* zbit  = NULL;
   13578       IRExpr* count = NULL;
   13579       IRExpr* cond  = NULL;
   13580       const HChar* xtra = NULL;
   13581 
   13582       if (sz != 4) goto decode_failure;
   13583       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
   13584       delta++;
   13585       putIReg(4, R_ECX, binop(Iop_Sub32, getIReg(4,R_ECX), mkU32(1)));
   13586 
   13587       count = getIReg(4,R_ECX);
   13588       cond = binop(Iop_CmpNE32, count, mkU32(0));
   13589       switch (opc) {
   13590          case 0xE2:
   13591             xtra = "";
   13592             break;
   13593          case 0xE1:
   13594             xtra = "e";
   13595             zbit = mk_x86g_calculate_condition( X86CondZ );
   13596 	    cond = mkAnd1(cond, zbit);
   13597             break;
   13598          case 0xE0:
   13599             xtra = "ne";
   13600             zbit = mk_x86g_calculate_condition( X86CondNZ );
   13601 	    cond = mkAnd1(cond, zbit);
   13602             break;
   13603          default:
   13604 	    vassert(0);
   13605       }
   13606       stmt( IRStmt_Exit(cond, Ijk_Boring, IRConst_U32(d32), OFFB_EIP) );
   13607 
   13608       DIP("loop%s 0x%x\n", xtra, d32);
   13609       break;
   13610     }
   13611 
   13612    /* ------------------------ IMUL ----------------------- */
   13613 
   13614    case 0x69: /* IMUL Iv, Ev, Gv */
   13615       delta = dis_imul_I_E_G ( sorb, sz, delta, sz );
   13616       break;
   13617    case 0x6B: /* IMUL Ib, Ev, Gv */
   13618       delta = dis_imul_I_E_G ( sorb, sz, delta, 1 );
   13619       break;
   13620 
   13621    /* ------------------------ MOV ------------------------ */
   13622 
   13623    case 0x88: /* MOV Gb,Eb */
   13624       delta = dis_mov_G_E(sorb, 1, delta);
   13625       break;
   13626 
   13627    case 0x89: /* MOV Gv,Ev */
   13628       delta = dis_mov_G_E(sorb, sz, delta);
   13629       break;
   13630 
   13631    case 0x8A: /* MOV Eb,Gb */
   13632       delta = dis_mov_E_G(sorb, 1, delta);
   13633       break;
   13634 
   13635    case 0x8B: /* MOV Ev,Gv */
   13636       delta = dis_mov_E_G(sorb, sz, delta);
   13637       break;
   13638 
   13639    case 0x8D: /* LEA M,Gv */
   13640       if (sz != 4)
   13641          goto decode_failure;
   13642       modrm = getIByte(delta);
   13643       if (epartIsReg(modrm))
   13644          goto decode_failure;
   13645       /* NOTE!  this is the one place where a segment override prefix
   13646          has no effect on the address calculation.  Therefore we pass
   13647          zero instead of sorb here. */
   13648       addr = disAMode ( &alen, /*sorb*/ 0, delta, dis_buf );
   13649       delta += alen;
   13650       putIReg(sz, gregOfRM(modrm), mkexpr(addr));
   13651       DIP("lea%c %s, %s\n", nameISize(sz), dis_buf,
   13652                             nameIReg(sz,gregOfRM(modrm)));
   13653       break;
   13654 
   13655    case 0x8C: /* MOV Sw,Ew -- MOV from a SEGMENT REGISTER */
   13656       delta = dis_mov_Sw_Ew(sorb, sz, delta);
   13657       break;
   13658 
   13659    case 0x8E: /* MOV Ew,Sw -- MOV to a SEGMENT REGISTER */
   13660       delta = dis_mov_Ew_Sw(sorb, delta);
   13661       break;
   13662 
   13663    case 0xA0: /* MOV Ob,AL */
   13664       sz = 1;
   13665       /* Fall through ... */
   13666    case 0xA1: /* MOV Ov,eAX */
   13667       d32 = getUDisp32(delta); delta += 4;
   13668       ty = szToITy(sz);
   13669       addr = newTemp(Ity_I32);
   13670       assign( addr, handleSegOverride(sorb, mkU32(d32)) );
   13671       putIReg(sz, R_EAX, loadLE(ty, mkexpr(addr)));
   13672       DIP("mov%c %s0x%x, %s\n", nameISize(sz), sorbTxt(sorb),
   13673                                 d32, nameIReg(sz,R_EAX));
   13674       break;
   13675 
   13676    case 0xA2: /* MOV Ob,AL */
   13677       sz = 1;
   13678       /* Fall through ... */
   13679    case 0xA3: /* MOV eAX,Ov */
   13680       d32 = getUDisp32(delta); delta += 4;
   13681       ty = szToITy(sz);
   13682       addr = newTemp(Ity_I32);
   13683       assign( addr, handleSegOverride(sorb, mkU32(d32)) );
   13684       storeLE( mkexpr(addr), getIReg(sz,R_EAX) );
   13685       DIP("mov%c %s, %s0x%x\n", nameISize(sz), nameIReg(sz,R_EAX),
   13686                                 sorbTxt(sorb), d32);
   13687       break;
   13688 
   13689    case 0xB0: /* MOV imm,AL */
   13690    case 0xB1: /* MOV imm,CL */
   13691    case 0xB2: /* MOV imm,DL */
   13692    case 0xB3: /* MOV imm,BL */
   13693    case 0xB4: /* MOV imm,AH */
   13694    case 0xB5: /* MOV imm,CH */
   13695    case 0xB6: /* MOV imm,DH */
   13696    case 0xB7: /* MOV imm,BH */
   13697       d32 = getIByte(delta); delta += 1;
   13698       putIReg(1, opc-0xB0, mkU8(d32));
   13699       DIP("movb $0x%x,%s\n", d32, nameIReg(1,opc-0xB0));
   13700       break;
   13701 
   13702    case 0xB8: /* MOV imm,eAX */
   13703    case 0xB9: /* MOV imm,eCX */
   13704    case 0xBA: /* MOV imm,eDX */
   13705    case 0xBB: /* MOV imm,eBX */
   13706    case 0xBC: /* MOV imm,eSP */
   13707    case 0xBD: /* MOV imm,eBP */
   13708    case 0xBE: /* MOV imm,eSI */
   13709    case 0xBF: /* MOV imm,eDI */
   13710       d32 = getUDisp(sz,delta); delta += sz;
   13711       putIReg(sz, opc-0xB8, mkU(szToITy(sz), d32));
   13712       DIP("mov%c $0x%x,%s\n", nameISize(sz), d32, nameIReg(sz,opc-0xB8));
   13713       break;
   13714 
   13715    case 0xC6: /* C6 /0 = MOV Ib,Eb */
   13716       sz = 1;
   13717       goto maybe_do_Mov_I_E;
   13718    case 0xC7: /* C7 /0 = MOV Iv,Ev */
   13719       goto maybe_do_Mov_I_E;
   13720 
   13721    maybe_do_Mov_I_E:
   13722       modrm = getIByte(delta);
   13723       if (gregOfRM(modrm) == 0) {
   13724          if (epartIsReg(modrm)) {
   13725             delta++; /* mod/rm byte */
   13726             d32 = getUDisp(sz,delta); delta += sz;
   13727             putIReg(sz, eregOfRM(modrm), mkU(szToITy(sz), d32));
   13728             DIP("mov%c $0x%x, %s\n", nameISize(sz), d32,
   13729                                      nameIReg(sz,eregOfRM(modrm)));
   13730          } else {
   13731             addr = disAMode ( &alen, sorb, delta, dis_buf );
   13732             delta += alen;
   13733             d32 = getUDisp(sz,delta); delta += sz;
   13734             storeLE(mkexpr(addr), mkU(szToITy(sz), d32));
   13735             DIP("mov%c $0x%x, %s\n", nameISize(sz), d32, dis_buf);
   13736          }
   13737          break;
   13738       }
   13739       goto decode_failure;
   13740 
   13741    /* ------------------------ opl imm, A ----------------- */
   13742 
   13743    case 0x04: /* ADD Ib, AL */
   13744       delta = dis_op_imm_A(  1, False, Iop_Add8, True, delta, "add" );
   13745       break;
   13746    case 0x05: /* ADD Iv, eAX */
   13747       delta = dis_op_imm_A( sz, False, Iop_Add8, True, delta, "add" );
   13748       break;
   13749 
   13750    case 0x0C: /* OR Ib, AL */
   13751       delta = dis_op_imm_A(  1, False, Iop_Or8, True, delta, "or" );
   13752       break;
   13753    case 0x0D: /* OR Iv, eAX */
   13754       delta = dis_op_imm_A( sz, False, Iop_Or8, True, delta, "or" );
   13755       break;
   13756 
   13757    case 0x14: /* ADC Ib, AL */
   13758       delta = dis_op_imm_A(  1, True, Iop_Add8, True, delta, "adc" );
   13759       break;
   13760    case 0x15: /* ADC Iv, eAX */
   13761       delta = dis_op_imm_A( sz, True, Iop_Add8, True, delta, "adc" );
   13762       break;
   13763 
   13764    case 0x1C: /* SBB Ib, AL */
   13765       delta = dis_op_imm_A( 1, True, Iop_Sub8, True, delta, "sbb" );
   13766       break;
   13767    case 0x1D: /* SBB Iv, eAX */
   13768       delta = dis_op_imm_A( sz, True, Iop_Sub8, True, delta, "sbb" );
   13769       break;
   13770 
   13771    case 0x24: /* AND Ib, AL */
   13772       delta = dis_op_imm_A(  1, False, Iop_And8, True, delta, "and" );
   13773       break;
   13774    case 0x25: /* AND Iv, eAX */
   13775       delta = dis_op_imm_A( sz, False, Iop_And8, True, delta, "and" );
   13776       break;
   13777 
   13778    case 0x2C: /* SUB Ib, AL */
   13779       delta = dis_op_imm_A(  1, False, Iop_Sub8, True, delta, "sub" );
   13780       break;
   13781    case 0x2D: /* SUB Iv, eAX */
   13782       delta = dis_op_imm_A( sz, False, Iop_Sub8, True, delta, "sub" );
   13783       break;
   13784 
   13785    case 0x34: /* XOR Ib, AL */
   13786       delta = dis_op_imm_A(  1, False, Iop_Xor8, True, delta, "xor" );
   13787       break;
   13788    case 0x35: /* XOR Iv, eAX */
   13789       delta = dis_op_imm_A( sz, False, Iop_Xor8, True, delta, "xor" );
   13790       break;
   13791 
   13792    case 0x3C: /* CMP Ib, AL */
   13793       delta = dis_op_imm_A(  1, False, Iop_Sub8, False, delta, "cmp" );
   13794       break;
   13795    case 0x3D: /* CMP Iv, eAX */
   13796       delta = dis_op_imm_A( sz, False, Iop_Sub8, False, delta, "cmp" );
   13797       break;
   13798 
   13799    case 0xA8: /* TEST Ib, AL */
   13800       delta = dis_op_imm_A(  1, False, Iop_And8, False, delta, "test" );
   13801       break;
   13802    case 0xA9: /* TEST Iv, eAX */
   13803       delta = dis_op_imm_A( sz, False, Iop_And8, False, delta, "test" );
   13804       break;
   13805 
   13806    /* ------------------------ opl Ev, Gv ----------------- */
   13807 
   13808    case 0x02: /* ADD Eb,Gb */
   13809       delta = dis_op2_E_G ( sorb, False, Iop_Add8, True, 1, delta, "add" );
   13810       break;
   13811    case 0x03: /* ADD Ev,Gv */
   13812       delta = dis_op2_E_G ( sorb, False, Iop_Add8, True, sz, delta, "add" );
   13813       break;
   13814 
   13815    case 0x0A: /* OR Eb,Gb */
   13816       delta = dis_op2_E_G ( sorb, False, Iop_Or8, True, 1, delta, "or" );
   13817       break;
   13818    case 0x0B: /* OR Ev,Gv */
   13819       delta = dis_op2_E_G ( sorb, False, Iop_Or8, True, sz, delta, "or" );
   13820       break;
   13821 
   13822    case 0x12: /* ADC Eb,Gb */
   13823       delta = dis_op2_E_G ( sorb, True, Iop_Add8, True, 1, delta, "adc" );
   13824       break;
   13825    case 0x13: /* ADC Ev,Gv */
   13826       delta = dis_op2_E_G ( sorb, True, Iop_Add8, True, sz, delta, "adc" );
   13827       break;
   13828 
   13829    case 0x1A: /* SBB Eb,Gb */
   13830       delta = dis_op2_E_G ( sorb, True, Iop_Sub8, True, 1, delta, "sbb" );
   13831       break;
   13832    case 0x1B: /* SBB Ev,Gv */
   13833       delta = dis_op2_E_G ( sorb, True, Iop_Sub8, True, sz, delta, "sbb" );
   13834       break;
   13835 
   13836    case 0x22: /* AND Eb,Gb */
   13837       delta = dis_op2_E_G ( sorb, False, Iop_And8, True, 1, delta, "and" );
   13838       break;
   13839    case 0x23: /* AND Ev,Gv */
   13840       delta = dis_op2_E_G ( sorb, False, Iop_And8, True, sz, delta, "and" );
   13841       break;
   13842 
   13843    case 0x2A: /* SUB Eb,Gb */
   13844       delta = dis_op2_E_G ( sorb, False, Iop_Sub8, True, 1, delta, "sub" );
   13845       break;
   13846    case 0x2B: /* SUB Ev,Gv */
   13847       delta = dis_op2_E_G ( sorb, False, Iop_Sub8, True, sz, delta, "sub" );
   13848       break;
   13849 
   13850    case 0x32: /* XOR Eb,Gb */
   13851       delta = dis_op2_E_G ( sorb, False, Iop_Xor8, True, 1, delta, "xor" );
   13852       break;
   13853    case 0x33: /* XOR Ev,Gv */
   13854       delta = dis_op2_E_G ( sorb, False, Iop_Xor8, True, sz, delta, "xor" );
   13855       break;
   13856 
   13857    case 0x3A: /* CMP Eb,Gb */
   13858       delta = dis_op2_E_G ( sorb, False, Iop_Sub8, False, 1, delta, "cmp" );
   13859       break;
   13860    case 0x3B: /* CMP Ev,Gv */
   13861       delta = dis_op2_E_G ( sorb, False, Iop_Sub8, False, sz, delta, "cmp" );
   13862       break;
   13863 
   13864    case 0x84: /* TEST Eb,Gb */
   13865       delta = dis_op2_E_G ( sorb, False, Iop_And8, False, 1, delta, "test" );
   13866       break;
   13867    case 0x85: /* TEST Ev,Gv */
   13868       delta = dis_op2_E_G ( sorb, False, Iop_And8, False, sz, delta, "test" );
   13869       break;
   13870 
   13871    /* ------------------------ opl Gv, Ev ----------------- */
   13872 
   13873    case 0x00: /* ADD Gb,Eb */
   13874       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13875                             Iop_Add8, True, 1, delta, "add" );
   13876       break;
   13877    case 0x01: /* ADD Gv,Ev */
   13878       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13879                             Iop_Add8, True, sz, delta, "add" );
   13880       break;
   13881 
   13882    case 0x08: /* OR Gb,Eb */
   13883       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13884                             Iop_Or8, True, 1, delta, "or" );
   13885       break;
   13886    case 0x09: /* OR Gv,Ev */
   13887       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13888                             Iop_Or8, True, sz, delta, "or" );
   13889       break;
   13890 
   13891    case 0x10: /* ADC Gb,Eb */
   13892       delta = dis_op2_G_E ( sorb, pfx_lock, True,
   13893                             Iop_Add8, True, 1, delta, "adc" );
   13894       break;
   13895    case 0x11: /* ADC Gv,Ev */
   13896       delta = dis_op2_G_E ( sorb, pfx_lock, True,
   13897                             Iop_Add8, True, sz, delta, "adc" );
   13898       break;
   13899 
   13900    case 0x18: /* SBB Gb,Eb */
   13901       delta = dis_op2_G_E ( sorb, pfx_lock, True,
   13902                             Iop_Sub8, True, 1, delta, "sbb" );
   13903       break;
   13904    case 0x19: /* SBB Gv,Ev */
   13905       delta = dis_op2_G_E ( sorb, pfx_lock, True,
   13906                             Iop_Sub8, True, sz, delta, "sbb" );
   13907       break;
   13908 
   13909    case 0x20: /* AND Gb,Eb */
   13910       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13911                             Iop_And8, True, 1, delta, "and" );
   13912       break;
   13913    case 0x21: /* AND Gv,Ev */
   13914       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13915                             Iop_And8, True, sz, delta, "and" );
   13916       break;
   13917 
   13918    case 0x28: /* SUB Gb,Eb */
   13919       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13920                             Iop_Sub8, True, 1, delta, "sub" );
   13921       break;
   13922    case 0x29: /* SUB Gv,Ev */
   13923       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13924                             Iop_Sub8, True, sz, delta, "sub" );
   13925       break;
   13926 
   13927    case 0x30: /* XOR Gb,Eb */
   13928       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13929                             Iop_Xor8, True, 1, delta, "xor" );
   13930       break;
   13931    case 0x31: /* XOR Gv,Ev */
   13932       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13933                             Iop_Xor8, True, sz, delta, "xor" );
   13934       break;
   13935 
   13936    case 0x38: /* CMP Gb,Eb */
   13937       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13938                             Iop_Sub8, False, 1, delta, "cmp" );
   13939       break;
   13940    case 0x39: /* CMP Gv,Ev */
   13941       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13942                             Iop_Sub8, False, sz, delta, "cmp" );
   13943       break;
   13944 
   13945    /* ------------------------ POP ------------------------ */
   13946 
   13947    case 0x58: /* POP eAX */
   13948    case 0x59: /* POP eCX */
   13949    case 0x5A: /* POP eDX */
   13950    case 0x5B: /* POP eBX */
   13951    case 0x5D: /* POP eBP */
   13952    case 0x5E: /* POP eSI */
   13953    case 0x5F: /* POP eDI */
   13954    case 0x5C: /* POP eSP */
   13955       vassert(sz == 2 || sz == 4);
   13956       t1 = newTemp(szToITy(sz)); t2 = newTemp(Ity_I32);
   13957       assign(t2, getIReg(4, R_ESP));
   13958       assign(t1, loadLE(szToITy(sz),mkexpr(t2)));
   13959       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t2), mkU32(sz)));
   13960       putIReg(sz, opc-0x58, mkexpr(t1));
   13961       DIP("pop%c %s\n", nameISize(sz), nameIReg(sz,opc-0x58));
   13962       break;
   13963 
   13964    case 0x9D: /* POPF */
   13965       vassert(sz == 2 || sz == 4);
   13966       t1 = newTemp(Ity_I32); t2 = newTemp(Ity_I32);
   13967       assign(t2, getIReg(4, R_ESP));
   13968       assign(t1, widenUto32(loadLE(szToITy(sz),mkexpr(t2))));
   13969       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t2), mkU32(sz)));
   13970 
   13971       /* Generate IR to set %EFLAGS{O,S,Z,A,C,P,D,ID,AC} from the
   13972 	 value in t1. */
   13973       set_EFLAGS_from_value( t1, True/*emit_AC_emwarn*/,
   13974                                  ((Addr32)guest_EIP_bbstart)+delta );
   13975 
   13976       DIP("popf%c\n", nameISize(sz));
   13977       break;
   13978 
   13979    case 0x61: /* POPA */
   13980       /* This is almost certainly wrong for sz==2.  So ... */
   13981       if (sz != 4) goto decode_failure;
   13982 
   13983       /* t5 is the old %ESP value. */
   13984       t5 = newTemp(Ity_I32);
   13985       assign( t5, getIReg(4, R_ESP) );
   13986 
   13987       /* Reload all the registers, except %esp. */
   13988       putIReg(4,R_EAX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(28)) ));
   13989       putIReg(4,R_ECX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(24)) ));
   13990       putIReg(4,R_EDX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(20)) ));
   13991       putIReg(4,R_EBX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(16)) ));
   13992       /* ignore saved %ESP */
   13993       putIReg(4,R_EBP, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 8)) ));
   13994       putIReg(4,R_ESI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 4)) ));
   13995       putIReg(4,R_EDI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 0)) ));
   13996 
   13997       /* and move %ESP back up */
   13998       putIReg( 4, R_ESP, binop(Iop_Add32, mkexpr(t5), mkU32(8*4)) );
   13999 
   14000       DIP("popa%c\n", nameISize(sz));
   14001       break;
   14002 
   14003    case 0x8F: /* POPL/POPW m32 */
   14004      { Int    len;
   14005        UChar  rm = getIByte(delta);
   14006 
   14007        /* make sure this instruction is correct POP */
   14008        if (epartIsReg(rm) || gregOfRM(rm) != 0)
   14009           goto decode_failure;
   14010        /* and has correct size */
   14011        if (sz != 4 && sz != 2)
   14012           goto decode_failure;
   14013        ty = szToITy(sz);
   14014 
   14015        t1 = newTemp(Ity_I32); /* stack address */
   14016        t3 = newTemp(ty); /* data */
   14017        /* set t1 to ESP: t1 = ESP */
   14018        assign( t1, getIReg(4, R_ESP) );
   14019        /* load M[ESP] to virtual register t3: t3 = M[t1] */
   14020        assign( t3, loadLE(ty, mkexpr(t1)) );
   14021 
   14022        /* increase ESP; must be done before the STORE.  Intel manual says:
   14023             If the ESP register is used as a base register for addressing
   14024             a destination operand in memory, the POP instruction computes
   14025             the effective address of the operand after it increments the
   14026             ESP register.
   14027        */
   14028        putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t1), mkU32(sz)) );
   14029 
   14030        /* resolve MODR/M */
   14031        addr = disAMode ( &len, sorb, delta, dis_buf);
   14032        storeLE( mkexpr(addr), mkexpr(t3) );
   14033 
   14034        DIP("pop%c %s\n", sz==2 ? 'w' : 'l', dis_buf);
   14035 
   14036        delta += len;
   14037        break;
   14038      }
   14039 
   14040    case 0x1F: /* POP %DS */
   14041       dis_pop_segreg( R_DS, sz ); break;
   14042    case 0x07: /* POP %ES */
   14043       dis_pop_segreg( R_ES, sz ); break;
   14044    case 0x17: /* POP %SS */
   14045       dis_pop_segreg( R_SS, sz ); break;
   14046 
   14047    /* ------------------------ PUSH ----------------------- */
   14048 
   14049    case 0x50: /* PUSH eAX */
   14050    case 0x51: /* PUSH eCX */
   14051    case 0x52: /* PUSH eDX */
   14052    case 0x53: /* PUSH eBX */
   14053    case 0x55: /* PUSH eBP */
   14054    case 0x56: /* PUSH eSI */
   14055    case 0x57: /* PUSH eDI */
   14056    case 0x54: /* PUSH eSP */
   14057       /* This is the Right Way, in that the value to be pushed is
   14058          established before %esp is changed, so that pushl %esp
   14059          correctly pushes the old value. */
   14060       vassert(sz == 2 || sz == 4);
   14061       ty = sz==2 ? Ity_I16 : Ity_I32;
   14062       t1 = newTemp(ty); t2 = newTemp(Ity_I32);
   14063       assign(t1, getIReg(sz, opc-0x50));
   14064       assign(t2, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)));
   14065       putIReg(4, R_ESP, mkexpr(t2) );
   14066       storeLE(mkexpr(t2),mkexpr(t1));
   14067       DIP("push%c %s\n", nameISize(sz), nameIReg(sz,opc-0x50));
   14068       break;
   14069 
   14070 
   14071    case 0x68: /* PUSH Iv */
   14072       d32 = getUDisp(sz,delta); delta += sz;
   14073       goto do_push_I;
   14074    case 0x6A: /* PUSH Ib, sign-extended to sz */
   14075       d32 = getSDisp8(delta); delta += 1;
   14076       goto do_push_I;
   14077    do_push_I:
   14078       ty = szToITy(sz);
   14079       t1 = newTemp(Ity_I32); t2 = newTemp(ty);
   14080       assign( t1, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
   14081       putIReg(4, R_ESP, mkexpr(t1) );
   14082       /* stop mkU16 asserting if d32 is a negative 16-bit number
   14083          (bug #132813) */
   14084       if (ty == Ity_I16)
   14085          d32 &= 0xFFFF;
   14086       storeLE( mkexpr(t1), mkU(ty,d32) );
   14087       DIP("push%c $0x%x\n", nameISize(sz), d32);
   14088       break;
   14089 
   14090    case 0x9C: /* PUSHF */ {
   14091       vassert(sz == 2 || sz == 4);
   14092 
   14093       t1 = newTemp(Ity_I32);
   14094       assign( t1, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
   14095       putIReg(4, R_ESP, mkexpr(t1) );
   14096 
   14097       /* Calculate OSZACP, and patch in fixed fields as per
   14098          Intel docs.
   14099          - bit 1 is always 1
   14100          - bit 9 is Interrupt Enable (should always be 1 in user mode?)
   14101       */
   14102       t2 = newTemp(Ity_I32);
   14103       assign( t2, binop(Iop_Or32,
   14104                         mk_x86g_calculate_eflags_all(),
   14105                         mkU32( (1<<1)|(1<<9) ) ));
   14106 
   14107       /* Patch in the D flag.  This can simply be a copy of bit 10 of
   14108          baseBlock[OFFB_DFLAG]. */
   14109       t3 = newTemp(Ity_I32);
   14110       assign( t3, binop(Iop_Or32,
   14111                         mkexpr(t2),
   14112                         binop(Iop_And32,
   14113                               IRExpr_Get(OFFB_DFLAG,Ity_I32),
   14114                               mkU32(1<<10)))
   14115             );
   14116 
   14117       /* And patch in the ID flag. */
   14118       t4 = newTemp(Ity_I32);
   14119       assign( t4, binop(Iop_Or32,
   14120                         mkexpr(t3),
   14121                         binop(Iop_And32,
   14122                               binop(Iop_Shl32, IRExpr_Get(OFFB_IDFLAG,Ity_I32),
   14123                                                mkU8(21)),
   14124                               mkU32(1<<21)))
   14125             );
   14126 
   14127       /* And patch in the AC flag. */
   14128       t5 = newTemp(Ity_I32);
   14129       assign( t5, binop(Iop_Or32,
   14130                         mkexpr(t4),
   14131                         binop(Iop_And32,
   14132                               binop(Iop_Shl32, IRExpr_Get(OFFB_ACFLAG,Ity_I32),
   14133                                                mkU8(18)),
   14134                               mkU32(1<<18)))
   14135             );
   14136 
   14137       /* if sz==2, the stored value needs to be narrowed. */
   14138       if (sz == 2)
   14139         storeLE( mkexpr(t1), unop(Iop_32to16,mkexpr(t5)) );
   14140       else
   14141         storeLE( mkexpr(t1), mkexpr(t5) );
   14142 
   14143       DIP("pushf%c\n", nameISize(sz));
   14144       break;
   14145    }
   14146 
   14147    case 0x60: /* PUSHA */
   14148       /* This is almost certainly wrong for sz==2.  So ... */
   14149       if (sz != 4) goto decode_failure;
   14150 
   14151       /* This is the Right Way, in that the value to be pushed is
   14152          established before %esp is changed, so that pusha
   14153          correctly pushes the old %esp value.  New value of %esp is
   14154          pushed at start. */
   14155       /* t0 is the %ESP value we're going to push. */
   14156       t0 = newTemp(Ity_I32);
   14157       assign( t0, getIReg(4, R_ESP) );
   14158 
   14159       /* t5 will be the new %ESP value. */
   14160       t5 = newTemp(Ity_I32);
   14161       assign( t5, binop(Iop_Sub32, mkexpr(t0), mkU32(8*4)) );
   14162 
   14163       /* Update guest state before prodding memory. */
   14164       putIReg(4, R_ESP, mkexpr(t5));
   14165 
   14166       /* Dump all the registers. */
   14167       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(28)), getIReg(4,R_EAX) );
   14168       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(24)), getIReg(4,R_ECX) );
   14169       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(20)), getIReg(4,R_EDX) );
   14170       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(16)), getIReg(4,R_EBX) );
   14171       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(12)), mkexpr(t0) /*esp*/);
   14172       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 8)), getIReg(4,R_EBP) );
   14173       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 4)), getIReg(4,R_ESI) );
   14174       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 0)), getIReg(4,R_EDI) );
   14175 
   14176       DIP("pusha%c\n", nameISize(sz));
   14177       break;
   14178 
   14179    case 0x0E: /* PUSH %CS */
   14180       dis_push_segreg( R_CS, sz ); break;
   14181    case 0x1E: /* PUSH %DS */
   14182       dis_push_segreg( R_DS, sz ); break;
   14183    case 0x06: /* PUSH %ES */
   14184       dis_push_segreg( R_ES, sz ); break;
   14185    case 0x16: /* PUSH %SS */
   14186       dis_push_segreg( R_SS, sz ); break;
   14187 
   14188    /* ------------------------ SCAS et al ----------------- */
   14189 
   14190    case 0xA4: /* MOVS, no REP prefix */
   14191    case 0xA5:
   14192       if (sorb != 0)
   14193          goto decode_failure; /* else dis_string_op asserts */
   14194       dis_string_op( dis_MOVS, ( opc == 0xA4 ? 1 : sz ), "movs", sorb );
   14195       break;
   14196 
   14197   case 0xA6: /* CMPSb, no REP prefix */
   14198   case 0xA7:
   14199       if (sorb != 0)
   14200          goto decode_failure; /* else dis_string_op asserts */
   14201       dis_string_op( dis_CMPS, ( opc == 0xA6 ? 1 : sz ), "cmps", sorb );
   14202       break;
   14203 
   14204    case 0xAA: /* STOS, no REP prefix */
   14205    case 0xAB:
   14206       if (sorb != 0)
   14207          goto decode_failure; /* else dis_string_op asserts */
   14208       dis_string_op( dis_STOS, ( opc == 0xAA ? 1 : sz ), "stos", sorb );
   14209       break;
   14210 
   14211    case 0xAC: /* LODS, no REP prefix */
   14212    case 0xAD:
   14213       if (sorb != 0)
   14214          goto decode_failure; /* else dis_string_op asserts */
   14215       dis_string_op( dis_LODS, ( opc == 0xAC ? 1 : sz ), "lods", sorb );
   14216       break;
   14217 
   14218    case 0xAE: /* SCAS, no REP prefix */
   14219    case 0xAF:
   14220       if (sorb != 0)
   14221          goto decode_failure; /* else dis_string_op asserts */
   14222       dis_string_op( dis_SCAS, ( opc == 0xAE ? 1 : sz ), "scas", sorb );
   14223       break;
   14224 
   14225 
   14226    case 0xFC: /* CLD */
   14227       stmt( IRStmt_Put( OFFB_DFLAG, mkU32(1)) );
   14228       DIP("cld\n");
   14229       break;
   14230 
   14231    case 0xFD: /* STD */
   14232       stmt( IRStmt_Put( OFFB_DFLAG, mkU32(0xFFFFFFFF)) );
   14233       DIP("std\n");
   14234       break;
   14235 
   14236    case 0xF8: /* CLC */
   14237    case 0xF9: /* STC */
   14238    case 0xF5: /* CMC */
   14239       t0 = newTemp(Ity_I32);
   14240       t1 = newTemp(Ity_I32);
   14241       assign( t0, mk_x86g_calculate_eflags_all() );
   14242       switch (opc) {
   14243          case 0xF8:
   14244             assign( t1, binop(Iop_And32, mkexpr(t0),
   14245                                          mkU32(~X86G_CC_MASK_C)));
   14246             DIP("clc\n");
   14247             break;
   14248          case 0xF9:
   14249             assign( t1, binop(Iop_Or32, mkexpr(t0),
   14250                                         mkU32(X86G_CC_MASK_C)));
   14251             DIP("stc\n");
   14252             break;
   14253          case 0xF5:
   14254             assign( t1, binop(Iop_Xor32, mkexpr(t0),
   14255                                          mkU32(X86G_CC_MASK_C)));
   14256             DIP("cmc\n");
   14257             break;
   14258          default:
   14259             vpanic("disInstr(x86)(clc/stc/cmc)");
   14260       }
   14261       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   14262       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   14263       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t1) ));
   14264       /* Set NDEP even though it isn't used.  This makes redundant-PUT
   14265          elimination of previous stores to this field work better. */
   14266       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   14267       break;
   14268 
   14269    case 0xD6: /* SALC */
   14270       t0 = newTemp(Ity_I32);
   14271       t1 = newTemp(Ity_I32);
   14272       assign( t0,  binop(Iop_And32,
   14273                          mk_x86g_calculate_eflags_c(),
   14274                          mkU32(1)) );
   14275       assign( t1, binop(Iop_Sar32,
   14276                         binop(Iop_Shl32, mkexpr(t0), mkU8(31)),
   14277                         mkU8(31)) );
   14278       putIReg(1, R_EAX, unop(Iop_32to8, mkexpr(t1)) );
   14279       DIP("salc\n");
   14280       break;
   14281 
   14282    /* REPNE prefix insn */
   14283    case 0xF2: {
   14284       Addr32 eip_orig = guest_EIP_bbstart + delta_start;
   14285       if (sorb != 0) goto decode_failure;
   14286       abyte = getIByte(delta); delta++;
   14287 
   14288       if (abyte == 0x66) { sz = 2; abyte = getIByte(delta); delta++; }
   14289 
   14290       switch (abyte) {
   14291       /* According to the Intel manual, "repne movs" should never occur, but
   14292        * in practice it has happened, so allow for it here... */
   14293       case 0xA4: sz = 1;   /* REPNE MOVS<sz> */
   14294       case 0xA5:
   14295          dis_REP_op ( &dres, X86CondNZ, dis_MOVS, sz, eip_orig,
   14296                              guest_EIP_bbstart+delta, "repne movs" );
   14297          break;
   14298 
   14299       case 0xA6: sz = 1;   /* REPNE CMP<sz> */
   14300       case 0xA7:
   14301          dis_REP_op ( &dres, X86CondNZ, dis_CMPS, sz, eip_orig,
   14302                              guest_EIP_bbstart+delta, "repne cmps" );
   14303          break;
   14304 
   14305       case 0xAA: sz = 1;   /* REPNE STOS<sz> */
   14306       case 0xAB:
   14307          dis_REP_op ( &dres, X86CondNZ, dis_STOS, sz, eip_orig,
   14308                              guest_EIP_bbstart+delta, "repne stos" );
   14309          break;
   14310 
   14311       case 0xAE: sz = 1;   /* REPNE SCAS<sz> */
   14312       case 0xAF:
   14313          dis_REP_op ( &dres, X86CondNZ, dis_SCAS, sz, eip_orig,
   14314                              guest_EIP_bbstart+delta, "repne scas" );
   14315          break;
   14316 
   14317       default:
   14318          goto decode_failure;
   14319       }
   14320       break;
   14321    }
   14322 
   14323    /* REP/REPE prefix insn (for SCAS and CMPS, 0xF3 means REPE,
   14324       for the rest, it means REP) */
   14325    case 0xF3: {
   14326       Addr32 eip_orig = guest_EIP_bbstart + delta_start;
   14327       abyte = getIByte(delta); delta++;
   14328 
   14329       if (abyte == 0x66) { sz = 2; abyte = getIByte(delta); delta++; }
   14330 
   14331       if (sorb != 0 && abyte != 0x0F) goto decode_failure;
   14332 
   14333       switch (abyte) {
   14334       case 0x0F:
   14335          switch (getIByte(delta)) {
   14336          /* On older CPUs, TZCNT behaves the same as BSF.  */
   14337          case 0xBC: /* REP BSF Gv,Ev */
   14338             delta = dis_bs_E_G ( sorb, sz, delta + 1, True );
   14339             break;
   14340          /* On older CPUs, LZCNT behaves the same as BSR.  */
   14341          case 0xBD: /* REP BSR Gv,Ev */
   14342             delta = dis_bs_E_G ( sorb, sz, delta + 1, False );
   14343             break;
   14344          default:
   14345             goto decode_failure;
   14346          }
   14347          break;
   14348 
   14349       case 0xA4: sz = 1;   /* REP MOVS<sz> */
   14350       case 0xA5:
   14351          dis_REP_op ( &dres, X86CondAlways, dis_MOVS, sz, eip_orig,
   14352                              guest_EIP_bbstart+delta, "rep movs" );
   14353          break;
   14354 
   14355       case 0xA6: sz = 1;   /* REPE CMP<sz> */
   14356       case 0xA7:
   14357          dis_REP_op ( &dres, X86CondZ, dis_CMPS, sz, eip_orig,
   14358                              guest_EIP_bbstart+delta, "repe cmps" );
   14359          break;
   14360 
   14361       case 0xAA: sz = 1;   /* REP STOS<sz> */
   14362       case 0xAB:
   14363          dis_REP_op ( &dres, X86CondAlways, dis_STOS, sz, eip_orig,
   14364                              guest_EIP_bbstart+delta, "rep stos" );
   14365          break;
   14366 
   14367       case 0xAC: sz = 1;   /* REP LODS<sz> */
   14368       case 0xAD:
   14369          dis_REP_op ( &dres, X86CondAlways, dis_LODS, sz, eip_orig,
   14370                              guest_EIP_bbstart+delta, "rep lods" );
   14371          break;
   14372 
   14373       case 0xAE: sz = 1;   /* REPE SCAS<sz> */
   14374       case 0xAF:
   14375          dis_REP_op ( &dres, X86CondZ, dis_SCAS, sz, eip_orig,
   14376                              guest_EIP_bbstart+delta, "repe scas" );
   14377          break;
   14378 
   14379       case 0x90:           /* REP NOP (PAUSE) */
   14380          /* a hint to the P4 re spin-wait loop */
   14381          DIP("rep nop (P4 pause)\n");
   14382          /* "observe" the hint.  The Vex client needs to be careful not
   14383             to cause very long delays as a result, though. */
   14384          jmp_lit(&dres, Ijk_Yield, ((Addr32)guest_EIP_bbstart)+delta);
   14385          vassert(dres.whatNext == Dis_StopHere);
   14386          break;
   14387 
   14388       case 0xC3:           /* REP RET -- same as normal ret? */
   14389          dis_ret(&dres, 0);
   14390          DIP("rep ret\n");
   14391          break;
   14392 
   14393       default:
   14394          goto decode_failure;
   14395       }
   14396       break;
   14397    }
   14398 
   14399    /* ------------------------ XCHG ----------------------- */
   14400 
   14401    /* XCHG reg,mem automatically asserts LOCK# even without a LOCK
   14402       prefix; hence it must be translated with an IRCAS (at least, the
   14403       memory variant). */
   14404    case 0x86: /* XCHG Gb,Eb */
   14405       sz = 1;
   14406       /* Fall through ... */
   14407    case 0x87: /* XCHG Gv,Ev */
   14408       modrm = getIByte(delta);
   14409       ty = szToITy(sz);
   14410       t1 = newTemp(ty); t2 = newTemp(ty);
   14411       if (epartIsReg(modrm)) {
   14412          assign(t1, getIReg(sz, eregOfRM(modrm)));
   14413          assign(t2, getIReg(sz, gregOfRM(modrm)));
   14414          putIReg(sz, gregOfRM(modrm), mkexpr(t1));
   14415          putIReg(sz, eregOfRM(modrm), mkexpr(t2));
   14416          delta++;
   14417          DIP("xchg%c %s, %s\n",
   14418              nameISize(sz), nameIReg(sz,gregOfRM(modrm)),
   14419                             nameIReg(sz,eregOfRM(modrm)));
   14420       } else {
   14421          *expect_CAS = True;
   14422          addr = disAMode ( &alen, sorb, delta, dis_buf );
   14423          assign( t1, loadLE(ty,mkexpr(addr)) );
   14424          assign( t2, getIReg(sz,gregOfRM(modrm)) );
   14425          casLE( mkexpr(addr),
   14426                 mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
   14427          putIReg( sz, gregOfRM(modrm), mkexpr(t1) );
   14428          delta += alen;
   14429          DIP("xchg%c %s, %s\n", nameISize(sz),
   14430                                 nameIReg(sz,gregOfRM(modrm)), dis_buf);
   14431       }
   14432       break;
   14433 
   14434    case 0x90: /* XCHG eAX,eAX */
   14435       DIP("nop\n");
   14436       break;
   14437    case 0x91: /* XCHG eAX,eCX */
   14438    case 0x92: /* XCHG eAX,eDX */
   14439    case 0x93: /* XCHG eAX,eBX */
   14440    case 0x94: /* XCHG eAX,eSP */
   14441    case 0x95: /* XCHG eAX,eBP */
   14442    case 0x96: /* XCHG eAX,eSI */
   14443    case 0x97: /* XCHG eAX,eDI */
   14444       codegen_xchg_eAX_Reg ( sz, opc - 0x90 );
   14445       break;
   14446 
   14447    /* ------------------------ XLAT ----------------------- */
   14448 
   14449    case 0xD7: /* XLAT */
   14450       if (sz != 4) goto decode_failure; /* sz == 2 is also allowed (0x66) */
   14451       putIReg(
   14452          1,
   14453          R_EAX/*AL*/,
   14454          loadLE(Ity_I8,
   14455                 handleSegOverride(
   14456                    sorb,
   14457                    binop(Iop_Add32,
   14458                          getIReg(4, R_EBX),
   14459                          unop(Iop_8Uto32, getIReg(1, R_EAX/*AL*/))))));
   14460 
   14461       DIP("xlat%c [ebx]\n", nameISize(sz));
   14462       break;
   14463 
   14464    /* ------------------------ IN / OUT ----------------------- */
   14465 
   14466    case 0xE4: /* IN imm8, AL */
   14467       sz = 1;
   14468       t1 = newTemp(Ity_I32);
   14469       abyte = getIByte(delta); delta++;
   14470       assign(t1, mkU32( abyte & 0xFF ));
   14471       DIP("in%c $%d,%s\n", nameISize(sz), abyte, nameIReg(sz,R_EAX));
   14472       goto do_IN;
   14473    case 0xE5: /* IN imm8, eAX */
   14474       vassert(sz == 2 || sz == 4);
   14475       t1 = newTemp(Ity_I32);
   14476       abyte = getIByte(delta); delta++;
   14477       assign(t1, mkU32( abyte & 0xFF ));
   14478       DIP("in%c $%d,%s\n", nameISize(sz), abyte, nameIReg(sz,R_EAX));
   14479       goto do_IN;
   14480    case 0xEC: /* IN %DX, AL */
   14481       sz = 1;
   14482       t1 = newTemp(Ity_I32);
   14483       assign(t1, unop(Iop_16Uto32, getIReg(2, R_EDX)));
   14484       DIP("in%c %s,%s\n", nameISize(sz), nameIReg(2,R_EDX),
   14485                                          nameIReg(sz,R_EAX));
   14486       goto do_IN;
   14487    case 0xED: /* IN %DX, eAX */
   14488       vassert(sz == 2 || sz == 4);
   14489       t1 = newTemp(Ity_I32);
   14490       assign(t1, unop(Iop_16Uto32, getIReg(2, R_EDX)));
   14491       DIP("in%c %s,%s\n", nameISize(sz), nameIReg(2,R_EDX),
   14492                                          nameIReg(sz,R_EAX));
   14493       goto do_IN;
   14494    do_IN: {
   14495       /* At this point, sz indicates the width, and t1 is a 32-bit
   14496          value giving port number. */
   14497       IRDirty* d;
   14498       vassert(sz == 1 || sz == 2 || sz == 4);
   14499       ty = szToITy(sz);
   14500       t2 = newTemp(Ity_I32);
   14501       d = unsafeIRDirty_1_N(
   14502              t2,
   14503              0/*regparms*/,
   14504              "x86g_dirtyhelper_IN",
   14505              &x86g_dirtyhelper_IN,
   14506              mkIRExprVec_2( mkexpr(t1), mkU32(sz) )
   14507           );
   14508       /* do the call, dumping the result in t2. */
   14509       stmt( IRStmt_Dirty(d) );
   14510       putIReg(sz, R_EAX, narrowTo( ty, mkexpr(t2) ) );
   14511       break;
   14512    }
   14513 
   14514    case 0xE6: /* OUT AL, imm8 */
   14515       sz = 1;
   14516       t1 = newTemp(Ity_I32);
   14517       abyte = getIByte(delta); delta++;
   14518       assign( t1, mkU32( abyte & 0xFF ) );
   14519       DIP("out%c %s,$%d\n", nameISize(sz), nameIReg(sz,R_EAX), abyte);
   14520       goto do_OUT;
   14521    case 0xE7: /* OUT eAX, imm8 */
   14522       vassert(sz == 2 || sz == 4);
   14523       t1 = newTemp(Ity_I32);
   14524       abyte = getIByte(delta); delta++;
   14525       assign( t1, mkU32( abyte & 0xFF ) );
   14526       DIP("out%c %s,$%d\n", nameISize(sz), nameIReg(sz,R_EAX), abyte);
   14527       goto do_OUT;
   14528    case 0xEE: /* OUT AL, %DX */
   14529       sz = 1;
   14530       t1 = newTemp(Ity_I32);
   14531       assign( t1, unop(Iop_16Uto32, getIReg(2, R_EDX)) );
   14532       DIP("out%c %s,%s\n", nameISize(sz), nameIReg(sz,R_EAX),
   14533                                           nameIReg(2,R_EDX));
   14534       goto do_OUT;
   14535    case 0xEF: /* OUT eAX, %DX */
   14536       vassert(sz == 2 || sz == 4);
   14537       t1 = newTemp(Ity_I32);
   14538       assign( t1, unop(Iop_16Uto32, getIReg(2, R_EDX)) );
   14539       DIP("out%c %s,%s\n", nameISize(sz), nameIReg(sz,R_EAX),
   14540                                           nameIReg(2,R_EDX));
   14541       goto do_OUT;
   14542    do_OUT: {
   14543       /* At this point, sz indicates the width, and t1 is a 32-bit
   14544          value giving port number. */
   14545       IRDirty* d;
   14546       vassert(sz == 1 || sz == 2 || sz == 4);
   14547       ty = szToITy(sz);
   14548       d = unsafeIRDirty_0_N(
   14549              0/*regparms*/,
   14550              "x86g_dirtyhelper_OUT",
   14551              &x86g_dirtyhelper_OUT,
   14552              mkIRExprVec_3( mkexpr(t1),
   14553                             widenUto32( getIReg(sz, R_EAX) ),
   14554                             mkU32(sz) )
   14555           );
   14556       stmt( IRStmt_Dirty(d) );
   14557       break;
   14558    }
   14559 
   14560    /* ------------------------ (Grp1 extensions) ---------- */
   14561 
   14562    case 0x82: /* Grp1 Ib,Eb too.  Apparently this is the same as
   14563                  case 0x80, but only in 32-bit mode. */
   14564       /* fallthru */
   14565    case 0x80: /* Grp1 Ib,Eb */
   14566       modrm = getIByte(delta);
   14567       am_sz = lengthAMode(delta);
   14568       sz    = 1;
   14569       d_sz  = 1;
   14570       d32   = getUChar(delta + am_sz);
   14571       delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
   14572       break;
   14573 
   14574    case 0x81: /* Grp1 Iv,Ev */
   14575       modrm = getIByte(delta);
   14576       am_sz = lengthAMode(delta);
   14577       d_sz  = sz;
   14578       d32   = getUDisp(d_sz, delta + am_sz);
   14579       delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
   14580       break;
   14581 
   14582    case 0x83: /* Grp1 Ib,Ev */
   14583       modrm = getIByte(delta);
   14584       am_sz = lengthAMode(delta);
   14585       d_sz  = 1;
   14586       d32   = getSDisp8(delta + am_sz);
   14587       delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
   14588       break;
   14589 
   14590    /* ------------------------ (Grp2 extensions) ---------- */
   14591 
   14592    case 0xC0: { /* Grp2 Ib,Eb */
   14593       Bool decode_OK = True;
   14594       modrm = getIByte(delta);
   14595       am_sz = lengthAMode(delta);
   14596       d_sz  = 1;
   14597       d32   = getUChar(delta + am_sz);
   14598       sz    = 1;
   14599       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14600                          mkU8(d32 & 0xFF), NULL, &decode_OK );
   14601       if (!decode_OK)
   14602          goto decode_failure;
   14603       break;
   14604    }
   14605    case 0xC1: { /* Grp2 Ib,Ev */
   14606       Bool decode_OK = True;
   14607       modrm = getIByte(delta);
   14608       am_sz = lengthAMode(delta);
   14609       d_sz  = 1;
   14610       d32   = getUChar(delta + am_sz);
   14611       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14612                          mkU8(d32 & 0xFF), NULL, &decode_OK );
   14613       if (!decode_OK)
   14614          goto decode_failure;
   14615       break;
   14616    }
   14617    case 0xD0: { /* Grp2 1,Eb */
   14618       Bool decode_OK = True;
   14619       modrm = getIByte(delta);
   14620       am_sz = lengthAMode(delta);
   14621       d_sz  = 0;
   14622       d32   = 1;
   14623       sz    = 1;
   14624       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14625                          mkU8(d32), NULL, &decode_OK );
   14626       if (!decode_OK)
   14627          goto decode_failure;
   14628       break;
   14629    }
   14630    case 0xD1: { /* Grp2 1,Ev */
   14631       Bool decode_OK = True;
   14632       modrm = getUChar(delta);
   14633       am_sz = lengthAMode(delta);
   14634       d_sz  = 0;
   14635       d32   = 1;
   14636       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14637                          mkU8(d32), NULL, &decode_OK );
   14638       if (!decode_OK)
   14639          goto decode_failure;
   14640       break;
   14641    }
   14642    case 0xD2: { /* Grp2 CL,Eb */
   14643       Bool decode_OK = True;
   14644       modrm = getUChar(delta);
   14645       am_sz = lengthAMode(delta);
   14646       d_sz  = 0;
   14647       sz    = 1;
   14648       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14649                          getIReg(1,R_ECX), "%cl", &decode_OK );
   14650       if (!decode_OK)
   14651          goto decode_failure;
   14652       break;
   14653    }
   14654    case 0xD3: { /* Grp2 CL,Ev */
   14655       Bool decode_OK = True;
   14656       modrm = getIByte(delta);
   14657       am_sz = lengthAMode(delta);
   14658       d_sz  = 0;
   14659       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14660                          getIReg(1,R_ECX), "%cl", &decode_OK );
   14661       if (!decode_OK)
   14662          goto decode_failure;
   14663       break;
   14664    }
   14665 
   14666    /* ------------------------ (Grp3 extensions) ---------- */
   14667 
   14668    case 0xF6: { /* Grp3 Eb */
   14669       Bool decode_OK = True;
   14670       delta = dis_Grp3 ( sorb, pfx_lock, 1, delta, &decode_OK );
   14671       if (!decode_OK)
   14672          goto decode_failure;
   14673       break;
   14674    }
   14675    case 0xF7: { /* Grp3 Ev */
   14676       Bool decode_OK = True;
   14677       delta = dis_Grp3 ( sorb, pfx_lock, sz, delta, &decode_OK );
   14678       if (!decode_OK)
   14679          goto decode_failure;
   14680       break;
   14681    }
   14682 
   14683    /* ------------------------ (Grp4 extensions) ---------- */
   14684 
   14685    case 0xFE: { /* Grp4 Eb */
   14686       Bool decode_OK = True;
   14687       delta = dis_Grp4 ( sorb, pfx_lock, delta, &decode_OK );
   14688       if (!decode_OK)
   14689          goto decode_failure;
   14690       break;
   14691    }
   14692 
   14693    /* ------------------------ (Grp5 extensions) ---------- */
   14694 
   14695    case 0xFF: { /* Grp5 Ev */
   14696       Bool decode_OK = True;
   14697       delta = dis_Grp5 ( sorb, pfx_lock, sz, delta, &dres, &decode_OK );
   14698       if (!decode_OK)
   14699          goto decode_failure;
   14700       break;
   14701    }
   14702 
   14703    /* ------------------------ Escapes to 2-byte opcodes -- */
   14704 
   14705    case 0x0F: {
   14706       opc = getIByte(delta); delta++;
   14707       switch (opc) {
   14708 
   14709       /* =-=-=-=-=-=-=-=-=- Grp8 =-=-=-=-=-=-=-=-=-=-=-= */
   14710 
   14711       case 0xBA: { /* Grp8 Ib,Ev */
   14712          Bool decode_OK = False;
   14713          modrm = getUChar(delta);
   14714          am_sz = lengthAMode(delta);
   14715          d32   = getSDisp8(delta + am_sz);
   14716          delta = dis_Grp8_Imm ( sorb, pfx_lock, delta, modrm,
   14717                                 am_sz, sz, d32, &decode_OK );
   14718          if (!decode_OK)
   14719             goto decode_failure;
   14720          break;
   14721       }
   14722 
   14723       /* =-=-=-=-=-=-=-=-=- BSF/BSR -=-=-=-=-=-=-=-=-=-= */
   14724 
   14725       case 0xBC: /* BSF Gv,Ev */
   14726          delta = dis_bs_E_G ( sorb, sz, delta, True );
   14727          break;
   14728       case 0xBD: /* BSR Gv,Ev */
   14729          delta = dis_bs_E_G ( sorb, sz, delta, False );
   14730          break;
   14731 
   14732       /* =-=-=-=-=-=-=-=-=- BSWAP -=-=-=-=-=-=-=-=-=-=-= */
   14733 
   14734       case 0xC8: /* BSWAP %eax */
   14735       case 0xC9:
   14736       case 0xCA:
   14737       case 0xCB:
   14738       case 0xCC:
   14739       case 0xCD:
   14740       case 0xCE:
   14741       case 0xCF: /* BSWAP %edi */
   14742          /* AFAICS from the Intel docs, this only exists at size 4. */
   14743          if (sz != 4) goto decode_failure;
   14744 
   14745          t1 = newTemp(Ity_I32);
   14746          assign( t1, getIReg(4, opc-0xC8) );
   14747          t2 = math_BSWAP(t1, Ity_I32);
   14748 
   14749          putIReg(4, opc-0xC8, mkexpr(t2));
   14750          DIP("bswapl %s\n", nameIReg(4, opc-0xC8));
   14751          break;
   14752 
   14753       /* =-=-=-=-=-=-=-=-=- BT/BTS/BTR/BTC =-=-=-=-=-=-= */
   14754 
   14755       case 0xA3: /* BT Gv,Ev */
   14756          delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpNone );
   14757          break;
   14758       case 0xB3: /* BTR Gv,Ev */
   14759          delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpReset );
   14760          break;
   14761       case 0xAB: /* BTS Gv,Ev */
   14762          delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpSet );
   14763          break;
   14764       case 0xBB: /* BTC Gv,Ev */
   14765          delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpComp );
   14766          break;
   14767 
   14768       /* =-=-=-=-=-=-=-=-=- CMOV =-=-=-=-=-=-=-=-=-=-=-= */
   14769 
   14770       case 0x40:
   14771       case 0x41:
   14772       case 0x42: /* CMOVBb/CMOVNAEb (cmov below) */
   14773       case 0x43: /* CMOVNBb/CMOVAEb (cmov not below) */
   14774       case 0x44: /* CMOVZb/CMOVEb (cmov zero) */
   14775       case 0x45: /* CMOVNZb/CMOVNEb (cmov not zero) */
   14776       case 0x46: /* CMOVBEb/CMOVNAb (cmov below or equal) */
   14777       case 0x47: /* CMOVNBEb/CMOVAb (cmov not below or equal) */
   14778       case 0x48: /* CMOVSb (cmov negative) */
   14779       case 0x49: /* CMOVSb (cmov not negative) */
   14780       case 0x4A: /* CMOVP (cmov parity even) */
   14781       case 0x4B: /* CMOVNP (cmov parity odd) */
   14782       case 0x4C: /* CMOVLb/CMOVNGEb (cmov less) */
   14783       case 0x4D: /* CMOVGEb/CMOVNLb (cmov greater or equal) */
   14784       case 0x4E: /* CMOVLEb/CMOVNGb (cmov less or equal) */
   14785       case 0x4F: /* CMOVGb/CMOVNLEb (cmov greater) */
   14786          delta = dis_cmov_E_G(sorb, sz, (X86Condcode)(opc - 0x40), delta);
   14787          break;
   14788 
   14789       /* =-=-=-=-=-=-=-=-=- CMPXCHG -=-=-=-=-=-=-=-=-=-= */
   14790 
   14791       case 0xB0: /* CMPXCHG Gb,Eb */
   14792          delta = dis_cmpxchg_G_E ( sorb, pfx_lock, 1, delta );
   14793          break;
   14794       case 0xB1: /* CMPXCHG Gv,Ev */
   14795          delta = dis_cmpxchg_G_E ( sorb, pfx_lock, sz, delta );
   14796          break;
   14797 
   14798       case 0xC7: { /* CMPXCHG8B Gv (0F C7 /1) */
   14799          IRTemp expdHi    = newTemp(Ity_I32);
   14800          IRTemp expdLo    = newTemp(Ity_I32);
   14801          IRTemp dataHi    = newTemp(Ity_I32);
   14802          IRTemp dataLo    = newTemp(Ity_I32);
   14803          IRTemp oldHi     = newTemp(Ity_I32);
   14804          IRTemp oldLo     = newTemp(Ity_I32);
   14805          IRTemp flags_old = newTemp(Ity_I32);
   14806          IRTemp flags_new = newTemp(Ity_I32);
   14807          IRTemp success   = newTemp(Ity_I1);
   14808 
   14809          /* Translate this using a DCAS, even if there is no LOCK
   14810             prefix.  Life is too short to bother with generating two
   14811             different translations for the with/without-LOCK-prefix
   14812             cases. */
   14813          *expect_CAS = True;
   14814 
   14815 	 /* Decode, and generate address. */
   14816          if (sz != 4) goto decode_failure;
   14817          modrm = getIByte(delta);
   14818          if (epartIsReg(modrm)) goto decode_failure;
   14819          if (gregOfRM(modrm) != 1) goto decode_failure;
   14820          addr = disAMode ( &alen, sorb, delta, dis_buf );
   14821          delta += alen;
   14822 
   14823          /* Get the expected and new values. */
   14824          assign( expdHi, getIReg(4,R_EDX) );
   14825          assign( expdLo, getIReg(4,R_EAX) );
   14826          assign( dataHi, getIReg(4,R_ECX) );
   14827          assign( dataLo, getIReg(4,R_EBX) );
   14828 
   14829          /* Do the DCAS */
   14830          stmt( IRStmt_CAS(
   14831                   mkIRCAS( oldHi, oldLo,
   14832                            Iend_LE, mkexpr(addr),
   14833                            mkexpr(expdHi), mkexpr(expdLo),
   14834                            mkexpr(dataHi), mkexpr(dataLo)
   14835                )));
   14836 
   14837          /* success when oldHi:oldLo == expdHi:expdLo */
   14838          assign( success,
   14839                  binop(Iop_CasCmpEQ32,
   14840                        binop(Iop_Or32,
   14841                              binop(Iop_Xor32, mkexpr(oldHi), mkexpr(expdHi)),
   14842                              binop(Iop_Xor32, mkexpr(oldLo), mkexpr(expdLo))
   14843                        ),
   14844                        mkU32(0)
   14845                  ));
   14846 
   14847          /* If the DCAS is successful, that is to say oldHi:oldLo ==
   14848             expdHi:expdLo, then put expdHi:expdLo back in EDX:EAX,
   14849             which is where they came from originally.  Both the actual
   14850             contents of these two regs, and any shadow values, are
   14851             unchanged.  If the DCAS fails then we're putting into
   14852             EDX:EAX the value seen in memory. */
   14853          putIReg(4, R_EDX,
   14854                     IRExpr_ITE( mkexpr(success),
   14855                                 mkexpr(expdHi), mkexpr(oldHi)
   14856                 ));
   14857          putIReg(4, R_EAX,
   14858                     IRExpr_ITE( mkexpr(success),
   14859                                 mkexpr(expdLo), mkexpr(oldLo)
   14860                 ));
   14861 
   14862          /* Copy the success bit into the Z flag and leave the others
   14863             unchanged */
   14864          assign( flags_old, widenUto32(mk_x86g_calculate_eflags_all()));
   14865          assign(
   14866             flags_new,
   14867             binop(Iop_Or32,
   14868                   binop(Iop_And32, mkexpr(flags_old),
   14869                                    mkU32(~X86G_CC_MASK_Z)),
   14870                   binop(Iop_Shl32,
   14871                         binop(Iop_And32,
   14872                               unop(Iop_1Uto32, mkexpr(success)), mkU32(1)),
   14873                         mkU8(X86G_CC_SHIFT_Z)) ));
   14874 
   14875          stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   14876          stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(flags_new) ));
   14877          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   14878          /* Set NDEP even though it isn't used.  This makes
   14879             redundant-PUT elimination of previous stores to this field
   14880             work better. */
   14881          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   14882 
   14883          /* Sheesh.  Aren't you glad it was me and not you that had to
   14884 	    write and validate all this grunge? */
   14885 
   14886 	 DIP("cmpxchg8b %s\n", dis_buf);
   14887 	 break;
   14888       }
   14889 
   14890       /* =-=-=-=-=-=-=-=-=- CPUID -=-=-=-=-=-=-=-=-=-=-= */
   14891 
   14892       case 0xA2: { /* CPUID */
   14893          /* Uses dirty helper:
   14894                void dirtyhelper_CPUID_sse[012] ( VexGuestX86State* )
   14895             declared to mod eax, wr ebx, ecx, edx
   14896          */
   14897          IRDirty* d     = NULL;
   14898          void*    fAddr = NULL;
   14899          const HChar* fName = NULL;
   14900          if (archinfo->hwcaps & VEX_HWCAPS_X86_SSE3) {
   14901             fName = "x86g_dirtyhelper_CPUID_sse3";
   14902             fAddr = &x86g_dirtyhelper_CPUID_sse3;
   14903          }
   14904          else
   14905          if (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2) {
   14906             fName = "x86g_dirtyhelper_CPUID_sse2";
   14907             fAddr = &x86g_dirtyhelper_CPUID_sse2;
   14908          }
   14909          else
   14910          if (archinfo->hwcaps & VEX_HWCAPS_X86_SSE1) {
   14911             fName = "x86g_dirtyhelper_CPUID_sse1";
   14912             fAddr = &x86g_dirtyhelper_CPUID_sse1;
   14913          }
   14914          else
   14915          if (archinfo->hwcaps & VEX_HWCAPS_X86_MMXEXT) {
   14916             fName = "x86g_dirtyhelper_CPUID_mmxext";
   14917             fAddr = &x86g_dirtyhelper_CPUID_mmxext;
   14918          }
   14919          else
   14920          if (archinfo->hwcaps == 0/*no SSE*/) {
   14921             fName = "x86g_dirtyhelper_CPUID_sse0";
   14922             fAddr = &x86g_dirtyhelper_CPUID_sse0;
   14923          } else
   14924             vpanic("disInstr(x86)(cpuid)");
   14925 
   14926          vassert(fName); vassert(fAddr);
   14927          d = unsafeIRDirty_0_N ( 0/*regparms*/,
   14928                                  fName, fAddr, mkIRExprVec_1(IRExpr_GSPTR()) );
   14929          /* declare guest state effects */
   14930          d->nFxState = 4;
   14931          vex_bzero(&d->fxState, sizeof(d->fxState));
   14932          d->fxState[0].fx     = Ifx_Modify;
   14933          d->fxState[0].offset = OFFB_EAX;
   14934          d->fxState[0].size   = 4;
   14935          d->fxState[1].fx     = Ifx_Write;
   14936          d->fxState[1].offset = OFFB_EBX;
   14937          d->fxState[1].size   = 4;
   14938          d->fxState[2].fx     = Ifx_Modify;
   14939          d->fxState[2].offset = OFFB_ECX;
   14940          d->fxState[2].size   = 4;
   14941          d->fxState[3].fx     = Ifx_Write;
   14942          d->fxState[3].offset = OFFB_EDX;
   14943          d->fxState[3].size   = 4;
   14944          /* execute the dirty call, side-effecting guest state */
   14945          stmt( IRStmt_Dirty(d) );
   14946          /* CPUID is a serialising insn.  So, just in case someone is
   14947             using it as a memory fence ... */
   14948          stmt( IRStmt_MBE(Imbe_Fence) );
   14949          DIP("cpuid\n");
   14950          break;
   14951       }
   14952 
   14953 //--          if (!VG_(cpu_has_feature)(VG_X86_FEAT_CPUID))
   14954 //--             goto decode_failure;
   14955 //--
   14956 //--          t1 = newTemp(cb);
   14957 //--          t2 = newTemp(cb);
   14958 //--          t3 = newTemp(cb);
   14959 //--          t4 = newTemp(cb);
   14960 //--          uInstr0(cb, CALLM_S, 0);
   14961 //--
   14962 //--          uInstr2(cb, GET,   4, ArchReg, R_EAX, TempReg, t1);
   14963 //--          uInstr1(cb, PUSH,  4, TempReg, t1);
   14964 //--
   14965 //--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t2);
   14966 //--          uLiteral(cb, 0);
   14967 //--          uInstr1(cb, PUSH,  4, TempReg, t2);
   14968 //--
   14969 //--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t3);
   14970 //--          uLiteral(cb, 0);
   14971 //--          uInstr1(cb, PUSH,  4, TempReg, t3);
   14972 //--
   14973 //--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t4);
   14974 //--          uLiteral(cb, 0);
   14975 //--          uInstr1(cb, PUSH,  4, TempReg, t4);
   14976 //--
   14977 //--          uInstr1(cb, CALLM, 0, Lit16,   VGOFF_(helper_CPUID));
   14978 //--          uFlagsRWU(cb, FlagsEmpty, FlagsEmpty, FlagsEmpty);
   14979 //--
   14980 //--          uInstr1(cb, POP,   4, TempReg, t4);
   14981 //--          uInstr2(cb, PUT,   4, TempReg, t4, ArchReg, R_EDX);
   14982 //--
   14983 //--          uInstr1(cb, POP,   4, TempReg, t3);
   14984 //--          uInstr2(cb, PUT,   4, TempReg, t3, ArchReg, R_ECX);
   14985 //--
   14986 //--          uInstr1(cb, POP,   4, TempReg, t2);
   14987 //--          uInstr2(cb, PUT,   4, TempReg, t2, ArchReg, R_EBX);
   14988 //--
   14989 //--          uInstr1(cb, POP,   4, TempReg, t1);
   14990 //--          uInstr2(cb, PUT,   4, TempReg, t1, ArchReg, R_EAX);
   14991 //--
   14992 //--          uInstr0(cb, CALLM_E, 0);
   14993 //--          DIP("cpuid\n");
   14994 //--          break;
   14995 //--
   14996       /* =-=-=-=-=-=-=-=-=- MOVZX, MOVSX =-=-=-=-=-=-=-= */
   14997 
   14998       case 0xB6: /* MOVZXb Eb,Gv */
   14999          if (sz != 2 && sz != 4)
   15000             goto decode_failure;
   15001          delta = dis_movx_E_G ( sorb, delta, 1, sz, False );
   15002          break;
   15003 
   15004       case 0xB7: /* MOVZXw Ew,Gv */
   15005          if (sz != 4)
   15006             goto decode_failure;
   15007          delta = dis_movx_E_G ( sorb, delta, 2, 4, False );
   15008          break;
   15009 
   15010       case 0xBE: /* MOVSXb Eb,Gv */
   15011          if (sz != 2 && sz != 4)
   15012             goto decode_failure;
   15013          delta = dis_movx_E_G ( sorb, delta, 1, sz, True );
   15014          break;
   15015 
   15016       case 0xBF: /* MOVSXw Ew,Gv */
   15017          if (sz != 4 && /* accept movsww, sigh, see #250799 */sz != 2)
   15018             goto decode_failure;
   15019          delta = dis_movx_E_G ( sorb, delta, 2, sz, True );
   15020          break;
   15021 
   15022 //--       /* =-=-=-=-=-=-=-=-=-=-= MOVNTI -=-=-=-=-=-=-=-=-= */
   15023 //--
   15024 //--       case 0xC3: /* MOVNTI Gv,Ev */
   15025 //--          vg_assert(sz == 4);
   15026 //--          modrm = getUChar(eip);
   15027 //--          vg_assert(!epartIsReg(modrm));
   15028 //--          t1 = newTemp(cb);
   15029 //--          uInstr2(cb, GET, 4, ArchReg, gregOfRM(modrm), TempReg, t1);
   15030 //--          pair = disAMode ( cb, sorb, eip, dis_buf );
   15031 //--          t2 = LOW24(pair);
   15032 //--          eip += HI8(pair);
   15033 //--          uInstr2(cb, STORE, 4, TempReg, t1, TempReg, t2);
   15034 //--          DIP("movnti %s,%s\n", nameIReg(4,gregOfRM(modrm)), dis_buf);
   15035 //--          break;
   15036 
   15037       /* =-=-=-=-=-=-=-=-=- MUL/IMUL =-=-=-=-=-=-=-=-=-= */
   15038 
   15039       case 0xAF: /* IMUL Ev, Gv */
   15040          delta = dis_mul_E_G ( sorb, sz, delta );
   15041          break;
   15042 
   15043       /* =-=-=-=-=-=-=-=-=- NOPs =-=-=-=-=-=-=-=-=-=-=-= */
   15044 
   15045       case 0x1F:
   15046          modrm = getUChar(delta);
   15047          if (epartIsReg(modrm)) goto decode_failure;
   15048          addr = disAMode ( &alen, sorb, delta, dis_buf );
   15049          delta += alen;
   15050          DIP("nop%c %s\n", nameISize(sz), dis_buf);
   15051          break;
   15052 
   15053       /* =-=-=-=-=-=-=-=-=- Jcond d32 -=-=-=-=-=-=-=-=-= */
   15054       case 0x80:
   15055       case 0x81:
   15056       case 0x82: /* JBb/JNAEb (jump below) */
   15057       case 0x83: /* JNBb/JAEb (jump not below) */
   15058       case 0x84: /* JZb/JEb (jump zero) */
   15059       case 0x85: /* JNZb/JNEb (jump not zero) */
   15060       case 0x86: /* JBEb/JNAb (jump below or equal) */
   15061       case 0x87: /* JNBEb/JAb (jump not below or equal) */
   15062       case 0x88: /* JSb (jump negative) */
   15063       case 0x89: /* JSb (jump not negative) */
   15064       case 0x8A: /* JP (jump parity even) */
   15065       case 0x8B: /* JNP/JPO (jump parity odd) */
   15066       case 0x8C: /* JLb/JNGEb (jump less) */
   15067       case 0x8D: /* JGEb/JNLb (jump greater or equal) */
   15068       case 0x8E: /* JLEb/JNGb (jump less or equal) */
   15069       case 0x8F: /* JGb/JNLEb (jump greater) */
   15070        { Int    jmpDelta;
   15071          const HChar* comment  = "";
   15072          jmpDelta = (Int)getUDisp32(delta);
   15073          d32 = (((Addr32)guest_EIP_bbstart)+delta+4) + jmpDelta;
   15074          delta += 4;
   15075          if (resteerCisOk
   15076              && vex_control.guest_chase_cond
   15077              && (Addr32)d32 != (Addr32)guest_EIP_bbstart
   15078              && jmpDelta < 0
   15079              && resteerOkFn( callback_opaque, (Addr32)d32) ) {
   15080             /* Speculation: assume this backward branch is taken.  So
   15081                we need to emit a side-exit to the insn following this
   15082                one, on the negation of the condition, and continue at
   15083                the branch target address (d32).  If we wind up back at
   15084                the first instruction of the trace, just stop; it's
   15085                better to let the IR loop unroller handle that case.*/
   15086             stmt( IRStmt_Exit(
   15087                      mk_x86g_calculate_condition((X86Condcode)
   15088                                                  (1 ^ (opc - 0x80))),
   15089                      Ijk_Boring,
   15090                      IRConst_U32(guest_EIP_bbstart+delta),
   15091                      OFFB_EIP ) );
   15092             dres.whatNext   = Dis_ResteerC;
   15093             dres.continueAt = (Addr32)d32;
   15094             comment = "(assumed taken)";
   15095          }
   15096          else
   15097          if (resteerCisOk
   15098              && vex_control.guest_chase_cond
   15099              && (Addr32)d32 != (Addr32)guest_EIP_bbstart
   15100              && jmpDelta >= 0
   15101              && resteerOkFn( callback_opaque,
   15102                              (Addr32)(guest_EIP_bbstart+delta)) ) {
   15103             /* Speculation: assume this forward branch is not taken.
   15104                So we need to emit a side-exit to d32 (the dest) and
   15105                continue disassembling at the insn immediately
   15106                following this one. */
   15107             stmt( IRStmt_Exit(
   15108                      mk_x86g_calculate_condition((X86Condcode)(opc - 0x80)),
   15109                      Ijk_Boring,
   15110                      IRConst_U32(d32),
   15111                      OFFB_EIP ) );
   15112             dres.whatNext   = Dis_ResteerC;
   15113             dres.continueAt = guest_EIP_bbstart + delta;
   15114             comment = "(assumed not taken)";
   15115          }
   15116          else {
   15117             /* Conservative default translation - end the block at
   15118                this point. */
   15119             jcc_01( &dres, (X86Condcode)(opc - 0x80),
   15120                     (Addr32)(guest_EIP_bbstart+delta), d32);
   15121             vassert(dres.whatNext == Dis_StopHere);
   15122          }
   15123          DIP("j%s-32 0x%x %s\n", name_X86Condcode(opc - 0x80), d32, comment);
   15124          break;
   15125        }
   15126 
   15127       /* =-=-=-=-=-=-=-=-=- RDTSC -=-=-=-=-=-=-=-=-=-=-= */
   15128       case 0x31: { /* RDTSC */
   15129          IRTemp   val  = newTemp(Ity_I64);
   15130          IRExpr** args = mkIRExprVec_0();
   15131          IRDirty* d    = unsafeIRDirty_1_N (
   15132                             val,
   15133                             0/*regparms*/,
   15134                             "x86g_dirtyhelper_RDTSC",
   15135                             &x86g_dirtyhelper_RDTSC,
   15136                             args
   15137                          );
   15138          /* execute the dirty call, dumping the result in val. */
   15139          stmt( IRStmt_Dirty(d) );
   15140          putIReg(4, R_EDX, unop(Iop_64HIto32, mkexpr(val)));
   15141          putIReg(4, R_EAX, unop(Iop_64to32, mkexpr(val)));
   15142          DIP("rdtsc\n");
   15143          break;
   15144       }
   15145 
   15146       /* =-=-=-=-=-=-=-=-=- PUSH/POP Sreg =-=-=-=-=-=-=-=-=-= */
   15147 
   15148       case 0xA1: /* POP %FS */
   15149          dis_pop_segreg( R_FS, sz ); break;
   15150       case 0xA9: /* POP %GS */
   15151          dis_pop_segreg( R_GS, sz ); break;
   15152 
   15153       case 0xA0: /* PUSH %FS */
   15154          dis_push_segreg( R_FS, sz ); break;
   15155       case 0xA8: /* PUSH %GS */
   15156          dis_push_segreg( R_GS, sz ); break;
   15157 
   15158       /* =-=-=-=-=-=-=-=-=- SETcc Eb =-=-=-=-=-=-=-=-=-= */
   15159       case 0x90:
   15160       case 0x91:
   15161       case 0x92: /* set-Bb/set-NAEb (jump below) */
   15162       case 0x93: /* set-NBb/set-AEb (jump not below) */
   15163       case 0x94: /* set-Zb/set-Eb (jump zero) */
   15164       case 0x95: /* set-NZb/set-NEb (jump not zero) */
   15165       case 0x96: /* set-BEb/set-NAb (jump below or equal) */
   15166       case 0x97: /* set-NBEb/set-Ab (jump not below or equal) */
   15167       case 0x98: /* set-Sb (jump negative) */
   15168       case 0x99: /* set-Sb (jump not negative) */
   15169       case 0x9A: /* set-P (jump parity even) */
   15170       case 0x9B: /* set-NP (jump parity odd) */
   15171       case 0x9C: /* set-Lb/set-NGEb (jump less) */
   15172       case 0x9D: /* set-GEb/set-NLb (jump greater or equal) */
   15173       case 0x9E: /* set-LEb/set-NGb (jump less or equal) */
   15174       case 0x9F: /* set-Gb/set-NLEb (jump greater) */
   15175          t1 = newTemp(Ity_I8);
   15176          assign( t1, unop(Iop_1Uto8,mk_x86g_calculate_condition(opc-0x90)) );
   15177          modrm = getIByte(delta);
   15178          if (epartIsReg(modrm)) {
   15179             delta++;
   15180             putIReg(1, eregOfRM(modrm), mkexpr(t1));
   15181             DIP("set%s %s\n", name_X86Condcode(opc-0x90),
   15182                               nameIReg(1,eregOfRM(modrm)));
   15183          } else {
   15184            addr = disAMode ( &alen, sorb, delta, dis_buf );
   15185            delta += alen;
   15186            storeLE( mkexpr(addr), mkexpr(t1) );
   15187            DIP("set%s %s\n", name_X86Condcode(opc-0x90), dis_buf);
   15188          }
   15189          break;
   15190 
   15191       /* =-=-=-=-=-=-=-=-=- SHLD/SHRD -=-=-=-=-=-=-=-=-= */
   15192 
   15193       case 0xA4: /* SHLDv imm8,Gv,Ev */
   15194          modrm = getIByte(delta);
   15195          d32   = delta + lengthAMode(delta);
   15196          vex_sprintf(dis_buf, "$%d", getIByte(d32));
   15197          delta = dis_SHLRD_Gv_Ev (
   15198                   sorb, delta, modrm, sz,
   15199                   mkU8(getIByte(d32)), True, /* literal */
   15200                   dis_buf, True );
   15201          break;
   15202       case 0xA5: /* SHLDv %cl,Gv,Ev */
   15203          modrm = getIByte(delta);
   15204          delta = dis_SHLRD_Gv_Ev (
   15205                     sorb, delta, modrm, sz,
   15206                     getIReg(1,R_ECX), False, /* not literal */
   15207                     "%cl", True );
   15208          break;
   15209 
   15210       case 0xAC: /* SHRDv imm8,Gv,Ev */
   15211          modrm = getIByte(delta);
   15212          d32   = delta + lengthAMode(delta);
   15213          vex_sprintf(dis_buf, "$%d", getIByte(d32));
   15214          delta = dis_SHLRD_Gv_Ev (
   15215                     sorb, delta, modrm, sz,
   15216                     mkU8(getIByte(d32)), True, /* literal */
   15217                     dis_buf, False );
   15218          break;
   15219       case 0xAD: /* SHRDv %cl,Gv,Ev */
   15220          modrm = getIByte(delta);
   15221          delta = dis_SHLRD_Gv_Ev (
   15222                     sorb, delta, modrm, sz,
   15223                     getIReg(1,R_ECX), False, /* not literal */
   15224                     "%cl", False );
   15225          break;
   15226 
   15227       /* =-=-=-=-=-=-=-=-=- SYSENTER -=-=-=-=-=-=-=-=-=-= */
   15228 
   15229       case 0x34:
   15230          /* Simple implementation needing a long explaination.
   15231 
   15232             sysenter is a kind of syscall entry.  The key thing here
   15233             is that the return address is not known -- that is
   15234             something that is beyond Vex's knowledge.  So this IR
   15235             forces a return to the scheduler, which can do what it
   15236             likes to simulate the systenter, but it MUST set this
   15237             thread's guest_EIP field with the continuation address
   15238             before resuming execution.  If that doesn't happen, the
   15239             thread will jump to address zero, which is probably
   15240             fatal.
   15241          */
   15242 
   15243          /* Note where we are, so we can back up the guest to this
   15244             point if the syscall needs to be restarted. */
   15245          stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
   15246                            mkU32(guest_EIP_curr_instr) ) );
   15247          jmp_lit(&dres, Ijk_Sys_sysenter, 0/*bogus next EIP value*/);
   15248          vassert(dres.whatNext == Dis_StopHere);
   15249          DIP("sysenter");
   15250          break;
   15251 
   15252       /* =-=-=-=-=-=-=-=-=- XADD -=-=-=-=-=-=-=-=-=-= */
   15253 
   15254       case 0xC0: { /* XADD Gb,Eb */
   15255          Bool decodeOK;
   15256          delta = dis_xadd_G_E ( sorb, pfx_lock, 1, delta, &decodeOK );
   15257          if (!decodeOK) goto decode_failure;
   15258          break;
   15259       }
   15260       case 0xC1: { /* XADD Gv,Ev */
   15261          Bool decodeOK;
   15262          delta = dis_xadd_G_E ( sorb, pfx_lock, sz, delta, &decodeOK );
   15263          if (!decodeOK) goto decode_failure;
   15264          break;
   15265       }
   15266 
   15267       /* =-=-=-=-=-=-=-=-=- MMXery =-=-=-=-=-=-=-=-=-=-= */
   15268 
   15269       case 0x71:
   15270       case 0x72:
   15271       case 0x73: /* PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
   15272 
   15273       case 0x6E: /* MOVD (src)ireg-or-mem, (dst)mmxreg */
   15274       case 0x7E: /* MOVD (src)mmxreg, (dst)ireg-or-mem */
   15275       case 0x7F: /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
   15276       case 0x6F: /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
   15277 
   15278       case 0xFC:
   15279       case 0xFD:
   15280       case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
   15281 
   15282       case 0xEC:
   15283       case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
   15284 
   15285       case 0xDC:
   15286       case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   15287 
   15288       case 0xF8:
   15289       case 0xF9:
   15290       case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
   15291 
   15292       case 0xE8:
   15293       case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
   15294 
   15295       case 0xD8:
   15296       case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   15297 
   15298       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
   15299       case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
   15300 
   15301       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
   15302 
   15303       case 0x74:
   15304       case 0x75:
   15305       case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
   15306 
   15307       case 0x64:
   15308       case 0x65:
   15309       case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
   15310 
   15311       case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
   15312       case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
   15313       case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
   15314 
   15315       case 0x68:
   15316       case 0x69:
   15317       case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
   15318 
   15319       case 0x60:
   15320       case 0x61:
   15321       case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
   15322 
   15323       case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
   15324       case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
   15325       case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
   15326       case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
   15327 
   15328       case 0xF1: /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
   15329       case 0xF2:
   15330       case 0xF3:
   15331 
   15332       case 0xD1: /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
   15333       case 0xD2:
   15334       case 0xD3:
   15335 
   15336       case 0xE1: /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
   15337       case 0xE2:
   15338       {
   15339          Int  delta0    = delta-1;
   15340          Bool decode_OK = False;
   15341 
   15342          /* If sz==2 this is SSE, and we assume sse idec has
   15343             already spotted those cases by now. */
   15344          if (sz != 4)
   15345             goto decode_failure;
   15346 
   15347          delta = dis_MMX ( &decode_OK, sorb, sz, delta-1 );
   15348          if (!decode_OK) {
   15349             delta = delta0;
   15350             goto decode_failure;
   15351          }
   15352          break;
   15353       }
   15354 
   15355       case 0x0E: /* FEMMS */
   15356       case 0x77: /* EMMS */
   15357          if (sz != 4)
   15358             goto decode_failure;
   15359          do_EMMS_preamble();
   15360          DIP("{f}emms\n");
   15361          break;
   15362 
   15363       /* =-=-=-=-=-=-=-=-=- SGDT and SIDT =-=-=-=-=-=-=-=-=-=-= */
   15364       case 0x01: /* 0F 01 /0 -- SGDT */
   15365                  /* 0F 01 /1 -- SIDT */
   15366       {
   15367           /* This is really revolting, but ... since each processor
   15368              (core) only has one IDT and one GDT, just let the guest
   15369              see it (pass-through semantics).  I can't see any way to
   15370              construct a faked-up value, so don't bother to try. */
   15371          modrm = getUChar(delta);
   15372          if (epartIsReg(modrm)) goto decode_failure;
   15373          if (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)
   15374             goto decode_failure;
   15375          addr = disAMode ( &alen, sorb, delta, dis_buf );
   15376          delta += alen;
   15377          switch (gregOfRM(modrm)) {
   15378             case 0: DIP("sgdt %s\n", dis_buf); break;
   15379             case 1: DIP("sidt %s\n", dis_buf); break;
   15380             default: vassert(0); /*NOTREACHED*/
   15381          }
   15382 
   15383          IRDirty* d = unsafeIRDirty_0_N (
   15384                           0/*regparms*/,
   15385                           "x86g_dirtyhelper_SxDT",
   15386                           &x86g_dirtyhelper_SxDT,
   15387                           mkIRExprVec_2( mkexpr(addr),
   15388                                          mkU32(gregOfRM(modrm)) )
   15389                       );
   15390          /* declare we're writing memory */
   15391          d->mFx   = Ifx_Write;
   15392          d->mAddr = mkexpr(addr);
   15393          d->mSize = 6;
   15394          stmt( IRStmt_Dirty(d) );
   15395          break;
   15396       }
   15397 
   15398       case 0x05: /* AMD's syscall */
   15399          stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
   15400                            mkU32(guest_EIP_curr_instr) ) );
   15401          jmp_lit(&dres, Ijk_Sys_syscall, ((Addr32)guest_EIP_bbstart)+delta);
   15402          vassert(dres.whatNext == Dis_StopHere);
   15403          DIP("syscall\n");
   15404          break;
   15405 
   15406       /* =-=-=-=-=-=-=-=-=- unimp2 =-=-=-=-=-=-=-=-=-=-= */
   15407 
   15408       default:
   15409          goto decode_failure;
   15410    } /* switch (opc) for the 2-byte opcodes */
   15411    goto decode_success;
   15412    } /* case 0x0F: of primary opcode */
   15413 
   15414    /* ------------------------ ??? ------------------------ */
   15415 
   15416   default:
   15417   decode_failure:
   15418    /* All decode failures end up here. */
   15419    if (sigill_diag) {
   15420       vex_printf("vex x86->IR: unhandled instruction bytes: "
   15421                  "0x%x 0x%x 0x%x 0x%x\n",
   15422                  getIByte(delta_start+0),
   15423                  getIByte(delta_start+1),
   15424                  getIByte(delta_start+2),
   15425                  getIByte(delta_start+3));
   15426    }
   15427 
   15428    /* Tell the dispatcher that this insn cannot be decoded, and so has
   15429       not been executed, and (is currently) the next to be executed.
   15430       EIP should be up-to-date since it made so at the start of each
   15431       insn, but nevertheless be paranoid and update it again right
   15432       now. */
   15433    stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_curr_instr) ) );
   15434    jmp_lit(&dres, Ijk_NoDecode, guest_EIP_curr_instr);
   15435    vassert(dres.whatNext == Dis_StopHere);
   15436    dres.len = 0;
   15437    /* We also need to say that a CAS is not expected now, regardless
   15438       of what it might have been set to at the start of the function,
   15439       since the IR that we've emitted just above (to synthesis a
   15440       SIGILL) does not involve any CAS, and presumably no other IR has
   15441       been emitted for this (non-decoded) insn. */
   15442    *expect_CAS = False;
   15443    return dres;
   15444 
   15445    } /* switch (opc) for the main (primary) opcode switch. */
   15446 
   15447   decode_success:
   15448    /* All decode successes end up here. */
   15449    switch (dres.whatNext) {
   15450       case Dis_Continue:
   15451          stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_bbstart + delta) ) );
   15452          break;
   15453       case Dis_ResteerU:
   15454       case Dis_ResteerC:
   15455          stmt( IRStmt_Put( OFFB_EIP, mkU32(dres.continueAt) ) );
   15456          break;
   15457       case Dis_StopHere:
   15458          break;
   15459       default:
   15460          vassert(0);
   15461    }
   15462 
   15463    DIP("\n");
   15464    dres.len = delta - delta_start;
   15465    return dres;
   15466 }
   15467 
   15468 #undef DIP
   15469 #undef DIS
   15470 
   15471 
   15472 /*------------------------------------------------------------*/
   15473 /*--- Top-level fn                                         ---*/
   15474 /*------------------------------------------------------------*/
   15475 
   15476 /* Disassemble a single instruction into IR.  The instruction
   15477    is located in host memory at &guest_code[delta]. */
   15478 
   15479 DisResult disInstr_X86 ( IRSB*        irsb_IN,
   15480                          Bool         (*resteerOkFn) ( void*, Addr ),
   15481                          Bool         resteerCisOk,
   15482                          void*        callback_opaque,
   15483                          const UChar* guest_code_IN,
   15484                          Long         delta,
   15485                          Addr         guest_IP,
   15486                          VexArch      guest_arch,
   15487                          const VexArchInfo* archinfo,
   15488                          const VexAbiInfo*  abiinfo,
   15489                          VexEndness   host_endness_IN,
   15490                          Bool         sigill_diag_IN )
   15491 {
   15492    Int       i, x1, x2;
   15493    Bool      expect_CAS, has_CAS;
   15494    DisResult dres;
   15495 
   15496    /* Set globals (see top of this file) */
   15497    vassert(guest_arch == VexArchX86);
   15498    guest_code           = guest_code_IN;
   15499    irsb                 = irsb_IN;
   15500    host_endness         = host_endness_IN;
   15501    guest_EIP_curr_instr = (Addr32)guest_IP;
   15502    guest_EIP_bbstart    = (Addr32)toUInt(guest_IP - delta);
   15503 
   15504    x1 = irsb_IN->stmts_used;
   15505    expect_CAS = False;
   15506    dres = disInstr_X86_WRK ( &expect_CAS, resteerOkFn,
   15507                              resteerCisOk,
   15508                              callback_opaque,
   15509                              delta, archinfo, abiinfo, sigill_diag_IN );
   15510    x2 = irsb_IN->stmts_used;
   15511    vassert(x2 >= x1);
   15512 
   15513    /* See comment at the top of disInstr_X86_WRK for meaning of
   15514       expect_CAS.  Here, we (sanity-)check for the presence/absence of
   15515       IRCAS as directed by the returned expect_CAS value. */
   15516    has_CAS = False;
   15517    for (i = x1; i < x2; i++) {
   15518       if (irsb_IN->stmts[i]->tag == Ist_CAS)
   15519          has_CAS = True;
   15520    }
   15521 
   15522    if (expect_CAS != has_CAS) {
   15523       /* inconsistency detected.  re-disassemble the instruction so as
   15524          to generate a useful error message; then assert. */
   15525       vex_traceflags |= VEX_TRACE_FE;
   15526       dres = disInstr_X86_WRK ( &expect_CAS, resteerOkFn,
   15527                                 resteerCisOk,
   15528                                 callback_opaque,
   15529                                 delta, archinfo, abiinfo, sigill_diag_IN );
   15530       for (i = x1; i < x2; i++) {
   15531          vex_printf("\t\t");
   15532          ppIRStmt(irsb_IN->stmts[i]);
   15533          vex_printf("\n");
   15534       }
   15535       /* Failure of this assertion is serious and denotes a bug in
   15536          disInstr. */
   15537       vpanic("disInstr_X86: inconsistency in LOCK prefix handling");
   15538    }
   15539 
   15540    return dres;
   15541 }
   15542 
   15543 
   15544 /*--------------------------------------------------------------------*/
   15545 /*--- end                                         guest_x86_toIR.c ---*/
   15546 /*--------------------------------------------------------------------*/
   15547