Home | History | Annotate | Download | only in priv
      1 
      2 /*--------------------------------------------------------------------*/
      3 /*--- begin                                       guest_x86_toIR.c ---*/
      4 /*--------------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2013 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 /* Translates x86 code to IR. */
     37 
     38 /* TODO:
     39 
     40    All Puts to CC_OP/CC_DEP1/CC_DEP2/CC_NDEP should really be checked
     41    to ensure a 32-bit value is being written.
     42 
     43    FUCOMI(P): what happens to A and S flags?  Currently are forced
     44       to zero.
     45 
     46    x87 FP Limitations:
     47 
     48    * all arithmetic done at 64 bits
     49 
     50    * no FP exceptions, except for handling stack over/underflow
     51 
     52    * FP rounding mode observed only for float->int conversions
     53      and int->float conversions which could lose accuracy, and
     54      for float-to-float rounding.  For all other operations,
     55      round-to-nearest is used, regardless.
     56 
     57    * some of the FCOM cases could do with testing -- not convinced
     58      that the args are the right way round.
     59 
     60    * FSAVE does not re-initialise the FPU; it should do
     61 
     62    * FINIT not only initialises the FPU environment, it also
     63      zeroes all the FP registers.  It should leave the registers
     64      unchanged.
     65 
     66    SAHF should cause eflags[1] == 1, and in fact it produces 0.  As
     67    per Intel docs this bit has no meaning anyway.  Since PUSHF is the
     68    only way to observe eflags[1], a proper fix would be to make that
     69    bit be set by PUSHF.
     70 
     71    The state of %eflags.AC (alignment check, bit 18) is recorded by
     72    the simulation (viz, if you set it with popf then a pushf produces
     73    the value you set it to), but it is otherwise ignored.  In
     74    particular, setting it to 1 does NOT cause alignment checking to
     75    happen.  Programs that set it to 1 and then rely on the resulting
     76    SIGBUSs to inform them of misaligned accesses will not work.
     77 
     78    Implementation of sysenter is necessarily partial.  sysenter is a
     79    kind of system call entry.  When doing a sysenter, the return
     80    address is not known -- that is something that is beyond Vex's
     81    knowledge.  So the generated IR forces a return to the scheduler,
     82    which can do what it likes to simulate the systenter, but it MUST
     83    set this thread's guest_EIP field with the continuation address
     84    before resuming execution.  If that doesn't happen, the thread will
     85    jump to address zero, which is probably fatal.
     86 
     87    This module uses global variables and so is not MT-safe (if that
     88    should ever become relevant).
     89 
     90    The delta values are 32-bit ints, not 64-bit ints.  That means
     91    this module may not work right if run on a 64-bit host.  That should
     92    be fixed properly, really -- if anyone ever wants to use Vex to
     93    translate x86 code for execution on a 64-bit host.
     94 
     95    casLE (implementation of lock-prefixed insns) and rep-prefixed
     96    insns: the side-exit back to the start of the insn is done with
     97    Ijk_Boring.  This is quite wrong, it should be done with
     98    Ijk_NoRedir, since otherwise the side exit, which is intended to
     99    restart the instruction for whatever reason, could go somewhere
    100    entirely else.  Doing it right (with Ijk_NoRedir jumps) would make
    101    no-redir jumps performance critical, at least for rep-prefixed
    102    instructions, since all iterations thereof would involve such a
    103    jump.  It's not such a big deal with casLE since the side exit is
    104    only taken if the CAS fails, that is, the location is contended,
    105    which is relatively unlikely.
    106 
    107    XXXX: Nov 2009: handling of SWP on ARM suffers from the same
    108    problem.
    109 
    110    Note also, the test for CAS success vs failure is done using
    111    Iop_CasCmp{EQ,NE}{8,16,32,64} rather than the ordinary
    112    Iop_Cmp{EQ,NE} equivalents.  This is so as to tell Memcheck that it
    113    shouldn't definedness-check these comparisons.  See
    114    COMMENT_ON_CasCmpEQ in memcheck/mc_translate.c for
    115    background/rationale.
    116 */
    117 
    118 /* Performance holes:
    119 
    120    - fcom ; fstsw %ax ; sahf
    121      sahf does not update the O flag (sigh) and so O needs to
    122      be computed.  This is done expensively; it would be better
    123      to have a calculate_eflags_o helper.
    124 
    125    - emwarns; some FP codes can generate huge numbers of these
    126      if the fpucw is changed in an inner loop.  It would be
    127      better for the guest state to have an emwarn-enable reg
    128      which can be set zero or nonzero.  If it is zero, emwarns
    129      are not flagged, and instead control just flows all the
    130      way through bbs as usual.
    131 */
    132 
    133 /* "Special" instructions.
    134 
    135    This instruction decoder can decode three special instructions
    136    which mean nothing natively (are no-ops as far as regs/mem are
    137    concerned) but have meaning for supporting Valgrind.  A special
    138    instruction is flagged by the 12-byte preamble C1C703 C1C70D C1C71D
    139    C1C713 (in the standard interpretation, that means: roll $3, %edi;
    140    roll $13, %edi; roll $29, %edi; roll $19, %edi).  Following that,
    141    one of the following 3 are allowed (standard interpretation in
    142    parentheses):
    143 
    144       87DB (xchgl %ebx,%ebx)   %EDX = client_request ( %EAX )
    145       87C9 (xchgl %ecx,%ecx)   %EAX = guest_NRADDR
    146       87D2 (xchgl %edx,%edx)   call-noredir *%EAX
    147       87FF (xchgl %edi,%edi)   IR injection
    148 
    149    Any other bytes following the 12-byte preamble are illegal and
    150    constitute a failure in instruction decoding.  This all assumes
    151    that the preamble will never occur except in specific code
    152    fragments designed for Valgrind to catch.
    153 
    154    No prefixes may precede a "Special" instruction.
    155 */
    156 
    157 /* LOCK prefixed instructions.  These are translated using IR-level
    158    CAS statements (IRCAS) and are believed to preserve atomicity, even
    159    from the point of view of some other process racing against a
    160    simulated one (presumably they communicate via a shared memory
    161    segment).
    162 
    163    Handlers which are aware of LOCK prefixes are:
    164       dis_op2_G_E      (add, or, adc, sbb, and, sub, xor)
    165       dis_cmpxchg_G_E  (cmpxchg)
    166       dis_Grp1         (add, or, adc, sbb, and, sub, xor)
    167       dis_Grp3         (not, neg)
    168       dis_Grp4         (inc, dec)
    169       dis_Grp5         (inc, dec)
    170       dis_Grp8_Imm     (bts, btc, btr)
    171       dis_bt_G_E       (bts, btc, btr)
    172       dis_xadd_G_E     (xadd)
    173 */
    174 
    175 
    176 #include "libvex_basictypes.h"
    177 #include "libvex_ir.h"
    178 #include "libvex.h"
    179 #include "libvex_guest_x86.h"
    180 
    181 #include "main_util.h"
    182 #include "main_globals.h"
    183 #include "guest_generic_bb_to_IR.h"
    184 #include "guest_generic_x87.h"
    185 #include "guest_x86_defs.h"
    186 
    187 
    188 /*------------------------------------------------------------*/
    189 /*--- Globals                                              ---*/
    190 /*------------------------------------------------------------*/
    191 
    192 /* These are set at the start of the translation of an insn, right
    193    down in disInstr_X86, so that we don't have to pass them around
    194    endlessly.  They are all constant during the translation of any
    195    given insn. */
    196 
    197 /* We need to know this to do sub-register accesses correctly. */
    198 static VexEndness host_endness;
    199 
    200 /* Pointer to the guest code area (points to start of BB, not to the
    201    insn being processed). */
    202 static const UChar* guest_code;
    203 
    204 /* The guest address corresponding to guest_code[0]. */
    205 static Addr32 guest_EIP_bbstart;
    206 
    207 /* The guest address for the instruction currently being
    208    translated. */
    209 static Addr32 guest_EIP_curr_instr;
    210 
    211 /* The IRSB* into which we're generating code. */
    212 static IRSB* irsb;
    213 
    214 
    215 /*------------------------------------------------------------*/
    216 /*--- Debugging output                                     ---*/
    217 /*------------------------------------------------------------*/
    218 
    219 #define DIP(format, args...)           \
    220    if (vex_traceflags & VEX_TRACE_FE)  \
    221       vex_printf(format, ## args)
    222 
    223 #define DIS(buf, format, args...)      \
    224    if (vex_traceflags & VEX_TRACE_FE)  \
    225       vex_sprintf(buf, format, ## args)
    226 
    227 
    228 /*------------------------------------------------------------*/
    229 /*--- Offsets of various parts of the x86 guest state.     ---*/
    230 /*------------------------------------------------------------*/
    231 
    232 #define OFFB_EAX       offsetof(VexGuestX86State,guest_EAX)
    233 #define OFFB_EBX       offsetof(VexGuestX86State,guest_EBX)
    234 #define OFFB_ECX       offsetof(VexGuestX86State,guest_ECX)
    235 #define OFFB_EDX       offsetof(VexGuestX86State,guest_EDX)
    236 #define OFFB_ESP       offsetof(VexGuestX86State,guest_ESP)
    237 #define OFFB_EBP       offsetof(VexGuestX86State,guest_EBP)
    238 #define OFFB_ESI       offsetof(VexGuestX86State,guest_ESI)
    239 #define OFFB_EDI       offsetof(VexGuestX86State,guest_EDI)
    240 
    241 #define OFFB_EIP       offsetof(VexGuestX86State,guest_EIP)
    242 
    243 #define OFFB_CC_OP     offsetof(VexGuestX86State,guest_CC_OP)
    244 #define OFFB_CC_DEP1   offsetof(VexGuestX86State,guest_CC_DEP1)
    245 #define OFFB_CC_DEP2   offsetof(VexGuestX86State,guest_CC_DEP2)
    246 #define OFFB_CC_NDEP   offsetof(VexGuestX86State,guest_CC_NDEP)
    247 
    248 #define OFFB_FPREGS    offsetof(VexGuestX86State,guest_FPREG[0])
    249 #define OFFB_FPTAGS    offsetof(VexGuestX86State,guest_FPTAG[0])
    250 #define OFFB_DFLAG     offsetof(VexGuestX86State,guest_DFLAG)
    251 #define OFFB_IDFLAG    offsetof(VexGuestX86State,guest_IDFLAG)
    252 #define OFFB_ACFLAG    offsetof(VexGuestX86State,guest_ACFLAG)
    253 #define OFFB_FTOP      offsetof(VexGuestX86State,guest_FTOP)
    254 #define OFFB_FC3210    offsetof(VexGuestX86State,guest_FC3210)
    255 #define OFFB_FPROUND   offsetof(VexGuestX86State,guest_FPROUND)
    256 
    257 #define OFFB_CS        offsetof(VexGuestX86State,guest_CS)
    258 #define OFFB_DS        offsetof(VexGuestX86State,guest_DS)
    259 #define OFFB_ES        offsetof(VexGuestX86State,guest_ES)
    260 #define OFFB_FS        offsetof(VexGuestX86State,guest_FS)
    261 #define OFFB_GS        offsetof(VexGuestX86State,guest_GS)
    262 #define OFFB_SS        offsetof(VexGuestX86State,guest_SS)
    263 #define OFFB_LDT       offsetof(VexGuestX86State,guest_LDT)
    264 #define OFFB_GDT       offsetof(VexGuestX86State,guest_GDT)
    265 
    266 #define OFFB_SSEROUND  offsetof(VexGuestX86State,guest_SSEROUND)
    267 #define OFFB_XMM0      offsetof(VexGuestX86State,guest_XMM0)
    268 #define OFFB_XMM1      offsetof(VexGuestX86State,guest_XMM1)
    269 #define OFFB_XMM2      offsetof(VexGuestX86State,guest_XMM2)
    270 #define OFFB_XMM3      offsetof(VexGuestX86State,guest_XMM3)
    271 #define OFFB_XMM4      offsetof(VexGuestX86State,guest_XMM4)
    272 #define OFFB_XMM5      offsetof(VexGuestX86State,guest_XMM5)
    273 #define OFFB_XMM6      offsetof(VexGuestX86State,guest_XMM6)
    274 #define OFFB_XMM7      offsetof(VexGuestX86State,guest_XMM7)
    275 
    276 #define OFFB_EMNOTE    offsetof(VexGuestX86State,guest_EMNOTE)
    277 
    278 #define OFFB_CMSTART   offsetof(VexGuestX86State,guest_CMSTART)
    279 #define OFFB_CMLEN     offsetof(VexGuestX86State,guest_CMLEN)
    280 #define OFFB_NRADDR    offsetof(VexGuestX86State,guest_NRADDR)
    281 
    282 #define OFFB_IP_AT_SYSCALL offsetof(VexGuestX86State,guest_IP_AT_SYSCALL)
    283 
    284 
    285 /*------------------------------------------------------------*/
    286 /*--- Helper bits and pieces for deconstructing the        ---*/
    287 /*--- x86 insn stream.                                     ---*/
    288 /*------------------------------------------------------------*/
    289 
    290 /* This is the Intel register encoding -- integer regs. */
    291 #define R_EAX 0
    292 #define R_ECX 1
    293 #define R_EDX 2
    294 #define R_EBX 3
    295 #define R_ESP 4
    296 #define R_EBP 5
    297 #define R_ESI 6
    298 #define R_EDI 7
    299 
    300 #define R_AL (0+R_EAX)
    301 #define R_AH (4+R_EAX)
    302 
    303 /* This is the Intel register encoding -- segment regs. */
    304 #define R_ES 0
    305 #define R_CS 1
    306 #define R_SS 2
    307 #define R_DS 3
    308 #define R_FS 4
    309 #define R_GS 5
    310 
    311 
    312 /* Add a statement to the list held by "irbb". */
    313 static void stmt ( IRStmt* st )
    314 {
    315    addStmtToIRSB( irsb, st );
    316 }
    317 
    318 /* Generate a new temporary of the given type. */
    319 static IRTemp newTemp ( IRType ty )
    320 {
    321    vassert(isPlausibleIRType(ty));
    322    return newIRTemp( irsb->tyenv, ty );
    323 }
    324 
    325 /* Various simple conversions */
    326 
    327 static UInt extend_s_8to32( UInt x )
    328 {
    329    return (UInt)((Int)(x << 24) >> 24);
    330 }
    331 
    332 static UInt extend_s_16to32 ( UInt x )
    333 {
    334   return (UInt)((Int)(x << 16) >> 16);
    335 }
    336 
    337 /* Fetch a byte from the guest insn stream. */
    338 static UChar getIByte ( Int delta )
    339 {
    340    return guest_code[delta];
    341 }
    342 
    343 /* Extract the reg field from a modRM byte. */
    344 static Int gregOfRM ( UChar mod_reg_rm )
    345 {
    346    return (Int)( (mod_reg_rm >> 3) & 7 );
    347 }
    348 
    349 /* Figure out whether the mod and rm parts of a modRM byte refer to a
    350    register or memory.  If so, the byte will have the form 11XXXYYY,
    351    where YYY is the register number. */
    352 static Bool epartIsReg ( UChar mod_reg_rm )
    353 {
    354    return toBool(0xC0 == (mod_reg_rm & 0xC0));
    355 }
    356 
    357 /* ... and extract the register number ... */
    358 static Int eregOfRM ( UChar mod_reg_rm )
    359 {
    360    return (Int)(mod_reg_rm & 0x7);
    361 }
    362 
    363 /* Get a 8/16/32-bit unsigned value out of the insn stream. */
    364 
    365 static UChar getUChar ( Int delta )
    366 {
    367    UChar v = guest_code[delta+0];
    368    return toUChar(v);
    369 }
    370 
    371 static UInt getUDisp16 ( Int delta )
    372 {
    373    UInt v = guest_code[delta+1]; v <<= 8;
    374    v |= guest_code[delta+0];
    375    return v & 0xFFFF;
    376 }
    377 
    378 static UInt getUDisp32 ( Int delta )
    379 {
    380    UInt v = guest_code[delta+3]; v <<= 8;
    381    v |= guest_code[delta+2]; v <<= 8;
    382    v |= guest_code[delta+1]; v <<= 8;
    383    v |= guest_code[delta+0];
    384    return v;
    385 }
    386 
    387 static UInt getUDisp ( Int size, Int delta )
    388 {
    389    switch (size) {
    390       case 4: return getUDisp32(delta);
    391       case 2: return getUDisp16(delta);
    392       case 1: return (UInt)getUChar(delta);
    393       default: vpanic("getUDisp(x86)");
    394    }
    395    return 0; /*notreached*/
    396 }
    397 
    398 
    399 /* Get a byte value out of the insn stream and sign-extend to 32
    400    bits. */
    401 static UInt getSDisp8 ( Int delta )
    402 {
    403    return extend_s_8to32( (UInt) (guest_code[delta]) );
    404 }
    405 
    406 static UInt getSDisp16 ( Int delta0 )
    407 {
    408    const UChar* eip = &guest_code[delta0];
    409    UInt d = *eip++;
    410    d |= ((*eip++) << 8);
    411    return extend_s_16to32(d);
    412 }
    413 
    414 static UInt getSDisp ( Int size, Int delta )
    415 {
    416    switch (size) {
    417       case 4: return getUDisp32(delta);
    418       case 2: return getSDisp16(delta);
    419       case 1: return getSDisp8(delta);
    420       default: vpanic("getSDisp(x86)");
    421   }
    422   return 0; /*notreached*/
    423 }
    424 
    425 
    426 /*------------------------------------------------------------*/
    427 /*--- Helpers for constructing IR.                         ---*/
    428 /*------------------------------------------------------------*/
    429 
    430 /* Create a 1/2/4 byte read of an x86 integer registers.  For 16/8 bit
    431    register references, we need to take the host endianness into
    432    account.  Supplied value is 0 .. 7 and in the Intel instruction
    433    encoding. */
    434 
    435 static IRType szToITy ( Int n )
    436 {
    437    switch (n) {
    438       case 1: return Ity_I8;
    439       case 2: return Ity_I16;
    440       case 4: return Ity_I32;
    441       default: vpanic("szToITy(x86)");
    442    }
    443 }
    444 
    445 /* On a little-endian host, less significant bits of the guest
    446    registers are at lower addresses.  Therefore, if a reference to a
    447    register low half has the safe guest state offset as a reference to
    448    the full register.
    449 */
    450 static Int integerGuestRegOffset ( Int sz, UInt archreg )
    451 {
    452    vassert(archreg < 8);
    453 
    454    /* Correct for little-endian host only. */
    455    vassert(host_endness == VexEndnessLE);
    456 
    457    if (sz == 4 || sz == 2 || (sz == 1 && archreg < 4)) {
    458       switch (archreg) {
    459          case R_EAX: return OFFB_EAX;
    460          case R_EBX: return OFFB_EBX;
    461          case R_ECX: return OFFB_ECX;
    462          case R_EDX: return OFFB_EDX;
    463          case R_ESI: return OFFB_ESI;
    464          case R_EDI: return OFFB_EDI;
    465          case R_ESP: return OFFB_ESP;
    466          case R_EBP: return OFFB_EBP;
    467          default: vpanic("integerGuestRegOffset(x86,le)(4,2)");
    468       }
    469    }
    470 
    471    vassert(archreg >= 4 && archreg < 8 && sz == 1);
    472    switch (archreg-4) {
    473       case R_EAX: return 1+ OFFB_EAX;
    474       case R_EBX: return 1+ OFFB_EBX;
    475       case R_ECX: return 1+ OFFB_ECX;
    476       case R_EDX: return 1+ OFFB_EDX;
    477       default: vpanic("integerGuestRegOffset(x86,le)(1h)");
    478    }
    479 
    480    /* NOTREACHED */
    481    vpanic("integerGuestRegOffset(x86,le)");
    482 }
    483 
    484 static Int segmentGuestRegOffset ( UInt sreg )
    485 {
    486    switch (sreg) {
    487       case R_ES: return OFFB_ES;
    488       case R_CS: return OFFB_CS;
    489       case R_SS: return OFFB_SS;
    490       case R_DS: return OFFB_DS;
    491       case R_FS: return OFFB_FS;
    492       case R_GS: return OFFB_GS;
    493       default: vpanic("segmentGuestRegOffset(x86)");
    494    }
    495 }
    496 
    497 static Int xmmGuestRegOffset ( UInt xmmreg )
    498 {
    499    switch (xmmreg) {
    500       case 0: return OFFB_XMM0;
    501       case 1: return OFFB_XMM1;
    502       case 2: return OFFB_XMM2;
    503       case 3: return OFFB_XMM3;
    504       case 4: return OFFB_XMM4;
    505       case 5: return OFFB_XMM5;
    506       case 6: return OFFB_XMM6;
    507       case 7: return OFFB_XMM7;
    508       default: vpanic("xmmGuestRegOffset");
    509    }
    510 }
    511 
    512 /* Lanes of vector registers are always numbered from zero being the
    513    least significant lane (rightmost in the register).  */
    514 
    515 static Int xmmGuestRegLane16offset ( UInt xmmreg, Int laneno )
    516 {
    517    /* Correct for little-endian host only. */
    518    vassert(host_endness == VexEndnessLE);
    519    vassert(laneno >= 0 && laneno < 8);
    520    return xmmGuestRegOffset( xmmreg ) + 2 * laneno;
    521 }
    522 
    523 static Int xmmGuestRegLane32offset ( UInt xmmreg, Int laneno )
    524 {
    525    /* Correct for little-endian host only. */
    526    vassert(host_endness == VexEndnessLE);
    527    vassert(laneno >= 0 && laneno < 4);
    528    return xmmGuestRegOffset( xmmreg ) + 4 * laneno;
    529 }
    530 
    531 static Int xmmGuestRegLane64offset ( UInt xmmreg, Int laneno )
    532 {
    533    /* Correct for little-endian host only. */
    534    vassert(host_endness == VexEndnessLE);
    535    vassert(laneno >= 0 && laneno < 2);
    536    return xmmGuestRegOffset( xmmreg ) + 8 * laneno;
    537 }
    538 
    539 static IRExpr* getIReg ( Int sz, UInt archreg )
    540 {
    541    vassert(sz == 1 || sz == 2 || sz == 4);
    542    vassert(archreg < 8);
    543    return IRExpr_Get( integerGuestRegOffset(sz,archreg),
    544                       szToITy(sz) );
    545 }
    546 
    547 /* Ditto, but write to a reg instead. */
    548 static void putIReg ( Int sz, UInt archreg, IRExpr* e )
    549 {
    550    IRType ty = typeOfIRExpr(irsb->tyenv, e);
    551    switch (sz) {
    552       case 1: vassert(ty == Ity_I8); break;
    553       case 2: vassert(ty == Ity_I16); break;
    554       case 4: vassert(ty == Ity_I32); break;
    555       default: vpanic("putIReg(x86)");
    556    }
    557    vassert(archreg < 8);
    558    stmt( IRStmt_Put(integerGuestRegOffset(sz,archreg), e) );
    559 }
    560 
    561 static IRExpr* getSReg ( UInt sreg )
    562 {
    563    return IRExpr_Get( segmentGuestRegOffset(sreg), Ity_I16 );
    564 }
    565 
    566 static void putSReg ( UInt sreg, IRExpr* e )
    567 {
    568    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
    569    stmt( IRStmt_Put( segmentGuestRegOffset(sreg), e ) );
    570 }
    571 
    572 static IRExpr* getXMMReg ( UInt xmmreg )
    573 {
    574    return IRExpr_Get( xmmGuestRegOffset(xmmreg), Ity_V128 );
    575 }
    576 
    577 static IRExpr* getXMMRegLane64 ( UInt xmmreg, Int laneno )
    578 {
    579    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_I64 );
    580 }
    581 
    582 static IRExpr* getXMMRegLane64F ( UInt xmmreg, Int laneno )
    583 {
    584    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_F64 );
    585 }
    586 
    587 static IRExpr* getXMMRegLane32 ( UInt xmmreg, Int laneno )
    588 {
    589    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_I32 );
    590 }
    591 
    592 static IRExpr* getXMMRegLane32F ( UInt xmmreg, Int laneno )
    593 {
    594    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_F32 );
    595 }
    596 
    597 static void putXMMReg ( UInt xmmreg, IRExpr* e )
    598 {
    599    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
    600    stmt( IRStmt_Put( xmmGuestRegOffset(xmmreg), e ) );
    601 }
    602 
    603 static void putXMMRegLane64 ( UInt xmmreg, Int laneno, IRExpr* e )
    604 {
    605    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
    606    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
    607 }
    608 
    609 static void putXMMRegLane64F ( UInt xmmreg, Int laneno, IRExpr* e )
    610 {
    611    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
    612    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
    613 }
    614 
    615 static void putXMMRegLane32F ( UInt xmmreg, Int laneno, IRExpr* e )
    616 {
    617    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
    618    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
    619 }
    620 
    621 static void putXMMRegLane32 ( UInt xmmreg, Int laneno, IRExpr* e )
    622 {
    623    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
    624    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
    625 }
    626 
    627 static void putXMMRegLane16 ( UInt xmmreg, Int laneno, IRExpr* e )
    628 {
    629    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
    630    stmt( IRStmt_Put( xmmGuestRegLane16offset(xmmreg,laneno), e ) );
    631 }
    632 
    633 static void assign ( IRTemp dst, IRExpr* e )
    634 {
    635    stmt( IRStmt_WrTmp(dst, e) );
    636 }
    637 
    638 static void storeLE ( IRExpr* addr, IRExpr* data )
    639 {
    640    stmt( IRStmt_Store(Iend_LE, addr, data) );
    641 }
    642 
    643 static IRExpr* unop ( IROp op, IRExpr* a )
    644 {
    645    return IRExpr_Unop(op, a);
    646 }
    647 
    648 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
    649 {
    650    return IRExpr_Binop(op, a1, a2);
    651 }
    652 
    653 static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
    654 {
    655    return IRExpr_Triop(op, a1, a2, a3);
    656 }
    657 
    658 static IRExpr* mkexpr ( IRTemp tmp )
    659 {
    660    return IRExpr_RdTmp(tmp);
    661 }
    662 
    663 static IRExpr* mkU8 ( UInt i )
    664 {
    665    vassert(i < 256);
    666    return IRExpr_Const(IRConst_U8( (UChar)i ));
    667 }
    668 
    669 static IRExpr* mkU16 ( UInt i )
    670 {
    671    vassert(i < 65536);
    672    return IRExpr_Const(IRConst_U16( (UShort)i ));
    673 }
    674 
    675 static IRExpr* mkU32 ( UInt i )
    676 {
    677    return IRExpr_Const(IRConst_U32(i));
    678 }
    679 
    680 static IRExpr* mkU64 ( ULong i )
    681 {
    682    return IRExpr_Const(IRConst_U64(i));
    683 }
    684 
    685 static IRExpr* mkU ( IRType ty, UInt i )
    686 {
    687    if (ty == Ity_I8)  return mkU8(i);
    688    if (ty == Ity_I16) return mkU16(i);
    689    if (ty == Ity_I32) return mkU32(i);
    690    /* If this panics, it usually means you passed a size (1,2,4)
    691       value as the IRType, rather than a real IRType. */
    692    vpanic("mkU(x86)");
    693 }
    694 
    695 static IRExpr* mkV128 ( UShort mask )
    696 {
    697    return IRExpr_Const(IRConst_V128(mask));
    698 }
    699 
    700 static IRExpr* loadLE ( IRType ty, IRExpr* addr )
    701 {
    702    return IRExpr_Load(Iend_LE, ty, addr);
    703 }
    704 
    705 static IROp mkSizedOp ( IRType ty, IROp op8 )
    706 {
    707    Int adj;
    708    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
    709    vassert(op8 == Iop_Add8 || op8 == Iop_Sub8
    710            || op8 == Iop_Mul8
    711            || op8 == Iop_Or8 || op8 == Iop_And8 || op8 == Iop_Xor8
    712            || op8 == Iop_Shl8 || op8 == Iop_Shr8 || op8 == Iop_Sar8
    713            || op8 == Iop_CmpEQ8 || op8 == Iop_CmpNE8
    714            || op8 == Iop_CasCmpNE8
    715            || op8 == Iop_ExpCmpNE8
    716            || op8 == Iop_Not8);
    717    adj = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
    718    return adj + op8;
    719 }
    720 
    721 static IROp mkWidenOp ( Int szSmall, Int szBig, Bool signd )
    722 {
    723    if (szSmall == 1 && szBig == 4) {
    724       return signd ? Iop_8Sto32 : Iop_8Uto32;
    725    }
    726    if (szSmall == 1 && szBig == 2) {
    727       return signd ? Iop_8Sto16 : Iop_8Uto16;
    728    }
    729    if (szSmall == 2 && szBig == 4) {
    730       return signd ? Iop_16Sto32 : Iop_16Uto32;
    731    }
    732    vpanic("mkWidenOp(x86,guest)");
    733 }
    734 
    735 static IRExpr* mkAnd1 ( IRExpr* x, IRExpr* y )
    736 {
    737    vassert(typeOfIRExpr(irsb->tyenv,x) == Ity_I1);
    738    vassert(typeOfIRExpr(irsb->tyenv,y) == Ity_I1);
    739    return unop(Iop_32to1,
    740                binop(Iop_And32,
    741                      unop(Iop_1Uto32,x),
    742                      unop(Iop_1Uto32,y)));
    743 }
    744 
    745 /* Generate a compare-and-swap operation, operating on memory at
    746    'addr'.  The expected value is 'expVal' and the new value is
    747    'newVal'.  If the operation fails, then transfer control (with a
    748    no-redir jump (XXX no -- see comment at top of this file)) to
    749    'restart_point', which is presumably the address of the guest
    750    instruction again -- retrying, essentially. */
    751 static void casLE ( IRExpr* addr, IRExpr* expVal, IRExpr* newVal,
    752                     Addr32 restart_point )
    753 {
    754    IRCAS* cas;
    755    IRType tyE    = typeOfIRExpr(irsb->tyenv, expVal);
    756    IRType tyN    = typeOfIRExpr(irsb->tyenv, newVal);
    757    IRTemp oldTmp = newTemp(tyE);
    758    IRTemp expTmp = newTemp(tyE);
    759    vassert(tyE == tyN);
    760    vassert(tyE == Ity_I32 || tyE == Ity_I16 || tyE == Ity_I8);
    761    assign(expTmp, expVal);
    762    cas = mkIRCAS( IRTemp_INVALID, oldTmp, Iend_LE, addr,
    763                   NULL, mkexpr(expTmp), NULL, newVal );
    764    stmt( IRStmt_CAS(cas) );
    765    stmt( IRStmt_Exit(
    766             binop( mkSizedOp(tyE,Iop_CasCmpNE8),
    767                    mkexpr(oldTmp), mkexpr(expTmp) ),
    768             Ijk_Boring, /*Ijk_NoRedir*/
    769             IRConst_U32( restart_point ),
    770             OFFB_EIP
    771          ));
    772 }
    773 
    774 
    775 /*------------------------------------------------------------*/
    776 /*--- Helpers for %eflags.                                 ---*/
    777 /*------------------------------------------------------------*/
    778 
    779 /* -------------- Evaluating the flags-thunk. -------------- */
    780 
    781 /* Build IR to calculate all the eflags from stored
    782    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
    783    Ity_I32. */
    784 static IRExpr* mk_x86g_calculate_eflags_all ( void )
    785 {
    786    IRExpr** args
    787       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
    788                        IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
    789                        IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
    790                        IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
    791    IRExpr* call
    792       = mkIRExprCCall(
    793            Ity_I32,
    794            0/*regparm*/,
    795            "x86g_calculate_eflags_all", &x86g_calculate_eflags_all,
    796            args
    797         );
    798    /* Exclude OP and NDEP from definedness checking.  We're only
    799       interested in DEP1 and DEP2. */
    800    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
    801    return call;
    802 }
    803 
    804 /* Build IR to calculate some particular condition from stored
    805    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
    806    Ity_Bit. */
    807 static IRExpr* mk_x86g_calculate_condition ( X86Condcode cond )
    808 {
    809    IRExpr** args
    810       = mkIRExprVec_5( mkU32(cond),
    811                        IRExpr_Get(OFFB_CC_OP,  Ity_I32),
    812                        IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
    813                        IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
    814                        IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
    815    IRExpr* call
    816       = mkIRExprCCall(
    817            Ity_I32,
    818            0/*regparm*/,
    819            "x86g_calculate_condition", &x86g_calculate_condition,
    820            args
    821         );
    822    /* Exclude the requested condition, OP and NDEP from definedness
    823       checking.  We're only interested in DEP1 and DEP2. */
    824    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<1) | (1<<4);
    825    return unop(Iop_32to1, call);
    826 }
    827 
    828 /* Build IR to calculate just the carry flag from stored
    829    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression :: Ity_I32. */
    830 static IRExpr* mk_x86g_calculate_eflags_c ( void )
    831 {
    832    IRExpr** args
    833       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
    834                        IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
    835                        IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
    836                        IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
    837    IRExpr* call
    838       = mkIRExprCCall(
    839            Ity_I32,
    840            3/*regparm*/,
    841            "x86g_calculate_eflags_c", &x86g_calculate_eflags_c,
    842            args
    843         );
    844    /* Exclude OP and NDEP from definedness checking.  We're only
    845       interested in DEP1 and DEP2. */
    846    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
    847    return call;
    848 }
    849 
    850 
    851 /* -------------- Building the flags-thunk. -------------- */
    852 
    853 /* The machinery in this section builds the flag-thunk following a
    854    flag-setting operation.  Hence the various setFlags_* functions.
    855 */
    856 
    857 static Bool isAddSub ( IROp op8 )
    858 {
    859    return toBool(op8 == Iop_Add8 || op8 == Iop_Sub8);
    860 }
    861 
    862 static Bool isLogic ( IROp op8 )
    863 {
    864    return toBool(op8 == Iop_And8 || op8 == Iop_Or8 || op8 == Iop_Xor8);
    865 }
    866 
    867 /* U-widen 8/16/32 bit int expr to 32. */
    868 static IRExpr* widenUto32 ( IRExpr* e )
    869 {
    870    switch (typeOfIRExpr(irsb->tyenv,e)) {
    871       case Ity_I32: return e;
    872       case Ity_I16: return unop(Iop_16Uto32,e);
    873       case Ity_I8:  return unop(Iop_8Uto32,e);
    874       default: vpanic("widenUto32");
    875    }
    876 }
    877 
    878 /* S-widen 8/16/32 bit int expr to 32. */
    879 static IRExpr* widenSto32 ( IRExpr* e )
    880 {
    881    switch (typeOfIRExpr(irsb->tyenv,e)) {
    882       case Ity_I32: return e;
    883       case Ity_I16: return unop(Iop_16Sto32,e);
    884       case Ity_I8:  return unop(Iop_8Sto32,e);
    885       default: vpanic("widenSto32");
    886    }
    887 }
    888 
    889 /* Narrow 8/16/32 bit int expr to 8/16/32.  Clearly only some
    890    of these combinations make sense. */
    891 static IRExpr* narrowTo ( IRType dst_ty, IRExpr* e )
    892 {
    893    IRType src_ty = typeOfIRExpr(irsb->tyenv,e);
    894    if (src_ty == dst_ty)
    895       return e;
    896    if (src_ty == Ity_I32 && dst_ty == Ity_I16)
    897       return unop(Iop_32to16, e);
    898    if (src_ty == Ity_I32 && dst_ty == Ity_I8)
    899       return unop(Iop_32to8, e);
    900 
    901    vex_printf("\nsrc, dst tys are: ");
    902    ppIRType(src_ty);
    903    vex_printf(", ");
    904    ppIRType(dst_ty);
    905    vex_printf("\n");
    906    vpanic("narrowTo(x86)");
    907 }
    908 
    909 
    910 /* Set the flags thunk OP, DEP1 and DEP2 fields.  The supplied op is
    911    auto-sized up to the real op. */
    912 
    913 static
    914 void setFlags_DEP1_DEP2 ( IROp op8, IRTemp dep1, IRTemp dep2, IRType ty )
    915 {
    916    Int ccOp = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
    917 
    918    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
    919 
    920    switch (op8) {
    921       case Iop_Add8: ccOp += X86G_CC_OP_ADDB;   break;
    922       case Iop_Sub8: ccOp += X86G_CC_OP_SUBB;   break;
    923       default:       ppIROp(op8);
    924                      vpanic("setFlags_DEP1_DEP2(x86)");
    925    }
    926    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
    927    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(dep1))) );
    928    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(mkexpr(dep2))) );
    929    /* Set NDEP even though it isn't used.  This makes redundant-PUT
    930       elimination of previous stores to this field work better. */
    931    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
    932 }
    933 
    934 
    935 /* Set the OP and DEP1 fields only, and write zero to DEP2. */
    936 
    937 static
    938 void setFlags_DEP1 ( IROp op8, IRTemp dep1, IRType ty )
    939 {
    940    Int ccOp = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
    941 
    942    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
    943 
    944    switch (op8) {
    945       case Iop_Or8:
    946       case Iop_And8:
    947       case Iop_Xor8: ccOp += X86G_CC_OP_LOGICB; break;
    948       default:       ppIROp(op8);
    949                      vpanic("setFlags_DEP1(x86)");
    950    }
    951    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
    952    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(dep1))) );
    953    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0)) );
    954    /* Set NDEP even though it isn't used.  This makes redundant-PUT
    955       elimination of previous stores to this field work better. */
    956    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
    957 }
    958 
    959 
    960 /* For shift operations, we put in the result and the undershifted
    961    result.  Except if the shift amount is zero, the thunk is left
    962    unchanged. */
    963 
    964 static void setFlags_DEP1_DEP2_shift ( IROp    op32,
    965                                        IRTemp  res,
    966                                        IRTemp  resUS,
    967                                        IRType  ty,
    968                                        IRTemp  guard )
    969 {
    970    Int ccOp = ty==Ity_I8 ? 2 : (ty==Ity_I16 ? 1 : 0);
    971 
    972    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
    973    vassert(guard);
    974 
    975    /* Both kinds of right shifts are handled by the same thunk
    976       operation. */
    977    switch (op32) {
    978       case Iop_Shr32:
    979       case Iop_Sar32: ccOp = X86G_CC_OP_SHRL - ccOp; break;
    980       case Iop_Shl32: ccOp = X86G_CC_OP_SHLL - ccOp; break;
    981       default:        ppIROp(op32);
    982                       vpanic("setFlags_DEP1_DEP2_shift(x86)");
    983    }
    984 
    985    /* guard :: Ity_I8.  We need to convert it to I1. */
    986    IRTemp guardB = newTemp(Ity_I1);
    987    assign( guardB, binop(Iop_CmpNE8, mkexpr(guard), mkU8(0)) );
    988 
    989    /* DEP1 contains the result, DEP2 contains the undershifted value. */
    990    stmt( IRStmt_Put( OFFB_CC_OP,
    991                      IRExpr_ITE( mkexpr(guardB),
    992                                  mkU32(ccOp),
    993                                  IRExpr_Get(OFFB_CC_OP,Ity_I32) ) ));
    994    stmt( IRStmt_Put( OFFB_CC_DEP1,
    995                      IRExpr_ITE( mkexpr(guardB),
    996                                  widenUto32(mkexpr(res)),
    997                                  IRExpr_Get(OFFB_CC_DEP1,Ity_I32) ) ));
    998    stmt( IRStmt_Put( OFFB_CC_DEP2,
    999                      IRExpr_ITE( mkexpr(guardB),
   1000                                  widenUto32(mkexpr(resUS)),
   1001                                  IRExpr_Get(OFFB_CC_DEP2,Ity_I32) ) ));
   1002    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   1003       elimination of previous stores to this field work better. */
   1004    stmt( IRStmt_Put( OFFB_CC_NDEP,
   1005                      IRExpr_ITE( mkexpr(guardB),
   1006                                  mkU32(0),
   1007                                  IRExpr_Get(OFFB_CC_NDEP,Ity_I32) ) ));
   1008 }
   1009 
   1010 
   1011 /* For the inc/dec case, we store in DEP1 the result value and in NDEP
   1012    the former value of the carry flag, which unfortunately we have to
   1013    compute. */
   1014 
   1015 static void setFlags_INC_DEC ( Bool inc, IRTemp res, IRType ty )
   1016 {
   1017    Int ccOp = inc ? X86G_CC_OP_INCB : X86G_CC_OP_DECB;
   1018 
   1019    ccOp += ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
   1020    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
   1021 
   1022    /* This has to come first, because calculating the C flag
   1023       may require reading all four thunk fields. */
   1024    stmt( IRStmt_Put( OFFB_CC_NDEP, mk_x86g_calculate_eflags_c()) );
   1025    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
   1026    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(res))) );
   1027    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0)) );
   1028 }
   1029 
   1030 
   1031 /* Multiplies are pretty much like add and sub: DEP1 and DEP2 hold the
   1032    two arguments. */
   1033 
   1034 static
   1035 void setFlags_MUL ( IRType ty, IRTemp arg1, IRTemp arg2, UInt base_op )
   1036 {
   1037    switch (ty) {
   1038       case Ity_I8:
   1039          stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+0) ) );
   1040          break;
   1041       case Ity_I16:
   1042          stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+1) ) );
   1043          break;
   1044       case Ity_I32:
   1045          stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+2) ) );
   1046          break;
   1047       default:
   1048          vpanic("setFlags_MUL(x86)");
   1049    }
   1050    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(arg1)) ));
   1051    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(mkexpr(arg2)) ));
   1052    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   1053       elimination of previous stores to this field work better. */
   1054    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   1055 }
   1056 
   1057 
   1058 /* -------------- Condition codes. -------------- */
   1059 
   1060 /* Condition codes, using the Intel encoding.  */
   1061 
   1062 static const HChar* name_X86Condcode ( X86Condcode cond )
   1063 {
   1064    switch (cond) {
   1065       case X86CondO:      return "o";
   1066       case X86CondNO:     return "no";
   1067       case X86CondB:      return "b";
   1068       case X86CondNB:     return "nb";
   1069       case X86CondZ:      return "z";
   1070       case X86CondNZ:     return "nz";
   1071       case X86CondBE:     return "be";
   1072       case X86CondNBE:    return "nbe";
   1073       case X86CondS:      return "s";
   1074       case X86CondNS:     return "ns";
   1075       case X86CondP:      return "p";
   1076       case X86CondNP:     return "np";
   1077       case X86CondL:      return "l";
   1078       case X86CondNL:     return "nl";
   1079       case X86CondLE:     return "le";
   1080       case X86CondNLE:    return "nle";
   1081       case X86CondAlways: return "ALWAYS";
   1082       default: vpanic("name_X86Condcode");
   1083    }
   1084 }
   1085 
   1086 static
   1087 X86Condcode positiveIse_X86Condcode ( X86Condcode  cond,
   1088                                       Bool*        needInvert )
   1089 {
   1090    vassert(cond >= X86CondO && cond <= X86CondNLE);
   1091    if (cond & 1) {
   1092       *needInvert = True;
   1093       return cond-1;
   1094    } else {
   1095       *needInvert = False;
   1096       return cond;
   1097    }
   1098 }
   1099 
   1100 
   1101 /* -------------- Helpers for ADD/SUB with carry. -------------- */
   1102 
   1103 /* Given ta1, ta2 and tres, compute tres = ADC(ta1,ta2) and set flags
   1104    appropriately.
   1105 
   1106    Optionally, generate a store for the 'tres' value.  This can either
   1107    be a normal store, or it can be a cas-with-possible-failure style
   1108    store:
   1109 
   1110    if taddr is IRTemp_INVALID, then no store is generated.
   1111 
   1112    if taddr is not IRTemp_INVALID, then a store (using taddr as
   1113    the address) is generated:
   1114 
   1115      if texpVal is IRTemp_INVALID then a normal store is
   1116      generated, and restart_point must be zero (it is irrelevant).
   1117 
   1118      if texpVal is not IRTemp_INVALID then a cas-style store is
   1119      generated.  texpVal is the expected value, restart_point
   1120      is the restart point if the store fails, and texpVal must
   1121      have the same type as tres.
   1122 */
   1123 static void helper_ADC ( Int sz,
   1124                          IRTemp tres, IRTemp ta1, IRTemp ta2,
   1125                          /* info about optional store: */
   1126                          IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
   1127 {
   1128    UInt    thunkOp;
   1129    IRType  ty    = szToITy(sz);
   1130    IRTemp  oldc  = newTemp(Ity_I32);
   1131    IRTemp  oldcn = newTemp(ty);
   1132    IROp    plus  = mkSizedOp(ty, Iop_Add8);
   1133    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
   1134 
   1135    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
   1136    vassert(sz == 1 || sz == 2 || sz == 4);
   1137    thunkOp = sz==4 ? X86G_CC_OP_ADCL
   1138                    : (sz==2 ? X86G_CC_OP_ADCW : X86G_CC_OP_ADCB);
   1139 
   1140    /* oldc = old carry flag, 0 or 1 */
   1141    assign( oldc,  binop(Iop_And32,
   1142                         mk_x86g_calculate_eflags_c(),
   1143                         mkU32(1)) );
   1144 
   1145    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
   1146 
   1147    assign( tres, binop(plus,
   1148                        binop(plus,mkexpr(ta1),mkexpr(ta2)),
   1149                        mkexpr(oldcn)) );
   1150 
   1151    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
   1152       start of this function. */
   1153    if (taddr != IRTemp_INVALID) {
   1154       if (texpVal == IRTemp_INVALID) {
   1155          vassert(restart_point == 0);
   1156          storeLE( mkexpr(taddr), mkexpr(tres) );
   1157       } else {
   1158          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
   1159          /* .. and hence 'texpVal' has the same type as 'tres'. */
   1160          casLE( mkexpr(taddr),
   1161                 mkexpr(texpVal), mkexpr(tres), restart_point );
   1162       }
   1163    }
   1164 
   1165    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(thunkOp) ) );
   1166    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(ta1)) ));
   1167    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(binop(xor, mkexpr(ta2),
   1168                                                          mkexpr(oldcn)) )) );
   1169    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
   1170 }
   1171 
   1172 
   1173 /* Given ta1, ta2 and tres, compute tres = SBB(ta1,ta2) and set flags
   1174    appropriately.  As with helper_ADC, possibly generate a store of
   1175    the result -- see comments on helper_ADC for details.
   1176 */
   1177 static void helper_SBB ( Int sz,
   1178                          IRTemp tres, IRTemp ta1, IRTemp ta2,
   1179                          /* info about optional store: */
   1180                          IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
   1181 {
   1182    UInt    thunkOp;
   1183    IRType  ty    = szToITy(sz);
   1184    IRTemp  oldc  = newTemp(Ity_I32);
   1185    IRTemp  oldcn = newTemp(ty);
   1186    IROp    minus = mkSizedOp(ty, Iop_Sub8);
   1187    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
   1188 
   1189    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
   1190    vassert(sz == 1 || sz == 2 || sz == 4);
   1191    thunkOp = sz==4 ? X86G_CC_OP_SBBL
   1192                    : (sz==2 ? X86G_CC_OP_SBBW : X86G_CC_OP_SBBB);
   1193 
   1194    /* oldc = old carry flag, 0 or 1 */
   1195    assign( oldc, binop(Iop_And32,
   1196                        mk_x86g_calculate_eflags_c(),
   1197                        mkU32(1)) );
   1198 
   1199    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
   1200 
   1201    assign( tres, binop(minus,
   1202                        binop(minus,mkexpr(ta1),mkexpr(ta2)),
   1203                        mkexpr(oldcn)) );
   1204 
   1205    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
   1206       start of this function. */
   1207    if (taddr != IRTemp_INVALID) {
   1208       if (texpVal == IRTemp_INVALID) {
   1209          vassert(restart_point == 0);
   1210          storeLE( mkexpr(taddr), mkexpr(tres) );
   1211       } else {
   1212          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
   1213          /* .. and hence 'texpVal' has the same type as 'tres'. */
   1214          casLE( mkexpr(taddr),
   1215                 mkexpr(texpVal), mkexpr(tres), restart_point );
   1216       }
   1217    }
   1218 
   1219    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(thunkOp) ) );
   1220    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(ta1) )) );
   1221    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(binop(xor, mkexpr(ta2),
   1222                                                          mkexpr(oldcn)) )) );
   1223    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
   1224 }
   1225 
   1226 
   1227 /* -------------- Helpers for disassembly printing. -------------- */
   1228 
   1229 static const HChar* nameGrp1 ( Int opc_aux )
   1230 {
   1231    static const HChar* grp1_names[8]
   1232      = { "add", "or", "adc", "sbb", "and", "sub", "xor", "cmp" };
   1233    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp1(x86)");
   1234    return grp1_names[opc_aux];
   1235 }
   1236 
   1237 static const HChar* nameGrp2 ( Int opc_aux )
   1238 {
   1239    static const HChar* grp2_names[8]
   1240      = { "rol", "ror", "rcl", "rcr", "shl", "shr", "shl", "sar" };
   1241    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp2(x86)");
   1242    return grp2_names[opc_aux];
   1243 }
   1244 
   1245 static const HChar* nameGrp4 ( Int opc_aux )
   1246 {
   1247    static const HChar* grp4_names[8]
   1248      = { "inc", "dec", "???", "???", "???", "???", "???", "???" };
   1249    if (opc_aux < 0 || opc_aux > 1) vpanic("nameGrp4(x86)");
   1250    return grp4_names[opc_aux];
   1251 }
   1252 
   1253 static const HChar* nameGrp5 ( Int opc_aux )
   1254 {
   1255    static const HChar* grp5_names[8]
   1256      = { "inc", "dec", "call*", "call*", "jmp*", "jmp*", "push", "???" };
   1257    if (opc_aux < 0 || opc_aux > 6) vpanic("nameGrp5(x86)");
   1258    return grp5_names[opc_aux];
   1259 }
   1260 
   1261 static const HChar* nameGrp8 ( Int opc_aux )
   1262 {
   1263    static const HChar* grp8_names[8]
   1264      = { "???", "???", "???", "???", "bt", "bts", "btr", "btc" };
   1265    if (opc_aux < 4 || opc_aux > 7) vpanic("nameGrp8(x86)");
   1266    return grp8_names[opc_aux];
   1267 }
   1268 
   1269 static const HChar* nameIReg ( Int size, Int reg )
   1270 {
   1271    static const HChar* ireg32_names[8]
   1272      = { "%eax", "%ecx", "%edx", "%ebx",
   1273          "%esp", "%ebp", "%esi", "%edi" };
   1274    static const HChar* ireg16_names[8]
   1275      = { "%ax", "%cx", "%dx", "%bx", "%sp", "%bp", "%si", "%di" };
   1276    static const HChar* ireg8_names[8]
   1277      = { "%al", "%cl", "%dl", "%bl",
   1278          "%ah{sp}", "%ch{bp}", "%dh{si}", "%bh{di}" };
   1279    if (reg < 0 || reg > 7) goto bad;
   1280    switch (size) {
   1281       case 4: return ireg32_names[reg];
   1282       case 2: return ireg16_names[reg];
   1283       case 1: return ireg8_names[reg];
   1284    }
   1285   bad:
   1286    vpanic("nameIReg(X86)");
   1287    return NULL; /*notreached*/
   1288 }
   1289 
   1290 static const HChar* nameSReg ( UInt sreg )
   1291 {
   1292    switch (sreg) {
   1293       case R_ES: return "%es";
   1294       case R_CS: return "%cs";
   1295       case R_SS: return "%ss";
   1296       case R_DS: return "%ds";
   1297       case R_FS: return "%fs";
   1298       case R_GS: return "%gs";
   1299       default: vpanic("nameSReg(x86)");
   1300    }
   1301 }
   1302 
   1303 static const HChar* nameMMXReg ( Int mmxreg )
   1304 {
   1305    static const HChar* mmx_names[8]
   1306      = { "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" };
   1307    if (mmxreg < 0 || mmxreg > 7) vpanic("nameMMXReg(x86,guest)");
   1308    return mmx_names[mmxreg];
   1309 }
   1310 
   1311 static const HChar* nameXMMReg ( Int xmmreg )
   1312 {
   1313    static const HChar* xmm_names[8]
   1314      = { "%xmm0", "%xmm1", "%xmm2", "%xmm3",
   1315          "%xmm4", "%xmm5", "%xmm6", "%xmm7" };
   1316    if (xmmreg < 0 || xmmreg > 7) vpanic("name_of_xmm_reg");
   1317    return xmm_names[xmmreg];
   1318 }
   1319 
   1320 static const HChar* nameMMXGran ( Int gran )
   1321 {
   1322    switch (gran) {
   1323       case 0: return "b";
   1324       case 1: return "w";
   1325       case 2: return "d";
   1326       case 3: return "q";
   1327       default: vpanic("nameMMXGran(x86,guest)");
   1328    }
   1329 }
   1330 
   1331 static HChar nameISize ( Int size )
   1332 {
   1333    switch (size) {
   1334       case 4: return 'l';
   1335       case 2: return 'w';
   1336       case 1: return 'b';
   1337       default: vpanic("nameISize(x86)");
   1338    }
   1339 }
   1340 
   1341 
   1342 /*------------------------------------------------------------*/
   1343 /*--- JMP helpers                                          ---*/
   1344 /*------------------------------------------------------------*/
   1345 
   1346 static void jmp_lit( /*MOD*/DisResult* dres,
   1347                      IRJumpKind kind, Addr32 d32 )
   1348 {
   1349    vassert(dres->whatNext    == Dis_Continue);
   1350    vassert(dres->len         == 0);
   1351    vassert(dres->continueAt  == 0);
   1352    vassert(dres->jk_StopHere == Ijk_INVALID);
   1353    dres->whatNext    = Dis_StopHere;
   1354    dres->jk_StopHere = kind;
   1355    stmt( IRStmt_Put( OFFB_EIP, mkU32(d32) ) );
   1356 }
   1357 
   1358 static void jmp_treg( /*MOD*/DisResult* dres,
   1359                       IRJumpKind kind, IRTemp t )
   1360 {
   1361    vassert(dres->whatNext    == Dis_Continue);
   1362    vassert(dres->len         == 0);
   1363    vassert(dres->continueAt  == 0);
   1364    vassert(dres->jk_StopHere == Ijk_INVALID);
   1365    dres->whatNext    = Dis_StopHere;
   1366    dres->jk_StopHere = kind;
   1367    stmt( IRStmt_Put( OFFB_EIP, mkexpr(t) ) );
   1368 }
   1369 
   1370 static
   1371 void jcc_01( /*MOD*/DisResult* dres,
   1372              X86Condcode cond, Addr32 d32_false, Addr32 d32_true )
   1373 {
   1374    Bool        invert;
   1375    X86Condcode condPos;
   1376    vassert(dres->whatNext    == Dis_Continue);
   1377    vassert(dres->len         == 0);
   1378    vassert(dres->continueAt  == 0);
   1379    vassert(dres->jk_StopHere == Ijk_INVALID);
   1380    dres->whatNext    = Dis_StopHere;
   1381    dres->jk_StopHere = Ijk_Boring;
   1382    condPos = positiveIse_X86Condcode ( cond, &invert );
   1383    if (invert) {
   1384       stmt( IRStmt_Exit( mk_x86g_calculate_condition(condPos),
   1385                          Ijk_Boring,
   1386                          IRConst_U32(d32_false),
   1387                          OFFB_EIP ) );
   1388       stmt( IRStmt_Put( OFFB_EIP, mkU32(d32_true) ) );
   1389    } else {
   1390       stmt( IRStmt_Exit( mk_x86g_calculate_condition(condPos),
   1391                          Ijk_Boring,
   1392                          IRConst_U32(d32_true),
   1393                          OFFB_EIP ) );
   1394       stmt( IRStmt_Put( OFFB_EIP, mkU32(d32_false) ) );
   1395    }
   1396 }
   1397 
   1398 
   1399 /*------------------------------------------------------------*/
   1400 /*--- Disassembling addressing modes                       ---*/
   1401 /*------------------------------------------------------------*/
   1402 
   1403 static
   1404 const HChar* sorbTxt ( UChar sorb )
   1405 {
   1406    switch (sorb) {
   1407       case 0:    return ""; /* no override */
   1408       case 0x3E: return "%ds";
   1409       case 0x26: return "%es:";
   1410       case 0x64: return "%fs:";
   1411       case 0x65: return "%gs:";
   1412       default: vpanic("sorbTxt(x86,guest)");
   1413    }
   1414 }
   1415 
   1416 
   1417 /* 'virtual' is an IRExpr* holding a virtual address.  Convert it to a
   1418    linear address by adding any required segment override as indicated
   1419    by sorb. */
   1420 static
   1421 IRExpr* handleSegOverride ( UChar sorb, IRExpr* virtual )
   1422 {
   1423    Int    sreg;
   1424    IRType hWordTy;
   1425    IRTemp ldt_ptr, gdt_ptr, seg_selector, r64;
   1426 
   1427    if (sorb == 0)
   1428       /* the common case - no override */
   1429       return virtual;
   1430 
   1431    switch (sorb) {
   1432       case 0x3E: sreg = R_DS; break;
   1433       case 0x26: sreg = R_ES; break;
   1434       case 0x64: sreg = R_FS; break;
   1435       case 0x65: sreg = R_GS; break;
   1436       default: vpanic("handleSegOverride(x86,guest)");
   1437    }
   1438 
   1439    hWordTy = sizeof(HWord)==4 ? Ity_I32 : Ity_I64;
   1440 
   1441    seg_selector = newTemp(Ity_I32);
   1442    ldt_ptr      = newTemp(hWordTy);
   1443    gdt_ptr      = newTemp(hWordTy);
   1444    r64          = newTemp(Ity_I64);
   1445 
   1446    assign( seg_selector, unop(Iop_16Uto32, getSReg(sreg)) );
   1447    assign( ldt_ptr, IRExpr_Get( OFFB_LDT, hWordTy ));
   1448    assign( gdt_ptr, IRExpr_Get( OFFB_GDT, hWordTy ));
   1449 
   1450    /*
   1451    Call this to do the translation and limit checks:
   1452    ULong x86g_use_seg_selector ( HWord ldt, HWord gdt,
   1453                                  UInt seg_selector, UInt virtual_addr )
   1454    */
   1455    assign(
   1456       r64,
   1457       mkIRExprCCall(
   1458          Ity_I64,
   1459          0/*regparms*/,
   1460          "x86g_use_seg_selector",
   1461          &x86g_use_seg_selector,
   1462          mkIRExprVec_4( mkexpr(ldt_ptr), mkexpr(gdt_ptr),
   1463                         mkexpr(seg_selector), virtual)
   1464       )
   1465    );
   1466 
   1467    /* If the high 32 of the result are non-zero, there was a
   1468       failure in address translation.  In which case, make a
   1469       quick exit.
   1470    */
   1471    stmt(
   1472       IRStmt_Exit(
   1473          binop(Iop_CmpNE32, unop(Iop_64HIto32, mkexpr(r64)), mkU32(0)),
   1474          Ijk_MapFail,
   1475          IRConst_U32( guest_EIP_curr_instr ),
   1476          OFFB_EIP
   1477       )
   1478    );
   1479 
   1480    /* otherwise, here's the translated result. */
   1481    return unop(Iop_64to32, mkexpr(r64));
   1482 }
   1483 
   1484 
   1485 /* Generate IR to calculate an address indicated by a ModRM and
   1486    following SIB bytes.  The expression, and the number of bytes in
   1487    the address mode, are returned.  Note that this fn should not be
   1488    called if the R/M part of the address denotes a register instead of
   1489    memory.  If print_codegen is true, text of the addressing mode is
   1490    placed in buf.
   1491 
   1492    The computed address is stored in a new tempreg, and the
   1493    identity of the tempreg is returned.  */
   1494 
   1495 static IRTemp disAMode_copy2tmp ( IRExpr* addr32 )
   1496 {
   1497    IRTemp tmp = newTemp(Ity_I32);
   1498    assign( tmp, addr32 );
   1499    return tmp;
   1500 }
   1501 
   1502 static
   1503 IRTemp disAMode ( Int* len, UChar sorb, Int delta, HChar* buf )
   1504 {
   1505    UChar mod_reg_rm = getIByte(delta);
   1506    delta++;
   1507 
   1508    buf[0] = (UChar)0;
   1509 
   1510    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
   1511       jump table seems a bit excessive.
   1512    */
   1513    mod_reg_rm &= 0xC7;                      /* is now XX000YYY */
   1514    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
   1515                                             /* is now XX0XXYYY */
   1516    mod_reg_rm &= 0x1F;                      /* is now 000XXYYY */
   1517    switch (mod_reg_rm) {
   1518 
   1519       /* (%eax) .. (%edi), not including (%esp) or (%ebp).
   1520          --> GET %reg, t
   1521       */
   1522       case 0x00: case 0x01: case 0x02: case 0x03:
   1523       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
   1524          { UChar rm = mod_reg_rm;
   1525            DIS(buf, "%s(%s)", sorbTxt(sorb), nameIReg(4,rm));
   1526            *len = 1;
   1527            return disAMode_copy2tmp(
   1528                   handleSegOverride(sorb, getIReg(4,rm)));
   1529          }
   1530 
   1531       /* d8(%eax) ... d8(%edi), not including d8(%esp)
   1532          --> GET %reg, t ; ADDL d8, t
   1533       */
   1534       case 0x08: case 0x09: case 0x0A: case 0x0B:
   1535       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
   1536          { UChar rm = toUChar(mod_reg_rm & 7);
   1537            UInt  d  = getSDisp8(delta);
   1538            DIS(buf, "%s%d(%s)", sorbTxt(sorb), (Int)d, nameIReg(4,rm));
   1539            *len = 2;
   1540            return disAMode_copy2tmp(
   1541                   handleSegOverride(sorb,
   1542                      binop(Iop_Add32,getIReg(4,rm),mkU32(d))));
   1543          }
   1544 
   1545       /* d32(%eax) ... d32(%edi), not including d32(%esp)
   1546          --> GET %reg, t ; ADDL d8, t
   1547       */
   1548       case 0x10: case 0x11: case 0x12: case 0x13:
   1549       /* ! 14 */ case 0x15: case 0x16: case 0x17:
   1550          { UChar rm = toUChar(mod_reg_rm & 7);
   1551            UInt  d  = getUDisp32(delta);
   1552            DIS(buf, "%s0x%x(%s)", sorbTxt(sorb), (Int)d, nameIReg(4,rm));
   1553            *len = 5;
   1554            return disAMode_copy2tmp(
   1555                   handleSegOverride(sorb,
   1556                      binop(Iop_Add32,getIReg(4,rm),mkU32(d))));
   1557          }
   1558 
   1559       /* a register, %eax .. %edi.  This shouldn't happen. */
   1560       case 0x18: case 0x19: case 0x1A: case 0x1B:
   1561       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
   1562          vpanic("disAMode(x86): not an addr!");
   1563 
   1564       /* a 32-bit literal address
   1565          --> MOV d32, tmp
   1566       */
   1567       case 0x05:
   1568          { UInt d = getUDisp32(delta);
   1569            *len = 5;
   1570            DIS(buf, "%s(0x%x)", sorbTxt(sorb), d);
   1571            return disAMode_copy2tmp(
   1572                      handleSegOverride(sorb, mkU32(d)));
   1573          }
   1574 
   1575       case 0x04: {
   1576          /* SIB, with no displacement.  Special cases:
   1577             -- %esp cannot act as an index value.
   1578                If index_r indicates %esp, zero is used for the index.
   1579             -- when mod is zero and base indicates EBP, base is instead
   1580                a 32-bit literal.
   1581             It's all madness, I tell you.  Extract %index, %base and
   1582             scale from the SIB byte.  The value denoted is then:
   1583                | %index == %ESP && %base == %EBP
   1584                = d32 following SIB byte
   1585                | %index == %ESP && %base != %EBP
   1586                = %base
   1587                | %index != %ESP && %base == %EBP
   1588                = d32 following SIB byte + (%index << scale)
   1589                | %index != %ESP && %base != %ESP
   1590                = %base + (%index << scale)
   1591 
   1592             What happens to the souls of CPU architects who dream up such
   1593             horrendous schemes, do you suppose?
   1594          */
   1595          UChar sib     = getIByte(delta);
   1596          UChar scale   = toUChar((sib >> 6) & 3);
   1597          UChar index_r = toUChar((sib >> 3) & 7);
   1598          UChar base_r  = toUChar(sib & 7);
   1599          delta++;
   1600 
   1601          if (index_r != R_ESP && base_r != R_EBP) {
   1602             DIS(buf, "%s(%s,%s,%d)", sorbTxt(sorb),
   1603                       nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
   1604             *len = 2;
   1605             return
   1606                disAMode_copy2tmp(
   1607                handleSegOverride(sorb,
   1608                   binop(Iop_Add32,
   1609                         getIReg(4,base_r),
   1610                         binop(Iop_Shl32, getIReg(4,index_r),
   1611                               mkU8(scale)))));
   1612          }
   1613 
   1614          if (index_r != R_ESP && base_r == R_EBP) {
   1615             UInt d = getUDisp32(delta);
   1616             DIS(buf, "%s0x%x(,%s,%d)", sorbTxt(sorb), d,
   1617                       nameIReg(4,index_r), 1<<scale);
   1618             *len = 6;
   1619             return
   1620                disAMode_copy2tmp(
   1621                handleSegOverride(sorb,
   1622                   binop(Iop_Add32,
   1623                         binop(Iop_Shl32, getIReg(4,index_r), mkU8(scale)),
   1624                         mkU32(d))));
   1625          }
   1626 
   1627          if (index_r == R_ESP && base_r != R_EBP) {
   1628             DIS(buf, "%s(%s,,)", sorbTxt(sorb), nameIReg(4,base_r));
   1629             *len = 2;
   1630             return disAMode_copy2tmp(
   1631                    handleSegOverride(sorb, getIReg(4,base_r)));
   1632          }
   1633 
   1634          if (index_r == R_ESP && base_r == R_EBP) {
   1635             UInt d = getUDisp32(delta);
   1636             DIS(buf, "%s0x%x(,,)", sorbTxt(sorb), d);
   1637             *len = 6;
   1638             return disAMode_copy2tmp(
   1639                    handleSegOverride(sorb, mkU32(d)));
   1640          }
   1641          /*NOTREACHED*/
   1642          vassert(0);
   1643       }
   1644 
   1645       /* SIB, with 8-bit displacement.  Special cases:
   1646          -- %esp cannot act as an index value.
   1647             If index_r indicates %esp, zero is used for the index.
   1648          Denoted value is:
   1649             | %index == %ESP
   1650             = d8 + %base
   1651             | %index != %ESP
   1652             = d8 + %base + (%index << scale)
   1653       */
   1654       case 0x0C: {
   1655          UChar sib     = getIByte(delta);
   1656          UChar scale   = toUChar((sib >> 6) & 3);
   1657          UChar index_r = toUChar((sib >> 3) & 7);
   1658          UChar base_r  = toUChar(sib & 7);
   1659          UInt  d       = getSDisp8(delta+1);
   1660 
   1661          if (index_r == R_ESP) {
   1662             DIS(buf, "%s%d(%s,,)", sorbTxt(sorb),
   1663                                    (Int)d, nameIReg(4,base_r));
   1664             *len = 3;
   1665             return disAMode_copy2tmp(
   1666                    handleSegOverride(sorb,
   1667                       binop(Iop_Add32, getIReg(4,base_r), mkU32(d)) ));
   1668          } else {
   1669             DIS(buf, "%s%d(%s,%s,%d)", sorbTxt(sorb), (Int)d,
   1670                      nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
   1671             *len = 3;
   1672             return
   1673                 disAMode_copy2tmp(
   1674                 handleSegOverride(sorb,
   1675                   binop(Iop_Add32,
   1676                         binop(Iop_Add32,
   1677                               getIReg(4,base_r),
   1678                               binop(Iop_Shl32,
   1679                                     getIReg(4,index_r), mkU8(scale))),
   1680                         mkU32(d))));
   1681          }
   1682 	 /*NOTREACHED*/
   1683          vassert(0);
   1684       }
   1685 
   1686       /* SIB, with 32-bit displacement.  Special cases:
   1687          -- %esp cannot act as an index value.
   1688             If index_r indicates %esp, zero is used for the index.
   1689          Denoted value is:
   1690             | %index == %ESP
   1691             = d32 + %base
   1692             | %index != %ESP
   1693             = d32 + %base + (%index << scale)
   1694       */
   1695       case 0x14: {
   1696          UChar sib     = getIByte(delta);
   1697          UChar scale   = toUChar((sib >> 6) & 3);
   1698          UChar index_r = toUChar((sib >> 3) & 7);
   1699          UChar base_r  = toUChar(sib & 7);
   1700          UInt d        = getUDisp32(delta+1);
   1701 
   1702          if (index_r == R_ESP) {
   1703             DIS(buf, "%s%d(%s,,)", sorbTxt(sorb),
   1704                                    (Int)d, nameIReg(4,base_r));
   1705             *len = 6;
   1706             return disAMode_copy2tmp(
   1707                    handleSegOverride(sorb,
   1708                       binop(Iop_Add32, getIReg(4,base_r), mkU32(d)) ));
   1709          } else {
   1710             DIS(buf, "%s%d(%s,%s,%d)", sorbTxt(sorb), (Int)d,
   1711                      nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
   1712             *len = 6;
   1713             return
   1714                 disAMode_copy2tmp(
   1715                 handleSegOverride(sorb,
   1716                   binop(Iop_Add32,
   1717                         binop(Iop_Add32,
   1718                               getIReg(4,base_r),
   1719                               binop(Iop_Shl32,
   1720                                     getIReg(4,index_r), mkU8(scale))),
   1721                         mkU32(d))));
   1722          }
   1723 	 /*NOTREACHED*/
   1724          vassert(0);
   1725       }
   1726 
   1727       default:
   1728          vpanic("disAMode(x86)");
   1729          return 0; /*notreached*/
   1730    }
   1731 }
   1732 
   1733 
   1734 /* Figure out the number of (insn-stream) bytes constituting the amode
   1735    beginning at delta.  Is useful for getting hold of literals beyond
   1736    the end of the amode before it has been disassembled.  */
   1737 
   1738 static UInt lengthAMode ( Int delta )
   1739 {
   1740    UChar mod_reg_rm = getIByte(delta); delta++;
   1741 
   1742    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
   1743       jump table seems a bit excessive.
   1744    */
   1745    mod_reg_rm &= 0xC7;               /* is now XX000YYY */
   1746    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
   1747                                      /* is now XX0XXYYY */
   1748    mod_reg_rm &= 0x1F;               /* is now 000XXYYY */
   1749    switch (mod_reg_rm) {
   1750 
   1751       /* (%eax) .. (%edi), not including (%esp) or (%ebp). */
   1752       case 0x00: case 0x01: case 0x02: case 0x03:
   1753       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
   1754          return 1;
   1755 
   1756       /* d8(%eax) ... d8(%edi), not including d8(%esp). */
   1757       case 0x08: case 0x09: case 0x0A: case 0x0B:
   1758       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
   1759          return 2;
   1760 
   1761       /* d32(%eax) ... d32(%edi), not including d32(%esp). */
   1762       case 0x10: case 0x11: case 0x12: case 0x13:
   1763       /* ! 14 */ case 0x15: case 0x16: case 0x17:
   1764          return 5;
   1765 
   1766       /* a register, %eax .. %edi.  (Not an addr, but still handled.) */
   1767       case 0x18: case 0x19: case 0x1A: case 0x1B:
   1768       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
   1769          return 1;
   1770 
   1771       /* a 32-bit literal address. */
   1772       case 0x05: return 5;
   1773 
   1774       /* SIB, no displacement.  */
   1775       case 0x04: {
   1776          UChar sib    = getIByte(delta);
   1777          UChar base_r = toUChar(sib & 7);
   1778          if (base_r == R_EBP) return 6; else return 2;
   1779       }
   1780       /* SIB, with 8-bit displacement.  */
   1781       case 0x0C: return 3;
   1782 
   1783       /* SIB, with 32-bit displacement.  */
   1784       case 0x14: return 6;
   1785 
   1786       default:
   1787          vpanic("lengthAMode");
   1788          return 0; /*notreached*/
   1789    }
   1790 }
   1791 
   1792 /*------------------------------------------------------------*/
   1793 /*--- Disassembling common idioms                          ---*/
   1794 /*------------------------------------------------------------*/
   1795 
   1796 /* Handle binary integer instructions of the form
   1797       op E, G  meaning
   1798       op reg-or-mem, reg
   1799    Is passed the a ptr to the modRM byte, the actual operation, and the
   1800    data size.  Returns the address advanced completely over this
   1801    instruction.
   1802 
   1803    E(src) is reg-or-mem
   1804    G(dst) is reg.
   1805 
   1806    If E is reg, -->    GET %G,  tmp
   1807                        OP %E,   tmp
   1808                        PUT tmp, %G
   1809 
   1810    If E is mem and OP is not reversible,
   1811                 -->    (getAddr E) -> tmpa
   1812                        LD (tmpa), tmpa
   1813                        GET %G, tmp2
   1814                        OP tmpa, tmp2
   1815                        PUT tmp2, %G
   1816 
   1817    If E is mem and OP is reversible
   1818                 -->    (getAddr E) -> tmpa
   1819                        LD (tmpa), tmpa
   1820                        OP %G, tmpa
   1821                        PUT tmpa, %G
   1822 */
   1823 static
   1824 UInt dis_op2_E_G ( UChar       sorb,
   1825                    Bool        addSubCarry,
   1826                    IROp        op8,
   1827                    Bool        keep,
   1828                    Int         size,
   1829                    Int         delta0,
   1830                    const HChar* t_x86opc )
   1831 {
   1832    HChar   dis_buf[50];
   1833    Int     len;
   1834    IRType  ty   = szToITy(size);
   1835    IRTemp  dst1 = newTemp(ty);
   1836    IRTemp  src  = newTemp(ty);
   1837    IRTemp  dst0 = newTemp(ty);
   1838    UChar   rm   = getUChar(delta0);
   1839    IRTemp  addr = IRTemp_INVALID;
   1840 
   1841    /* addSubCarry == True indicates the intended operation is
   1842       add-with-carry or subtract-with-borrow. */
   1843    if (addSubCarry) {
   1844       vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
   1845       vassert(keep);
   1846    }
   1847 
   1848    if (epartIsReg(rm)) {
   1849       /* Specially handle XOR reg,reg, because that doesn't really
   1850          depend on reg, and doing the obvious thing potentially
   1851          generates a spurious value check failure due to the bogus
   1852          dependency.  Ditto SBB reg,reg. */
   1853       if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
   1854           && gregOfRM(rm) == eregOfRM(rm)) {
   1855          putIReg(size, gregOfRM(rm), mkU(ty,0));
   1856       }
   1857       assign( dst0, getIReg(size,gregOfRM(rm)) );
   1858       assign( src,  getIReg(size,eregOfRM(rm)) );
   1859 
   1860       if (addSubCarry && op8 == Iop_Add8) {
   1861          helper_ADC( size, dst1, dst0, src,
   1862                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1863          putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1864       } else
   1865       if (addSubCarry && op8 == Iop_Sub8) {
   1866          helper_SBB( size, dst1, dst0, src,
   1867                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1868          putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1869       } else {
   1870          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   1871          if (isAddSub(op8))
   1872             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   1873          else
   1874             setFlags_DEP1(op8, dst1, ty);
   1875          if (keep)
   1876             putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1877       }
   1878 
   1879       DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
   1880                           nameIReg(size,eregOfRM(rm)),
   1881                           nameIReg(size,gregOfRM(rm)));
   1882       return 1+delta0;
   1883    } else {
   1884       /* E refers to memory */
   1885       addr = disAMode ( &len, sorb, delta0, dis_buf);
   1886       assign( dst0, getIReg(size,gregOfRM(rm)) );
   1887       assign( src,  loadLE(szToITy(size), mkexpr(addr)) );
   1888 
   1889       if (addSubCarry && op8 == Iop_Add8) {
   1890          helper_ADC( size, dst1, dst0, src,
   1891                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1892          putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1893       } else
   1894       if (addSubCarry && op8 == Iop_Sub8) {
   1895          helper_SBB( size, dst1, dst0, src,
   1896                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1897          putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1898       } else {
   1899          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   1900          if (isAddSub(op8))
   1901             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   1902          else
   1903             setFlags_DEP1(op8, dst1, ty);
   1904          if (keep)
   1905             putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1906       }
   1907 
   1908       DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
   1909                           dis_buf,nameIReg(size,gregOfRM(rm)));
   1910       return len+delta0;
   1911    }
   1912 }
   1913 
   1914 
   1915 
   1916 /* Handle binary integer instructions of the form
   1917       op G, E  meaning
   1918       op reg, reg-or-mem
   1919    Is passed the a ptr to the modRM byte, the actual operation, and the
   1920    data size.  Returns the address advanced completely over this
   1921    instruction.
   1922 
   1923    G(src) is reg.
   1924    E(dst) is reg-or-mem
   1925 
   1926    If E is reg, -->    GET %E,  tmp
   1927                        OP %G,   tmp
   1928                        PUT tmp, %E
   1929 
   1930    If E is mem, -->    (getAddr E) -> tmpa
   1931                        LD (tmpa), tmpv
   1932                        OP %G, tmpv
   1933                        ST tmpv, (tmpa)
   1934 */
   1935 static
   1936 UInt dis_op2_G_E ( UChar       sorb,
   1937                    Bool        locked,
   1938                    Bool        addSubCarry,
   1939                    IROp        op8,
   1940                    Bool        keep,
   1941                    Int         size,
   1942                    Int         delta0,
   1943                    const HChar* t_x86opc )
   1944 {
   1945    HChar   dis_buf[50];
   1946    Int     len;
   1947    IRType  ty   = szToITy(size);
   1948    IRTemp  dst1 = newTemp(ty);
   1949    IRTemp  src  = newTemp(ty);
   1950    IRTemp  dst0 = newTemp(ty);
   1951    UChar   rm   = getIByte(delta0);
   1952    IRTemp  addr = IRTemp_INVALID;
   1953 
   1954    /* addSubCarry == True indicates the intended operation is
   1955       add-with-carry or subtract-with-borrow. */
   1956    if (addSubCarry) {
   1957       vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
   1958       vassert(keep);
   1959    }
   1960 
   1961    if (epartIsReg(rm)) {
   1962       /* Specially handle XOR reg,reg, because that doesn't really
   1963          depend on reg, and doing the obvious thing potentially
   1964          generates a spurious value check failure due to the bogus
   1965          dependency.  Ditto SBB reg,reg.*/
   1966       if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
   1967           && gregOfRM(rm) == eregOfRM(rm)) {
   1968          putIReg(size, eregOfRM(rm), mkU(ty,0));
   1969       }
   1970       assign(dst0, getIReg(size,eregOfRM(rm)));
   1971       assign(src,  getIReg(size,gregOfRM(rm)));
   1972 
   1973       if (addSubCarry && op8 == Iop_Add8) {
   1974          helper_ADC( size, dst1, dst0, src,
   1975                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1976          putIReg(size, eregOfRM(rm), mkexpr(dst1));
   1977       } else
   1978       if (addSubCarry && op8 == Iop_Sub8) {
   1979          helper_SBB( size, dst1, dst0, src,
   1980                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1981          putIReg(size, eregOfRM(rm), mkexpr(dst1));
   1982       } else {
   1983          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   1984          if (isAddSub(op8))
   1985             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   1986          else
   1987             setFlags_DEP1(op8, dst1, ty);
   1988          if (keep)
   1989             putIReg(size, eregOfRM(rm), mkexpr(dst1));
   1990       }
   1991 
   1992       DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
   1993                           nameIReg(size,gregOfRM(rm)),
   1994                           nameIReg(size,eregOfRM(rm)));
   1995       return 1+delta0;
   1996    }
   1997 
   1998    /* E refers to memory */
   1999    {
   2000       addr = disAMode ( &len, sorb, delta0, dis_buf);
   2001       assign(dst0, loadLE(ty,mkexpr(addr)));
   2002       assign(src,  getIReg(size,gregOfRM(rm)));
   2003 
   2004       if (addSubCarry && op8 == Iop_Add8) {
   2005          if (locked) {
   2006             /* cas-style store */
   2007             helper_ADC( size, dst1, dst0, src,
   2008                         /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
   2009          } else {
   2010             /* normal store */
   2011             helper_ADC( size, dst1, dst0, src,
   2012                         /*store*/addr, IRTemp_INVALID, 0 );
   2013          }
   2014       } else
   2015       if (addSubCarry && op8 == Iop_Sub8) {
   2016          if (locked) {
   2017             /* cas-style store */
   2018             helper_SBB( size, dst1, dst0, src,
   2019                         /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
   2020          } else {
   2021             /* normal store */
   2022             helper_SBB( size, dst1, dst0, src,
   2023                         /*store*/addr, IRTemp_INVALID, 0 );
   2024          }
   2025       } else {
   2026          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   2027          if (keep) {
   2028             if (locked) {
   2029                if (0) vex_printf("locked case\n" );
   2030                casLE( mkexpr(addr),
   2031                       mkexpr(dst0)/*expval*/,
   2032                       mkexpr(dst1)/*newval*/, guest_EIP_curr_instr );
   2033             } else {
   2034                if (0) vex_printf("nonlocked case\n");
   2035                storeLE(mkexpr(addr), mkexpr(dst1));
   2036             }
   2037          }
   2038          if (isAddSub(op8))
   2039             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2040          else
   2041             setFlags_DEP1(op8, dst1, ty);
   2042       }
   2043 
   2044       DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
   2045                           nameIReg(size,gregOfRM(rm)), dis_buf);
   2046       return len+delta0;
   2047    }
   2048 }
   2049 
   2050 
   2051 /* Handle move instructions of the form
   2052       mov E, G  meaning
   2053       mov reg-or-mem, reg
   2054    Is passed the a ptr to the modRM byte, and the data size.  Returns
   2055    the address advanced completely over this instruction.
   2056 
   2057    E(src) is reg-or-mem
   2058    G(dst) is reg.
   2059 
   2060    If E is reg, -->    GET %E,  tmpv
   2061                        PUT tmpv, %G
   2062 
   2063    If E is mem  -->    (getAddr E) -> tmpa
   2064                        LD (tmpa), tmpb
   2065                        PUT tmpb, %G
   2066 */
   2067 static
   2068 UInt dis_mov_E_G ( UChar       sorb,
   2069                    Int         size,
   2070                    Int         delta0 )
   2071 {
   2072    Int len;
   2073    UChar rm = getIByte(delta0);
   2074    HChar dis_buf[50];
   2075 
   2076    if (epartIsReg(rm)) {
   2077       putIReg(size, gregOfRM(rm), getIReg(size, eregOfRM(rm)));
   2078       DIP("mov%c %s,%s\n", nameISize(size),
   2079                            nameIReg(size,eregOfRM(rm)),
   2080                            nameIReg(size,gregOfRM(rm)));
   2081       return 1+delta0;
   2082    }
   2083 
   2084    /* E refers to memory */
   2085    {
   2086       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
   2087       putIReg(size, gregOfRM(rm), loadLE(szToITy(size), mkexpr(addr)));
   2088       DIP("mov%c %s,%s\n", nameISize(size),
   2089                            dis_buf,nameIReg(size,gregOfRM(rm)));
   2090       return delta0+len;
   2091    }
   2092 }
   2093 
   2094 
   2095 /* Handle move instructions of the form
   2096       mov G, E  meaning
   2097       mov reg, reg-or-mem
   2098    Is passed the a ptr to the modRM byte, and the data size.  Returns
   2099    the address advanced completely over this instruction.
   2100 
   2101    G(src) is reg.
   2102    E(dst) is reg-or-mem
   2103 
   2104    If E is reg, -->    GET %G,  tmp
   2105                        PUT tmp, %E
   2106 
   2107    If E is mem, -->    (getAddr E) -> tmpa
   2108                        GET %G, tmpv
   2109                        ST tmpv, (tmpa)
   2110 */
   2111 static
   2112 UInt dis_mov_G_E ( UChar       sorb,
   2113                    Int         size,
   2114                    Int         delta0 )
   2115 {
   2116    Int len;
   2117    UChar rm = getIByte(delta0);
   2118    HChar dis_buf[50];
   2119 
   2120    if (epartIsReg(rm)) {
   2121       putIReg(size, eregOfRM(rm), getIReg(size, gregOfRM(rm)));
   2122       DIP("mov%c %s,%s\n", nameISize(size),
   2123                            nameIReg(size,gregOfRM(rm)),
   2124                            nameIReg(size,eregOfRM(rm)));
   2125       return 1+delta0;
   2126    }
   2127 
   2128    /* E refers to memory */
   2129    {
   2130       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf);
   2131       storeLE( mkexpr(addr), getIReg(size, gregOfRM(rm)) );
   2132       DIP("mov%c %s,%s\n", nameISize(size),
   2133                            nameIReg(size,gregOfRM(rm)), dis_buf);
   2134       return len+delta0;
   2135    }
   2136 }
   2137 
   2138 
   2139 /* op $immediate, AL/AX/EAX. */
   2140 static
   2141 UInt dis_op_imm_A ( Int    size,
   2142                     Bool   carrying,
   2143                     IROp   op8,
   2144                     Bool   keep,
   2145                     Int    delta,
   2146                     const HChar* t_x86opc )
   2147 {
   2148    IRType ty   = szToITy(size);
   2149    IRTemp dst0 = newTemp(ty);
   2150    IRTemp src  = newTemp(ty);
   2151    IRTemp dst1 = newTemp(ty);
   2152    UInt lit    = getUDisp(size,delta);
   2153    assign(dst0, getIReg(size,R_EAX));
   2154    assign(src,  mkU(ty,lit));
   2155 
   2156    if (isAddSub(op8) && !carrying) {
   2157       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   2158       setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2159    }
   2160    else
   2161    if (isLogic(op8)) {
   2162       vassert(!carrying);
   2163       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   2164       setFlags_DEP1(op8, dst1, ty);
   2165    }
   2166    else
   2167    if (op8 == Iop_Add8 && carrying) {
   2168       helper_ADC( size, dst1, dst0, src,
   2169                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2170    }
   2171    else
   2172    if (op8 == Iop_Sub8 && carrying) {
   2173       helper_SBB( size, dst1, dst0, src,
   2174                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2175    }
   2176    else
   2177       vpanic("dis_op_imm_A(x86,guest)");
   2178 
   2179    if (keep)
   2180       putIReg(size, R_EAX, mkexpr(dst1));
   2181 
   2182    DIP("%s%c $0x%x, %s\n", t_x86opc, nameISize(size),
   2183                            lit, nameIReg(size,R_EAX));
   2184    return delta+size;
   2185 }
   2186 
   2187 
   2188 /* Sign- and Zero-extending moves. */
   2189 static
   2190 UInt dis_movx_E_G ( UChar      sorb,
   2191                     Int delta, Int szs, Int szd, Bool sign_extend )
   2192 {
   2193    UChar rm = getIByte(delta);
   2194    if (epartIsReg(rm)) {
   2195       if (szd == szs) {
   2196          // mutant case.  See #250799
   2197          putIReg(szd, gregOfRM(rm),
   2198                            getIReg(szs,eregOfRM(rm)));
   2199       } else {
   2200          // normal case
   2201          putIReg(szd, gregOfRM(rm),
   2202                       unop(mkWidenOp(szs,szd,sign_extend),
   2203                            getIReg(szs,eregOfRM(rm))));
   2204       }
   2205       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
   2206                                nameISize(szs), nameISize(szd),
   2207                                nameIReg(szs,eregOfRM(rm)),
   2208                                nameIReg(szd,gregOfRM(rm)));
   2209       return 1+delta;
   2210    }
   2211 
   2212    /* E refers to memory */
   2213    {
   2214       Int    len;
   2215       HChar  dis_buf[50];
   2216       IRTemp addr = disAMode ( &len, sorb, delta, dis_buf );
   2217       if (szd == szs) {
   2218          // mutant case.  See #250799
   2219          putIReg(szd, gregOfRM(rm),
   2220                            loadLE(szToITy(szs),mkexpr(addr)));
   2221       } else {
   2222          // normal case
   2223          putIReg(szd, gregOfRM(rm),
   2224                       unop(mkWidenOp(szs,szd,sign_extend),
   2225                            loadLE(szToITy(szs),mkexpr(addr))));
   2226       }
   2227       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
   2228                                nameISize(szs), nameISize(szd),
   2229                                dis_buf, nameIReg(szd,gregOfRM(rm)));
   2230       return len+delta;
   2231    }
   2232 }
   2233 
   2234 
   2235 /* Generate code to divide ArchRegs EDX:EAX / DX:AX / AX by the 32 /
   2236    16 / 8 bit quantity in the given IRTemp.  */
   2237 static
   2238 void codegen_div ( Int sz, IRTemp t, Bool signed_divide )
   2239 {
   2240    IROp   op    = signed_divide ? Iop_DivModS64to32 : Iop_DivModU64to32;
   2241    IRTemp src64 = newTemp(Ity_I64);
   2242    IRTemp dst64 = newTemp(Ity_I64);
   2243    switch (sz) {
   2244       case 4:
   2245          assign( src64, binop(Iop_32HLto64,
   2246                               getIReg(4,R_EDX), getIReg(4,R_EAX)) );
   2247          assign( dst64, binop(op, mkexpr(src64), mkexpr(t)) );
   2248          putIReg( 4, R_EAX, unop(Iop_64to32,mkexpr(dst64)) );
   2249          putIReg( 4, R_EDX, unop(Iop_64HIto32,mkexpr(dst64)) );
   2250          break;
   2251       case 2: {
   2252          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
   2253          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
   2254          assign( src64, unop(widen3264,
   2255                              binop(Iop_16HLto32,
   2256                                    getIReg(2,R_EDX), getIReg(2,R_EAX))) );
   2257          assign( dst64, binop(op, mkexpr(src64), unop(widen1632,mkexpr(t))) );
   2258          putIReg( 2, R_EAX, unop(Iop_32to16,unop(Iop_64to32,mkexpr(dst64))) );
   2259          putIReg( 2, R_EDX, unop(Iop_32to16,unop(Iop_64HIto32,mkexpr(dst64))) );
   2260          break;
   2261       }
   2262       case 1: {
   2263          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
   2264          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
   2265          IROp widen816  = signed_divide ? Iop_8Sto16  : Iop_8Uto16;
   2266          assign( src64, unop(widen3264, unop(widen1632, getIReg(2,R_EAX))) );
   2267          assign( dst64,
   2268                  binop(op, mkexpr(src64),
   2269                            unop(widen1632, unop(widen816, mkexpr(t)))) );
   2270          putIReg( 1, R_AL, unop(Iop_16to8, unop(Iop_32to16,
   2271                            unop(Iop_64to32,mkexpr(dst64)))) );
   2272          putIReg( 1, R_AH, unop(Iop_16to8, unop(Iop_32to16,
   2273                            unop(Iop_64HIto32,mkexpr(dst64)))) );
   2274          break;
   2275       }
   2276       default: vpanic("codegen_div(x86)");
   2277    }
   2278 }
   2279 
   2280 
   2281 static
   2282 UInt dis_Grp1 ( UChar sorb, Bool locked,
   2283                 Int delta, UChar modrm,
   2284                 Int am_sz, Int d_sz, Int sz, UInt d32 )
   2285 {
   2286    Int     len;
   2287    HChar   dis_buf[50];
   2288    IRType  ty   = szToITy(sz);
   2289    IRTemp  dst1 = newTemp(ty);
   2290    IRTemp  src  = newTemp(ty);
   2291    IRTemp  dst0 = newTemp(ty);
   2292    IRTemp  addr = IRTemp_INVALID;
   2293    IROp    op8  = Iop_INVALID;
   2294    UInt    mask = sz==1 ? 0xFF : (sz==2 ? 0xFFFF : 0xFFFFFFFF);
   2295 
   2296    switch (gregOfRM(modrm)) {
   2297       case 0: op8 = Iop_Add8; break;  case 1: op8 = Iop_Or8;  break;
   2298       case 2: break;  // ADC
   2299       case 3: break;  // SBB
   2300       case 4: op8 = Iop_And8; break;  case 5: op8 = Iop_Sub8; break;
   2301       case 6: op8 = Iop_Xor8; break;  case 7: op8 = Iop_Sub8; break;
   2302       /*NOTREACHED*/
   2303       default: vpanic("dis_Grp1: unhandled case");
   2304    }
   2305 
   2306    if (epartIsReg(modrm)) {
   2307       vassert(am_sz == 1);
   2308 
   2309       assign(dst0, getIReg(sz,eregOfRM(modrm)));
   2310       assign(src,  mkU(ty,d32 & mask));
   2311 
   2312       if (gregOfRM(modrm) == 2 /* ADC */) {
   2313          helper_ADC( sz, dst1, dst0, src,
   2314                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2315       } else
   2316       if (gregOfRM(modrm) == 3 /* SBB */) {
   2317          helper_SBB( sz, dst1, dst0, src,
   2318                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2319       } else {
   2320          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   2321          if (isAddSub(op8))
   2322             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2323          else
   2324             setFlags_DEP1(op8, dst1, ty);
   2325       }
   2326 
   2327       if (gregOfRM(modrm) < 7)
   2328          putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
   2329 
   2330       delta += (am_sz + d_sz);
   2331       DIP("%s%c $0x%x, %s\n", nameGrp1(gregOfRM(modrm)), nameISize(sz), d32,
   2332                               nameIReg(sz,eregOfRM(modrm)));
   2333    } else {
   2334       addr = disAMode ( &len, sorb, delta, dis_buf);
   2335 
   2336       assign(dst0, loadLE(ty,mkexpr(addr)));
   2337       assign(src, mkU(ty,d32 & mask));
   2338 
   2339       if (gregOfRM(modrm) == 2 /* ADC */) {
   2340          if (locked) {
   2341             /* cas-style store */
   2342             helper_ADC( sz, dst1, dst0, src,
   2343                        /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
   2344          } else {
   2345             /* normal store */
   2346             helper_ADC( sz, dst1, dst0, src,
   2347                         /*store*/addr, IRTemp_INVALID, 0 );
   2348          }
   2349       } else
   2350       if (gregOfRM(modrm) == 3 /* SBB */) {
   2351          if (locked) {
   2352             /* cas-style store */
   2353             helper_SBB( sz, dst1, dst0, src,
   2354                        /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
   2355          } else {
   2356             /* normal store */
   2357             helper_SBB( sz, dst1, dst0, src,
   2358                         /*store*/addr, IRTemp_INVALID, 0 );
   2359          }
   2360       } else {
   2361          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   2362          if (gregOfRM(modrm) < 7) {
   2363             if (locked) {
   2364                casLE( mkexpr(addr), mkexpr(dst0)/*expVal*/,
   2365                                     mkexpr(dst1)/*newVal*/,
   2366                                     guest_EIP_curr_instr );
   2367             } else {
   2368                storeLE(mkexpr(addr), mkexpr(dst1));
   2369             }
   2370          }
   2371          if (isAddSub(op8))
   2372             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2373          else
   2374             setFlags_DEP1(op8, dst1, ty);
   2375       }
   2376 
   2377       delta += (len+d_sz);
   2378       DIP("%s%c $0x%x, %s\n", nameGrp1(gregOfRM(modrm)), nameISize(sz),
   2379                               d32, dis_buf);
   2380    }
   2381    return delta;
   2382 }
   2383 
   2384 
   2385 /* Group 2 extended opcodes.  shift_expr must be an 8-bit typed
   2386    expression. */
   2387 
   2388 static
   2389 UInt dis_Grp2 ( UChar sorb,
   2390                 Int delta, UChar modrm,
   2391                 Int am_sz, Int d_sz, Int sz, IRExpr* shift_expr,
   2392                 const HChar* shift_expr_txt, Bool* decode_OK )
   2393 {
   2394    /* delta on entry points at the modrm byte. */
   2395    HChar  dis_buf[50];
   2396    Int    len;
   2397    Bool   isShift, isRotate, isRotateC;
   2398    IRType ty    = szToITy(sz);
   2399    IRTemp dst0  = newTemp(ty);
   2400    IRTemp dst1  = newTemp(ty);
   2401    IRTemp addr  = IRTemp_INVALID;
   2402 
   2403    *decode_OK = True;
   2404 
   2405    vassert(sz == 1 || sz == 2 || sz == 4);
   2406 
   2407    /* Put value to shift/rotate in dst0. */
   2408    if (epartIsReg(modrm)) {
   2409       assign(dst0, getIReg(sz, eregOfRM(modrm)));
   2410       delta += (am_sz + d_sz);
   2411    } else {
   2412       addr = disAMode ( &len, sorb, delta, dis_buf);
   2413       assign(dst0, loadLE(ty,mkexpr(addr)));
   2414       delta += len + d_sz;
   2415    }
   2416 
   2417    isShift = False;
   2418    switch (gregOfRM(modrm)) { case 4: case 5: case 6: case 7: isShift = True; }
   2419 
   2420    isRotate = False;
   2421    switch (gregOfRM(modrm)) { case 0: case 1: isRotate = True; }
   2422 
   2423    isRotateC = False;
   2424    switch (gregOfRM(modrm)) { case 2: case 3: isRotateC = True; }
   2425 
   2426    if (!isShift && !isRotate && !isRotateC) {
   2427       /*NOTREACHED*/
   2428       vpanic("dis_Grp2(Reg): unhandled case(x86)");
   2429    }
   2430 
   2431    if (isRotateC) {
   2432       /* call a helper; these insns are so ridiculous they do not
   2433          deserve better */
   2434       Bool     left = toBool(gregOfRM(modrm) == 2);
   2435       IRTemp   r64  = newTemp(Ity_I64);
   2436       IRExpr** args
   2437          = mkIRExprVec_4( widenUto32(mkexpr(dst0)), /* thing to rotate */
   2438                           widenUto32(shift_expr),   /* rotate amount */
   2439                           widenUto32(mk_x86g_calculate_eflags_all()),
   2440                           mkU32(sz) );
   2441       assign( r64, mkIRExprCCall(
   2442                       Ity_I64,
   2443                       0/*regparm*/,
   2444                       left ? "x86g_calculate_RCL" : "x86g_calculate_RCR",
   2445                       left ? &x86g_calculate_RCL  : &x86g_calculate_RCR,
   2446                       args
   2447                    )
   2448             );
   2449       /* new eflags in hi half r64; new value in lo half r64 */
   2450       assign( dst1, narrowTo(ty, unop(Iop_64to32, mkexpr(r64))) );
   2451       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   2452       stmt( IRStmt_Put( OFFB_CC_DEP1, unop(Iop_64HIto32, mkexpr(r64)) ));
   2453       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   2454       /* Set NDEP even though it isn't used.  This makes redundant-PUT
   2455          elimination of previous stores to this field work better. */
   2456       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   2457    }
   2458 
   2459    if (isShift) {
   2460 
   2461       IRTemp pre32     = newTemp(Ity_I32);
   2462       IRTemp res32     = newTemp(Ity_I32);
   2463       IRTemp res32ss   = newTemp(Ity_I32);
   2464       IRTemp shift_amt = newTemp(Ity_I8);
   2465       IROp   op32;
   2466 
   2467       switch (gregOfRM(modrm)) {
   2468          case 4: op32 = Iop_Shl32; break;
   2469          case 5: op32 = Iop_Shr32; break;
   2470          case 6: op32 = Iop_Shl32; break;
   2471          case 7: op32 = Iop_Sar32; break;
   2472          /*NOTREACHED*/
   2473          default: vpanic("dis_Grp2:shift"); break;
   2474       }
   2475 
   2476       /* Widen the value to be shifted to 32 bits, do the shift, and
   2477          narrow back down.  This seems surprisingly long-winded, but
   2478          unfortunately the Intel semantics requires that 8/16-bit
   2479          shifts give defined results for shift values all the way up
   2480          to 31, and this seems the simplest way to do it.  It has the
   2481          advantage that the only IR level shifts generated are of 32
   2482          bit values, and the shift amount is guaranteed to be in the
   2483          range 0 .. 31, thereby observing the IR semantics requiring
   2484          all shift values to be in the range 0 .. 2^word_size-1. */
   2485 
   2486       /* shift_amt = shift_expr & 31, regardless of operation size */
   2487       assign( shift_amt, binop(Iop_And8, shift_expr, mkU8(31)) );
   2488 
   2489       /* suitably widen the value to be shifted to 32 bits. */
   2490       assign( pre32, op32==Iop_Sar32 ? widenSto32(mkexpr(dst0))
   2491                                      : widenUto32(mkexpr(dst0)) );
   2492 
   2493       /* res32 = pre32 `shift` shift_amt */
   2494       assign( res32, binop(op32, mkexpr(pre32), mkexpr(shift_amt)) );
   2495 
   2496       /* res32ss = pre32 `shift` ((shift_amt - 1) & 31) */
   2497       assign( res32ss,
   2498               binop(op32,
   2499                     mkexpr(pre32),
   2500                     binop(Iop_And8,
   2501                           binop(Iop_Sub8,
   2502                                 mkexpr(shift_amt), mkU8(1)),
   2503                           mkU8(31))) );
   2504 
   2505       /* Build the flags thunk. */
   2506       setFlags_DEP1_DEP2_shift(op32, res32, res32ss, ty, shift_amt);
   2507 
   2508       /* Narrow the result back down. */
   2509       assign( dst1, narrowTo(ty, mkexpr(res32)) );
   2510 
   2511    } /* if (isShift) */
   2512 
   2513    else
   2514    if (isRotate) {
   2515       Int    ccOp      = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
   2516       Bool   left      = toBool(gregOfRM(modrm) == 0);
   2517       IRTemp rot_amt   = newTemp(Ity_I8);
   2518       IRTemp rot_amt32 = newTemp(Ity_I8);
   2519       IRTemp oldFlags  = newTemp(Ity_I32);
   2520 
   2521       /* rot_amt = shift_expr & mask */
   2522       /* By masking the rotate amount thusly, the IR-level Shl/Shr
   2523          expressions never shift beyond the word size and thus remain
   2524          well defined. */
   2525       assign(rot_amt32, binop(Iop_And8, shift_expr, mkU8(31)));
   2526 
   2527       if (ty == Ity_I32)
   2528          assign(rot_amt, mkexpr(rot_amt32));
   2529       else
   2530          assign(rot_amt, binop(Iop_And8, mkexpr(rot_amt32), mkU8(8*sz-1)));
   2531 
   2532       if (left) {
   2533 
   2534          /* dst1 = (dst0 << rot_amt) | (dst0 >>u (wordsize-rot_amt)) */
   2535          assign(dst1,
   2536             binop( mkSizedOp(ty,Iop_Or8),
   2537                    binop( mkSizedOp(ty,Iop_Shl8),
   2538                           mkexpr(dst0),
   2539                           mkexpr(rot_amt)
   2540                    ),
   2541                    binop( mkSizedOp(ty,Iop_Shr8),
   2542                           mkexpr(dst0),
   2543                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
   2544                    )
   2545             )
   2546          );
   2547          ccOp += X86G_CC_OP_ROLB;
   2548 
   2549       } else { /* right */
   2550 
   2551          /* dst1 = (dst0 >>u rot_amt) | (dst0 << (wordsize-rot_amt)) */
   2552          assign(dst1,
   2553             binop( mkSizedOp(ty,Iop_Or8),
   2554                    binop( mkSizedOp(ty,Iop_Shr8),
   2555                           mkexpr(dst0),
   2556                           mkexpr(rot_amt)
   2557                    ),
   2558                    binop( mkSizedOp(ty,Iop_Shl8),
   2559                           mkexpr(dst0),
   2560                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
   2561                    )
   2562             )
   2563          );
   2564          ccOp += X86G_CC_OP_RORB;
   2565 
   2566       }
   2567 
   2568       /* dst1 now holds the rotated value.  Build flag thunk.  We
   2569          need the resulting value for this, and the previous flags.
   2570          Except don't set it if the rotate count is zero. */
   2571 
   2572       assign(oldFlags, mk_x86g_calculate_eflags_all());
   2573 
   2574       /* rot_amt32 :: Ity_I8.  We need to convert it to I1. */
   2575       IRTemp rot_amt32b = newTemp(Ity_I1);
   2576       assign(rot_amt32b, binop(Iop_CmpNE8, mkexpr(rot_amt32), mkU8(0)) );
   2577 
   2578       /* CC_DEP1 is the rotated value.  CC_NDEP is flags before. */
   2579       stmt( IRStmt_Put( OFFB_CC_OP,
   2580                         IRExpr_ITE( mkexpr(rot_amt32b),
   2581                                     mkU32(ccOp),
   2582                                     IRExpr_Get(OFFB_CC_OP,Ity_I32) ) ));
   2583       stmt( IRStmt_Put( OFFB_CC_DEP1,
   2584                         IRExpr_ITE( mkexpr(rot_amt32b),
   2585                                     widenUto32(mkexpr(dst1)),
   2586                                     IRExpr_Get(OFFB_CC_DEP1,Ity_I32) ) ));
   2587       stmt( IRStmt_Put( OFFB_CC_DEP2,
   2588                         IRExpr_ITE( mkexpr(rot_amt32b),
   2589                                     mkU32(0),
   2590                                     IRExpr_Get(OFFB_CC_DEP2,Ity_I32) ) ));
   2591       stmt( IRStmt_Put( OFFB_CC_NDEP,
   2592                         IRExpr_ITE( mkexpr(rot_amt32b),
   2593                                     mkexpr(oldFlags),
   2594                                     IRExpr_Get(OFFB_CC_NDEP,Ity_I32) ) ));
   2595    } /* if (isRotate) */
   2596 
   2597    /* Save result, and finish up. */
   2598    if (epartIsReg(modrm)) {
   2599       putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
   2600       if (vex_traceflags & VEX_TRACE_FE) {
   2601          vex_printf("%s%c ",
   2602                     nameGrp2(gregOfRM(modrm)), nameISize(sz) );
   2603          if (shift_expr_txt)
   2604             vex_printf("%s", shift_expr_txt);
   2605          else
   2606             ppIRExpr(shift_expr);
   2607          vex_printf(", %s\n", nameIReg(sz,eregOfRM(modrm)));
   2608       }
   2609    } else {
   2610       storeLE(mkexpr(addr), mkexpr(dst1));
   2611       if (vex_traceflags & VEX_TRACE_FE) {
   2612          vex_printf("%s%c ",
   2613                     nameGrp2(gregOfRM(modrm)), nameISize(sz) );
   2614          if (shift_expr_txt)
   2615             vex_printf("%s", shift_expr_txt);
   2616          else
   2617             ppIRExpr(shift_expr);
   2618          vex_printf(", %s\n", dis_buf);
   2619       }
   2620    }
   2621    return delta;
   2622 }
   2623 
   2624 
   2625 /* Group 8 extended opcodes (but BT/BTS/BTC/BTR only). */
   2626 static
   2627 UInt dis_Grp8_Imm ( UChar sorb,
   2628                     Bool locked,
   2629                     Int delta, UChar modrm,
   2630                     Int am_sz, Int sz, UInt src_val,
   2631                     Bool* decode_OK )
   2632 {
   2633    /* src_val denotes a d8.
   2634       And delta on entry points at the modrm byte. */
   2635 
   2636    IRType ty     = szToITy(sz);
   2637    IRTemp t2     = newTemp(Ity_I32);
   2638    IRTemp t2m    = newTemp(Ity_I32);
   2639    IRTemp t_addr = IRTemp_INVALID;
   2640    HChar  dis_buf[50];
   2641    UInt   mask;
   2642 
   2643    /* we're optimists :-) */
   2644    *decode_OK = True;
   2645 
   2646    /* Limit src_val -- the bit offset -- to something within a word.
   2647       The Intel docs say that literal offsets larger than a word are
   2648       masked in this way. */
   2649    switch (sz) {
   2650       case 2:  src_val &= 15; break;
   2651       case 4:  src_val &= 31; break;
   2652       default: *decode_OK = False; return delta;
   2653    }
   2654 
   2655    /* Invent a mask suitable for the operation. */
   2656    switch (gregOfRM(modrm)) {
   2657       case 4: /* BT */  mask = 0;               break;
   2658       case 5: /* BTS */ mask = 1 << src_val;    break;
   2659       case 6: /* BTR */ mask = ~(1 << src_val); break;
   2660       case 7: /* BTC */ mask = 1 << src_val;    break;
   2661          /* If this needs to be extended, probably simplest to make a
   2662             new function to handle the other cases (0 .. 3).  The
   2663             Intel docs do however not indicate any use for 0 .. 3, so
   2664             we don't expect this to happen. */
   2665       default: *decode_OK = False; return delta;
   2666    }
   2667 
   2668    /* Fetch the value to be tested and modified into t2, which is
   2669       32-bits wide regardless of sz. */
   2670    if (epartIsReg(modrm)) {
   2671       vassert(am_sz == 1);
   2672       assign( t2, widenUto32(getIReg(sz, eregOfRM(modrm))) );
   2673       delta += (am_sz + 1);
   2674       DIP("%s%c $0x%x, %s\n", nameGrp8(gregOfRM(modrm)), nameISize(sz),
   2675                               src_val, nameIReg(sz,eregOfRM(modrm)));
   2676    } else {
   2677       Int len;
   2678       t_addr = disAMode ( &len, sorb, delta, dis_buf);
   2679       delta  += (len+1);
   2680       assign( t2, widenUto32(loadLE(ty, mkexpr(t_addr))) );
   2681       DIP("%s%c $0x%x, %s\n", nameGrp8(gregOfRM(modrm)), nameISize(sz),
   2682                               src_val, dis_buf);
   2683    }
   2684 
   2685    /* Compute the new value into t2m, if non-BT. */
   2686    switch (gregOfRM(modrm)) {
   2687       case 4: /* BT */
   2688          break;
   2689       case 5: /* BTS */
   2690          assign( t2m, binop(Iop_Or32, mkU32(mask), mkexpr(t2)) );
   2691          break;
   2692       case 6: /* BTR */
   2693          assign( t2m, binop(Iop_And32, mkU32(mask), mkexpr(t2)) );
   2694          break;
   2695       case 7: /* BTC */
   2696          assign( t2m, binop(Iop_Xor32, mkU32(mask), mkexpr(t2)) );
   2697          break;
   2698       default:
   2699          /*NOTREACHED*/ /*the previous switch guards this*/
   2700          vassert(0);
   2701    }
   2702 
   2703    /* Write the result back, if non-BT.  If the CAS fails then we
   2704       side-exit from the trace at this point, and so the flag state is
   2705       not affected.  This is of course as required. */
   2706    if (gregOfRM(modrm) != 4 /* BT */) {
   2707       if (epartIsReg(modrm)) {
   2708          putIReg(sz, eregOfRM(modrm), narrowTo(ty, mkexpr(t2m)));
   2709       } else {
   2710          if (locked) {
   2711             casLE( mkexpr(t_addr),
   2712                    narrowTo(ty, mkexpr(t2))/*expd*/,
   2713                    narrowTo(ty, mkexpr(t2m))/*new*/,
   2714                    guest_EIP_curr_instr );
   2715          } else {
   2716             storeLE(mkexpr(t_addr), narrowTo(ty, mkexpr(t2m)));
   2717          }
   2718       }
   2719    }
   2720 
   2721    /* Copy relevant bit from t2 into the carry flag. */
   2722    /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
   2723    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   2724    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   2725    stmt( IRStmt_Put(
   2726             OFFB_CC_DEP1,
   2727             binop(Iop_And32,
   2728                   binop(Iop_Shr32, mkexpr(t2), mkU8(src_val)),
   2729                   mkU32(1))
   2730        ));
   2731    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   2732       elimination of previous stores to this field work better. */
   2733    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   2734 
   2735    return delta;
   2736 }
   2737 
   2738 
   2739 /* Signed/unsigned widening multiply.  Generate IR to multiply the
   2740    value in EAX/AX/AL by the given IRTemp, and park the result in
   2741    EDX:EAX/DX:AX/AX.
   2742 */
   2743 static void codegen_mulL_A_D ( Int sz, Bool syned,
   2744                                IRTemp tmp, const HChar* tmp_txt )
   2745 {
   2746    IRType ty = szToITy(sz);
   2747    IRTemp t1 = newTemp(ty);
   2748 
   2749    assign( t1, getIReg(sz, R_EAX) );
   2750 
   2751    switch (ty) {
   2752       case Ity_I32: {
   2753          IRTemp res64   = newTemp(Ity_I64);
   2754          IRTemp resHi   = newTemp(Ity_I32);
   2755          IRTemp resLo   = newTemp(Ity_I32);
   2756          IROp   mulOp   = syned ? Iop_MullS32 : Iop_MullU32;
   2757          UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
   2758          setFlags_MUL ( Ity_I32, t1, tmp, tBaseOp );
   2759          assign( res64, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   2760          assign( resHi, unop(Iop_64HIto32,mkexpr(res64)));
   2761          assign( resLo, unop(Iop_64to32,mkexpr(res64)));
   2762          putIReg(4, R_EDX, mkexpr(resHi));
   2763          putIReg(4, R_EAX, mkexpr(resLo));
   2764          break;
   2765       }
   2766       case Ity_I16: {
   2767          IRTemp res32   = newTemp(Ity_I32);
   2768          IRTemp resHi   = newTemp(Ity_I16);
   2769          IRTemp resLo   = newTemp(Ity_I16);
   2770          IROp   mulOp   = syned ? Iop_MullS16 : Iop_MullU16;
   2771          UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
   2772          setFlags_MUL ( Ity_I16, t1, tmp, tBaseOp );
   2773          assign( res32, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   2774          assign( resHi, unop(Iop_32HIto16,mkexpr(res32)));
   2775          assign( resLo, unop(Iop_32to16,mkexpr(res32)));
   2776          putIReg(2, R_EDX, mkexpr(resHi));
   2777          putIReg(2, R_EAX, mkexpr(resLo));
   2778          break;
   2779       }
   2780       case Ity_I8: {
   2781          IRTemp res16   = newTemp(Ity_I16);
   2782          IRTemp resHi   = newTemp(Ity_I8);
   2783          IRTemp resLo   = newTemp(Ity_I8);
   2784          IROp   mulOp   = syned ? Iop_MullS8 : Iop_MullU8;
   2785          UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
   2786          setFlags_MUL ( Ity_I8, t1, tmp, tBaseOp );
   2787          assign( res16, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   2788          assign( resHi, unop(Iop_16HIto8,mkexpr(res16)));
   2789          assign( resLo, unop(Iop_16to8,mkexpr(res16)));
   2790          putIReg(2, R_EAX, mkexpr(res16));
   2791          break;
   2792       }
   2793       default:
   2794          vpanic("codegen_mulL_A_D(x86)");
   2795    }
   2796    DIP("%s%c %s\n", syned ? "imul" : "mul", nameISize(sz), tmp_txt);
   2797 }
   2798 
   2799 
   2800 /* Group 3 extended opcodes. */
   2801 static
   2802 UInt dis_Grp3 ( UChar sorb, Bool locked, Int sz, Int delta, Bool* decode_OK )
   2803 {
   2804    UInt    d32;
   2805    UChar   modrm;
   2806    HChar   dis_buf[50];
   2807    Int     len;
   2808    IRTemp  addr;
   2809    IRType  ty = szToITy(sz);
   2810    IRTemp  t1 = newTemp(ty);
   2811    IRTemp dst1, src, dst0;
   2812 
   2813    *decode_OK = True; /* may change this later */
   2814 
   2815    modrm = getIByte(delta);
   2816 
   2817    if (locked && (gregOfRM(modrm) != 2 && gregOfRM(modrm) != 3)) {
   2818       /* LOCK prefix only allowed with not and neg subopcodes */
   2819       *decode_OK = False;
   2820       return delta;
   2821    }
   2822 
   2823    if (epartIsReg(modrm)) {
   2824       switch (gregOfRM(modrm)) {
   2825          case 0: { /* TEST */
   2826             delta++; d32 = getUDisp(sz, delta); delta += sz;
   2827             dst1 = newTemp(ty);
   2828             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
   2829                                getIReg(sz,eregOfRM(modrm)),
   2830                                mkU(ty,d32)));
   2831             setFlags_DEP1( Iop_And8, dst1, ty );
   2832             DIP("test%c $0x%x, %s\n", nameISize(sz), d32,
   2833                                       nameIReg(sz, eregOfRM(modrm)));
   2834             break;
   2835          }
   2836          case 1: /* UNDEFINED */
   2837            /* The Intel docs imply this insn is undefined and binutils
   2838               agrees.  Unfortunately Core 2 will run it (with who
   2839               knows what result?)  sandpile.org reckons it's an alias
   2840               for case 0.  We play safe. */
   2841            *decode_OK = False;
   2842            break;
   2843          case 2: /* NOT */
   2844             delta++;
   2845             putIReg(sz, eregOfRM(modrm),
   2846                         unop(mkSizedOp(ty,Iop_Not8),
   2847                              getIReg(sz, eregOfRM(modrm))));
   2848             DIP("not%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
   2849             break;
   2850          case 3: /* NEG */
   2851             delta++;
   2852             dst0 = newTemp(ty);
   2853             src  = newTemp(ty);
   2854             dst1 = newTemp(ty);
   2855             assign(dst0, mkU(ty,0));
   2856             assign(src,  getIReg(sz,eregOfRM(modrm)));
   2857             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0), mkexpr(src)));
   2858             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
   2859             putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
   2860             DIP("neg%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
   2861             break;
   2862          case 4: /* MUL (unsigned widening) */
   2863             delta++;
   2864             src = newTemp(ty);
   2865             assign(src, getIReg(sz,eregOfRM(modrm)));
   2866             codegen_mulL_A_D ( sz, False, src, nameIReg(sz,eregOfRM(modrm)) );
   2867             break;
   2868          case 5: /* IMUL (signed widening) */
   2869             delta++;
   2870             src = newTemp(ty);
   2871             assign(src, getIReg(sz,eregOfRM(modrm)));
   2872             codegen_mulL_A_D ( sz, True, src, nameIReg(sz,eregOfRM(modrm)) );
   2873             break;
   2874          case 6: /* DIV */
   2875             delta++;
   2876             assign( t1, getIReg(sz, eregOfRM(modrm)) );
   2877             codegen_div ( sz, t1, False );
   2878             DIP("div%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
   2879             break;
   2880          case 7: /* IDIV */
   2881             delta++;
   2882             assign( t1, getIReg(sz, eregOfRM(modrm)) );
   2883             codegen_div ( sz, t1, True );
   2884             DIP("idiv%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
   2885             break;
   2886          default:
   2887             /* This can't happen - gregOfRM should return 0 .. 7 only */
   2888             vpanic("Grp3(x86)");
   2889       }
   2890    } else {
   2891       addr = disAMode ( &len, sorb, delta, dis_buf );
   2892       t1   = newTemp(ty);
   2893       delta += len;
   2894       assign(t1, loadLE(ty,mkexpr(addr)));
   2895       switch (gregOfRM(modrm)) {
   2896          case 0: { /* TEST */
   2897             d32 = getUDisp(sz, delta); delta += sz;
   2898             dst1 = newTemp(ty);
   2899             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
   2900                                mkexpr(t1), mkU(ty,d32)));
   2901             setFlags_DEP1( Iop_And8, dst1, ty );
   2902             DIP("test%c $0x%x, %s\n", nameISize(sz), d32, dis_buf);
   2903             break;
   2904          }
   2905          case 1: /* UNDEFINED */
   2906            /* See comment above on R case */
   2907            *decode_OK = False;
   2908            break;
   2909          case 2: /* NOT */
   2910             dst1 = newTemp(ty);
   2911             assign(dst1, unop(mkSizedOp(ty,Iop_Not8), mkexpr(t1)));
   2912             if (locked) {
   2913                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
   2914                                     guest_EIP_curr_instr );
   2915             } else {
   2916                storeLE( mkexpr(addr), mkexpr(dst1) );
   2917             }
   2918             DIP("not%c %s\n", nameISize(sz), dis_buf);
   2919             break;
   2920          case 3: /* NEG */
   2921             dst0 = newTemp(ty);
   2922             src  = newTemp(ty);
   2923             dst1 = newTemp(ty);
   2924             assign(dst0, mkU(ty,0));
   2925             assign(src,  mkexpr(t1));
   2926             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8),
   2927                                mkexpr(dst0), mkexpr(src)));
   2928             if (locked) {
   2929                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
   2930                                     guest_EIP_curr_instr );
   2931             } else {
   2932                storeLE( mkexpr(addr), mkexpr(dst1) );
   2933             }
   2934             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
   2935             DIP("neg%c %s\n", nameISize(sz), dis_buf);
   2936             break;
   2937          case 4: /* MUL */
   2938             codegen_mulL_A_D ( sz, False, t1, dis_buf );
   2939             break;
   2940          case 5: /* IMUL */
   2941             codegen_mulL_A_D ( sz, True, t1, dis_buf );
   2942             break;
   2943          case 6: /* DIV */
   2944             codegen_div ( sz, t1, False );
   2945             DIP("div%c %s\n", nameISize(sz), dis_buf);
   2946             break;
   2947          case 7: /* IDIV */
   2948             codegen_div ( sz, t1, True );
   2949             DIP("idiv%c %s\n", nameISize(sz), dis_buf);
   2950             break;
   2951          default:
   2952             /* This can't happen - gregOfRM should return 0 .. 7 only */
   2953             vpanic("Grp3(x86)");
   2954       }
   2955    }
   2956    return delta;
   2957 }
   2958 
   2959 
   2960 /* Group 4 extended opcodes. */
   2961 static
   2962 UInt dis_Grp4 ( UChar sorb, Bool locked, Int delta, Bool* decode_OK )
   2963 {
   2964    Int   alen;
   2965    UChar modrm;
   2966    HChar dis_buf[50];
   2967    IRType ty = Ity_I8;
   2968    IRTemp t1 = newTemp(ty);
   2969    IRTemp t2 = newTemp(ty);
   2970 
   2971    *decode_OK = True;
   2972 
   2973    modrm = getIByte(delta);
   2974 
   2975    if (locked && (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)) {
   2976       /* LOCK prefix only allowed with inc and dec subopcodes */
   2977       *decode_OK = False;
   2978       return delta;
   2979    }
   2980 
   2981    if (epartIsReg(modrm)) {
   2982       assign(t1, getIReg(1, eregOfRM(modrm)));
   2983       switch (gregOfRM(modrm)) {
   2984          case 0: /* INC */
   2985             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
   2986             putIReg(1, eregOfRM(modrm), mkexpr(t2));
   2987             setFlags_INC_DEC( True, t2, ty );
   2988             break;
   2989          case 1: /* DEC */
   2990             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
   2991             putIReg(1, eregOfRM(modrm), mkexpr(t2));
   2992             setFlags_INC_DEC( False, t2, ty );
   2993             break;
   2994          default:
   2995             *decode_OK = False;
   2996             return delta;
   2997       }
   2998       delta++;
   2999       DIP("%sb %s\n", nameGrp4(gregOfRM(modrm)),
   3000                       nameIReg(1, eregOfRM(modrm)));
   3001    } else {
   3002       IRTemp addr = disAMode ( &alen, sorb, delta, dis_buf );
   3003       assign( t1, loadLE(ty, mkexpr(addr)) );
   3004       switch (gregOfRM(modrm)) {
   3005          case 0: /* INC */
   3006             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
   3007             if (locked) {
   3008                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
   3009                       guest_EIP_curr_instr );
   3010             } else {
   3011                storeLE( mkexpr(addr), mkexpr(t2) );
   3012             }
   3013             setFlags_INC_DEC( True, t2, ty );
   3014             break;
   3015          case 1: /* DEC */
   3016             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
   3017             if (locked) {
   3018                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
   3019                       guest_EIP_curr_instr );
   3020             } else {
   3021                storeLE( mkexpr(addr), mkexpr(t2) );
   3022             }
   3023             setFlags_INC_DEC( False, t2, ty );
   3024             break;
   3025          default:
   3026             *decode_OK = False;
   3027             return delta;
   3028       }
   3029       delta += alen;
   3030       DIP("%sb %s\n", nameGrp4(gregOfRM(modrm)), dis_buf);
   3031    }
   3032    return delta;
   3033 }
   3034 
   3035 
   3036 /* Group 5 extended opcodes. */
   3037 static
   3038 UInt dis_Grp5 ( UChar sorb, Bool locked, Int sz, Int delta,
   3039                 /*MOD*/DisResult* dres, /*OUT*/Bool* decode_OK )
   3040 {
   3041    Int     len;
   3042    UChar   modrm;
   3043    HChar   dis_buf[50];
   3044    IRTemp  addr = IRTemp_INVALID;
   3045    IRType  ty = szToITy(sz);
   3046    IRTemp  t1 = newTemp(ty);
   3047    IRTemp  t2 = IRTemp_INVALID;
   3048 
   3049    *decode_OK = True;
   3050 
   3051    modrm = getIByte(delta);
   3052 
   3053    if (locked && (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)) {
   3054       /* LOCK prefix only allowed with inc and dec subopcodes */
   3055       *decode_OK = False;
   3056       return delta;
   3057    }
   3058 
   3059    if (epartIsReg(modrm)) {
   3060       assign(t1, getIReg(sz,eregOfRM(modrm)));
   3061       switch (gregOfRM(modrm)) {
   3062          case 0: /* INC */
   3063             vassert(sz == 2 || sz == 4);
   3064             t2 = newTemp(ty);
   3065             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
   3066                              mkexpr(t1), mkU(ty,1)));
   3067             setFlags_INC_DEC( True, t2, ty );
   3068             putIReg(sz,eregOfRM(modrm),mkexpr(t2));
   3069             break;
   3070          case 1: /* DEC */
   3071             vassert(sz == 2 || sz == 4);
   3072             t2 = newTemp(ty);
   3073             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
   3074                              mkexpr(t1), mkU(ty,1)));
   3075             setFlags_INC_DEC( False, t2, ty );
   3076             putIReg(sz,eregOfRM(modrm),mkexpr(t2));
   3077             break;
   3078          case 2: /* call Ev */
   3079             vassert(sz == 4);
   3080             t2 = newTemp(Ity_I32);
   3081             assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
   3082             putIReg(4, R_ESP, mkexpr(t2));
   3083             storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta+1));
   3084             jmp_treg(dres, Ijk_Call, t1);
   3085             vassert(dres->whatNext == Dis_StopHere);
   3086             break;
   3087          case 4: /* jmp Ev */
   3088             vassert(sz == 4);
   3089             jmp_treg(dres, Ijk_Boring, t1);
   3090             vassert(dres->whatNext == Dis_StopHere);
   3091             break;
   3092          case 6: /* PUSH Ev */
   3093             vassert(sz == 4 || sz == 2);
   3094             t2 = newTemp(Ity_I32);
   3095             assign( t2, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
   3096             putIReg(4, R_ESP, mkexpr(t2) );
   3097             storeLE( mkexpr(t2), mkexpr(t1) );
   3098             break;
   3099          default:
   3100             *decode_OK = False;
   3101             return delta;
   3102       }
   3103       delta++;
   3104       DIP("%s%c %s\n", nameGrp5(gregOfRM(modrm)),
   3105                        nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
   3106    } else {
   3107       addr = disAMode ( &len, sorb, delta, dis_buf );
   3108       assign(t1, loadLE(ty,mkexpr(addr)));
   3109       switch (gregOfRM(modrm)) {
   3110          case 0: /* INC */
   3111             t2 = newTemp(ty);
   3112             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
   3113                              mkexpr(t1), mkU(ty,1)));
   3114             if (locked) {
   3115                casLE( mkexpr(addr),
   3116                       mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
   3117             } else {
   3118                storeLE(mkexpr(addr),mkexpr(t2));
   3119             }
   3120             setFlags_INC_DEC( True, t2, ty );
   3121             break;
   3122          case 1: /* DEC */
   3123             t2 = newTemp(ty);
   3124             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
   3125                              mkexpr(t1), mkU(ty,1)));
   3126             if (locked) {
   3127                casLE( mkexpr(addr),
   3128                       mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
   3129             } else {
   3130                storeLE(mkexpr(addr),mkexpr(t2));
   3131             }
   3132             setFlags_INC_DEC( False, t2, ty );
   3133             break;
   3134          case 2: /* call Ev */
   3135             vassert(sz == 4);
   3136             t2 = newTemp(Ity_I32);
   3137             assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
   3138             putIReg(4, R_ESP, mkexpr(t2));
   3139             storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta+len));
   3140             jmp_treg(dres, Ijk_Call, t1);
   3141             vassert(dres->whatNext == Dis_StopHere);
   3142             break;
   3143          case 4: /* JMP Ev */
   3144             vassert(sz == 4);
   3145             jmp_treg(dres, Ijk_Boring, t1);
   3146             vassert(dres->whatNext == Dis_StopHere);
   3147             break;
   3148          case 6: /* PUSH Ev */
   3149             vassert(sz == 4 || sz == 2);
   3150             t2 = newTemp(Ity_I32);
   3151             assign( t2, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
   3152             putIReg(4, R_ESP, mkexpr(t2) );
   3153             storeLE( mkexpr(t2), mkexpr(t1) );
   3154             break;
   3155          default:
   3156             *decode_OK = False;
   3157             return delta;
   3158       }
   3159       delta += len;
   3160       DIP("%s%c %s\n", nameGrp5(gregOfRM(modrm)),
   3161                        nameISize(sz), dis_buf);
   3162    }
   3163    return delta;
   3164 }
   3165 
   3166 
   3167 /*------------------------------------------------------------*/
   3168 /*--- Disassembling string ops (including REP prefixes)    ---*/
   3169 /*------------------------------------------------------------*/
   3170 
   3171 /* Code shared by all the string ops */
   3172 static
   3173 void dis_string_op_increment(Int sz, Int t_inc)
   3174 {
   3175    if (sz == 4 || sz == 2) {
   3176       assign( t_inc,
   3177               binop(Iop_Shl32, IRExpr_Get( OFFB_DFLAG, Ity_I32 ),
   3178                                mkU8(sz/2) ) );
   3179    } else {
   3180       assign( t_inc,
   3181               IRExpr_Get( OFFB_DFLAG, Ity_I32 ) );
   3182    }
   3183 }
   3184 
   3185 static
   3186 void dis_string_op( void (*dis_OP)( Int, IRTemp ),
   3187                     Int sz, const HChar* name, UChar sorb )
   3188 {
   3189    IRTemp t_inc = newTemp(Ity_I32);
   3190    vassert(sorb == 0); /* hmm.  so what was the point of passing it in? */
   3191    dis_string_op_increment(sz, t_inc);
   3192    dis_OP( sz, t_inc );
   3193    DIP("%s%c\n", name, nameISize(sz));
   3194 }
   3195 
   3196 static
   3197 void dis_MOVS ( Int sz, IRTemp t_inc )
   3198 {
   3199    IRType ty = szToITy(sz);
   3200    IRTemp td = newTemp(Ity_I32);   /* EDI */
   3201    IRTemp ts = newTemp(Ity_I32);   /* ESI */
   3202 
   3203    assign( td, getIReg(4, R_EDI) );
   3204    assign( ts, getIReg(4, R_ESI) );
   3205 
   3206    storeLE( mkexpr(td), loadLE(ty,mkexpr(ts)) );
   3207 
   3208    putIReg( 4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
   3209    putIReg( 4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
   3210 }
   3211 
   3212 static
   3213 void dis_LODS ( Int sz, IRTemp t_inc )
   3214 {
   3215    IRType ty = szToITy(sz);
   3216    IRTemp ts = newTemp(Ity_I32);   /* ESI */
   3217 
   3218    assign( ts, getIReg(4, R_ESI) );
   3219 
   3220    putIReg( sz, R_EAX, loadLE(ty, mkexpr(ts)) );
   3221 
   3222    putIReg( 4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
   3223 }
   3224 
   3225 static
   3226 void dis_STOS ( Int sz, IRTemp t_inc )
   3227 {
   3228    IRType ty = szToITy(sz);
   3229    IRTemp ta = newTemp(ty);        /* EAX */
   3230    IRTemp td = newTemp(Ity_I32);   /* EDI */
   3231 
   3232    assign( ta, getIReg(sz, R_EAX) );
   3233    assign( td, getIReg(4, R_EDI) );
   3234 
   3235    storeLE( mkexpr(td), mkexpr(ta) );
   3236 
   3237    putIReg( 4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
   3238 }
   3239 
   3240 static
   3241 void dis_CMPS ( Int sz, IRTemp t_inc )
   3242 {
   3243    IRType ty  = szToITy(sz);
   3244    IRTemp tdv = newTemp(ty);      /* (EDI) */
   3245    IRTemp tsv = newTemp(ty);      /* (ESI) */
   3246    IRTemp td  = newTemp(Ity_I32); /*  EDI  */
   3247    IRTemp ts  = newTemp(Ity_I32); /*  ESI  */
   3248 
   3249    assign( td, getIReg(4, R_EDI) );
   3250    assign( ts, getIReg(4, R_ESI) );
   3251 
   3252    assign( tdv, loadLE(ty,mkexpr(td)) );
   3253    assign( tsv, loadLE(ty,mkexpr(ts)) );
   3254 
   3255    setFlags_DEP1_DEP2 ( Iop_Sub8, tsv, tdv, ty );
   3256 
   3257    putIReg(4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
   3258    putIReg(4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
   3259 }
   3260 
   3261 static
   3262 void dis_SCAS ( Int sz, IRTemp t_inc )
   3263 {
   3264    IRType ty  = szToITy(sz);
   3265    IRTemp ta  = newTemp(ty);       /*  EAX  */
   3266    IRTemp td  = newTemp(Ity_I32);  /*  EDI  */
   3267    IRTemp tdv = newTemp(ty);       /* (EDI) */
   3268 
   3269    assign( ta, getIReg(sz, R_EAX) );
   3270    assign( td, getIReg(4, R_EDI) );
   3271 
   3272    assign( tdv, loadLE(ty,mkexpr(td)) );
   3273    setFlags_DEP1_DEP2 ( Iop_Sub8, ta, tdv, ty );
   3274 
   3275    putIReg(4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
   3276 }
   3277 
   3278 
   3279 /* Wrap the appropriate string op inside a REP/REPE/REPNE.
   3280    We assume the insn is the last one in the basic block, and so emit a jump
   3281    to the next insn, rather than just falling through. */
   3282 static
   3283 void dis_REP_op ( /*MOD*/DisResult* dres,
   3284                   X86Condcode cond,
   3285                   void (*dis_OP)(Int, IRTemp),
   3286                   Int sz, Addr32 eip, Addr32 eip_next, const HChar* name )
   3287 {
   3288    IRTemp t_inc = newTemp(Ity_I32);
   3289    IRTemp tc    = newTemp(Ity_I32);  /*  ECX  */
   3290 
   3291    assign( tc, getIReg(4,R_ECX) );
   3292 
   3293    stmt( IRStmt_Exit( binop(Iop_CmpEQ32,mkexpr(tc),mkU32(0)),
   3294                       Ijk_Boring,
   3295                       IRConst_U32(eip_next), OFFB_EIP ) );
   3296 
   3297    putIReg(4, R_ECX, binop(Iop_Sub32, mkexpr(tc), mkU32(1)) );
   3298 
   3299    dis_string_op_increment(sz, t_inc);
   3300    dis_OP (sz, t_inc);
   3301 
   3302    if (cond == X86CondAlways) {
   3303       jmp_lit(dres, Ijk_Boring, eip);
   3304       vassert(dres->whatNext == Dis_StopHere);
   3305    } else {
   3306       stmt( IRStmt_Exit( mk_x86g_calculate_condition(cond),
   3307                          Ijk_Boring,
   3308                          IRConst_U32(eip), OFFB_EIP ) );
   3309       jmp_lit(dres, Ijk_Boring, eip_next);
   3310       vassert(dres->whatNext == Dis_StopHere);
   3311    }
   3312    DIP("%s%c\n", name, nameISize(sz));
   3313 }
   3314 
   3315 
   3316 /*------------------------------------------------------------*/
   3317 /*--- Arithmetic, etc.                                     ---*/
   3318 /*------------------------------------------------------------*/
   3319 
   3320 /* IMUL E, G.  Supplied eip points to the modR/M byte. */
   3321 static
   3322 UInt dis_mul_E_G ( UChar       sorb,
   3323                    Int         size,
   3324                    Int         delta0 )
   3325 {
   3326    Int    alen;
   3327    HChar  dis_buf[50];
   3328    UChar  rm = getIByte(delta0);
   3329    IRType ty = szToITy(size);
   3330    IRTemp te = newTemp(ty);
   3331    IRTemp tg = newTemp(ty);
   3332    IRTemp resLo = newTemp(ty);
   3333 
   3334    assign( tg, getIReg(size, gregOfRM(rm)) );
   3335    if (epartIsReg(rm)) {
   3336       assign( te, getIReg(size, eregOfRM(rm)) );
   3337    } else {
   3338       IRTemp addr = disAMode( &alen, sorb, delta0, dis_buf );
   3339       assign( te, loadLE(ty,mkexpr(addr)) );
   3340    }
   3341 
   3342    setFlags_MUL ( ty, te, tg, X86G_CC_OP_SMULB );
   3343 
   3344    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tg) ) );
   3345 
   3346    putIReg(size, gregOfRM(rm), mkexpr(resLo) );
   3347 
   3348    if (epartIsReg(rm)) {
   3349       DIP("imul%c %s, %s\n", nameISize(size),
   3350                              nameIReg(size,eregOfRM(rm)),
   3351                              nameIReg(size,gregOfRM(rm)));
   3352       return 1+delta0;
   3353    } else {
   3354       DIP("imul%c %s, %s\n", nameISize(size),
   3355                              dis_buf, nameIReg(size,gregOfRM(rm)));
   3356       return alen+delta0;
   3357    }
   3358 }
   3359 
   3360 
   3361 /* IMUL I * E -> G.  Supplied eip points to the modR/M byte. */
   3362 static
   3363 UInt dis_imul_I_E_G ( UChar       sorb,
   3364                       Int         size,
   3365                       Int         delta,
   3366                       Int         litsize )
   3367 {
   3368    Int    d32, alen;
   3369    HChar  dis_buf[50];
   3370    UChar  rm = getIByte(delta);
   3371    IRType ty = szToITy(size);
   3372    IRTemp te = newTemp(ty);
   3373    IRTemp tl = newTemp(ty);
   3374    IRTemp resLo = newTemp(ty);
   3375 
   3376    vassert(size == 1 || size == 2 || size == 4);
   3377 
   3378    if (epartIsReg(rm)) {
   3379       assign(te, getIReg(size, eregOfRM(rm)));
   3380       delta++;
   3381    } else {
   3382       IRTemp addr = disAMode( &alen, sorb, delta, dis_buf );
   3383       assign(te, loadLE(ty, mkexpr(addr)));
   3384       delta += alen;
   3385    }
   3386    d32 = getSDisp(litsize,delta);
   3387    delta += litsize;
   3388 
   3389    if (size == 1) d32 &= 0xFF;
   3390    if (size == 2) d32 &= 0xFFFF;
   3391 
   3392    assign(tl, mkU(ty,d32));
   3393 
   3394    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tl) ));
   3395 
   3396    setFlags_MUL ( ty, te, tl, X86G_CC_OP_SMULB );
   3397 
   3398    putIReg(size, gregOfRM(rm), mkexpr(resLo));
   3399 
   3400    DIP("imul %d, %s, %s\n", d32,
   3401        ( epartIsReg(rm) ? nameIReg(size,eregOfRM(rm)) : dis_buf ),
   3402        nameIReg(size,gregOfRM(rm)) );
   3403    return delta;
   3404 }
   3405 
   3406 
   3407 /* Generate an IR sequence to do a count-leading-zeroes operation on
   3408    the supplied IRTemp, and return a new IRTemp holding the result.
   3409    'ty' may be Ity_I16 or Ity_I32 only.  In the case where the
   3410    argument is zero, return the number of bits in the word (the
   3411    natural semantics). */
   3412 static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
   3413 {
   3414    vassert(ty == Ity_I32 || ty == Ity_I16);
   3415 
   3416    IRTemp src32 = newTemp(Ity_I32);
   3417    assign(src32, widenUto32( mkexpr(src) ));
   3418 
   3419    IRTemp src32x = newTemp(Ity_I32);
   3420    assign(src32x,
   3421           binop(Iop_Shl32, mkexpr(src32),
   3422                            mkU8(32 - 8 * sizeofIRType(ty))));
   3423 
   3424    // Clz32 has undefined semantics when its input is zero, so
   3425    // special-case around that.
   3426    IRTemp res32 = newTemp(Ity_I32);
   3427    assign(res32,
   3428           IRExpr_ITE(
   3429              binop(Iop_CmpEQ32, mkexpr(src32x), mkU32(0)),
   3430              mkU32(8 * sizeofIRType(ty)),
   3431              unop(Iop_Clz32, mkexpr(src32x))
   3432    ));
   3433 
   3434    IRTemp res = newTemp(ty);
   3435    assign(res, narrowTo(ty, mkexpr(res32)));
   3436    return res;
   3437 }
   3438 
   3439 
   3440 /*------------------------------------------------------------*/
   3441 /*---                                                      ---*/
   3442 /*--- x87 FLOATING POINT INSTRUCTIONS                      ---*/
   3443 /*---                                                      ---*/
   3444 /*------------------------------------------------------------*/
   3445 
   3446 /* --- Helper functions for dealing with the register stack. --- */
   3447 
   3448 /* --- Set the emulation-warning pseudo-register. --- */
   3449 
   3450 static void put_emwarn ( IRExpr* e /* :: Ity_I32 */ )
   3451 {
   3452    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   3453    stmt( IRStmt_Put( OFFB_EMNOTE, e ) );
   3454 }
   3455 
   3456 /* --- Produce an IRExpr* denoting a 64-bit QNaN. --- */
   3457 
   3458 static IRExpr* mkQNaN64 ( void )
   3459 {
   3460   /* QNaN is 0 2047 1 0(51times)
   3461      == 0b 11111111111b 1 0(51times)
   3462      == 0x7FF8 0000 0000 0000
   3463    */
   3464    return IRExpr_Const(IRConst_F64i(0x7FF8000000000000ULL));
   3465 }
   3466 
   3467 /* --------- Get/put the top-of-stack pointer. --------- */
   3468 
   3469 static IRExpr* get_ftop ( void )
   3470 {
   3471    return IRExpr_Get( OFFB_FTOP, Ity_I32 );
   3472 }
   3473 
   3474 static void put_ftop ( IRExpr* e )
   3475 {
   3476    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   3477    stmt( IRStmt_Put( OFFB_FTOP, e ) );
   3478 }
   3479 
   3480 /* --------- Get/put the C3210 bits. --------- */
   3481 
   3482 static IRExpr* get_C3210 ( void )
   3483 {
   3484    return IRExpr_Get( OFFB_FC3210, Ity_I32 );
   3485 }
   3486 
   3487 static void put_C3210 ( IRExpr* e )
   3488 {
   3489    stmt( IRStmt_Put( OFFB_FC3210, e ) );
   3490 }
   3491 
   3492 /* --------- Get/put the FPU rounding mode. --------- */
   3493 static IRExpr* /* :: Ity_I32 */ get_fpround ( void )
   3494 {
   3495    return IRExpr_Get( OFFB_FPROUND, Ity_I32 );
   3496 }
   3497 
   3498 static void put_fpround ( IRExpr* /* :: Ity_I32 */ e )
   3499 {
   3500    stmt( IRStmt_Put( OFFB_FPROUND, e ) );
   3501 }
   3502 
   3503 
   3504 /* --------- Synthesise a 2-bit FPU rounding mode. --------- */
   3505 /* Produces a value in 0 .. 3, which is encoded as per the type
   3506    IRRoundingMode.  Since the guest_FPROUND value is also encoded as
   3507    per IRRoundingMode, we merely need to get it and mask it for
   3508    safety.
   3509 */
   3510 static IRExpr* /* :: Ity_I32 */ get_roundingmode ( void )
   3511 {
   3512    return binop( Iop_And32, get_fpround(), mkU32(3) );
   3513 }
   3514 
   3515 static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
   3516 {
   3517    return mkU32(Irrm_NEAREST);
   3518 }
   3519 
   3520 
   3521 /* --------- Get/set FP register tag bytes. --------- */
   3522 
   3523 /* Given i, and some expression e, generate 'ST_TAG(i) = e'. */
   3524 
   3525 static void put_ST_TAG ( Int i, IRExpr* value )
   3526 {
   3527    IRRegArray* descr;
   3528    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_I8);
   3529    descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   3530    stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
   3531 }
   3532 
   3533 /* Given i, generate an expression yielding 'ST_TAG(i)'.  This will be
   3534    zero to indicate "Empty" and nonzero to indicate "NonEmpty".  */
   3535 
   3536 static IRExpr* get_ST_TAG ( Int i )
   3537 {
   3538    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   3539    return IRExpr_GetI( descr, get_ftop(), i );
   3540 }
   3541 
   3542 
   3543 /* --------- Get/set FP registers. --------- */
   3544 
   3545 /* Given i, and some expression e, emit 'ST(i) = e' and set the
   3546    register's tag to indicate the register is full.  The previous
   3547    state of the register is not checked. */
   3548 
   3549 static void put_ST_UNCHECKED ( Int i, IRExpr* value )
   3550 {
   3551    IRRegArray* descr;
   3552    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_F64);
   3553    descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
   3554    stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
   3555    /* Mark the register as in-use. */
   3556    put_ST_TAG(i, mkU8(1));
   3557 }
   3558 
   3559 /* Given i, and some expression e, emit
   3560       ST(i) = is_full(i) ? NaN : e
   3561    and set the tag accordingly.
   3562 */
   3563 
   3564 static void put_ST ( Int i, IRExpr* value )
   3565 {
   3566    put_ST_UNCHECKED(
   3567       i,
   3568       IRExpr_ITE( binop(Iop_CmpNE8, get_ST_TAG(i), mkU8(0)),
   3569                   /* non-0 means full */
   3570                   mkQNaN64(),
   3571                   /* 0 means empty */
   3572                   value
   3573       )
   3574    );
   3575 }
   3576 
   3577 
   3578 /* Given i, generate an expression yielding 'ST(i)'. */
   3579 
   3580 static IRExpr* get_ST_UNCHECKED ( Int i )
   3581 {
   3582    IRRegArray* descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
   3583    return IRExpr_GetI( descr, get_ftop(), i );
   3584 }
   3585 
   3586 
   3587 /* Given i, generate an expression yielding
   3588   is_full(i) ? ST(i) : NaN
   3589 */
   3590 
   3591 static IRExpr* get_ST ( Int i )
   3592 {
   3593    return
   3594       IRExpr_ITE( binop(Iop_CmpNE8, get_ST_TAG(i), mkU8(0)),
   3595                   /* non-0 means full */
   3596                   get_ST_UNCHECKED(i),
   3597                   /* 0 means empty */
   3598                   mkQNaN64());
   3599 }
   3600 
   3601 
   3602 /* Given i, and some expression e, and a condition cond, generate IR
   3603    which has the same effect as put_ST(i,e) when cond is true and has
   3604    no effect when cond is false.  Given the lack of proper
   3605    if-then-else in the IR, this is pretty tricky.
   3606 */
   3607 
   3608 static void maybe_put_ST ( IRTemp cond, Int i, IRExpr* value )
   3609 {
   3610    // new_tag = if cond then FULL else old_tag
   3611    // new_val = if cond then (if old_tag==FULL then NaN else val)
   3612    //                   else old_val
   3613 
   3614    IRTemp old_tag = newTemp(Ity_I8);
   3615    assign(old_tag, get_ST_TAG(i));
   3616    IRTemp new_tag = newTemp(Ity_I8);
   3617    assign(new_tag,
   3618           IRExpr_ITE(mkexpr(cond), mkU8(1)/*FULL*/, mkexpr(old_tag)));
   3619 
   3620    IRTemp old_val = newTemp(Ity_F64);
   3621    assign(old_val, get_ST_UNCHECKED(i));
   3622    IRTemp new_val = newTemp(Ity_F64);
   3623    assign(new_val,
   3624           IRExpr_ITE(mkexpr(cond),
   3625                      IRExpr_ITE(binop(Iop_CmpNE8, mkexpr(old_tag), mkU8(0)),
   3626                                 /* non-0 means full */
   3627                                 mkQNaN64(),
   3628                                 /* 0 means empty */
   3629                                 value),
   3630                      mkexpr(old_val)));
   3631 
   3632    put_ST_UNCHECKED(i, mkexpr(new_val));
   3633    // put_ST_UNCHECKED incorrectly sets tag(i) to always be FULL.  So
   3634    // now set it to new_tag instead.
   3635    put_ST_TAG(i, mkexpr(new_tag));
   3636 }
   3637 
   3638 /* Adjust FTOP downwards by one register. */
   3639 
   3640 static void fp_push ( void )
   3641 {
   3642    put_ftop( binop(Iop_Sub32, get_ftop(), mkU32(1)) );
   3643 }
   3644 
   3645 /* Adjust FTOP downwards by one register when COND is 1:I1.  Else
   3646    don't change it. */
   3647 
   3648 static void maybe_fp_push ( IRTemp cond )
   3649 {
   3650    put_ftop( binop(Iop_Sub32, get_ftop(), unop(Iop_1Uto32,mkexpr(cond))) );
   3651 }
   3652 
   3653 /* Adjust FTOP upwards by one register, and mark the vacated register
   3654    as empty.  */
   3655 
   3656 static void fp_pop ( void )
   3657 {
   3658    put_ST_TAG(0, mkU8(0));
   3659    put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
   3660 }
   3661 
   3662 /* Set the C2 bit of the FPU status register to e[0].  Assumes that
   3663    e[31:1] == 0.
   3664 */
   3665 static void set_C2 ( IRExpr* e )
   3666 {
   3667    IRExpr* cleared = binop(Iop_And32, get_C3210(), mkU32(~X86G_FC_MASK_C2));
   3668    put_C3210( binop(Iop_Or32,
   3669                     cleared,
   3670                     binop(Iop_Shl32, e, mkU8(X86G_FC_SHIFT_C2))) );
   3671 }
   3672 
   3673 /* Generate code to check that abs(d64) < 2^63 and is finite.  This is
   3674    used to do the range checks for FSIN, FCOS, FSINCOS and FPTAN.  The
   3675    test is simple, but the derivation of it is not so simple.
   3676 
   3677    The exponent field for an IEEE754 double is 11 bits.  That means it
   3678    can take values 0 through 0x7FF.  If the exponent has value 0x7FF,
   3679    the number is either a NaN or an Infinity and so is not finite.
   3680    Furthermore, a finite value of exactly 2^63 is the smallest value
   3681    that has exponent value 0x43E.  Hence, what we need to do is
   3682    extract the exponent, ignoring the sign bit and mantissa, and check
   3683    it is < 0x43E, or <= 0x43D.
   3684 
   3685    To make this easily applicable to 32- and 64-bit targets, a
   3686    roundabout approach is used.  First the number is converted to I64,
   3687    then the top 32 bits are taken.  Shifting them right by 20 bits
   3688    places the sign bit and exponent in the bottom 12 bits.  Anding
   3689    with 0x7FF gets rid of the sign bit, leaving just the exponent
   3690    available for comparison.
   3691 */
   3692 static IRTemp math_IS_TRIG_ARG_FINITE_AND_IN_RANGE ( IRTemp d64 )
   3693 {
   3694    IRTemp i64 = newTemp(Ity_I64);
   3695    assign(i64, unop(Iop_ReinterpF64asI64, mkexpr(d64)) );
   3696    IRTemp exponent = newTemp(Ity_I32);
   3697    assign(exponent,
   3698           binop(Iop_And32,
   3699                 binop(Iop_Shr32, unop(Iop_64HIto32, mkexpr(i64)), mkU8(20)),
   3700                 mkU32(0x7FF)));
   3701    IRTemp in_range_and_finite = newTemp(Ity_I1);
   3702    assign(in_range_and_finite,
   3703           binop(Iop_CmpLE32U, mkexpr(exponent), mkU32(0x43D)));
   3704    return in_range_and_finite;
   3705 }
   3706 
   3707 /* Invent a plausible-looking FPU status word value:
   3708       ((ftop & 7) << 11) | (c3210 & 0x4700)
   3709  */
   3710 static IRExpr* get_FPU_sw ( void )
   3711 {
   3712    return
   3713       unop(Iop_32to16,
   3714            binop(Iop_Or32,
   3715                  binop(Iop_Shl32,
   3716                        binop(Iop_And32, get_ftop(), mkU32(7)),
   3717                              mkU8(11)),
   3718                        binop(Iop_And32, get_C3210(), mkU32(0x4700))
   3719       ));
   3720 }
   3721 
   3722 
   3723 /* ------------------------------------------------------- */
   3724 /* Given all that stack-mangling junk, we can now go ahead
   3725    and describe FP instructions.
   3726 */
   3727 
   3728 /* ST(0) = ST(0) `op` mem64/32(addr)
   3729    Need to check ST(0)'s tag on read, but not on write.
   3730 */
   3731 static
   3732 void fp_do_op_mem_ST_0 ( IRTemp addr, const HChar* op_txt, HChar* dis_buf,
   3733                          IROp op, Bool dbl )
   3734 {
   3735    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
   3736    if (dbl) {
   3737       put_ST_UNCHECKED(0,
   3738          triop( op,
   3739                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3740                 get_ST(0),
   3741                 loadLE(Ity_F64,mkexpr(addr))
   3742          ));
   3743    } else {
   3744       put_ST_UNCHECKED(0,
   3745          triop( op,
   3746                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3747                 get_ST(0),
   3748                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr)))
   3749          ));
   3750    }
   3751 }
   3752 
   3753 
   3754 /* ST(0) = mem64/32(addr) `op` ST(0)
   3755    Need to check ST(0)'s tag on read, but not on write.
   3756 */
   3757 static
   3758 void fp_do_oprev_mem_ST_0 ( IRTemp addr, const HChar* op_txt, HChar* dis_buf,
   3759                             IROp op, Bool dbl )
   3760 {
   3761    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
   3762    if (dbl) {
   3763       put_ST_UNCHECKED(0,
   3764          triop( op,
   3765                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3766                 loadLE(Ity_F64,mkexpr(addr)),
   3767                 get_ST(0)
   3768          ));
   3769    } else {
   3770       put_ST_UNCHECKED(0,
   3771          triop( op,
   3772                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3773                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr))),
   3774                 get_ST(0)
   3775          ));
   3776    }
   3777 }
   3778 
   3779 
   3780 /* ST(dst) = ST(dst) `op` ST(src).
   3781    Check dst and src tags when reading but not on write.
   3782 */
   3783 static
   3784 void fp_do_op_ST_ST ( const HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
   3785                       Bool pop_after )
   3786 {
   3787    DIP("f%s%s st(%d), st(%d)\n", op_txt, pop_after?"p":"",
   3788                                  (Int)st_src, (Int)st_dst );
   3789    put_ST_UNCHECKED(
   3790       st_dst,
   3791       triop( op,
   3792              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3793              get_ST(st_dst),
   3794              get_ST(st_src) )
   3795    );
   3796    if (pop_after)
   3797       fp_pop();
   3798 }
   3799 
   3800 /* ST(dst) = ST(src) `op` ST(dst).
   3801    Check dst and src tags when reading but not on write.
   3802 */
   3803 static
   3804 void fp_do_oprev_ST_ST ( const HChar* op_txt, IROp op, UInt st_src,
   3805                          UInt st_dst, Bool pop_after )
   3806 {
   3807    DIP("f%s%s st(%d), st(%d)\n", op_txt, pop_after?"p":"",
   3808                                  (Int)st_src, (Int)st_dst );
   3809    put_ST_UNCHECKED(
   3810       st_dst,
   3811       triop( op,
   3812              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3813              get_ST(st_src),
   3814              get_ST(st_dst) )
   3815    );
   3816    if (pop_after)
   3817       fp_pop();
   3818 }
   3819 
   3820 /* %eflags(Z,P,C) = UCOMI( st(0), st(i) ) */
   3821 static void fp_do_ucomi_ST0_STi ( UInt i, Bool pop_after )
   3822 {
   3823    DIP("fucomi%s %%st(0),%%st(%d)\n", pop_after ? "p" : "", (Int)i );
   3824    /* This is a bit of a hack (and isn't really right).  It sets
   3825       Z,P,C,O correctly, but forces A and S to zero, whereas the Intel
   3826       documentation implies A and S are unchanged.
   3827    */
   3828    /* It's also fishy in that it is used both for COMIP and
   3829       UCOMIP, and they aren't the same (although similar). */
   3830    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   3831    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   3832    stmt( IRStmt_Put( OFFB_CC_DEP1,
   3833                      binop( Iop_And32,
   3834                             binop(Iop_CmpF64, get_ST(0), get_ST(i)),
   3835                             mkU32(0x45)
   3836        )));
   3837    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   3838       elimination of previous stores to this field work better. */
   3839    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   3840    if (pop_after)
   3841       fp_pop();
   3842 }
   3843 
   3844 
   3845 static
   3846 UInt dis_FPU ( Bool* decode_ok, UChar sorb, Int delta )
   3847 {
   3848    Int    len;
   3849    UInt   r_src, r_dst;
   3850    HChar  dis_buf[50];
   3851    IRTemp t1, t2;
   3852 
   3853    /* On entry, delta points at the second byte of the insn (the modrm
   3854       byte).*/
   3855    UChar first_opcode = getIByte(delta-1);
   3856    UChar modrm        = getIByte(delta+0);
   3857 
   3858    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD8 opcodes +-+-+-+-+-+-+-+ */
   3859 
   3860    if (first_opcode == 0xD8) {
   3861       if (modrm < 0xC0) {
   3862 
   3863          /* bits 5,4,3 are an opcode extension, and the modRM also
   3864            specifies an address. */
   3865          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   3866          delta += len;
   3867 
   3868          switch (gregOfRM(modrm)) {
   3869 
   3870             case 0: /* FADD single-real */
   3871                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, False );
   3872                break;
   3873 
   3874             case 1: /* FMUL single-real */
   3875                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, False );
   3876                break;
   3877 
   3878             case 2: /* FCOM single-real */
   3879                DIP("fcoms %s\n", dis_buf);
   3880                /* This forces C1 to zero, which isn't right. */
   3881                put_C3210(
   3882                    binop( Iop_And32,
   3883                           binop(Iop_Shl32,
   3884                                 binop(Iop_CmpF64,
   3885                                       get_ST(0),
   3886                                       unop(Iop_F32toF64,
   3887                                            loadLE(Ity_F32,mkexpr(addr)))),
   3888                                 mkU8(8)),
   3889                           mkU32(0x4500)
   3890                    ));
   3891                break;
   3892 
   3893             case 3: /* FCOMP single-real */
   3894                DIP("fcomps %s\n", dis_buf);
   3895                /* This forces C1 to zero, which isn't right. */
   3896                put_C3210(
   3897                    binop( Iop_And32,
   3898                           binop(Iop_Shl32,
   3899                                 binop(Iop_CmpF64,
   3900                                       get_ST(0),
   3901                                       unop(Iop_F32toF64,
   3902                                            loadLE(Ity_F32,mkexpr(addr)))),
   3903                                 mkU8(8)),
   3904                           mkU32(0x4500)
   3905                    ));
   3906                fp_pop();
   3907                break;
   3908 
   3909             case 4: /* FSUB single-real */
   3910                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, False );
   3911                break;
   3912 
   3913             case 5: /* FSUBR single-real */
   3914                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, False );
   3915                break;
   3916 
   3917             case 6: /* FDIV single-real */
   3918                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, False );
   3919                break;
   3920 
   3921             case 7: /* FDIVR single-real */
   3922                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, False );
   3923                break;
   3924 
   3925             default:
   3926                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   3927                vex_printf("first_opcode == 0xD8\n");
   3928                goto decode_fail;
   3929          }
   3930       } else {
   3931          delta++;
   3932          switch (modrm) {
   3933 
   3934             case 0xC0 ... 0xC7: /* FADD %st(?),%st(0) */
   3935                fp_do_op_ST_ST ( "add", Iop_AddF64, modrm - 0xC0, 0, False );
   3936                break;
   3937 
   3938             case 0xC8 ... 0xCF: /* FMUL %st(?),%st(0) */
   3939                fp_do_op_ST_ST ( "mul", Iop_MulF64, modrm - 0xC8, 0, False );
   3940                break;
   3941 
   3942             /* Dunno if this is right */
   3943             case 0xD0 ... 0xD7: /* FCOM %st(?),%st(0) */
   3944                r_dst = (UInt)modrm - 0xD0;
   3945                DIP("fcom %%st(0),%%st(%d)\n", (Int)r_dst);
   3946                /* This forces C1 to zero, which isn't right. */
   3947                put_C3210(
   3948                    binop( Iop_And32,
   3949                           binop(Iop_Shl32,
   3950                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   3951                                 mkU8(8)),
   3952                           mkU32(0x4500)
   3953                    ));
   3954                break;
   3955 
   3956             /* Dunno if this is right */
   3957             case 0xD8 ... 0xDF: /* FCOMP %st(?),%st(0) */
   3958                r_dst = (UInt)modrm - 0xD8;
   3959                DIP("fcomp %%st(0),%%st(%d)\n", (Int)r_dst);
   3960                /* This forces C1 to zero, which isn't right. */
   3961                put_C3210(
   3962                    binop( Iop_And32,
   3963                           binop(Iop_Shl32,
   3964                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   3965                                 mkU8(8)),
   3966                           mkU32(0x4500)
   3967                    ));
   3968                fp_pop();
   3969                break;
   3970 
   3971             case 0xE0 ... 0xE7: /* FSUB %st(?),%st(0) */
   3972                fp_do_op_ST_ST ( "sub", Iop_SubF64, modrm - 0xE0, 0, False );
   3973                break;
   3974 
   3975             case 0xE8 ... 0xEF: /* FSUBR %st(?),%st(0) */
   3976                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, modrm - 0xE8, 0, False );
   3977                break;
   3978 
   3979             case 0xF0 ... 0xF7: /* FDIV %st(?),%st(0) */
   3980                fp_do_op_ST_ST ( "div", Iop_DivF64, modrm - 0xF0, 0, False );
   3981                break;
   3982 
   3983             case 0xF8 ... 0xFF: /* FDIVR %st(?),%st(0) */
   3984                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, modrm - 0xF8, 0, False );
   3985                break;
   3986 
   3987             default:
   3988                goto decode_fail;
   3989          }
   3990       }
   3991    }
   3992 
   3993    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD9 opcodes +-+-+-+-+-+-+-+ */
   3994    else
   3995    if (first_opcode == 0xD9) {
   3996       if (modrm < 0xC0) {
   3997 
   3998          /* bits 5,4,3 are an opcode extension, and the modRM also
   3999             specifies an address. */
   4000          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   4001          delta += len;
   4002 
   4003          switch (gregOfRM(modrm)) {
   4004 
   4005             case 0: /* FLD single-real */
   4006                DIP("flds %s\n", dis_buf);
   4007                fp_push();
   4008                put_ST(0, unop(Iop_F32toF64,
   4009                               loadLE(Ity_F32, mkexpr(addr))));
   4010                break;
   4011 
   4012             case 2: /* FST single-real */
   4013                DIP("fsts %s\n", dis_buf);
   4014                storeLE(mkexpr(addr),
   4015                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
   4016                break;
   4017 
   4018             case 3: /* FSTP single-real */
   4019                DIP("fstps %s\n", dis_buf);
   4020                storeLE(mkexpr(addr),
   4021                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
   4022                fp_pop();
   4023                break;
   4024 
   4025             case 4: { /* FLDENV m28 */
   4026                /* Uses dirty helper:
   4027                      VexEmNote x86g_do_FLDENV ( VexGuestX86State*, HWord ) */
   4028                IRTemp   ew = newTemp(Ity_I32);
   4029                IRDirty* d  = unsafeIRDirty_0_N (
   4030                                 0/*regparms*/,
   4031                                 "x86g_dirtyhelper_FLDENV",
   4032                                 &x86g_dirtyhelper_FLDENV,
   4033                                 mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   4034                              );
   4035                d->tmp   = ew;
   4036                /* declare we're reading memory */
   4037                d->mFx   = Ifx_Read;
   4038                d->mAddr = mkexpr(addr);
   4039                d->mSize = 28;
   4040 
   4041                /* declare we're writing guest state */
   4042                d->nFxState = 4;
   4043                vex_bzero(&d->fxState, sizeof(d->fxState));
   4044 
   4045                d->fxState[0].fx     = Ifx_Write;
   4046                d->fxState[0].offset = OFFB_FTOP;
   4047                d->fxState[0].size   = sizeof(UInt);
   4048 
   4049                d->fxState[1].fx     = Ifx_Write;
   4050                d->fxState[1].offset = OFFB_FPTAGS;
   4051                d->fxState[1].size   = 8 * sizeof(UChar);
   4052 
   4053                d->fxState[2].fx     = Ifx_Write;
   4054                d->fxState[2].offset = OFFB_FPROUND;
   4055                d->fxState[2].size   = sizeof(UInt);
   4056 
   4057                d->fxState[3].fx     = Ifx_Write;
   4058                d->fxState[3].offset = OFFB_FC3210;
   4059                d->fxState[3].size   = sizeof(UInt);
   4060 
   4061                stmt( IRStmt_Dirty(d) );
   4062 
   4063                /* ew contains any emulation warning we may need to
   4064                   issue.  If needed, side-exit to the next insn,
   4065                   reporting the warning, so that Valgrind's dispatcher
   4066                   sees the warning. */
   4067                put_emwarn( mkexpr(ew) );
   4068                stmt(
   4069                   IRStmt_Exit(
   4070                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   4071                      Ijk_EmWarn,
   4072                      IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
   4073                      OFFB_EIP
   4074                   )
   4075                );
   4076 
   4077                DIP("fldenv %s\n", dis_buf);
   4078                break;
   4079             }
   4080 
   4081             case 5: {/* FLDCW */
   4082                /* The only thing we observe in the control word is the
   4083                   rounding mode.  Therefore, pass the 16-bit value
   4084                   (x87 native-format control word) to a clean helper,
   4085                   getting back a 64-bit value, the lower half of which
   4086                   is the FPROUND value to store, and the upper half of
   4087                   which is the emulation-warning token which may be
   4088                   generated.
   4089                */
   4090                /* ULong x86h_check_fldcw ( UInt ); */
   4091                IRTemp t64 = newTemp(Ity_I64);
   4092                IRTemp ew = newTemp(Ity_I32);
   4093                DIP("fldcw %s\n", dis_buf);
   4094                assign( t64, mkIRExprCCall(
   4095                                Ity_I64, 0/*regparms*/,
   4096                                "x86g_check_fldcw",
   4097                                &x86g_check_fldcw,
   4098                                mkIRExprVec_1(
   4099                                   unop( Iop_16Uto32,
   4100                                         loadLE(Ity_I16, mkexpr(addr)))
   4101                                )
   4102                             )
   4103                      );
   4104 
   4105                put_fpround( unop(Iop_64to32, mkexpr(t64)) );
   4106                assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
   4107                put_emwarn( mkexpr(ew) );
   4108                /* Finally, if an emulation warning was reported,
   4109                   side-exit to the next insn, reporting the warning,
   4110                   so that Valgrind's dispatcher sees the warning. */
   4111                stmt(
   4112                   IRStmt_Exit(
   4113                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   4114                      Ijk_EmWarn,
   4115                      IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
   4116                      OFFB_EIP
   4117                   )
   4118                );
   4119                break;
   4120             }
   4121 
   4122             case 6: { /* FNSTENV m28 */
   4123                /* Uses dirty helper:
   4124                      void x86g_do_FSTENV ( VexGuestX86State*, HWord ) */
   4125                IRDirty* d = unsafeIRDirty_0_N (
   4126                                0/*regparms*/,
   4127                                "x86g_dirtyhelper_FSTENV",
   4128                                &x86g_dirtyhelper_FSTENV,
   4129                                mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   4130                             );
   4131                /* declare we're writing memory */
   4132                d->mFx   = Ifx_Write;
   4133                d->mAddr = mkexpr(addr);
   4134                d->mSize = 28;
   4135 
   4136                /* declare we're reading guest state */
   4137                d->nFxState = 4;
   4138                vex_bzero(&d->fxState, sizeof(d->fxState));
   4139 
   4140                d->fxState[0].fx     = Ifx_Read;
   4141                d->fxState[0].offset = OFFB_FTOP;
   4142                d->fxState[0].size   = sizeof(UInt);
   4143 
   4144                d->fxState[1].fx     = Ifx_Read;
   4145                d->fxState[1].offset = OFFB_FPTAGS;
   4146                d->fxState[1].size   = 8 * sizeof(UChar);
   4147 
   4148                d->fxState[2].fx     = Ifx_Read;
   4149                d->fxState[2].offset = OFFB_FPROUND;
   4150                d->fxState[2].size   = sizeof(UInt);
   4151 
   4152                d->fxState[3].fx     = Ifx_Read;
   4153                d->fxState[3].offset = OFFB_FC3210;
   4154                d->fxState[3].size   = sizeof(UInt);
   4155 
   4156                stmt( IRStmt_Dirty(d) );
   4157 
   4158                DIP("fnstenv %s\n", dis_buf);
   4159                break;
   4160             }
   4161 
   4162             case 7: /* FNSTCW */
   4163               /* Fake up a native x87 FPU control word.  The only
   4164                  thing it depends on is FPROUND[1:0], so call a clean
   4165                  helper to cook it up. */
   4166                /* UInt x86h_create_fpucw ( UInt fpround ) */
   4167                DIP("fnstcw %s\n", dis_buf);
   4168                storeLE(
   4169                   mkexpr(addr),
   4170                   unop( Iop_32to16,
   4171                         mkIRExprCCall(
   4172                            Ity_I32, 0/*regp*/,
   4173                            "x86g_create_fpucw", &x86g_create_fpucw,
   4174                            mkIRExprVec_1( get_fpround() )
   4175                         )
   4176                   )
   4177                );
   4178                break;
   4179 
   4180             default:
   4181                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   4182                vex_printf("first_opcode == 0xD9\n");
   4183                goto decode_fail;
   4184          }
   4185 
   4186       } else {
   4187          delta++;
   4188          switch (modrm) {
   4189 
   4190             case 0xC0 ... 0xC7: /* FLD %st(?) */
   4191                r_src = (UInt)modrm - 0xC0;
   4192                DIP("fld %%st(%d)\n", (Int)r_src);
   4193                t1 = newTemp(Ity_F64);
   4194                assign(t1, get_ST(r_src));
   4195                fp_push();
   4196                put_ST(0, mkexpr(t1));
   4197                break;
   4198 
   4199             case 0xC8 ... 0xCF: /* FXCH %st(?) */
   4200                r_src = (UInt)modrm - 0xC8;
   4201                DIP("fxch %%st(%d)\n", (Int)r_src);
   4202                t1 = newTemp(Ity_F64);
   4203                t2 = newTemp(Ity_F64);
   4204                assign(t1, get_ST(0));
   4205                assign(t2, get_ST(r_src));
   4206                put_ST_UNCHECKED(0, mkexpr(t2));
   4207                put_ST_UNCHECKED(r_src, mkexpr(t1));
   4208                break;
   4209 
   4210             case 0xE0: /* FCHS */
   4211                DIP("fchs\n");
   4212                put_ST_UNCHECKED(0, unop(Iop_NegF64, get_ST(0)));
   4213                break;
   4214 
   4215             case 0xE1: /* FABS */
   4216                DIP("fabs\n");
   4217                put_ST_UNCHECKED(0, unop(Iop_AbsF64, get_ST(0)));
   4218                break;
   4219 
   4220             case 0xE4: /* FTST */
   4221                DIP("ftst\n");
   4222                /* This forces C1 to zero, which isn't right. */
   4223                /* Well, in fact the Intel docs say (bizarrely): "C1 is
   4224                   set to 0 if stack underflow occurred; otherwise, set
   4225                   to 0" which is pretty nonsensical.  I guess it's a
   4226                    typo. */
   4227                put_C3210(
   4228                    binop( Iop_And32,
   4229                           binop(Iop_Shl32,
   4230                                 binop(Iop_CmpF64,
   4231                                       get_ST(0),
   4232                                       IRExpr_Const(IRConst_F64i(0x0ULL))),
   4233                                 mkU8(8)),
   4234                           mkU32(0x4500)
   4235                    ));
   4236                break;
   4237 
   4238             case 0xE5: { /* FXAM */
   4239                /* This is an interesting one.  It examines %st(0),
   4240                   regardless of whether the tag says it's empty or not.
   4241                   Here, just pass both the tag (in our format) and the
   4242                   value (as a double, actually a ULong) to a helper
   4243                   function. */
   4244                IRExpr** args
   4245                   = mkIRExprVec_2( unop(Iop_8Uto32, get_ST_TAG(0)),
   4246                                    unop(Iop_ReinterpF64asI64,
   4247                                         get_ST_UNCHECKED(0)) );
   4248                put_C3210(mkIRExprCCall(
   4249                             Ity_I32,
   4250                             0/*regparm*/,
   4251                             "x86g_calculate_FXAM", &x86g_calculate_FXAM,
   4252                             args
   4253                         ));
   4254                DIP("fxam\n");
   4255                break;
   4256             }
   4257 
   4258             case 0xE8: /* FLD1 */
   4259                DIP("fld1\n");
   4260                fp_push();
   4261                /* put_ST(0, IRExpr_Const(IRConst_F64(1.0))); */
   4262                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff0000000000000ULL)));
   4263                break;
   4264 
   4265             case 0xE9: /* FLDL2T */
   4266                DIP("fldl2t\n");
   4267                fp_push();
   4268                /* put_ST(0, IRExpr_Const(IRConst_F64(3.32192809488736234781))); */
   4269                put_ST(0, IRExpr_Const(IRConst_F64i(0x400a934f0979a371ULL)));
   4270                break;
   4271 
   4272             case 0xEA: /* FLDL2E */
   4273                DIP("fldl2e\n");
   4274                fp_push();
   4275                /* put_ST(0, IRExpr_Const(IRConst_F64(1.44269504088896340739))); */
   4276                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff71547652b82feULL)));
   4277                break;
   4278 
   4279             case 0xEB: /* FLDPI */
   4280                DIP("fldpi\n");
   4281                fp_push();
   4282                /* put_ST(0, IRExpr_Const(IRConst_F64(3.14159265358979323851))); */
   4283                put_ST(0, IRExpr_Const(IRConst_F64i(0x400921fb54442d18ULL)));
   4284                break;
   4285 
   4286             case 0xEC: /* FLDLG2 */
   4287                DIP("fldlg2\n");
   4288                fp_push();
   4289                /* put_ST(0, IRExpr_Const(IRConst_F64(0.301029995663981143))); */
   4290                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fd34413509f79ffULL)));
   4291                break;
   4292 
   4293             case 0xED: /* FLDLN2 */
   4294                DIP("fldln2\n");
   4295                fp_push();
   4296                /* put_ST(0, IRExpr_Const(IRConst_F64(0.69314718055994530942))); */
   4297                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fe62e42fefa39efULL)));
   4298                break;
   4299 
   4300             case 0xEE: /* FLDZ */
   4301                DIP("fldz\n");
   4302                fp_push();
   4303                /* put_ST(0, IRExpr_Const(IRConst_F64(0.0))); */
   4304                put_ST(0, IRExpr_Const(IRConst_F64i(0x0000000000000000ULL)));
   4305                break;
   4306 
   4307             case 0xF0: /* F2XM1 */
   4308                DIP("f2xm1\n");
   4309                put_ST_UNCHECKED(0,
   4310                   binop(Iop_2xm1F64,
   4311                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4312                         get_ST(0)));
   4313                break;
   4314 
   4315             case 0xF1: /* FYL2X */
   4316                DIP("fyl2x\n");
   4317                put_ST_UNCHECKED(1,
   4318                   triop(Iop_Yl2xF64,
   4319                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4320                         get_ST(1),
   4321                         get_ST(0)));
   4322                fp_pop();
   4323                break;
   4324 
   4325             case 0xF2: { /* FPTAN */
   4326                DIP("fptan\n");
   4327                IRTemp argD = newTemp(Ity_F64);
   4328                assign(argD, get_ST(0));
   4329                IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
   4330                IRTemp resD = newTemp(Ity_F64);
   4331                assign(resD,
   4332                   IRExpr_ITE(
   4333                      mkexpr(argOK),
   4334                      binop(Iop_TanF64,
   4335                            get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4336                            mkexpr(argD)),
   4337                      mkexpr(argD))
   4338                );
   4339                put_ST_UNCHECKED(0, mkexpr(resD));
   4340                /* Conditionally push 1.0 on the stack, if the arg is
   4341                   in range */
   4342                maybe_fp_push(argOK);
   4343                maybe_put_ST(argOK, 0,
   4344                             IRExpr_Const(IRConst_F64(1.0)));
   4345                set_C2( binop(Iop_Xor32,
   4346                              unop(Iop_1Uto32, mkexpr(argOK)),
   4347                              mkU32(1)) );
   4348                break;
   4349             }
   4350 
   4351             case 0xF3: /* FPATAN */
   4352                DIP("fpatan\n");
   4353                put_ST_UNCHECKED(1,
   4354                   triop(Iop_AtanF64,
   4355                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4356                         get_ST(1),
   4357                         get_ST(0)));
   4358                fp_pop();
   4359                break;
   4360 
   4361             case 0xF4: { /* FXTRACT */
   4362                IRTemp argF = newTemp(Ity_F64);
   4363                IRTemp sigF = newTemp(Ity_F64);
   4364                IRTemp expF = newTemp(Ity_F64);
   4365                IRTemp argI = newTemp(Ity_I64);
   4366                IRTemp sigI = newTemp(Ity_I64);
   4367                IRTemp expI = newTemp(Ity_I64);
   4368                DIP("fxtract\n");
   4369                assign( argF, get_ST(0) );
   4370                assign( argI, unop(Iop_ReinterpF64asI64, mkexpr(argF)));
   4371                assign( sigI,
   4372                        mkIRExprCCall(
   4373                           Ity_I64, 0/*regparms*/,
   4374                           "x86amd64g_calculate_FXTRACT",
   4375                           &x86amd64g_calculate_FXTRACT,
   4376                           mkIRExprVec_2( mkexpr(argI),
   4377                                          mkIRExpr_HWord(0)/*sig*/ ))
   4378                );
   4379                assign( expI,
   4380                        mkIRExprCCall(
   4381                           Ity_I64, 0/*regparms*/,
   4382                           "x86amd64g_calculate_FXTRACT",
   4383                           &x86amd64g_calculate_FXTRACT,
   4384                           mkIRExprVec_2( mkexpr(argI),
   4385                                          mkIRExpr_HWord(1)/*exp*/ ))
   4386                );
   4387                assign( sigF, unop(Iop_ReinterpI64asF64, mkexpr(sigI)) );
   4388                assign( expF, unop(Iop_ReinterpI64asF64, mkexpr(expI)) );
   4389                /* exponent */
   4390                put_ST_UNCHECKED(0, mkexpr(expF) );
   4391                fp_push();
   4392                /* significand */
   4393                put_ST(0, mkexpr(sigF) );
   4394                break;
   4395             }
   4396 
   4397             case 0xF5: { /* FPREM1 -- IEEE compliant */
   4398                IRTemp a1 = newTemp(Ity_F64);
   4399                IRTemp a2 = newTemp(Ity_F64);
   4400                DIP("fprem1\n");
   4401                /* Do FPREM1 twice, once to get the remainder, and once
   4402                   to get the C3210 flag values. */
   4403                assign( a1, get_ST(0) );
   4404                assign( a2, get_ST(1) );
   4405                put_ST_UNCHECKED(0,
   4406                   triop(Iop_PRem1F64,
   4407                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4408                         mkexpr(a1),
   4409                         mkexpr(a2)));
   4410                put_C3210(
   4411                   triop(Iop_PRem1C3210F64,
   4412                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4413                         mkexpr(a1),
   4414                         mkexpr(a2)) );
   4415                break;
   4416             }
   4417 
   4418             case 0xF7: /* FINCSTP */
   4419                DIP("fprem\n");
   4420                put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
   4421                break;
   4422 
   4423             case 0xF8: { /* FPREM -- not IEEE compliant */
   4424                IRTemp a1 = newTemp(Ity_F64);
   4425                IRTemp a2 = newTemp(Ity_F64);
   4426                DIP("fprem\n");
   4427                /* Do FPREM twice, once to get the remainder, and once
   4428                   to get the C3210 flag values. */
   4429                assign( a1, get_ST(0) );
   4430                assign( a2, get_ST(1) );
   4431                put_ST_UNCHECKED(0,
   4432                   triop(Iop_PRemF64,
   4433                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4434                         mkexpr(a1),
   4435                         mkexpr(a2)));
   4436                put_C3210(
   4437                   triop(Iop_PRemC3210F64,
   4438                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4439                         mkexpr(a1),
   4440                         mkexpr(a2)) );
   4441                break;
   4442             }
   4443 
   4444             case 0xF9: /* FYL2XP1 */
   4445                DIP("fyl2xp1\n");
   4446                put_ST_UNCHECKED(1,
   4447                   triop(Iop_Yl2xp1F64,
   4448                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4449                         get_ST(1),
   4450                         get_ST(0)));
   4451                fp_pop();
   4452                break;
   4453 
   4454             case 0xFA: /* FSQRT */
   4455                DIP("fsqrt\n");
   4456                put_ST_UNCHECKED(0,
   4457                   binop(Iop_SqrtF64,
   4458                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4459                         get_ST(0)));
   4460                break;
   4461 
   4462             case 0xFB: { /* FSINCOS */
   4463                DIP("fsincos\n");
   4464                IRTemp argD = newTemp(Ity_F64);
   4465                assign(argD, get_ST(0));
   4466                IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
   4467                IRTemp resD = newTemp(Ity_F64);
   4468                assign(resD,
   4469                   IRExpr_ITE(
   4470                      mkexpr(argOK),
   4471                      binop(Iop_SinF64,
   4472                            get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4473                            mkexpr(argD)),
   4474                      mkexpr(argD))
   4475                );
   4476                put_ST_UNCHECKED(0, mkexpr(resD));
   4477                /* Conditionally push the cos value on the stack, if
   4478                   the arg is in range */
   4479                maybe_fp_push(argOK);
   4480                maybe_put_ST(argOK, 0,
   4481                   binop(Iop_CosF64,
   4482                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4483                         mkexpr(argD)));
   4484                set_C2( binop(Iop_Xor32,
   4485                              unop(Iop_1Uto32, mkexpr(argOK)),
   4486                              mkU32(1)) );
   4487                break;
   4488             }
   4489 
   4490             case 0xFC: /* FRNDINT */
   4491                DIP("frndint\n");
   4492                put_ST_UNCHECKED(0,
   4493                   binop(Iop_RoundF64toInt, get_roundingmode(), get_ST(0)) );
   4494                break;
   4495 
   4496             case 0xFD: /* FSCALE */
   4497                DIP("fscale\n");
   4498                put_ST_UNCHECKED(0,
   4499                   triop(Iop_ScaleF64,
   4500                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4501                         get_ST(0),
   4502                         get_ST(1)));
   4503                break;
   4504 
   4505             case 0xFE:   /* FSIN */
   4506             case 0xFF: { /* FCOS */
   4507                Bool isSIN = modrm == 0xFE;
   4508                DIP("%s\n", isSIN ? "fsin" : "fcos");
   4509                IRTemp argD = newTemp(Ity_F64);
   4510                assign(argD, get_ST(0));
   4511                IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
   4512                IRTemp resD = newTemp(Ity_F64);
   4513                assign(resD,
   4514                   IRExpr_ITE(
   4515                      mkexpr(argOK),
   4516                      binop(isSIN ? Iop_SinF64 : Iop_CosF64,
   4517                            get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4518                            mkexpr(argD)),
   4519                      mkexpr(argD))
   4520                );
   4521                put_ST_UNCHECKED(0, mkexpr(resD));
   4522                set_C2( binop(Iop_Xor32,
   4523                              unop(Iop_1Uto32, mkexpr(argOK)),
   4524                              mkU32(1)) );
   4525                break;
   4526             }
   4527 
   4528             default:
   4529                goto decode_fail;
   4530          }
   4531       }
   4532    }
   4533 
   4534    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDA opcodes +-+-+-+-+-+-+-+ */
   4535    else
   4536    if (first_opcode == 0xDA) {
   4537 
   4538       if (modrm < 0xC0) {
   4539 
   4540          /* bits 5,4,3 are an opcode extension, and the modRM also
   4541             specifies an address. */
   4542          IROp   fop;
   4543          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   4544          delta += len;
   4545          switch (gregOfRM(modrm)) {
   4546 
   4547             case 0: /* FIADD m32int */ /* ST(0) += m32int */
   4548                DIP("fiaddl %s\n", dis_buf);
   4549                fop = Iop_AddF64;
   4550                goto do_fop_m32;
   4551 
   4552             case 1: /* FIMUL m32int */ /* ST(0) *= m32int */
   4553                DIP("fimull %s\n", dis_buf);
   4554                fop = Iop_MulF64;
   4555                goto do_fop_m32;
   4556 
   4557             case 2: /* FICOM m32int */
   4558                DIP("ficoml %s\n", dis_buf);
   4559                /* This forces C1 to zero, which isn't right. */
   4560                put_C3210(
   4561                    binop( Iop_And32,
   4562                           binop(Iop_Shl32,
   4563                                 binop(Iop_CmpF64,
   4564                                       get_ST(0),
   4565                                       unop(Iop_I32StoF64,
   4566                                            loadLE(Ity_I32,mkexpr(addr)))),
   4567                                 mkU8(8)),
   4568                           mkU32(0x4500)
   4569                    ));
   4570                break;
   4571 
   4572             case 3: /* FICOMP m32int */
   4573                DIP("ficompl %s\n", dis_buf);
   4574                /* This forces C1 to zero, which isn't right. */
   4575                put_C3210(
   4576                    binop( Iop_And32,
   4577                           binop(Iop_Shl32,
   4578                                 binop(Iop_CmpF64,
   4579                                       get_ST(0),
   4580                                       unop(Iop_I32StoF64,
   4581                                            loadLE(Ity_I32,mkexpr(addr)))),
   4582                                 mkU8(8)),
   4583                           mkU32(0x4500)
   4584                    ));
   4585                fp_pop();
   4586                break;
   4587 
   4588             case 4: /* FISUB m32int */ /* ST(0) -= m32int */
   4589                DIP("fisubl %s\n", dis_buf);
   4590                fop = Iop_SubF64;
   4591                goto do_fop_m32;
   4592 
   4593             case 5: /* FISUBR m32int */ /* ST(0) = m32int - ST(0) */
   4594                DIP("fisubrl %s\n", dis_buf);
   4595                fop = Iop_SubF64;
   4596                goto do_foprev_m32;
   4597 
   4598             case 6: /* FIDIV m32int */ /* ST(0) /= m32int */
   4599                DIP("fidivl %s\n", dis_buf);
   4600                fop = Iop_DivF64;
   4601                goto do_fop_m32;
   4602 
   4603             case 7: /* FIDIVR m32int */ /* ST(0) = m32int / ST(0) */
   4604                DIP("fidivrl %s\n", dis_buf);
   4605                fop = Iop_DivF64;
   4606                goto do_foprev_m32;
   4607 
   4608             do_fop_m32:
   4609                put_ST_UNCHECKED(0,
   4610                   triop(fop,
   4611                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4612                         get_ST(0),
   4613                         unop(Iop_I32StoF64,
   4614                              loadLE(Ity_I32, mkexpr(addr)))));
   4615                break;
   4616 
   4617             do_foprev_m32:
   4618                put_ST_UNCHECKED(0,
   4619                   triop(fop,
   4620                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4621                         unop(Iop_I32StoF64,
   4622                              loadLE(Ity_I32, mkexpr(addr))),
   4623                         get_ST(0)));
   4624                break;
   4625 
   4626             default:
   4627                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   4628                vex_printf("first_opcode == 0xDA\n");
   4629                goto decode_fail;
   4630          }
   4631 
   4632       } else {
   4633 
   4634          delta++;
   4635          switch (modrm) {
   4636 
   4637             case 0xC0 ... 0xC7: /* FCMOVB ST(i), ST(0) */
   4638                r_src = (UInt)modrm - 0xC0;
   4639                DIP("fcmovb %%st(%d), %%st(0)\n", (Int)r_src);
   4640                put_ST_UNCHECKED(0,
   4641                                 IRExpr_ITE(
   4642                                     mk_x86g_calculate_condition(X86CondB),
   4643                                     get_ST(r_src), get_ST(0)) );
   4644                break;
   4645 
   4646             case 0xC8 ... 0xCF: /* FCMOVE(Z) ST(i), ST(0) */
   4647                r_src = (UInt)modrm - 0xC8;
   4648                DIP("fcmovz %%st(%d), %%st(0)\n", (Int)r_src);
   4649                put_ST_UNCHECKED(0,
   4650                                 IRExpr_ITE(
   4651                                     mk_x86g_calculate_condition(X86CondZ),
   4652                                     get_ST(r_src), get_ST(0)) );
   4653                break;
   4654 
   4655             case 0xD0 ... 0xD7: /* FCMOVBE ST(i), ST(0) */
   4656                r_src = (UInt)modrm - 0xD0;
   4657                DIP("fcmovbe %%st(%d), %%st(0)\n", (Int)r_src);
   4658                put_ST_UNCHECKED(0,
   4659                                 IRExpr_ITE(
   4660                                     mk_x86g_calculate_condition(X86CondBE),
   4661                                     get_ST(r_src), get_ST(0)) );
   4662                break;
   4663 
   4664             case 0xD8 ... 0xDF: /* FCMOVU ST(i), ST(0) */
   4665                r_src = (UInt)modrm - 0xD8;
   4666                DIP("fcmovu %%st(%d), %%st(0)\n", (Int)r_src);
   4667                put_ST_UNCHECKED(0,
   4668                                 IRExpr_ITE(
   4669                                     mk_x86g_calculate_condition(X86CondP),
   4670                                     get_ST(r_src), get_ST(0)) );
   4671                break;
   4672 
   4673             case 0xE9: /* FUCOMPP %st(0),%st(1) */
   4674                DIP("fucompp %%st(0),%%st(1)\n");
   4675                /* This forces C1 to zero, which isn't right. */
   4676                put_C3210(
   4677                    binop( Iop_And32,
   4678                           binop(Iop_Shl32,
   4679                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
   4680                                 mkU8(8)),
   4681                           mkU32(0x4500)
   4682                    ));
   4683                fp_pop();
   4684                fp_pop();
   4685                break;
   4686 
   4687             default:
   4688                goto decode_fail;
   4689          }
   4690 
   4691       }
   4692    }
   4693 
   4694    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDB opcodes +-+-+-+-+-+-+-+ */
   4695    else
   4696    if (first_opcode == 0xDB) {
   4697       if (modrm < 0xC0) {
   4698 
   4699          /* bits 5,4,3 are an opcode extension, and the modRM also
   4700             specifies an address. */
   4701          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   4702          delta += len;
   4703 
   4704          switch (gregOfRM(modrm)) {
   4705 
   4706             case 0: /* FILD m32int */
   4707                DIP("fildl %s\n", dis_buf);
   4708                fp_push();
   4709                put_ST(0, unop(Iop_I32StoF64,
   4710                               loadLE(Ity_I32, mkexpr(addr))));
   4711                break;
   4712 
   4713             case 1: /* FISTTPL m32 (SSE3) */
   4714                DIP("fisttpl %s\n", dis_buf);
   4715                storeLE( mkexpr(addr),
   4716                         binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) );
   4717                fp_pop();
   4718                break;
   4719 
   4720             case 2: /* FIST m32 */
   4721                DIP("fistl %s\n", dis_buf);
   4722                storeLE( mkexpr(addr),
   4723                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
   4724                break;
   4725 
   4726             case 3: /* FISTP m32 */
   4727                DIP("fistpl %s\n", dis_buf);
   4728                storeLE( mkexpr(addr),
   4729                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
   4730                fp_pop();
   4731                break;
   4732 
   4733             case 5: { /* FLD extended-real */
   4734                /* Uses dirty helper:
   4735                      ULong x86g_loadF80le ( UInt )
   4736                   addr holds the address.  First, do a dirty call to
   4737                   get hold of the data. */
   4738                IRTemp   val  = newTemp(Ity_I64);
   4739                IRExpr** args = mkIRExprVec_1 ( mkexpr(addr) );
   4740 
   4741                IRDirty* d = unsafeIRDirty_1_N (
   4742                                val,
   4743                                0/*regparms*/,
   4744                                "x86g_dirtyhelper_loadF80le",
   4745                                &x86g_dirtyhelper_loadF80le,
   4746                                args
   4747                             );
   4748                /* declare that we're reading memory */
   4749                d->mFx   = Ifx_Read;
   4750                d->mAddr = mkexpr(addr);
   4751                d->mSize = 10;
   4752 
   4753                /* execute the dirty call, dumping the result in val. */
   4754                stmt( IRStmt_Dirty(d) );
   4755                fp_push();
   4756                put_ST(0, unop(Iop_ReinterpI64asF64, mkexpr(val)));
   4757 
   4758                DIP("fldt %s\n", dis_buf);
   4759                break;
   4760             }
   4761 
   4762             case 7: { /* FSTP extended-real */
   4763                /* Uses dirty helper: void x86g_storeF80le ( UInt, ULong ) */
   4764                IRExpr** args
   4765                   = mkIRExprVec_2( mkexpr(addr),
   4766                                    unop(Iop_ReinterpF64asI64, get_ST(0)) );
   4767 
   4768                IRDirty* d = unsafeIRDirty_0_N (
   4769                                0/*regparms*/,
   4770                                "x86g_dirtyhelper_storeF80le",
   4771                                &x86g_dirtyhelper_storeF80le,
   4772                                args
   4773                             );
   4774                /* declare we're writing memory */
   4775                d->mFx   = Ifx_Write;
   4776                d->mAddr = mkexpr(addr);
   4777                d->mSize = 10;
   4778 
   4779                /* execute the dirty call. */
   4780                stmt( IRStmt_Dirty(d) );
   4781                fp_pop();
   4782 
   4783                DIP("fstpt\n %s", dis_buf);
   4784                break;
   4785             }
   4786 
   4787             default:
   4788                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   4789                vex_printf("first_opcode == 0xDB\n");
   4790                goto decode_fail;
   4791          }
   4792 
   4793       } else {
   4794 
   4795          delta++;
   4796          switch (modrm) {
   4797 
   4798             case 0xC0 ... 0xC7: /* FCMOVNB ST(i), ST(0) */
   4799                r_src = (UInt)modrm - 0xC0;
   4800                DIP("fcmovnb %%st(%d), %%st(0)\n", (Int)r_src);
   4801                put_ST_UNCHECKED(0,
   4802                                 IRExpr_ITE(
   4803                                     mk_x86g_calculate_condition(X86CondNB),
   4804                                     get_ST(r_src), get_ST(0)) );
   4805                break;
   4806 
   4807             case 0xC8 ... 0xCF: /* FCMOVNE(NZ) ST(i), ST(0) */
   4808                r_src = (UInt)modrm - 0xC8;
   4809                DIP("fcmovnz %%st(%d), %%st(0)\n", (Int)r_src);
   4810                put_ST_UNCHECKED(0,
   4811                                 IRExpr_ITE(
   4812                                     mk_x86g_calculate_condition(X86CondNZ),
   4813                                     get_ST(r_src), get_ST(0)) );
   4814                break;
   4815 
   4816             case 0xD0 ... 0xD7: /* FCMOVNBE ST(i), ST(0) */
   4817                r_src = (UInt)modrm - 0xD0;
   4818                DIP("fcmovnbe %%st(%d), %%st(0)\n", (Int)r_src);
   4819                put_ST_UNCHECKED(0,
   4820                                 IRExpr_ITE(
   4821                                     mk_x86g_calculate_condition(X86CondNBE),
   4822                                     get_ST(r_src), get_ST(0)) );
   4823                break;
   4824 
   4825             case 0xD8 ... 0xDF: /* FCMOVNU ST(i), ST(0) */
   4826                r_src = (UInt)modrm - 0xD8;
   4827                DIP("fcmovnu %%st(%d), %%st(0)\n", (Int)r_src);
   4828                put_ST_UNCHECKED(0,
   4829                                 IRExpr_ITE(
   4830                                     mk_x86g_calculate_condition(X86CondNP),
   4831                                     get_ST(r_src), get_ST(0)) );
   4832                break;
   4833 
   4834             case 0xE2:
   4835                DIP("fnclex\n");
   4836                break;
   4837 
   4838             case 0xE3: {
   4839                /* Uses dirty helper:
   4840                      void x86g_do_FINIT ( VexGuestX86State* ) */
   4841                IRDirty* d  = unsafeIRDirty_0_N (
   4842                                 0/*regparms*/,
   4843                                 "x86g_dirtyhelper_FINIT",
   4844                                 &x86g_dirtyhelper_FINIT,
   4845                                 mkIRExprVec_1(IRExpr_BBPTR())
   4846                              );
   4847 
   4848                /* declare we're writing guest state */
   4849                d->nFxState = 5;
   4850                vex_bzero(&d->fxState, sizeof(d->fxState));
   4851 
   4852                d->fxState[0].fx     = Ifx_Write;
   4853                d->fxState[0].offset = OFFB_FTOP;
   4854                d->fxState[0].size   = sizeof(UInt);
   4855 
   4856                d->fxState[1].fx     = Ifx_Write;
   4857                d->fxState[1].offset = OFFB_FPREGS;
   4858                d->fxState[1].size   = 8 * sizeof(ULong);
   4859 
   4860                d->fxState[2].fx     = Ifx_Write;
   4861                d->fxState[2].offset = OFFB_FPTAGS;
   4862                d->fxState[2].size   = 8 * sizeof(UChar);
   4863 
   4864                d->fxState[3].fx     = Ifx_Write;
   4865                d->fxState[3].offset = OFFB_FPROUND;
   4866                d->fxState[3].size   = sizeof(UInt);
   4867 
   4868                d->fxState[4].fx     = Ifx_Write;
   4869                d->fxState[4].offset = OFFB_FC3210;
   4870                d->fxState[4].size   = sizeof(UInt);
   4871 
   4872                stmt( IRStmt_Dirty(d) );
   4873 
   4874                DIP("fninit\n");
   4875                break;
   4876             }
   4877 
   4878             case 0xE8 ... 0xEF: /* FUCOMI %st(0),%st(?) */
   4879                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, False );
   4880                break;
   4881 
   4882             case 0xF0 ... 0xF7: /* FCOMI %st(0),%st(?) */
   4883                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, False );
   4884                break;
   4885 
   4886             default:
   4887                goto decode_fail;
   4888          }
   4889       }
   4890    }
   4891 
   4892    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDC opcodes +-+-+-+-+-+-+-+ */
   4893    else
   4894    if (first_opcode == 0xDC) {
   4895       if (modrm < 0xC0) {
   4896 
   4897          /* bits 5,4,3 are an opcode extension, and the modRM also
   4898             specifies an address. */
   4899          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   4900          delta += len;
   4901 
   4902          switch (gregOfRM(modrm)) {
   4903 
   4904             case 0: /* FADD double-real */
   4905                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, True );
   4906                break;
   4907 
   4908             case 1: /* FMUL double-real */
   4909                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, True );
   4910                break;
   4911 
   4912             case 2: /* FCOM double-real */
   4913                DIP("fcoml %s\n", dis_buf);
   4914                /* This forces C1 to zero, which isn't right. */
   4915                put_C3210(
   4916                    binop( Iop_And32,
   4917                           binop(Iop_Shl32,
   4918                                 binop(Iop_CmpF64,
   4919                                       get_ST(0),
   4920                                       loadLE(Ity_F64,mkexpr(addr))),
   4921                                 mkU8(8)),
   4922                           mkU32(0x4500)
   4923                    ));
   4924                break;
   4925 
   4926             case 3: /* FCOMP double-real */
   4927                DIP("fcompl %s\n", dis_buf);
   4928                /* This forces C1 to zero, which isn't right. */
   4929                put_C3210(
   4930                    binop( Iop_And32,
   4931                           binop(Iop_Shl32,
   4932                                 binop(Iop_CmpF64,
   4933                                       get_ST(0),
   4934                                       loadLE(Ity_F64,mkexpr(addr))),
   4935                                 mkU8(8)),
   4936                           mkU32(0x4500)
   4937                    ));
   4938                fp_pop();
   4939                break;
   4940 
   4941             case 4: /* FSUB double-real */
   4942                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, True );
   4943                break;
   4944 
   4945             case 5: /* FSUBR double-real */
   4946                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, True );
   4947                break;
   4948 
   4949             case 6: /* FDIV double-real */
   4950                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, True );
   4951                break;
   4952 
   4953             case 7: /* FDIVR double-real */
   4954                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, True );
   4955                break;
   4956 
   4957             default:
   4958                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   4959                vex_printf("first_opcode == 0xDC\n");
   4960                goto decode_fail;
   4961          }
   4962 
   4963       } else {
   4964 
   4965          delta++;
   4966          switch (modrm) {
   4967 
   4968             case 0xC0 ... 0xC7: /* FADD %st(0),%st(?) */
   4969                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, False );
   4970                break;
   4971 
   4972             case 0xC8 ... 0xCF: /* FMUL %st(0),%st(?) */
   4973                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, False );
   4974                break;
   4975 
   4976             case 0xE0 ... 0xE7: /* FSUBR %st(0),%st(?) */
   4977                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0, modrm - 0xE0, False );
   4978                break;
   4979 
   4980             case 0xE8 ... 0xEF: /* FSUB %st(0),%st(?) */
   4981                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0, modrm - 0xE8, False );
   4982                break;
   4983 
   4984             case 0xF0 ... 0xF7: /* FDIVR %st(0),%st(?) */
   4985                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, False );
   4986                break;
   4987 
   4988             case 0xF8 ... 0xFF: /* FDIV %st(0),%st(?) */
   4989                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, False );
   4990                break;
   4991 
   4992             default:
   4993                goto decode_fail;
   4994          }
   4995 
   4996       }
   4997    }
   4998 
   4999    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDD opcodes +-+-+-+-+-+-+-+ */
   5000    else
   5001    if (first_opcode == 0xDD) {
   5002 
   5003       if (modrm < 0xC0) {
   5004 
   5005          /* bits 5,4,3 are an opcode extension, and the modRM also
   5006             specifies an address. */
   5007          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5008          delta += len;
   5009 
   5010          switch (gregOfRM(modrm)) {
   5011 
   5012             case 0: /* FLD double-real */
   5013                DIP("fldl %s\n", dis_buf);
   5014                fp_push();
   5015                put_ST(0, loadLE(Ity_F64, mkexpr(addr)));
   5016                break;
   5017 
   5018             case 1: /* FISTTPQ m64 (SSE3) */
   5019                DIP("fistppll %s\n", dis_buf);
   5020                storeLE( mkexpr(addr),
   5021                         binop(Iop_F64toI64S, mkU32(Irrm_ZERO), get_ST(0)) );
   5022                fp_pop();
   5023                break;
   5024 
   5025             case 2: /* FST double-real */
   5026                DIP("fstl %s\n", dis_buf);
   5027                storeLE(mkexpr(addr), get_ST(0));
   5028                break;
   5029 
   5030             case 3: /* FSTP double-real */
   5031                DIP("fstpl %s\n", dis_buf);
   5032                storeLE(mkexpr(addr), get_ST(0));
   5033                fp_pop();
   5034                break;
   5035 
   5036             case 4: { /* FRSTOR m108 */
   5037                /* Uses dirty helper:
   5038                      VexEmNote x86g_do_FRSTOR ( VexGuestX86State*, Addr32 ) */
   5039                IRTemp   ew = newTemp(Ity_I32);
   5040                IRDirty* d  = unsafeIRDirty_0_N (
   5041                                 0/*regparms*/,
   5042                                 "x86g_dirtyhelper_FRSTOR",
   5043                                 &x86g_dirtyhelper_FRSTOR,
   5044                                 mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   5045                              );
   5046                d->tmp   = ew;
   5047                /* declare we're reading memory */
   5048                d->mFx   = Ifx_Read;
   5049                d->mAddr = mkexpr(addr);
   5050                d->mSize = 108;
   5051 
   5052                /* declare we're writing guest state */
   5053                d->nFxState = 5;
   5054                vex_bzero(&d->fxState, sizeof(d->fxState));
   5055 
   5056                d->fxState[0].fx     = Ifx_Write;
   5057                d->fxState[0].offset = OFFB_FTOP;
   5058                d->fxState[0].size   = sizeof(UInt);
   5059 
   5060                d->fxState[1].fx     = Ifx_Write;
   5061                d->fxState[1].offset = OFFB_FPREGS;
   5062                d->fxState[1].size   = 8 * sizeof(ULong);
   5063 
   5064                d->fxState[2].fx     = Ifx_Write;
   5065                d->fxState[2].offset = OFFB_FPTAGS;
   5066                d->fxState[2].size   = 8 * sizeof(UChar);
   5067 
   5068                d->fxState[3].fx     = Ifx_Write;
   5069                d->fxState[3].offset = OFFB_FPROUND;
   5070                d->fxState[3].size   = sizeof(UInt);
   5071 
   5072                d->fxState[4].fx     = Ifx_Write;
   5073                d->fxState[4].offset = OFFB_FC3210;
   5074                d->fxState[4].size   = sizeof(UInt);
   5075 
   5076                stmt( IRStmt_Dirty(d) );
   5077 
   5078                /* ew contains any emulation warning we may need to
   5079                   issue.  If needed, side-exit to the next insn,
   5080                   reporting the warning, so that Valgrind's dispatcher
   5081                   sees the warning. */
   5082                put_emwarn( mkexpr(ew) );
   5083                stmt(
   5084                   IRStmt_Exit(
   5085                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   5086                      Ijk_EmWarn,
   5087                      IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
   5088                      OFFB_EIP
   5089                   )
   5090                );
   5091 
   5092                DIP("frstor %s\n", dis_buf);
   5093                break;
   5094             }
   5095 
   5096             case 6: { /* FNSAVE m108 */
   5097                /* Uses dirty helper:
   5098                      void x86g_do_FSAVE ( VexGuestX86State*, UInt ) */
   5099                IRDirty* d = unsafeIRDirty_0_N (
   5100                                0/*regparms*/,
   5101                                "x86g_dirtyhelper_FSAVE",
   5102                                &x86g_dirtyhelper_FSAVE,
   5103                                mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   5104                             );
   5105                /* declare we're writing memory */
   5106                d->mFx   = Ifx_Write;
   5107                d->mAddr = mkexpr(addr);
   5108                d->mSize = 108;
   5109 
   5110                /* declare we're reading guest state */
   5111                d->nFxState = 5;
   5112                vex_bzero(&d->fxState, sizeof(d->fxState));
   5113 
   5114                d->fxState[0].fx     = Ifx_Read;
   5115                d->fxState[0].offset = OFFB_FTOP;
   5116                d->fxState[0].size   = sizeof(UInt);
   5117 
   5118                d->fxState[1].fx     = Ifx_Read;
   5119                d->fxState[1].offset = OFFB_FPREGS;
   5120                d->fxState[1].size   = 8 * sizeof(ULong);
   5121 
   5122                d->fxState[2].fx     = Ifx_Read;
   5123                d->fxState[2].offset = OFFB_FPTAGS;
   5124                d->fxState[2].size   = 8 * sizeof(UChar);
   5125 
   5126                d->fxState[3].fx     = Ifx_Read;
   5127                d->fxState[3].offset = OFFB_FPROUND;
   5128                d->fxState[3].size   = sizeof(UInt);
   5129 
   5130                d->fxState[4].fx     = Ifx_Read;
   5131                d->fxState[4].offset = OFFB_FC3210;
   5132                d->fxState[4].size   = sizeof(UInt);
   5133 
   5134                stmt( IRStmt_Dirty(d) );
   5135 
   5136                DIP("fnsave %s\n", dis_buf);
   5137                break;
   5138             }
   5139 
   5140             case 7: { /* FNSTSW m16 */
   5141                IRExpr* sw = get_FPU_sw();
   5142                vassert(typeOfIRExpr(irsb->tyenv, sw) == Ity_I16);
   5143                storeLE( mkexpr(addr), sw );
   5144                DIP("fnstsw %s\n", dis_buf);
   5145                break;
   5146             }
   5147 
   5148             default:
   5149                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   5150                vex_printf("first_opcode == 0xDD\n");
   5151                goto decode_fail;
   5152          }
   5153       } else {
   5154          delta++;
   5155          switch (modrm) {
   5156 
   5157             case 0xC0 ... 0xC7: /* FFREE %st(?) */
   5158                r_dst = (UInt)modrm - 0xC0;
   5159                DIP("ffree %%st(%d)\n", (Int)r_dst);
   5160                put_ST_TAG ( r_dst, mkU8(0) );
   5161                break;
   5162 
   5163             case 0xD0 ... 0xD7: /* FST %st(0),%st(?) */
   5164                r_dst = (UInt)modrm - 0xD0;
   5165                DIP("fst %%st(0),%%st(%d)\n", (Int)r_dst);
   5166                /* P4 manual says: "If the destination operand is a
   5167                   non-empty register, the invalid-operation exception
   5168                   is not generated.  Hence put_ST_UNCHECKED. */
   5169                put_ST_UNCHECKED(r_dst, get_ST(0));
   5170                break;
   5171 
   5172             case 0xD8 ... 0xDF: /* FSTP %st(0),%st(?) */
   5173                r_dst = (UInt)modrm - 0xD8;
   5174                DIP("fstp %%st(0),%%st(%d)\n", (Int)r_dst);
   5175                /* P4 manual says: "If the destination operand is a
   5176                   non-empty register, the invalid-operation exception
   5177                   is not generated.  Hence put_ST_UNCHECKED. */
   5178                put_ST_UNCHECKED(r_dst, get_ST(0));
   5179                fp_pop();
   5180                break;
   5181 
   5182             case 0xE0 ... 0xE7: /* FUCOM %st(0),%st(?) */
   5183                r_dst = (UInt)modrm - 0xE0;
   5184                DIP("fucom %%st(0),%%st(%d)\n", (Int)r_dst);
   5185                /* This forces C1 to zero, which isn't right. */
   5186                put_C3210(
   5187                    binop( Iop_And32,
   5188                           binop(Iop_Shl32,
   5189                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   5190                                 mkU8(8)),
   5191                           mkU32(0x4500)
   5192                    ));
   5193                break;
   5194 
   5195             case 0xE8 ... 0xEF: /* FUCOMP %st(0),%st(?) */
   5196                r_dst = (UInt)modrm - 0xE8;
   5197                DIP("fucomp %%st(0),%%st(%d)\n", (Int)r_dst);
   5198                /* This forces C1 to zero, which isn't right. */
   5199                put_C3210(
   5200                    binop( Iop_And32,
   5201                           binop(Iop_Shl32,
   5202                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   5203                                 mkU8(8)),
   5204                           mkU32(0x4500)
   5205                    ));
   5206                fp_pop();
   5207                break;
   5208 
   5209             default:
   5210                goto decode_fail;
   5211          }
   5212       }
   5213    }
   5214 
   5215    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDE opcodes +-+-+-+-+-+-+-+ */
   5216    else
   5217    if (first_opcode == 0xDE) {
   5218 
   5219       if (modrm < 0xC0) {
   5220 
   5221          /* bits 5,4,3 are an opcode extension, and the modRM also
   5222             specifies an address. */
   5223          IROp   fop;
   5224          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5225          delta += len;
   5226 
   5227          switch (gregOfRM(modrm)) {
   5228 
   5229             case 0: /* FIADD m16int */ /* ST(0) += m16int */
   5230                DIP("fiaddw %s\n", dis_buf);
   5231                fop = Iop_AddF64;
   5232                goto do_fop_m16;
   5233 
   5234             case 1: /* FIMUL m16int */ /* ST(0) *= m16int */
   5235                DIP("fimulw %s\n", dis_buf);
   5236                fop = Iop_MulF64;
   5237                goto do_fop_m16;
   5238 
   5239             case 2: /* FICOM m16int */
   5240                DIP("ficomw %s\n", dis_buf);
   5241                /* This forces C1 to zero, which isn't right. */
   5242                put_C3210(
   5243                    binop( Iop_And32,
   5244                           binop(Iop_Shl32,
   5245                                 binop(Iop_CmpF64,
   5246                                       get_ST(0),
   5247                                       unop(Iop_I32StoF64,
   5248                                          unop(Iop_16Sto32,
   5249                                            loadLE(Ity_I16,mkexpr(addr))))),
   5250                                 mkU8(8)),
   5251                           mkU32(0x4500)
   5252                    ));
   5253                break;
   5254 
   5255             case 3: /* FICOMP m16int */
   5256                DIP("ficompw %s\n", dis_buf);
   5257                /* This forces C1 to zero, which isn't right. */
   5258                put_C3210(
   5259                    binop( Iop_And32,
   5260                           binop(Iop_Shl32,
   5261                                 binop(Iop_CmpF64,
   5262                                       get_ST(0),
   5263                                       unop(Iop_I32StoF64,
   5264                                          unop(Iop_16Sto32,
   5265                                               loadLE(Ity_I16,mkexpr(addr))))),
   5266                                 mkU8(8)),
   5267                           mkU32(0x4500)
   5268                    ));
   5269                fp_pop();
   5270                break;
   5271 
   5272             case 4: /* FISUB m16int */ /* ST(0) -= m16int */
   5273                DIP("fisubw %s\n", dis_buf);
   5274                fop = Iop_SubF64;
   5275                goto do_fop_m16;
   5276 
   5277             case 5: /* FISUBR m16int */ /* ST(0) = m16int - ST(0) */
   5278                DIP("fisubrw %s\n", dis_buf);
   5279                fop = Iop_SubF64;
   5280                goto do_foprev_m16;
   5281 
   5282             case 6: /* FIDIV m16int */ /* ST(0) /= m16int */
   5283                DIP("fisubw %s\n", dis_buf);
   5284                fop = Iop_DivF64;
   5285                goto do_fop_m16;
   5286 
   5287             case 7: /* FIDIVR m16int */ /* ST(0) = m16int / ST(0) */
   5288                DIP("fidivrw %s\n", dis_buf);
   5289                fop = Iop_DivF64;
   5290                goto do_foprev_m16;
   5291 
   5292             do_fop_m16:
   5293                put_ST_UNCHECKED(0,
   5294                   triop(fop,
   5295                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5296                         get_ST(0),
   5297                         unop(Iop_I32StoF64,
   5298                              unop(Iop_16Sto32,
   5299                                   loadLE(Ity_I16, mkexpr(addr))))));
   5300                break;
   5301 
   5302             do_foprev_m16:
   5303                put_ST_UNCHECKED(0,
   5304                   triop(fop,
   5305                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5306                         unop(Iop_I32StoF64,
   5307                              unop(Iop_16Sto32,
   5308                                   loadLE(Ity_I16, mkexpr(addr)))),
   5309                         get_ST(0)));
   5310                break;
   5311 
   5312             default:
   5313                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   5314                vex_printf("first_opcode == 0xDE\n");
   5315                goto decode_fail;
   5316          }
   5317 
   5318       } else {
   5319 
   5320          delta++;
   5321          switch (modrm) {
   5322 
   5323             case 0xC0 ... 0xC7: /* FADDP %st(0),%st(?) */
   5324                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, True );
   5325                break;
   5326 
   5327             case 0xC8 ... 0xCF: /* FMULP %st(0),%st(?) */
   5328                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, True );
   5329                break;
   5330 
   5331             case 0xD9: /* FCOMPP %st(0),%st(1) */
   5332                DIP("fuompp %%st(0),%%st(1)\n");
   5333                /* This forces C1 to zero, which isn't right. */
   5334                put_C3210(
   5335                    binop( Iop_And32,
   5336                           binop(Iop_Shl32,
   5337                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
   5338                                 mkU8(8)),
   5339                           mkU32(0x4500)
   5340                    ));
   5341                fp_pop();
   5342                fp_pop();
   5343                break;
   5344 
   5345             case 0xE0 ... 0xE7: /* FSUBRP %st(0),%st(?) */
   5346                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0,  modrm - 0xE0, True );
   5347                break;
   5348 
   5349             case 0xE8 ... 0xEF: /* FSUBP %st(0),%st(?) */
   5350                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0,  modrm - 0xE8, True );
   5351                break;
   5352 
   5353             case 0xF0 ... 0xF7: /* FDIVRP %st(0),%st(?) */
   5354                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, True );
   5355                break;
   5356 
   5357             case 0xF8 ... 0xFF: /* FDIVP %st(0),%st(?) */
   5358                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, True );
   5359                break;
   5360 
   5361             default:
   5362                goto decode_fail;
   5363          }
   5364 
   5365       }
   5366    }
   5367 
   5368    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDF opcodes +-+-+-+-+-+-+-+ */
   5369    else
   5370    if (first_opcode == 0xDF) {
   5371 
   5372       if (modrm < 0xC0) {
   5373 
   5374          /* bits 5,4,3 are an opcode extension, and the modRM also
   5375             specifies an address. */
   5376          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5377          delta += len;
   5378 
   5379          switch (gregOfRM(modrm)) {
   5380 
   5381             case 0: /* FILD m16int */
   5382                DIP("fildw %s\n", dis_buf);
   5383                fp_push();
   5384                put_ST(0, unop(Iop_I32StoF64,
   5385                               unop(Iop_16Sto32,
   5386                                    loadLE(Ity_I16, mkexpr(addr)))));
   5387                break;
   5388 
   5389             case 1: /* FISTTPS m16 (SSE3) */
   5390                DIP("fisttps %s\n", dis_buf);
   5391                storeLE( mkexpr(addr),
   5392                         binop(Iop_F64toI16S, mkU32(Irrm_ZERO), get_ST(0)) );
   5393                fp_pop();
   5394                break;
   5395 
   5396             case 2: /* FIST m16 */
   5397                DIP("fistp %s\n", dis_buf);
   5398                storeLE( mkexpr(addr),
   5399                         binop(Iop_F64toI16S, get_roundingmode(), get_ST(0)) );
   5400                break;
   5401 
   5402             case 3: /* FISTP m16 */
   5403                DIP("fistps %s\n", dis_buf);
   5404                storeLE( mkexpr(addr),
   5405                         binop(Iop_F64toI16S, get_roundingmode(), get_ST(0)) );
   5406                fp_pop();
   5407                break;
   5408 
   5409             case 5: /* FILD m64 */
   5410                DIP("fildll %s\n", dis_buf);
   5411                fp_push();
   5412                put_ST(0, binop(Iop_I64StoF64,
   5413                                get_roundingmode(),
   5414                                loadLE(Ity_I64, mkexpr(addr))));
   5415                break;
   5416 
   5417             case 7: /* FISTP m64 */
   5418                DIP("fistpll %s\n", dis_buf);
   5419                storeLE( mkexpr(addr),
   5420                         binop(Iop_F64toI64S, get_roundingmode(), get_ST(0)) );
   5421                fp_pop();
   5422                break;
   5423 
   5424             default:
   5425                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   5426                vex_printf("first_opcode == 0xDF\n");
   5427                goto decode_fail;
   5428          }
   5429 
   5430       } else {
   5431 
   5432          delta++;
   5433          switch (modrm) {
   5434 
   5435             case 0xC0: /* FFREEP %st(0) */
   5436                DIP("ffreep %%st(%d)\n", 0);
   5437                put_ST_TAG ( 0, mkU8(0) );
   5438                fp_pop();
   5439                break;
   5440 
   5441             case 0xE0: /* FNSTSW %ax */
   5442                DIP("fnstsw %%ax\n");
   5443                /* Get the FPU status word value and dump it in %AX. */
   5444                if (0) {
   5445                   /* The obvious thing to do is simply dump the 16-bit
   5446                      status word value in %AX.  However, due to a
   5447                      limitation in Memcheck's origin tracking
   5448                      machinery, this causes Memcheck not to track the
   5449                      origin of any undefinedness into %AH (only into
   5450                      %AL/%AX/%EAX), which means origins are lost in
   5451                      the sequence "fnstsw %ax; test $M,%ah; jcond .." */
   5452                   putIReg(2, R_EAX, get_FPU_sw());
   5453                } else {
   5454                   /* So a somewhat lame kludge is to make it very
   5455                      clear to Memcheck that the value is written to
   5456                      both %AH and %AL.  This generates marginally
   5457                      worse code, but I don't think it matters much. */
   5458                   IRTemp t16 = newTemp(Ity_I16);
   5459                   assign(t16, get_FPU_sw());
   5460                   putIReg( 1, R_AL, unop(Iop_16to8, mkexpr(t16)) );
   5461                   putIReg( 1, R_AH, unop(Iop_16HIto8, mkexpr(t16)) );
   5462                }
   5463                break;
   5464 
   5465             case 0xE8 ... 0xEF: /* FUCOMIP %st(0),%st(?) */
   5466                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, True );
   5467                break;
   5468 
   5469             case 0xF0 ... 0xF7: /* FCOMIP %st(0),%st(?) */
   5470                /* not really right since COMIP != UCOMIP */
   5471                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, True );
   5472                break;
   5473 
   5474             default:
   5475                goto decode_fail;
   5476          }
   5477       }
   5478 
   5479    }
   5480 
   5481    else
   5482    vpanic("dis_FPU(x86): invalid primary opcode");
   5483 
   5484    *decode_ok = True;
   5485    return delta;
   5486 
   5487   decode_fail:
   5488    *decode_ok = False;
   5489    return delta;
   5490 }
   5491 
   5492 
   5493 /*------------------------------------------------------------*/
   5494 /*---                                                      ---*/
   5495 /*--- MMX INSTRUCTIONS                                     ---*/
   5496 /*---                                                      ---*/
   5497 /*------------------------------------------------------------*/
   5498 
   5499 /* Effect of MMX insns on x87 FPU state (table 11-2 of
   5500    IA32 arch manual, volume 3):
   5501 
   5502    Read from, or write to MMX register (viz, any insn except EMMS):
   5503    * All tags set to Valid (non-empty) -- FPTAGS[i] := nonzero
   5504    * FP stack pointer set to zero
   5505 
   5506    EMMS:
   5507    * All tags set to Invalid (empty) -- FPTAGS[i] := zero
   5508    * FP stack pointer set to zero
   5509 */
   5510 
   5511 static void do_MMX_preamble ( void )
   5512 {
   5513    Int         i;
   5514    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   5515    IRExpr*     zero  = mkU32(0);
   5516    IRExpr*     tag1  = mkU8(1);
   5517    put_ftop(zero);
   5518    for (i = 0; i < 8; i++)
   5519       stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag1) ) );
   5520 }
   5521 
   5522 static void do_EMMS_preamble ( void )
   5523 {
   5524    Int         i;
   5525    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   5526    IRExpr*     zero  = mkU32(0);
   5527    IRExpr*     tag0  = mkU8(0);
   5528    put_ftop(zero);
   5529    for (i = 0; i < 8; i++)
   5530       stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag0) ) );
   5531 }
   5532 
   5533 
   5534 static IRExpr* getMMXReg ( UInt archreg )
   5535 {
   5536    vassert(archreg < 8);
   5537    return IRExpr_Get( OFFB_FPREGS + 8 * archreg, Ity_I64 );
   5538 }
   5539 
   5540 
   5541 static void putMMXReg ( UInt archreg, IRExpr* e )
   5542 {
   5543    vassert(archreg < 8);
   5544    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   5545    stmt( IRStmt_Put( OFFB_FPREGS + 8 * archreg, e ) );
   5546 }
   5547 
   5548 
   5549 /* Helper for non-shift MMX insns.  Note this is incomplete in the
   5550    sense that it does not first call do_MMX_preamble() -- that is the
   5551    responsibility of its caller. */
   5552 
   5553 static
   5554 UInt dis_MMXop_regmem_to_reg ( UChar  sorb,
   5555                                Int    delta,
   5556                                UChar  opc,
   5557                                const HChar* name,
   5558                                Bool   show_granularity )
   5559 {
   5560    HChar   dis_buf[50];
   5561    UChar   modrm = getIByte(delta);
   5562    Bool    isReg = epartIsReg(modrm);
   5563    IRExpr* argL  = NULL;
   5564    IRExpr* argR  = NULL;
   5565    IRExpr* argG  = NULL;
   5566    IRExpr* argE  = NULL;
   5567    IRTemp  res   = newTemp(Ity_I64);
   5568 
   5569    Bool    invG  = False;
   5570    IROp    op    = Iop_INVALID;
   5571    void*   hAddr = NULL;
   5572    Bool    eLeft = False;
   5573    const HChar*  hName = NULL;
   5574 
   5575 #  define XXX(_name) do { hAddr = &_name; hName = #_name; } while (0)
   5576 
   5577    switch (opc) {
   5578       /* Original MMX ones */
   5579       case 0xFC: op = Iop_Add8x8; break;
   5580       case 0xFD: op = Iop_Add16x4; break;
   5581       case 0xFE: op = Iop_Add32x2; break;
   5582 
   5583       case 0xEC: op = Iop_QAdd8Sx8; break;
   5584       case 0xED: op = Iop_QAdd16Sx4; break;
   5585 
   5586       case 0xDC: op = Iop_QAdd8Ux8; break;
   5587       case 0xDD: op = Iop_QAdd16Ux4; break;
   5588 
   5589       case 0xF8: op = Iop_Sub8x8;  break;
   5590       case 0xF9: op = Iop_Sub16x4; break;
   5591       case 0xFA: op = Iop_Sub32x2; break;
   5592 
   5593       case 0xE8: op = Iop_QSub8Sx8; break;
   5594       case 0xE9: op = Iop_QSub16Sx4; break;
   5595 
   5596       case 0xD8: op = Iop_QSub8Ux8; break;
   5597       case 0xD9: op = Iop_QSub16Ux4; break;
   5598 
   5599       case 0xE5: op = Iop_MulHi16Sx4; break;
   5600       case 0xD5: op = Iop_Mul16x4; break;
   5601       case 0xF5: XXX(x86g_calculate_mmx_pmaddwd); break;
   5602 
   5603       case 0x74: op = Iop_CmpEQ8x8; break;
   5604       case 0x75: op = Iop_CmpEQ16x4; break;
   5605       case 0x76: op = Iop_CmpEQ32x2; break;
   5606 
   5607       case 0x64: op = Iop_CmpGT8Sx8; break;
   5608       case 0x65: op = Iop_CmpGT16Sx4; break;
   5609       case 0x66: op = Iop_CmpGT32Sx2; break;
   5610 
   5611       case 0x6B: op = Iop_QNarrowBin32Sto16Sx4; eLeft = True; break;
   5612       case 0x63: op = Iop_QNarrowBin16Sto8Sx8;  eLeft = True; break;
   5613       case 0x67: op = Iop_QNarrowBin16Sto8Ux8;  eLeft = True; break;
   5614 
   5615       case 0x68: op = Iop_InterleaveHI8x8;  eLeft = True; break;
   5616       case 0x69: op = Iop_InterleaveHI16x4; eLeft = True; break;
   5617       case 0x6A: op = Iop_InterleaveHI32x2; eLeft = True; break;
   5618 
   5619       case 0x60: op = Iop_InterleaveLO8x8;  eLeft = True; break;
   5620       case 0x61: op = Iop_InterleaveLO16x4; eLeft = True; break;
   5621       case 0x62: op = Iop_InterleaveLO32x2; eLeft = True; break;
   5622 
   5623       case 0xDB: op = Iop_And64; break;
   5624       case 0xDF: op = Iop_And64; invG = True; break;
   5625       case 0xEB: op = Iop_Or64; break;
   5626       case 0xEF: /* Possibly do better here if argL and argR are the
   5627                     same reg */
   5628                  op = Iop_Xor64; break;
   5629 
   5630       /* Introduced in SSE1 */
   5631       case 0xE0: op = Iop_Avg8Ux8;    break;
   5632       case 0xE3: op = Iop_Avg16Ux4;   break;
   5633       case 0xEE: op = Iop_Max16Sx4;   break;
   5634       case 0xDE: op = Iop_Max8Ux8;    break;
   5635       case 0xEA: op = Iop_Min16Sx4;   break;
   5636       case 0xDA: op = Iop_Min8Ux8;    break;
   5637       case 0xE4: op = Iop_MulHi16Ux4; break;
   5638       case 0xF6: XXX(x86g_calculate_mmx_psadbw); break;
   5639 
   5640       /* Introduced in SSE2 */
   5641       case 0xD4: op = Iop_Add64; break;
   5642       case 0xFB: op = Iop_Sub64; break;
   5643 
   5644       default:
   5645          vex_printf("\n0x%x\n", (Int)opc);
   5646          vpanic("dis_MMXop_regmem_to_reg");
   5647    }
   5648 
   5649 #  undef XXX
   5650 
   5651    argG = getMMXReg(gregOfRM(modrm));
   5652    if (invG)
   5653       argG = unop(Iop_Not64, argG);
   5654 
   5655    if (isReg) {
   5656       delta++;
   5657       argE = getMMXReg(eregOfRM(modrm));
   5658    } else {
   5659       Int    len;
   5660       IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5661       delta += len;
   5662       argE = loadLE(Ity_I64, mkexpr(addr));
   5663    }
   5664 
   5665    if (eLeft) {
   5666       argL = argE;
   5667       argR = argG;
   5668    } else {
   5669       argL = argG;
   5670       argR = argE;
   5671    }
   5672 
   5673    if (op != Iop_INVALID) {
   5674       vassert(hName == NULL);
   5675       vassert(hAddr == NULL);
   5676       assign(res, binop(op, argL, argR));
   5677    } else {
   5678       vassert(hName != NULL);
   5679       vassert(hAddr != NULL);
   5680       assign( res,
   5681               mkIRExprCCall(
   5682                  Ity_I64,
   5683                  0/*regparms*/, hName, hAddr,
   5684                  mkIRExprVec_2( argL, argR )
   5685               )
   5686             );
   5687    }
   5688 
   5689    putMMXReg( gregOfRM(modrm), mkexpr(res) );
   5690 
   5691    DIP("%s%s %s, %s\n",
   5692        name, show_granularity ? nameMMXGran(opc & 3) : "",
   5693        ( isReg ? nameMMXReg(eregOfRM(modrm)) : dis_buf ),
   5694        nameMMXReg(gregOfRM(modrm)) );
   5695 
   5696    return delta;
   5697 }
   5698 
   5699 
   5700 /* Vector by scalar shift of G by the amount specified at the bottom
   5701    of E.  This is a straight copy of dis_SSE_shiftG_byE. */
   5702 
   5703 static UInt dis_MMX_shiftG_byE ( UChar sorb, Int delta,
   5704                                  const HChar* opname, IROp op )
   5705 {
   5706    HChar   dis_buf[50];
   5707    Int     alen, size;
   5708    IRTemp  addr;
   5709    Bool    shl, shr, sar;
   5710    UChar   rm   = getIByte(delta);
   5711    IRTemp  g0   = newTemp(Ity_I64);
   5712    IRTemp  g1   = newTemp(Ity_I64);
   5713    IRTemp  amt  = newTemp(Ity_I32);
   5714    IRTemp  amt8 = newTemp(Ity_I8);
   5715 
   5716    if (epartIsReg(rm)) {
   5717       assign( amt, unop(Iop_64to32, getMMXReg(eregOfRM(rm))) );
   5718       DIP("%s %s,%s\n", opname,
   5719                         nameMMXReg(eregOfRM(rm)),
   5720                         nameMMXReg(gregOfRM(rm)) );
   5721       delta++;
   5722    } else {
   5723       addr = disAMode ( &alen, sorb, delta, dis_buf );
   5724       assign( amt, loadLE(Ity_I32, mkexpr(addr)) );
   5725       DIP("%s %s,%s\n", opname,
   5726                         dis_buf,
   5727                         nameMMXReg(gregOfRM(rm)) );
   5728       delta += alen;
   5729    }
   5730    assign( g0,   getMMXReg(gregOfRM(rm)) );
   5731    assign( amt8, unop(Iop_32to8, mkexpr(amt)) );
   5732 
   5733    shl = shr = sar = False;
   5734    size = 0;
   5735    switch (op) {
   5736       case Iop_ShlN16x4: shl = True; size = 32; break;
   5737       case Iop_ShlN32x2: shl = True; size = 32; break;
   5738       case Iop_Shl64:    shl = True; size = 64; break;
   5739       case Iop_ShrN16x4: shr = True; size = 16; break;
   5740       case Iop_ShrN32x2: shr = True; size = 32; break;
   5741       case Iop_Shr64:    shr = True; size = 64; break;
   5742       case Iop_SarN16x4: sar = True; size = 16; break;
   5743       case Iop_SarN32x2: sar = True; size = 32; break;
   5744       default: vassert(0);
   5745    }
   5746 
   5747    if (shl || shr) {
   5748      assign(
   5749         g1,
   5750         IRExpr_ITE(
   5751            binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size)),
   5752            binop(op, mkexpr(g0), mkexpr(amt8)),
   5753            mkU64(0)
   5754         )
   5755      );
   5756    } else
   5757    if (sar) {
   5758      assign(
   5759         g1,
   5760         IRExpr_ITE(
   5761            binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size)),
   5762            binop(op, mkexpr(g0), mkexpr(amt8)),
   5763            binop(op, mkexpr(g0), mkU8(size-1))
   5764         )
   5765      );
   5766    } else {
   5767       /*NOTREACHED*/
   5768       vassert(0);
   5769    }
   5770 
   5771    putMMXReg( gregOfRM(rm), mkexpr(g1) );
   5772    return delta;
   5773 }
   5774 
   5775 
   5776 /* Vector by scalar shift of E by an immediate byte.  This is a
   5777    straight copy of dis_SSE_shiftE_imm. */
   5778 
   5779 static
   5780 UInt dis_MMX_shiftE_imm ( Int delta, const HChar* opname, IROp op )
   5781 {
   5782    Bool    shl, shr, sar;
   5783    UChar   rm   = getIByte(delta);
   5784    IRTemp  e0   = newTemp(Ity_I64);
   5785    IRTemp  e1   = newTemp(Ity_I64);
   5786    UChar   amt, size;
   5787    vassert(epartIsReg(rm));
   5788    vassert(gregOfRM(rm) == 2
   5789            || gregOfRM(rm) == 4 || gregOfRM(rm) == 6);
   5790    amt = getIByte(delta+1);
   5791    delta += 2;
   5792    DIP("%s $%d,%s\n", opname,
   5793                       (Int)amt,
   5794                       nameMMXReg(eregOfRM(rm)) );
   5795 
   5796    assign( e0, getMMXReg(eregOfRM(rm)) );
   5797 
   5798    shl = shr = sar = False;
   5799    size = 0;
   5800    switch (op) {
   5801       case Iop_ShlN16x4: shl = True; size = 16; break;
   5802       case Iop_ShlN32x2: shl = True; size = 32; break;
   5803       case Iop_Shl64:    shl = True; size = 64; break;
   5804       case Iop_SarN16x4: sar = True; size = 16; break;
   5805       case Iop_SarN32x2: sar = True; size = 32; break;
   5806       case Iop_ShrN16x4: shr = True; size = 16; break;
   5807       case Iop_ShrN32x2: shr = True; size = 32; break;
   5808       case Iop_Shr64:    shr = True; size = 64; break;
   5809       default: vassert(0);
   5810    }
   5811 
   5812    if (shl || shr) {
   5813       assign( e1, amt >= size
   5814                      ? mkU64(0)
   5815                      : binop(op, mkexpr(e0), mkU8(amt))
   5816       );
   5817    } else
   5818    if (sar) {
   5819       assign( e1, amt >= size
   5820                      ? binop(op, mkexpr(e0), mkU8(size-1))
   5821                      : binop(op, mkexpr(e0), mkU8(amt))
   5822       );
   5823    } else {
   5824       /*NOTREACHED*/
   5825       vassert(0);
   5826    }
   5827 
   5828    putMMXReg( eregOfRM(rm), mkexpr(e1) );
   5829    return delta;
   5830 }
   5831 
   5832 
   5833 /* Completely handle all MMX instructions except emms. */
   5834 
   5835 static
   5836 UInt dis_MMX ( Bool* decode_ok, UChar sorb, Int sz, Int delta )
   5837 {
   5838    Int   len;
   5839    UChar modrm;
   5840    HChar dis_buf[50];
   5841    UChar opc = getIByte(delta);
   5842    delta++;
   5843 
   5844    /* dis_MMX handles all insns except emms. */
   5845    do_MMX_preamble();
   5846 
   5847    switch (opc) {
   5848 
   5849       case 0x6E:
   5850          /* MOVD (src)ireg-or-mem (E), (dst)mmxreg (G)*/
   5851          if (sz != 4)
   5852             goto mmx_decode_failure;
   5853          modrm = getIByte(delta);
   5854          if (epartIsReg(modrm)) {
   5855             delta++;
   5856             putMMXReg(
   5857                gregOfRM(modrm),
   5858                binop( Iop_32HLto64,
   5859                       mkU32(0),
   5860                       getIReg(4, eregOfRM(modrm)) ) );
   5861             DIP("movd %s, %s\n",
   5862                 nameIReg(4,eregOfRM(modrm)), nameMMXReg(gregOfRM(modrm)));
   5863          } else {
   5864             IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5865             delta += len;
   5866             putMMXReg(
   5867                gregOfRM(modrm),
   5868                binop( Iop_32HLto64,
   5869                       mkU32(0),
   5870                       loadLE(Ity_I32, mkexpr(addr)) ) );
   5871             DIP("movd %s, %s\n", dis_buf, nameMMXReg(gregOfRM(modrm)));
   5872          }
   5873          break;
   5874 
   5875       case 0x7E: /* MOVD (src)mmxreg (G), (dst)ireg-or-mem (E) */
   5876          if (sz != 4)
   5877             goto mmx_decode_failure;
   5878          modrm = getIByte(delta);
   5879          if (epartIsReg(modrm)) {
   5880             delta++;
   5881             putIReg( 4, eregOfRM(modrm),
   5882                      unop(Iop_64to32, getMMXReg(gregOfRM(modrm)) ) );
   5883             DIP("movd %s, %s\n",
   5884                 nameMMXReg(gregOfRM(modrm)), nameIReg(4,eregOfRM(modrm)));
   5885          } else {
   5886             IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5887             delta += len;
   5888             storeLE( mkexpr(addr),
   5889                      unop(Iop_64to32, getMMXReg(gregOfRM(modrm)) ) );
   5890             DIP("movd %s, %s\n", nameMMXReg(gregOfRM(modrm)), dis_buf);
   5891          }
   5892          break;
   5893 
   5894       case 0x6F:
   5895          /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
   5896          if (sz != 4)
   5897             goto mmx_decode_failure;
   5898          modrm = getIByte(delta);
   5899          if (epartIsReg(modrm)) {
   5900             delta++;
   5901             putMMXReg( gregOfRM(modrm), getMMXReg(eregOfRM(modrm)) );
   5902             DIP("movq %s, %s\n",
   5903                 nameMMXReg(eregOfRM(modrm)), nameMMXReg(gregOfRM(modrm)));
   5904          } else {
   5905             IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5906             delta += len;
   5907             putMMXReg( gregOfRM(modrm), loadLE(Ity_I64, mkexpr(addr)) );
   5908             DIP("movq %s, %s\n",
   5909                 dis_buf, nameMMXReg(gregOfRM(modrm)));
   5910          }
   5911          break;
   5912 
   5913       case 0x7F:
   5914          /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
   5915          if (sz != 4)
   5916             goto mmx_decode_failure;
   5917          modrm = getIByte(delta);
   5918          if (epartIsReg(modrm)) {
   5919             delta++;
   5920             putMMXReg( eregOfRM(modrm), getMMXReg(gregOfRM(modrm)) );
   5921             DIP("movq %s, %s\n",
   5922                 nameMMXReg(gregOfRM(modrm)), nameMMXReg(eregOfRM(modrm)));
   5923          } else {
   5924             IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5925             delta += len;
   5926             storeLE( mkexpr(addr), getMMXReg(gregOfRM(modrm)) );
   5927             DIP("mov(nt)q %s, %s\n",
   5928                 nameMMXReg(gregOfRM(modrm)), dis_buf);
   5929          }
   5930          break;
   5931 
   5932       case 0xFC:
   5933       case 0xFD:
   5934       case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
   5935          if (sz != 4)
   5936             goto mmx_decode_failure;
   5937          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "padd", True );
   5938          break;
   5939 
   5940       case 0xEC:
   5941       case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
   5942          if (sz != 4)
   5943             goto mmx_decode_failure;
   5944          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "padds", True );
   5945          break;
   5946 
   5947       case 0xDC:
   5948       case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   5949          if (sz != 4)
   5950             goto mmx_decode_failure;
   5951          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "paddus", True );
   5952          break;
   5953 
   5954       case 0xF8:
   5955       case 0xF9:
   5956       case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
   5957          if (sz != 4)
   5958             goto mmx_decode_failure;
   5959          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psub", True );
   5960          break;
   5961 
   5962       case 0xE8:
   5963       case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
   5964          if (sz != 4)
   5965             goto mmx_decode_failure;
   5966          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psubs", True );
   5967          break;
   5968 
   5969       case 0xD8:
   5970       case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   5971          if (sz != 4)
   5972             goto mmx_decode_failure;
   5973          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psubus", True );
   5974          break;
   5975 
   5976       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
   5977          if (sz != 4)
   5978             goto mmx_decode_failure;
   5979          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmulhw", False );
   5980          break;
   5981 
   5982       case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
   5983          if (sz != 4)
   5984             goto mmx_decode_failure;
   5985          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmullw", False );
   5986          break;
   5987 
   5988       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
   5989          vassert(sz == 4);
   5990          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmaddwd", False );
   5991          break;
   5992 
   5993       case 0x74:
   5994       case 0x75:
   5995       case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
   5996          if (sz != 4)
   5997             goto mmx_decode_failure;
   5998          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pcmpeq", True );
   5999          break;
   6000 
   6001       case 0x64:
   6002       case 0x65:
   6003       case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
   6004          if (sz != 4)
   6005             goto mmx_decode_failure;
   6006          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pcmpgt", True );
   6007          break;
   6008 
   6009       case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
   6010          if (sz != 4)
   6011             goto mmx_decode_failure;
   6012          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packssdw", False );
   6013          break;
   6014 
   6015       case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
   6016          if (sz != 4)
   6017             goto mmx_decode_failure;
   6018          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packsswb", False );
   6019          break;
   6020 
   6021       case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
   6022          if (sz != 4)
   6023             goto mmx_decode_failure;
   6024          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packuswb", False );
   6025          break;
   6026 
   6027       case 0x68:
   6028       case 0x69:
   6029       case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
   6030          if (sz != 4)
   6031             goto mmx_decode_failure;
   6032          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "punpckh", True );
   6033          break;
   6034 
   6035       case 0x60:
   6036       case 0x61:
   6037       case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
   6038          if (sz != 4)
   6039             goto mmx_decode_failure;
   6040          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "punpckl", True );
   6041          break;
   6042 
   6043       case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
   6044          if (sz != 4)
   6045             goto mmx_decode_failure;
   6046          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pand", False );
   6047          break;
   6048 
   6049       case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
   6050          if (sz != 4)
   6051             goto mmx_decode_failure;
   6052          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pandn", False );
   6053          break;
   6054 
   6055       case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
   6056          if (sz != 4)
   6057             goto mmx_decode_failure;
   6058          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "por", False );
   6059          break;
   6060 
   6061       case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
   6062          if (sz != 4)
   6063             goto mmx_decode_failure;
   6064          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pxor", False );
   6065          break;
   6066 
   6067 #     define SHIFT_BY_REG(_name,_op)                                 \
   6068                 delta = dis_MMX_shiftG_byE(sorb, delta, _name, _op); \
   6069                 break;
   6070 
   6071       /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
   6072       case 0xF1: SHIFT_BY_REG("psllw", Iop_ShlN16x4);
   6073       case 0xF2: SHIFT_BY_REG("pslld", Iop_ShlN32x2);
   6074       case 0xF3: SHIFT_BY_REG("psllq", Iop_Shl64);
   6075 
   6076       /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
   6077       case 0xD1: SHIFT_BY_REG("psrlw", Iop_ShrN16x4);
   6078       case 0xD2: SHIFT_BY_REG("psrld", Iop_ShrN32x2);
   6079       case 0xD3: SHIFT_BY_REG("psrlq", Iop_Shr64);
   6080 
   6081       /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
   6082       case 0xE1: SHIFT_BY_REG("psraw", Iop_SarN16x4);
   6083       case 0xE2: SHIFT_BY_REG("psrad", Iop_SarN32x2);
   6084 
   6085 #     undef SHIFT_BY_REG
   6086 
   6087       case 0x71:
   6088       case 0x72:
   6089       case 0x73: {
   6090          /* (sz==4): PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
   6091          UChar byte2, subopc;
   6092          if (sz != 4)
   6093             goto mmx_decode_failure;
   6094          byte2  = getIByte(delta);           /* amode / sub-opcode */
   6095          subopc = toUChar( (byte2 >> 3) & 7 );
   6096 
   6097 #        define SHIFT_BY_IMM(_name,_op)                         \
   6098              do { delta = dis_MMX_shiftE_imm(delta,_name,_op);  \
   6099              } while (0)
   6100 
   6101               if (subopc == 2 /*SRL*/ && opc == 0x71)
   6102                  SHIFT_BY_IMM("psrlw", Iop_ShrN16x4);
   6103          else if (subopc == 2 /*SRL*/ && opc == 0x72)
   6104                  SHIFT_BY_IMM("psrld", Iop_ShrN32x2);
   6105          else if (subopc == 2 /*SRL*/ && opc == 0x73)
   6106                  SHIFT_BY_IMM("psrlq", Iop_Shr64);
   6107 
   6108          else if (subopc == 4 /*SAR*/ && opc == 0x71)
   6109                  SHIFT_BY_IMM("psraw", Iop_SarN16x4);
   6110          else if (subopc == 4 /*SAR*/ && opc == 0x72)
   6111                  SHIFT_BY_IMM("psrad", Iop_SarN32x2);
   6112 
   6113          else if (subopc == 6 /*SHL*/ && opc == 0x71)
   6114                  SHIFT_BY_IMM("psllw", Iop_ShlN16x4);
   6115          else if (subopc == 6 /*SHL*/ && opc == 0x72)
   6116                  SHIFT_BY_IMM("pslld", Iop_ShlN32x2);
   6117          else if (subopc == 6 /*SHL*/ && opc == 0x73)
   6118                  SHIFT_BY_IMM("psllq", Iop_Shl64);
   6119 
   6120          else goto mmx_decode_failure;
   6121 
   6122 #        undef SHIFT_BY_IMM
   6123          break;
   6124       }
   6125 
   6126       case 0xF7: {
   6127          IRTemp addr    = newTemp(Ity_I32);
   6128          IRTemp regD    = newTemp(Ity_I64);
   6129          IRTemp regM    = newTemp(Ity_I64);
   6130          IRTemp mask    = newTemp(Ity_I64);
   6131          IRTemp olddata = newTemp(Ity_I64);
   6132          IRTemp newdata = newTemp(Ity_I64);
   6133 
   6134          modrm = getIByte(delta);
   6135          if (sz != 4 || (!epartIsReg(modrm)))
   6136             goto mmx_decode_failure;
   6137          delta++;
   6138 
   6139          assign( addr, handleSegOverride( sorb, getIReg(4, R_EDI) ));
   6140          assign( regM, getMMXReg( eregOfRM(modrm) ));
   6141          assign( regD, getMMXReg( gregOfRM(modrm) ));
   6142          assign( mask, binop(Iop_SarN8x8, mkexpr(regM), mkU8(7)) );
   6143          assign( olddata, loadLE( Ity_I64, mkexpr(addr) ));
   6144          assign( newdata,
   6145                  binop(Iop_Or64,
   6146                        binop(Iop_And64,
   6147                              mkexpr(regD),
   6148                              mkexpr(mask) ),
   6149                        binop(Iop_And64,
   6150                              mkexpr(olddata),
   6151                              unop(Iop_Not64, mkexpr(mask)))) );
   6152          storeLE( mkexpr(addr), mkexpr(newdata) );
   6153          DIP("maskmovq %s,%s\n", nameMMXReg( eregOfRM(modrm) ),
   6154                                  nameMMXReg( gregOfRM(modrm) ) );
   6155          break;
   6156       }
   6157 
   6158       /* --- MMX decode failure --- */
   6159       default:
   6160       mmx_decode_failure:
   6161          *decode_ok = False;
   6162          return delta; /* ignored */
   6163 
   6164    }
   6165 
   6166    *decode_ok = True;
   6167    return delta;
   6168 }
   6169 
   6170 
   6171 /*------------------------------------------------------------*/
   6172 /*--- More misc arithmetic and other obscure insns.        ---*/
   6173 /*------------------------------------------------------------*/
   6174 
   6175 /* Double length left and right shifts.  Apparently only required in
   6176    v-size (no b- variant). */
   6177 static
   6178 UInt dis_SHLRD_Gv_Ev ( UChar sorb,
   6179                        Int delta, UChar modrm,
   6180                        Int sz,
   6181                        IRExpr* shift_amt,
   6182                        Bool amt_is_literal,
   6183                        const HChar* shift_amt_txt,
   6184                        Bool left_shift )
   6185 {
   6186    /* shift_amt :: Ity_I8 is the amount to shift.  shift_amt_txt is used
   6187       for printing it.   And eip on entry points at the modrm byte. */
   6188    Int len;
   6189    HChar dis_buf[50];
   6190 
   6191    IRType ty       = szToITy(sz);
   6192    IRTemp gsrc     = newTemp(ty);
   6193    IRTemp esrc     = newTemp(ty);
   6194    IRTemp addr     = IRTemp_INVALID;
   6195    IRTemp tmpSH    = newTemp(Ity_I8);
   6196    IRTemp tmpL     = IRTemp_INVALID;
   6197    IRTemp tmpRes   = IRTemp_INVALID;
   6198    IRTemp tmpSubSh = IRTemp_INVALID;
   6199    IROp   mkpair;
   6200    IROp   getres;
   6201    IROp   shift;
   6202    IRExpr* mask = NULL;
   6203 
   6204    vassert(sz == 2 || sz == 4);
   6205 
   6206    /* The E-part is the destination; this is shifted.  The G-part
   6207       supplies bits to be shifted into the E-part, but is not
   6208       changed.
   6209 
   6210       If shifting left, form a double-length word with E at the top
   6211       and G at the bottom, and shift this left.  The result is then in
   6212       the high part.
   6213 
   6214       If shifting right, form a double-length word with G at the top
   6215       and E at the bottom, and shift this right.  The result is then
   6216       at the bottom.  */
   6217 
   6218    /* Fetch the operands. */
   6219 
   6220    assign( gsrc, getIReg(sz, gregOfRM(modrm)) );
   6221 
   6222    if (epartIsReg(modrm)) {
   6223       delta++;
   6224       assign( esrc, getIReg(sz, eregOfRM(modrm)) );
   6225       DIP("sh%cd%c %s, %s, %s\n",
   6226           ( left_shift ? 'l' : 'r' ), nameISize(sz),
   6227           shift_amt_txt,
   6228           nameIReg(sz, gregOfRM(modrm)), nameIReg(sz, eregOfRM(modrm)));
   6229    } else {
   6230       addr = disAMode ( &len, sorb, delta, dis_buf );
   6231       delta += len;
   6232       assign( esrc, loadLE(ty, mkexpr(addr)) );
   6233       DIP("sh%cd%c %s, %s, %s\n",
   6234           ( left_shift ? 'l' : 'r' ), nameISize(sz),
   6235           shift_amt_txt,
   6236           nameIReg(sz, gregOfRM(modrm)), dis_buf);
   6237    }
   6238 
   6239    /* Round up the relevant primops. */
   6240 
   6241    if (sz == 4) {
   6242       tmpL     = newTemp(Ity_I64);
   6243       tmpRes   = newTemp(Ity_I32);
   6244       tmpSubSh = newTemp(Ity_I32);
   6245       mkpair   = Iop_32HLto64;
   6246       getres   = left_shift ? Iop_64HIto32 : Iop_64to32;
   6247       shift    = left_shift ? Iop_Shl64 : Iop_Shr64;
   6248       mask     = mkU8(31);
   6249    } else {
   6250       /* sz == 2 */
   6251       tmpL     = newTemp(Ity_I32);
   6252       tmpRes   = newTemp(Ity_I16);
   6253       tmpSubSh = newTemp(Ity_I16);
   6254       mkpair   = Iop_16HLto32;
   6255       getres   = left_shift ? Iop_32HIto16 : Iop_32to16;
   6256       shift    = left_shift ? Iop_Shl32 : Iop_Shr32;
   6257       mask     = mkU8(15);
   6258    }
   6259 
   6260    /* Do the shift, calculate the subshift value, and set
   6261       the flag thunk. */
   6262 
   6263    assign( tmpSH, binop(Iop_And8, shift_amt, mask) );
   6264 
   6265    if (left_shift)
   6266       assign( tmpL, binop(mkpair, mkexpr(esrc), mkexpr(gsrc)) );
   6267    else
   6268       assign( tmpL, binop(mkpair, mkexpr(gsrc), mkexpr(esrc)) );
   6269 
   6270    assign( tmpRes, unop(getres, binop(shift, mkexpr(tmpL), mkexpr(tmpSH)) ) );
   6271    assign( tmpSubSh,
   6272            unop(getres,
   6273                 binop(shift,
   6274                       mkexpr(tmpL),
   6275                       binop(Iop_And8,
   6276                             binop(Iop_Sub8, mkexpr(tmpSH), mkU8(1) ),
   6277                             mask))) );
   6278 
   6279    setFlags_DEP1_DEP2_shift ( left_shift ? Iop_Shl32 : Iop_Sar32,
   6280                               tmpRes, tmpSubSh, ty, tmpSH );
   6281 
   6282    /* Put result back. */
   6283 
   6284    if (epartIsReg(modrm)) {
   6285       putIReg(sz, eregOfRM(modrm), mkexpr(tmpRes));
   6286    } else {
   6287       storeLE( mkexpr(addr), mkexpr(tmpRes) );
   6288    }
   6289 
   6290    if (amt_is_literal) delta++;
   6291    return delta;
   6292 }
   6293 
   6294 
   6295 /* Handle BT/BTS/BTR/BTC Gv, Ev.  Apparently b-size is not
   6296    required. */
   6297 
   6298 typedef enum { BtOpNone, BtOpSet, BtOpReset, BtOpComp } BtOp;
   6299 
   6300 static const HChar* nameBtOp ( BtOp op )
   6301 {
   6302    switch (op) {
   6303       case BtOpNone:  return "";
   6304       case BtOpSet:   return "s";
   6305       case BtOpReset: return "r";
   6306       case BtOpComp:  return "c";
   6307       default: vpanic("nameBtOp(x86)");
   6308    }
   6309 }
   6310 
   6311 
   6312 static
   6313 UInt dis_bt_G_E ( const VexAbiInfo* vbi,
   6314                   UChar sorb, Bool locked, Int sz, Int delta, BtOp op )
   6315 {
   6316    HChar  dis_buf[50];
   6317    UChar  modrm;
   6318    Int    len;
   6319    IRTemp t_fetched, t_bitno0, t_bitno1, t_bitno2, t_addr0,
   6320           t_addr1, t_esp, t_mask, t_new;
   6321 
   6322    vassert(sz == 2 || sz == 4);
   6323 
   6324    t_fetched = t_bitno0 = t_bitno1 = t_bitno2
   6325              = t_addr0 = t_addr1 = t_esp
   6326              = t_mask = t_new = IRTemp_INVALID;
   6327 
   6328    t_fetched = newTemp(Ity_I8);
   6329    t_new     = newTemp(Ity_I8);
   6330    t_bitno0  = newTemp(Ity_I32);
   6331    t_bitno1  = newTemp(Ity_I32);
   6332    t_bitno2  = newTemp(Ity_I8);
   6333    t_addr1   = newTemp(Ity_I32);
   6334    modrm     = getIByte(delta);
   6335 
   6336    assign( t_bitno0, widenSto32(getIReg(sz, gregOfRM(modrm))) );
   6337 
   6338    if (epartIsReg(modrm)) {
   6339       delta++;
   6340       /* Get it onto the client's stack. */
   6341       t_esp = newTemp(Ity_I32);
   6342       t_addr0 = newTemp(Ity_I32);
   6343 
   6344       /* For the choice of the value 128, see comment in dis_bt_G_E in
   6345          guest_amd64_toIR.c.  We point out here only that 128 is
   6346          fast-cased in Memcheck and is > 0, so seems like a good
   6347          choice. */
   6348       vassert(vbi->guest_stack_redzone_size == 0);
   6349       assign( t_esp, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(128)) );
   6350       putIReg(4, R_ESP, mkexpr(t_esp));
   6351 
   6352       storeLE( mkexpr(t_esp), getIReg(sz, eregOfRM(modrm)) );
   6353 
   6354       /* Make t_addr0 point at it. */
   6355       assign( t_addr0, mkexpr(t_esp) );
   6356 
   6357       /* Mask out upper bits of the shift amount, since we're doing a
   6358          reg. */
   6359       assign( t_bitno1, binop(Iop_And32,
   6360                               mkexpr(t_bitno0),
   6361                               mkU32(sz == 4 ? 31 : 15)) );
   6362 
   6363    } else {
   6364       t_addr0 = disAMode ( &len, sorb, delta, dis_buf );
   6365       delta += len;
   6366       assign( t_bitno1, mkexpr(t_bitno0) );
   6367    }
   6368 
   6369    /* At this point: t_addr0 is the address being operated on.  If it
   6370       was a reg, we will have pushed it onto the client's stack.
   6371       t_bitno1 is the bit number, suitably masked in the case of a
   6372       reg.  */
   6373 
   6374    /* Now the main sequence. */
   6375    assign( t_addr1,
   6376            binop(Iop_Add32,
   6377                  mkexpr(t_addr0),
   6378                  binop(Iop_Sar32, mkexpr(t_bitno1), mkU8(3))) );
   6379 
   6380    /* t_addr1 now holds effective address */
   6381 
   6382    assign( t_bitno2,
   6383            unop(Iop_32to8,
   6384                 binop(Iop_And32, mkexpr(t_bitno1), mkU32(7))) );
   6385 
   6386    /* t_bitno2 contains offset of bit within byte */
   6387 
   6388    if (op != BtOpNone) {
   6389       t_mask = newTemp(Ity_I8);
   6390       assign( t_mask, binop(Iop_Shl8, mkU8(1), mkexpr(t_bitno2)) );
   6391    }
   6392 
   6393    /* t_mask is now a suitable byte mask */
   6394 
   6395    assign( t_fetched, loadLE(Ity_I8, mkexpr(t_addr1)) );
   6396 
   6397    if (op != BtOpNone) {
   6398       switch (op) {
   6399          case BtOpSet:
   6400             assign( t_new,
   6401                     binop(Iop_Or8, mkexpr(t_fetched), mkexpr(t_mask)) );
   6402             break;
   6403          case BtOpComp:
   6404             assign( t_new,
   6405                     binop(Iop_Xor8, mkexpr(t_fetched), mkexpr(t_mask)) );
   6406             break;
   6407          case BtOpReset:
   6408             assign( t_new,
   6409                     binop(Iop_And8, mkexpr(t_fetched),
   6410                                     unop(Iop_Not8, mkexpr(t_mask))) );
   6411             break;
   6412          default:
   6413             vpanic("dis_bt_G_E(x86)");
   6414       }
   6415       if (locked && !epartIsReg(modrm)) {
   6416          casLE( mkexpr(t_addr1), mkexpr(t_fetched)/*expd*/,
   6417                                  mkexpr(t_new)/*new*/,
   6418                                  guest_EIP_curr_instr );
   6419       } else {
   6420          storeLE( mkexpr(t_addr1), mkexpr(t_new) );
   6421       }
   6422    }
   6423 
   6424    /* Side effect done; now get selected bit into Carry flag */
   6425    /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
   6426    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   6427    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   6428    stmt( IRStmt_Put(
   6429             OFFB_CC_DEP1,
   6430             binop(Iop_And32,
   6431                   binop(Iop_Shr32,
   6432                         unop(Iop_8Uto32, mkexpr(t_fetched)),
   6433                         mkexpr(t_bitno2)),
   6434                   mkU32(1)))
   6435        );
   6436    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   6437       elimination of previous stores to this field work better. */
   6438    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   6439 
   6440    /* Move reg operand from stack back to reg */
   6441    if (epartIsReg(modrm)) {
   6442       /* t_esp still points at it. */
   6443       putIReg(sz, eregOfRM(modrm), loadLE(szToITy(sz), mkexpr(t_esp)) );
   6444       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t_esp), mkU32(128)) );
   6445    }
   6446 
   6447    DIP("bt%s%c %s, %s\n",
   6448        nameBtOp(op), nameISize(sz), nameIReg(sz, gregOfRM(modrm)),
   6449        ( epartIsReg(modrm) ? nameIReg(sz, eregOfRM(modrm)) : dis_buf ) );
   6450 
   6451    return delta;
   6452 }
   6453 
   6454 
   6455 
   6456 /* Handle BSF/BSR.  Only v-size seems necessary. */
   6457 static
   6458 UInt dis_bs_E_G ( UChar sorb, Int sz, Int delta, Bool fwds )
   6459 {
   6460    Bool   isReg;
   6461    UChar  modrm;
   6462    HChar  dis_buf[50];
   6463 
   6464    IRType ty  = szToITy(sz);
   6465    IRTemp src = newTemp(ty);
   6466    IRTemp dst = newTemp(ty);
   6467 
   6468    IRTemp src32 = newTemp(Ity_I32);
   6469    IRTemp dst32 = newTemp(Ity_I32);
   6470    IRTemp srcB  = newTemp(Ity_I1);
   6471 
   6472    vassert(sz == 4 || sz == 2);
   6473 
   6474    modrm = getIByte(delta);
   6475 
   6476    isReg = epartIsReg(modrm);
   6477    if (isReg) {
   6478       delta++;
   6479       assign( src, getIReg(sz, eregOfRM(modrm)) );
   6480    } else {
   6481       Int    len;
   6482       IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   6483       delta += len;
   6484       assign( src, loadLE(ty, mkexpr(addr)) );
   6485    }
   6486 
   6487    DIP("bs%c%c %s, %s\n",
   6488        fwds ? 'f' : 'r', nameISize(sz),
   6489        ( isReg ? nameIReg(sz, eregOfRM(modrm)) : dis_buf ),
   6490        nameIReg(sz, gregOfRM(modrm)));
   6491 
   6492    /* Generate a bool expression which is zero iff the original is
   6493       zero, and nonzero otherwise.  Ask for a CmpNE version which, if
   6494       instrumented by Memcheck, is instrumented expensively, since
   6495       this may be used on the output of a preceding movmskb insn,
   6496       which has been known to be partially defined, and in need of
   6497       careful handling. */
   6498    assign( srcB, binop(mkSizedOp(ty,Iop_ExpCmpNE8),
   6499                        mkexpr(src), mkU(ty,0)) );
   6500 
   6501    /* Flags: Z is 1 iff source value is zero.  All others
   6502       are undefined -- we force them to zero. */
   6503    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   6504    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   6505    stmt( IRStmt_Put(
   6506             OFFB_CC_DEP1,
   6507             IRExpr_ITE( mkexpr(srcB),
   6508                         /* src!=0 */
   6509                         mkU32(0),
   6510                         /* src==0 */
   6511                         mkU32(X86G_CC_MASK_Z)
   6512                         )
   6513        ));
   6514    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   6515       elimination of previous stores to this field work better. */
   6516    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   6517 
   6518    /* Result: iff source value is zero, we can't use
   6519       Iop_Clz32/Iop_Ctz32 as they have no defined result in that case.
   6520       But anyway, Intel x86 semantics say the result is undefined in
   6521       such situations.  Hence handle the zero case specially. */
   6522 
   6523    /* Bleh.  What we compute:
   6524 
   6525           bsf32:  if src == 0 then 0 else  Ctz32(src)
   6526           bsr32:  if src == 0 then 0 else  31 - Clz32(src)
   6527 
   6528           bsf16:  if src == 0 then 0 else  Ctz32(16Uto32(src))
   6529           bsr16:  if src == 0 then 0 else  31 - Clz32(16Uto32(src))
   6530 
   6531       First, widen src to 32 bits if it is not already.
   6532 
   6533       Postscript 15 Oct 04: it seems that at least VIA Nehemiah leaves the
   6534       dst register unchanged when src == 0.  Hence change accordingly.
   6535    */
   6536    if (sz == 2)
   6537       assign( src32, unop(Iop_16Uto32, mkexpr(src)) );
   6538    else
   6539       assign( src32, mkexpr(src) );
   6540 
   6541    /* The main computation, guarding against zero. */
   6542    assign( dst32,
   6543            IRExpr_ITE(
   6544               mkexpr(srcB),
   6545               /* src != 0 */
   6546               fwds ? unop(Iop_Ctz32, mkexpr(src32))
   6547                    : binop(Iop_Sub32,
   6548                            mkU32(31),
   6549                            unop(Iop_Clz32, mkexpr(src32))),
   6550               /* src == 0 -- leave dst unchanged */
   6551               widenUto32( getIReg( sz, gregOfRM(modrm) ) )
   6552            )
   6553          );
   6554 
   6555    if (sz == 2)
   6556       assign( dst, unop(Iop_32to16, mkexpr(dst32)) );
   6557    else
   6558       assign( dst, mkexpr(dst32) );
   6559 
   6560    /* dump result back */
   6561    putIReg( sz, gregOfRM(modrm), mkexpr(dst) );
   6562 
   6563    return delta;
   6564 }
   6565 
   6566 
   6567 static
   6568 void codegen_xchg_eAX_Reg ( Int sz, Int reg )
   6569 {
   6570    IRType ty = szToITy(sz);
   6571    IRTemp t1 = newTemp(ty);
   6572    IRTemp t2 = newTemp(ty);
   6573    vassert(sz == 2 || sz == 4);
   6574    assign( t1, getIReg(sz, R_EAX) );
   6575    assign( t2, getIReg(sz, reg) );
   6576    putIReg( sz, R_EAX, mkexpr(t2) );
   6577    putIReg( sz, reg, mkexpr(t1) );
   6578    DIP("xchg%c %s, %s\n",
   6579        nameISize(sz), nameIReg(sz, R_EAX), nameIReg(sz, reg));
   6580 }
   6581 
   6582 
   6583 static
   6584 void codegen_SAHF ( void )
   6585 {
   6586    /* Set the flags to:
   6587       (x86g_calculate_flags_all() & X86G_CC_MASK_O)  -- retain the old O flag
   6588       | (%AH & (X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
   6589                 |X86G_CC_MASK_P|X86G_CC_MASK_C)
   6590    */
   6591    UInt   mask_SZACP = X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
   6592                        |X86G_CC_MASK_C|X86G_CC_MASK_P;
   6593    IRTemp oldflags   = newTemp(Ity_I32);
   6594    assign( oldflags, mk_x86g_calculate_eflags_all() );
   6595    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   6596    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   6597    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   6598    stmt( IRStmt_Put( OFFB_CC_DEP1,
   6599          binop(Iop_Or32,
   6600                binop(Iop_And32, mkexpr(oldflags), mkU32(X86G_CC_MASK_O)),
   6601                binop(Iop_And32,
   6602                      binop(Iop_Shr32, getIReg(4, R_EAX), mkU8(8)),
   6603                      mkU32(mask_SZACP))
   6604               )
   6605    ));
   6606    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   6607       elimination of previous stores to this field work better. */
   6608    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   6609 }
   6610 
   6611 
   6612 static
   6613 void codegen_LAHF ( void  )
   6614 {
   6615    /* AH <- EFLAGS(SF:ZF:0:AF:0:PF:1:CF) */
   6616    IRExpr* eax_with_hole;
   6617    IRExpr* new_byte;
   6618    IRExpr* new_eax;
   6619    UInt    mask_SZACP = X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
   6620                         |X86G_CC_MASK_C|X86G_CC_MASK_P;
   6621 
   6622    IRTemp  flags = newTemp(Ity_I32);
   6623    assign( flags, mk_x86g_calculate_eflags_all() );
   6624 
   6625    eax_with_hole
   6626       = binop(Iop_And32, getIReg(4, R_EAX), mkU32(0xFFFF00FF));
   6627    new_byte
   6628       = binop(Iop_Or32, binop(Iop_And32, mkexpr(flags), mkU32(mask_SZACP)),
   6629                         mkU32(1<<1));
   6630    new_eax
   6631       = binop(Iop_Or32, eax_with_hole,
   6632                         binop(Iop_Shl32, new_byte, mkU8(8)));
   6633    putIReg(4, R_EAX, new_eax);
   6634 }
   6635 
   6636 
   6637 static
   6638 UInt dis_cmpxchg_G_E ( UChar       sorb,
   6639                        Bool        locked,
   6640                        Int         size,
   6641                        Int         delta0 )
   6642 {
   6643    HChar dis_buf[50];
   6644    Int   len;
   6645 
   6646    IRType ty    = szToITy(size);
   6647    IRTemp acc   = newTemp(ty);
   6648    IRTemp src   = newTemp(ty);
   6649    IRTemp dest  = newTemp(ty);
   6650    IRTemp dest2 = newTemp(ty);
   6651    IRTemp acc2  = newTemp(ty);
   6652    IRTemp cond  = newTemp(Ity_I1);
   6653    IRTemp addr  = IRTemp_INVALID;
   6654    UChar  rm    = getUChar(delta0);
   6655 
   6656    /* There are 3 cases to consider:
   6657 
   6658       reg-reg: ignore any lock prefix, generate sequence based
   6659                on ITE
   6660 
   6661       reg-mem, not locked: ignore any lock prefix, generate sequence
   6662                            based on ITE
   6663 
   6664       reg-mem, locked: use IRCAS
   6665    */
   6666    if (epartIsReg(rm)) {
   6667       /* case 1 */
   6668       assign( dest, getIReg(size, eregOfRM(rm)) );
   6669       delta0++;
   6670       assign( src, getIReg(size, gregOfRM(rm)) );
   6671       assign( acc, getIReg(size, R_EAX) );
   6672       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   6673       assign( cond, mk_x86g_calculate_condition(X86CondZ) );
   6674       assign( dest2, IRExpr_ITE(mkexpr(cond), mkexpr(src), mkexpr(dest)) );
   6675       assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
   6676       putIReg(size, R_EAX, mkexpr(acc2));
   6677       putIReg(size, eregOfRM(rm), mkexpr(dest2));
   6678       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   6679                                nameIReg(size,gregOfRM(rm)),
   6680                                nameIReg(size,eregOfRM(rm)) );
   6681    }
   6682    else if (!epartIsReg(rm) && !locked) {
   6683       /* case 2 */
   6684       addr = disAMode ( &len, sorb, delta0, dis_buf );
   6685       assign( dest, loadLE(ty, mkexpr(addr)) );
   6686       delta0 += len;
   6687       assign( src, getIReg(size, gregOfRM(rm)) );
   6688       assign( acc, getIReg(size, R_EAX) );
   6689       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   6690       assign( cond, mk_x86g_calculate_condition(X86CondZ) );
   6691       assign( dest2, IRExpr_ITE(mkexpr(cond), mkexpr(src), mkexpr(dest)) );
   6692       assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
   6693       putIReg(size, R_EAX, mkexpr(acc2));
   6694       storeLE( mkexpr(addr), mkexpr(dest2) );
   6695       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   6696                                nameIReg(size,gregOfRM(rm)), dis_buf);
   6697    }
   6698    else if (!epartIsReg(rm) && locked) {
   6699       /* case 3 */
   6700       /* src is new value.  acc is expected value.  dest is old value.
   6701          Compute success from the output of the IRCAS, and steer the
   6702          new value for EAX accordingly: in case of success, EAX is
   6703          unchanged. */
   6704       addr = disAMode ( &len, sorb, delta0, dis_buf );
   6705       delta0 += len;
   6706       assign( src, getIReg(size, gregOfRM(rm)) );
   6707       assign( acc, getIReg(size, R_EAX) );
   6708       stmt( IRStmt_CAS(
   6709          mkIRCAS( IRTemp_INVALID, dest, Iend_LE, mkexpr(addr),
   6710                   NULL, mkexpr(acc), NULL, mkexpr(src) )
   6711       ));
   6712       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   6713       assign( cond, mk_x86g_calculate_condition(X86CondZ) );
   6714       assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
   6715       putIReg(size, R_EAX, mkexpr(acc2));
   6716       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   6717                                nameIReg(size,gregOfRM(rm)), dis_buf);
   6718    }
   6719    else vassert(0);
   6720 
   6721    return delta0;
   6722 }
   6723 
   6724 
   6725 /* Handle conditional move instructions of the form
   6726       cmovcc E(reg-or-mem), G(reg)
   6727 
   6728    E(src) is reg-or-mem
   6729    G(dst) is reg.
   6730 
   6731    If E is reg, -->    GET %E, tmps
   6732                        GET %G, tmpd
   6733                        CMOVcc tmps, tmpd
   6734                        PUT tmpd, %G
   6735 
   6736    If E is mem  -->    (getAddr E) -> tmpa
   6737                        LD (tmpa), tmps
   6738                        GET %G, tmpd
   6739                        CMOVcc tmps, tmpd
   6740                        PUT tmpd, %G
   6741 */
   6742 static
   6743 UInt dis_cmov_E_G ( UChar       sorb,
   6744                     Int         sz,
   6745                     X86Condcode cond,
   6746                     Int         delta0 )
   6747 {
   6748    UChar rm  = getIByte(delta0);
   6749    HChar dis_buf[50];
   6750    Int   len;
   6751 
   6752    IRType ty   = szToITy(sz);
   6753    IRTemp tmps = newTemp(ty);
   6754    IRTemp tmpd = newTemp(ty);
   6755 
   6756    if (epartIsReg(rm)) {
   6757       assign( tmps, getIReg(sz, eregOfRM(rm)) );
   6758       assign( tmpd, getIReg(sz, gregOfRM(rm)) );
   6759 
   6760       putIReg(sz, gregOfRM(rm),
   6761                   IRExpr_ITE( mk_x86g_calculate_condition(cond),
   6762                               mkexpr(tmps),
   6763                               mkexpr(tmpd) )
   6764              );
   6765       DIP("cmov%c%s %s,%s\n", nameISize(sz),
   6766                               name_X86Condcode(cond),
   6767                               nameIReg(sz,eregOfRM(rm)),
   6768                               nameIReg(sz,gregOfRM(rm)));
   6769       return 1+delta0;
   6770    }
   6771 
   6772    /* E refers to memory */
   6773    {
   6774       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
   6775       assign( tmps, loadLE(ty, mkexpr(addr)) );
   6776       assign( tmpd, getIReg(sz, gregOfRM(rm)) );
   6777 
   6778       putIReg(sz, gregOfRM(rm),
   6779                   IRExpr_ITE( mk_x86g_calculate_condition(cond),
   6780                               mkexpr(tmps),
   6781                               mkexpr(tmpd) )
   6782              );
   6783 
   6784       DIP("cmov%c%s %s,%s\n", nameISize(sz),
   6785                               name_X86Condcode(cond),
   6786                               dis_buf,
   6787                               nameIReg(sz,gregOfRM(rm)));
   6788       return len+delta0;
   6789    }
   6790 }
   6791 
   6792 
   6793 static
   6794 UInt dis_xadd_G_E ( UChar sorb, Bool locked, Int sz, Int delta0,
   6795                     Bool* decodeOK )
   6796 {
   6797    Int   len;
   6798    UChar rm = getIByte(delta0);
   6799    HChar dis_buf[50];
   6800 
   6801    IRType ty    = szToITy(sz);
   6802    IRTemp tmpd  = newTemp(ty);
   6803    IRTemp tmpt0 = newTemp(ty);
   6804    IRTemp tmpt1 = newTemp(ty);
   6805 
   6806    /* There are 3 cases to consider:
   6807 
   6808       reg-reg: ignore any lock prefix,
   6809                generate 'naive' (non-atomic) sequence
   6810 
   6811       reg-mem, not locked: ignore any lock prefix, generate 'naive'
   6812                            (non-atomic) sequence
   6813 
   6814       reg-mem, locked: use IRCAS
   6815    */
   6816 
   6817    if (epartIsReg(rm)) {
   6818       /* case 1 */
   6819       assign( tmpd,  getIReg(sz, eregOfRM(rm)));
   6820       assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
   6821       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   6822                            mkexpr(tmpd), mkexpr(tmpt0)) );
   6823       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   6824       putIReg(sz, eregOfRM(rm), mkexpr(tmpt1));
   6825       putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
   6826       DIP("xadd%c %s, %s\n",
   6827           nameISize(sz), nameIReg(sz,gregOfRM(rm)),
   6828           				 nameIReg(sz,eregOfRM(rm)));
   6829       *decodeOK = True;
   6830       return 1+delta0;
   6831    }
   6832    else if (!epartIsReg(rm) && !locked) {
   6833       /* case 2 */
   6834       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
   6835       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
   6836       assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
   6837       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   6838                            mkexpr(tmpd), mkexpr(tmpt0)) );
   6839       storeLE( mkexpr(addr), mkexpr(tmpt1) );
   6840       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   6841       putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
   6842       DIP("xadd%c %s, %s\n",
   6843           nameISize(sz), nameIReg(sz,gregOfRM(rm)), dis_buf);
   6844       *decodeOK = True;
   6845       return len+delta0;
   6846    }
   6847    else if (!epartIsReg(rm) && locked) {
   6848       /* case 3 */
   6849       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
   6850       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
   6851       assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
   6852       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   6853                            mkexpr(tmpd), mkexpr(tmpt0)) );
   6854       casLE( mkexpr(addr), mkexpr(tmpd)/*expVal*/,
   6855                            mkexpr(tmpt1)/*newVal*/, guest_EIP_curr_instr );
   6856       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   6857       putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
   6858       DIP("xadd%c %s, %s\n",
   6859           nameISize(sz), nameIReg(sz,gregOfRM(rm)), dis_buf);
   6860       *decodeOK = True;
   6861       return len+delta0;
   6862    }
   6863    /*UNREACHED*/
   6864    vassert(0);
   6865 }
   6866 
   6867 /* Move 16 bits from Ew (ireg or mem) to G (a segment register). */
   6868 
   6869 static
   6870 UInt dis_mov_Ew_Sw ( UChar sorb, Int delta0 )
   6871 {
   6872    Int    len;
   6873    IRTemp addr;
   6874    UChar  rm  = getIByte(delta0);
   6875    HChar  dis_buf[50];
   6876 
   6877    if (epartIsReg(rm)) {
   6878       putSReg( gregOfRM(rm), getIReg(2, eregOfRM(rm)) );
   6879       DIP("movw %s,%s\n", nameIReg(2,eregOfRM(rm)), nameSReg(gregOfRM(rm)));
   6880       return 1+delta0;
   6881    } else {
   6882       addr = disAMode ( &len, sorb, delta0, dis_buf );
   6883       putSReg( gregOfRM(rm), loadLE(Ity_I16, mkexpr(addr)) );
   6884       DIP("movw %s,%s\n", dis_buf, nameSReg(gregOfRM(rm)));
   6885       return len+delta0;
   6886    }
   6887 }
   6888 
   6889 /* Move 16 bits from G (a segment register) to Ew (ireg or mem).  If
   6890    dst is ireg and sz==4, zero out top half of it.  */
   6891 
   6892 static
   6893 UInt dis_mov_Sw_Ew ( UChar sorb,
   6894                      Int   sz,
   6895                      Int   delta0 )
   6896 {
   6897    Int    len;
   6898    IRTemp addr;
   6899    UChar  rm  = getIByte(delta0);
   6900    HChar  dis_buf[50];
   6901 
   6902    vassert(sz == 2 || sz == 4);
   6903 
   6904    if (epartIsReg(rm)) {
   6905       if (sz == 4)
   6906          putIReg(4, eregOfRM(rm), unop(Iop_16Uto32, getSReg(gregOfRM(rm))));
   6907       else
   6908          putIReg(2, eregOfRM(rm), getSReg(gregOfRM(rm)));
   6909 
   6910       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), nameIReg(sz,eregOfRM(rm)));
   6911       return 1+delta0;
   6912    } else {
   6913       addr = disAMode ( &len, sorb, delta0, dis_buf );
   6914       storeLE( mkexpr(addr), getSReg(gregOfRM(rm)) );
   6915       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), dis_buf);
   6916       return len+delta0;
   6917    }
   6918 }
   6919 
   6920 
   6921 static
   6922 void dis_push_segreg ( UInt sreg, Int sz )
   6923 {
   6924     IRTemp t1 = newTemp(Ity_I16);
   6925     IRTemp ta = newTemp(Ity_I32);
   6926     vassert(sz == 2 || sz == 4);
   6927 
   6928     assign( t1, getSReg(sreg) );
   6929     assign( ta, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)) );
   6930     putIReg(4, R_ESP, mkexpr(ta));
   6931     storeLE( mkexpr(ta), mkexpr(t1) );
   6932 
   6933     DIP("push%c %s\n", sz==2 ? 'w' : 'l', nameSReg(sreg));
   6934 }
   6935 
   6936 static
   6937 void dis_pop_segreg ( UInt sreg, Int sz )
   6938 {
   6939     IRTemp t1 = newTemp(Ity_I16);
   6940     IRTemp ta = newTemp(Ity_I32);
   6941     vassert(sz == 2 || sz == 4);
   6942 
   6943     assign( ta, getIReg(4, R_ESP) );
   6944     assign( t1, loadLE(Ity_I16, mkexpr(ta)) );
   6945 
   6946     putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(ta), mkU32(sz)) );
   6947     putSReg( sreg, mkexpr(t1) );
   6948     DIP("pop%c %s\n", sz==2 ? 'w' : 'l', nameSReg(sreg));
   6949 }
   6950 
   6951 static
   6952 void dis_ret ( /*MOD*/DisResult* dres, UInt d32 )
   6953 {
   6954    IRTemp t1 = newTemp(Ity_I32);
   6955    IRTemp t2 = newTemp(Ity_I32);
   6956    assign(t1, getIReg(4,R_ESP));
   6957    assign(t2, loadLE(Ity_I32,mkexpr(t1)));
   6958    putIReg(4, R_ESP,binop(Iop_Add32, mkexpr(t1), mkU32(4+d32)));
   6959    jmp_treg(dres, Ijk_Ret, t2);
   6960    vassert(dres->whatNext == Dis_StopHere);
   6961 }
   6962 
   6963 /*------------------------------------------------------------*/
   6964 /*--- SSE/SSE2/SSE3 helpers                                ---*/
   6965 /*------------------------------------------------------------*/
   6966 
   6967 /* Indicates whether the op requires a rounding-mode argument.  Note
   6968    that this covers only vector floating point arithmetic ops, and
   6969    omits the scalar ones that need rounding modes.  Note also that
   6970    inconsistencies here will get picked up later by the IR sanity
   6971    checker, so this isn't correctness-critical. */
   6972 static Bool requiresRMode ( IROp op )
   6973 {
   6974    switch (op) {
   6975       /* 128 bit ops */
   6976       case Iop_Add32Fx4: case Iop_Sub32Fx4:
   6977       case Iop_Mul32Fx4: case Iop_Div32Fx4:
   6978       case Iop_Add64Fx2: case Iop_Sub64Fx2:
   6979       case Iop_Mul64Fx2: case Iop_Div64Fx2:
   6980          return True;
   6981       default:
   6982          break;
   6983    }
   6984    return False;
   6985 }
   6986 
   6987 
   6988 /* Worker function; do not call directly.
   6989    Handles full width G = G `op` E   and   G = (not G) `op` E.
   6990 */
   6991 
   6992 static UInt dis_SSE_E_to_G_all_wrk (
   6993                UChar sorb, Int delta,
   6994                const HChar* opname, IROp op,
   6995                Bool   invertG
   6996             )
   6997 {
   6998    HChar   dis_buf[50];
   6999    Int     alen;
   7000    IRTemp  addr;
   7001    UChar   rm = getIByte(delta);
   7002    IRExpr* gpart
   7003       = invertG ? unop(Iop_NotV128, getXMMReg(gregOfRM(rm)))
   7004                 : getXMMReg(gregOfRM(rm));
   7005    if (epartIsReg(rm)) {
   7006       putXMMReg(
   7007          gregOfRM(rm),
   7008          requiresRMode(op)
   7009             ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   7010                         gpart,
   7011                         getXMMReg(eregOfRM(rm)))
   7012             : binop(op, gpart,
   7013                         getXMMReg(eregOfRM(rm)))
   7014       );
   7015       DIP("%s %s,%s\n", opname,
   7016                         nameXMMReg(eregOfRM(rm)),
   7017                         nameXMMReg(gregOfRM(rm)) );
   7018       return delta+1;
   7019    } else {
   7020       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7021       putXMMReg(
   7022          gregOfRM(rm),
   7023          requiresRMode(op)
   7024             ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   7025                         gpart,
   7026                         loadLE(Ity_V128, mkexpr(addr)))
   7027             : binop(op, gpart,
   7028                         loadLE(Ity_V128, mkexpr(addr)))
   7029       );
   7030       DIP("%s %s,%s\n", opname,
   7031                         dis_buf,
   7032                         nameXMMReg(gregOfRM(rm)) );
   7033       return delta+alen;
   7034    }
   7035 }
   7036 
   7037 
   7038 /* All lanes SSE binary operation, G = G `op` E. */
   7039 
   7040 static
   7041 UInt dis_SSE_E_to_G_all ( UChar sorb, Int delta, const HChar* opname, IROp op )
   7042 {
   7043    return dis_SSE_E_to_G_all_wrk( sorb, delta, opname, op, False );
   7044 }
   7045 
   7046 /* All lanes SSE binary operation, G = (not G) `op` E. */
   7047 
   7048 static
   7049 UInt dis_SSE_E_to_G_all_invG ( UChar sorb, Int delta,
   7050                                const HChar* opname, IROp op )
   7051 {
   7052    return dis_SSE_E_to_G_all_wrk( sorb, delta, opname, op, True );
   7053 }
   7054 
   7055 
   7056 /* Lowest 32-bit lane only SSE binary operation, G = G `op` E. */
   7057 
   7058 static UInt dis_SSE_E_to_G_lo32 ( UChar sorb, Int delta,
   7059                                   const HChar* opname, IROp op )
   7060 {
   7061    HChar   dis_buf[50];
   7062    Int     alen;
   7063    IRTemp  addr;
   7064    UChar   rm = getIByte(delta);
   7065    IRExpr* gpart = getXMMReg(gregOfRM(rm));
   7066    if (epartIsReg(rm)) {
   7067       putXMMReg( gregOfRM(rm),
   7068                  binop(op, gpart,
   7069                            getXMMReg(eregOfRM(rm))) );
   7070       DIP("%s %s,%s\n", opname,
   7071                         nameXMMReg(eregOfRM(rm)),
   7072                         nameXMMReg(gregOfRM(rm)) );
   7073       return delta+1;
   7074    } else {
   7075       /* We can only do a 32-bit memory read, so the upper 3/4 of the
   7076          E operand needs to be made simply of zeroes. */
   7077       IRTemp epart = newTemp(Ity_V128);
   7078       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7079       assign( epart, unop( Iop_32UtoV128,
   7080                            loadLE(Ity_I32, mkexpr(addr))) );
   7081       putXMMReg( gregOfRM(rm),
   7082                  binop(op, gpart, mkexpr(epart)) );
   7083       DIP("%s %s,%s\n", opname,
   7084                         dis_buf,
   7085                         nameXMMReg(gregOfRM(rm)) );
   7086       return delta+alen;
   7087    }
   7088 }
   7089 
   7090 
   7091 /* Lower 64-bit lane only SSE binary operation, G = G `op` E. */
   7092 
   7093 static UInt dis_SSE_E_to_G_lo64 ( UChar sorb, Int delta,
   7094                                   const HChar* opname, IROp op )
   7095 {
   7096    HChar   dis_buf[50];
   7097    Int     alen;
   7098    IRTemp  addr;
   7099    UChar   rm = getIByte(delta);
   7100    IRExpr* gpart = getXMMReg(gregOfRM(rm));
   7101    if (epartIsReg(rm)) {
   7102       putXMMReg( gregOfRM(rm),
   7103                  binop(op, gpart,
   7104                            getXMMReg(eregOfRM(rm))) );
   7105       DIP("%s %s,%s\n", opname,
   7106                         nameXMMReg(eregOfRM(rm)),
   7107                         nameXMMReg(gregOfRM(rm)) );
   7108       return delta+1;
   7109    } else {
   7110       /* We can only do a 64-bit memory read, so the upper half of the
   7111          E operand needs to be made simply of zeroes. */
   7112       IRTemp epart = newTemp(Ity_V128);
   7113       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7114       assign( epart, unop( Iop_64UtoV128,
   7115                            loadLE(Ity_I64, mkexpr(addr))) );
   7116       putXMMReg( gregOfRM(rm),
   7117                  binop(op, gpart, mkexpr(epart)) );
   7118       DIP("%s %s,%s\n", opname,
   7119                         dis_buf,
   7120                         nameXMMReg(gregOfRM(rm)) );
   7121       return delta+alen;
   7122    }
   7123 }
   7124 
   7125 
   7126 /* All lanes unary SSE operation, G = op(E). */
   7127 
   7128 static UInt dis_SSE_E_to_G_unary_all (
   7129                UChar sorb, Int delta,
   7130                const HChar* opname, IROp op
   7131             )
   7132 {
   7133    HChar   dis_buf[50];
   7134    Int     alen;
   7135    IRTemp  addr;
   7136    UChar   rm = getIByte(delta);
   7137    // Sqrt32Fx4 and Sqrt64Fx2 take a rounding mode, which is faked
   7138    // up in the usual way.
   7139    Bool needsIRRM = op == Iop_Sqrt32Fx4 || op == Iop_Sqrt64Fx2;
   7140    if (epartIsReg(rm)) {
   7141       IRExpr* src = getXMMReg(eregOfRM(rm));
   7142       /* XXXROUNDINGFIXME */
   7143       IRExpr* res = needsIRRM ? binop(op, get_FAKE_roundingmode(), src)
   7144                               : unop(op, src);
   7145       putXMMReg( gregOfRM(rm), res );
   7146       DIP("%s %s,%s\n", opname,
   7147                         nameXMMReg(eregOfRM(rm)),
   7148                         nameXMMReg(gregOfRM(rm)) );
   7149       return delta+1;
   7150    } else {
   7151       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7152       IRExpr* src = loadLE(Ity_V128, mkexpr(addr));
   7153       /* XXXROUNDINGFIXME */
   7154       IRExpr* res = needsIRRM ? binop(op, get_FAKE_roundingmode(), src)
   7155                               : unop(op, src);
   7156       putXMMReg( gregOfRM(rm), res );
   7157       DIP("%s %s,%s\n", opname,
   7158                         dis_buf,
   7159                         nameXMMReg(gregOfRM(rm)) );
   7160       return delta+alen;
   7161    }
   7162 }
   7163 
   7164 
   7165 /* Lowest 32-bit lane only unary SSE operation, G = op(E). */
   7166 
   7167 static UInt dis_SSE_E_to_G_unary_lo32 (
   7168                UChar sorb, Int delta,
   7169                const HChar* opname, IROp op
   7170             )
   7171 {
   7172    /* First we need to get the old G value and patch the low 32 bits
   7173       of the E operand into it.  Then apply op and write back to G. */
   7174    HChar   dis_buf[50];
   7175    Int     alen;
   7176    IRTemp  addr;
   7177    UChar   rm = getIByte(delta);
   7178    IRTemp  oldG0 = newTemp(Ity_V128);
   7179    IRTemp  oldG1 = newTemp(Ity_V128);
   7180 
   7181    assign( oldG0, getXMMReg(gregOfRM(rm)) );
   7182 
   7183    if (epartIsReg(rm)) {
   7184       assign( oldG1,
   7185               binop( Iop_SetV128lo32,
   7186                      mkexpr(oldG0),
   7187                      getXMMRegLane32(eregOfRM(rm), 0)) );
   7188       putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
   7189       DIP("%s %s,%s\n", opname,
   7190                         nameXMMReg(eregOfRM(rm)),
   7191                         nameXMMReg(gregOfRM(rm)) );
   7192       return delta+1;
   7193    } else {
   7194       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7195       assign( oldG1,
   7196               binop( Iop_SetV128lo32,
   7197                      mkexpr(oldG0),
   7198                      loadLE(Ity_I32, mkexpr(addr)) ));
   7199       putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
   7200       DIP("%s %s,%s\n", opname,
   7201                         dis_buf,
   7202                         nameXMMReg(gregOfRM(rm)) );
   7203       return delta+alen;
   7204    }
   7205 }
   7206 
   7207 
   7208 /* Lowest 64-bit lane only unary SSE operation, G = op(E). */
   7209 
   7210 static UInt dis_SSE_E_to_G_unary_lo64 (
   7211                UChar sorb, Int delta,
   7212                const HChar* opname, IROp op
   7213             )
   7214 {
   7215    /* First we need to get the old G value and patch the low 64 bits
   7216       of the E operand into it.  Then apply op and write back to G. */
   7217    HChar   dis_buf[50];
   7218    Int     alen;
   7219    IRTemp  addr;
   7220    UChar   rm = getIByte(delta);
   7221    IRTemp  oldG0 = newTemp(Ity_V128);
   7222    IRTemp  oldG1 = newTemp(Ity_V128);
   7223 
   7224    assign( oldG0, getXMMReg(gregOfRM(rm)) );
   7225 
   7226    if (epartIsReg(rm)) {
   7227       assign( oldG1,
   7228               binop( Iop_SetV128lo64,
   7229                      mkexpr(oldG0),
   7230                      getXMMRegLane64(eregOfRM(rm), 0)) );
   7231       putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
   7232       DIP("%s %s,%s\n", opname,
   7233                         nameXMMReg(eregOfRM(rm)),
   7234                         nameXMMReg(gregOfRM(rm)) );
   7235       return delta+1;
   7236    } else {
   7237       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7238       assign( oldG1,
   7239               binop( Iop_SetV128lo64,
   7240                      mkexpr(oldG0),
   7241                      loadLE(Ity_I64, mkexpr(addr)) ));
   7242       putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
   7243       DIP("%s %s,%s\n", opname,
   7244                         dis_buf,
   7245                         nameXMMReg(gregOfRM(rm)) );
   7246       return delta+alen;
   7247    }
   7248 }
   7249 
   7250 
   7251 /* SSE integer binary operation:
   7252       G = G `op` E   (eLeft == False)
   7253       G = E `op` G   (eLeft == True)
   7254 */
   7255 static UInt dis_SSEint_E_to_G(
   7256                UChar sorb, Int delta,
   7257                const HChar* opname, IROp op,
   7258                Bool   eLeft
   7259             )
   7260 {
   7261    HChar   dis_buf[50];
   7262    Int     alen;
   7263    IRTemp  addr;
   7264    UChar   rm = getIByte(delta);
   7265    IRExpr* gpart = getXMMReg(gregOfRM(rm));
   7266    IRExpr* epart = NULL;
   7267    if (epartIsReg(rm)) {
   7268       epart = getXMMReg(eregOfRM(rm));
   7269       DIP("%s %s,%s\n", opname,
   7270                         nameXMMReg(eregOfRM(rm)),
   7271                         nameXMMReg(gregOfRM(rm)) );
   7272       delta += 1;
   7273    } else {
   7274       addr  = disAMode ( &alen, sorb, delta, dis_buf );
   7275       epart = loadLE(Ity_V128, mkexpr(addr));
   7276       DIP("%s %s,%s\n", opname,
   7277                         dis_buf,
   7278                         nameXMMReg(gregOfRM(rm)) );
   7279       delta += alen;
   7280    }
   7281    putXMMReg( gregOfRM(rm),
   7282               eLeft ? binop(op, epart, gpart)
   7283 	            : binop(op, gpart, epart) );
   7284    return delta;
   7285 }
   7286 
   7287 
   7288 /* Helper for doing SSE FP comparisons. */
   7289 
   7290 static void findSSECmpOp ( Bool* needNot, IROp* op,
   7291                            Int imm8, Bool all_lanes, Int sz )
   7292 {
   7293    imm8 &= 7;
   7294    *needNot = False;
   7295    *op      = Iop_INVALID;
   7296    if (imm8 >= 4) {
   7297       *needNot = True;
   7298       imm8 -= 4;
   7299    }
   7300 
   7301    if (sz == 4 && all_lanes) {
   7302       switch (imm8) {
   7303          case 0: *op = Iop_CmpEQ32Fx4; return;
   7304          case 1: *op = Iop_CmpLT32Fx4; return;
   7305          case 2: *op = Iop_CmpLE32Fx4; return;
   7306          case 3: *op = Iop_CmpUN32Fx4; return;
   7307          default: break;
   7308       }
   7309    }
   7310    if (sz == 4 && !all_lanes) {
   7311       switch (imm8) {
   7312          case 0: *op = Iop_CmpEQ32F0x4; return;
   7313          case 1: *op = Iop_CmpLT32F0x4; return;
   7314          case 2: *op = Iop_CmpLE32F0x4; return;
   7315          case 3: *op = Iop_CmpUN32F0x4; return;
   7316          default: break;
   7317       }
   7318    }
   7319    if (sz == 8 && all_lanes) {
   7320       switch (imm8) {
   7321          case 0: *op = Iop_CmpEQ64Fx2; return;
   7322          case 1: *op = Iop_CmpLT64Fx2; return;
   7323          case 2: *op = Iop_CmpLE64Fx2; return;
   7324          case 3: *op = Iop_CmpUN64Fx2; return;
   7325          default: break;
   7326       }
   7327    }
   7328    if (sz == 8 && !all_lanes) {
   7329       switch (imm8) {
   7330          case 0: *op = Iop_CmpEQ64F0x2; return;
   7331          case 1: *op = Iop_CmpLT64F0x2; return;
   7332          case 2: *op = Iop_CmpLE64F0x2; return;
   7333          case 3: *op = Iop_CmpUN64F0x2; return;
   7334          default: break;
   7335       }
   7336    }
   7337    vpanic("findSSECmpOp(x86,guest)");
   7338 }
   7339 
   7340 /* Handles SSE 32F/64F comparisons. */
   7341 
   7342 static UInt dis_SSEcmp_E_to_G ( UChar sorb, Int delta,
   7343 				const HChar* opname, Bool all_lanes, Int sz )
   7344 {
   7345    HChar   dis_buf[50];
   7346    Int     alen, imm8;
   7347    IRTemp  addr;
   7348    Bool    needNot = False;
   7349    IROp    op      = Iop_INVALID;
   7350    IRTemp  plain   = newTemp(Ity_V128);
   7351    UChar   rm      = getIByte(delta);
   7352    UShort  mask    = 0;
   7353    vassert(sz == 4 || sz == 8);
   7354    if (epartIsReg(rm)) {
   7355       imm8 = getIByte(delta+1);
   7356       findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
   7357       assign( plain, binop(op, getXMMReg(gregOfRM(rm)),
   7358                                getXMMReg(eregOfRM(rm))) );
   7359       delta += 2;
   7360       DIP("%s $%d,%s,%s\n", opname,
   7361                             (Int)imm8,
   7362                             nameXMMReg(eregOfRM(rm)),
   7363                             nameXMMReg(gregOfRM(rm)) );
   7364    } else {
   7365       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7366       imm8 = getIByte(delta+alen);
   7367       findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
   7368       assign( plain,
   7369               binop(
   7370                  op,
   7371                  getXMMReg(gregOfRM(rm)),
   7372                    all_lanes  ? loadLE(Ity_V128, mkexpr(addr))
   7373                  : sz == 8    ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
   7374                  : /*sz==4*/    unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr)))
   7375              )
   7376       );
   7377       delta += alen+1;
   7378       DIP("%s $%d,%s,%s\n", opname,
   7379                             (Int)imm8,
   7380                             dis_buf,
   7381                             nameXMMReg(gregOfRM(rm)) );
   7382    }
   7383 
   7384    if (needNot && all_lanes) {
   7385       putXMMReg( gregOfRM(rm),
   7386                  unop(Iop_NotV128, mkexpr(plain)) );
   7387    }
   7388    else
   7389    if (needNot && !all_lanes) {
   7390       mask = toUShort( sz==4 ? 0x000F : 0x00FF );
   7391       putXMMReg( gregOfRM(rm),
   7392                  binop(Iop_XorV128, mkexpr(plain), mkV128(mask)) );
   7393    }
   7394    else {
   7395       putXMMReg( gregOfRM(rm), mkexpr(plain) );
   7396    }
   7397 
   7398    return delta;
   7399 }
   7400 
   7401 
   7402 /* Vector by scalar shift of G by the amount specified at the bottom
   7403    of E. */
   7404 
   7405 static UInt dis_SSE_shiftG_byE ( UChar sorb, Int delta,
   7406                                  const HChar* opname, IROp op )
   7407 {
   7408    HChar   dis_buf[50];
   7409    Int     alen, size;
   7410    IRTemp  addr;
   7411    Bool    shl, shr, sar;
   7412    UChar   rm   = getIByte(delta);
   7413    IRTemp  g0   = newTemp(Ity_V128);
   7414    IRTemp  g1   = newTemp(Ity_V128);
   7415    IRTemp  amt  = newTemp(Ity_I32);
   7416    IRTemp  amt8 = newTemp(Ity_I8);
   7417    if (epartIsReg(rm)) {
   7418       assign( amt, getXMMRegLane32(eregOfRM(rm), 0) );
   7419       DIP("%s %s,%s\n", opname,
   7420                         nameXMMReg(eregOfRM(rm)),
   7421                         nameXMMReg(gregOfRM(rm)) );
   7422       delta++;
   7423    } else {
   7424       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7425       assign( amt, loadLE(Ity_I32, mkexpr(addr)) );
   7426       DIP("%s %s,%s\n", opname,
   7427                         dis_buf,
   7428                         nameXMMReg(gregOfRM(rm)) );
   7429       delta += alen;
   7430    }
   7431    assign( g0,   getXMMReg(gregOfRM(rm)) );
   7432    assign( amt8, unop(Iop_32to8, mkexpr(amt)) );
   7433 
   7434    shl = shr = sar = False;
   7435    size = 0;
   7436    switch (op) {
   7437       case Iop_ShlN16x8: shl = True; size = 32; break;
   7438       case Iop_ShlN32x4: shl = True; size = 32; break;
   7439       case Iop_ShlN64x2: shl = True; size = 64; break;
   7440       case Iop_SarN16x8: sar = True; size = 16; break;
   7441       case Iop_SarN32x4: sar = True; size = 32; break;
   7442       case Iop_ShrN16x8: shr = True; size = 16; break;
   7443       case Iop_ShrN32x4: shr = True; size = 32; break;
   7444       case Iop_ShrN64x2: shr = True; size = 64; break;
   7445       default: vassert(0);
   7446    }
   7447 
   7448    if (shl || shr) {
   7449      assign(
   7450         g1,
   7451         IRExpr_ITE(
   7452            binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size)),
   7453            binop(op, mkexpr(g0), mkexpr(amt8)),
   7454            mkV128(0x0000)
   7455         )
   7456      );
   7457    } else
   7458    if (sar) {
   7459      assign(
   7460         g1,
   7461         IRExpr_ITE(
   7462            binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size)),
   7463            binop(op, mkexpr(g0), mkexpr(amt8)),
   7464            binop(op, mkexpr(g0), mkU8(size-1))
   7465         )
   7466      );
   7467    } else {
   7468       /*NOTREACHED*/
   7469       vassert(0);
   7470    }
   7471 
   7472    putXMMReg( gregOfRM(rm), mkexpr(g1) );
   7473    return delta;
   7474 }
   7475 
   7476 
   7477 /* Vector by scalar shift of E by an immediate byte. */
   7478 
   7479 static
   7480 UInt dis_SSE_shiftE_imm ( Int delta, const HChar* opname, IROp op )
   7481 {
   7482    Bool    shl, shr, sar;
   7483    UChar   rm   = getIByte(delta);
   7484    IRTemp  e0   = newTemp(Ity_V128);
   7485    IRTemp  e1   = newTemp(Ity_V128);
   7486    UChar   amt, size;
   7487    vassert(epartIsReg(rm));
   7488    vassert(gregOfRM(rm) == 2
   7489            || gregOfRM(rm) == 4 || gregOfRM(rm) == 6);
   7490    amt = getIByte(delta+1);
   7491    delta += 2;
   7492    DIP("%s $%d,%s\n", opname,
   7493                       (Int)amt,
   7494                       nameXMMReg(eregOfRM(rm)) );
   7495    assign( e0, getXMMReg(eregOfRM(rm)) );
   7496 
   7497    shl = shr = sar = False;
   7498    size = 0;
   7499    switch (op) {
   7500       case Iop_ShlN16x8: shl = True; size = 16; break;
   7501       case Iop_ShlN32x4: shl = True; size = 32; break;
   7502       case Iop_ShlN64x2: shl = True; size = 64; break;
   7503       case Iop_SarN16x8: sar = True; size = 16; break;
   7504       case Iop_SarN32x4: sar = True; size = 32; break;
   7505       case Iop_ShrN16x8: shr = True; size = 16; break;
   7506       case Iop_ShrN32x4: shr = True; size = 32; break;
   7507       case Iop_ShrN64x2: shr = True; size = 64; break;
   7508       default: vassert(0);
   7509    }
   7510 
   7511    if (shl || shr) {
   7512       assign( e1, amt >= size
   7513                      ? mkV128(0x0000)
   7514                      : binop(op, mkexpr(e0), mkU8(amt))
   7515       );
   7516    } else
   7517    if (sar) {
   7518       assign( e1, amt >= size
   7519                      ? binop(op, mkexpr(e0), mkU8(size-1))
   7520                      : binop(op, mkexpr(e0), mkU8(amt))
   7521       );
   7522    } else {
   7523       /*NOTREACHED*/
   7524       vassert(0);
   7525    }
   7526 
   7527    putXMMReg( eregOfRM(rm), mkexpr(e1) );
   7528    return delta;
   7529 }
   7530 
   7531 
   7532 /* Get the current SSE rounding mode. */
   7533 
   7534 static IRExpr* /* :: Ity_I32 */ get_sse_roundingmode ( void )
   7535 {
   7536    return binop( Iop_And32,
   7537                  IRExpr_Get( OFFB_SSEROUND, Ity_I32 ),
   7538                  mkU32(3) );
   7539 }
   7540 
   7541 static void put_sse_roundingmode ( IRExpr* sseround )
   7542 {
   7543    vassert(typeOfIRExpr(irsb->tyenv, sseround) == Ity_I32);
   7544    stmt( IRStmt_Put( OFFB_SSEROUND, sseround ) );
   7545 }
   7546 
   7547 /* Break a 128-bit value up into four 32-bit ints. */
   7548 
   7549 static void breakup128to32s ( IRTemp t128,
   7550 			      /*OUTs*/
   7551                               IRTemp* t3, IRTemp* t2,
   7552                               IRTemp* t1, IRTemp* t0 )
   7553 {
   7554    IRTemp hi64 = newTemp(Ity_I64);
   7555    IRTemp lo64 = newTemp(Ity_I64);
   7556    assign( hi64, unop(Iop_V128HIto64, mkexpr(t128)) );
   7557    assign( lo64, unop(Iop_V128to64,   mkexpr(t128)) );
   7558 
   7559    vassert(t0 && *t0 == IRTemp_INVALID);
   7560    vassert(t1 && *t1 == IRTemp_INVALID);
   7561    vassert(t2 && *t2 == IRTemp_INVALID);
   7562    vassert(t3 && *t3 == IRTemp_INVALID);
   7563 
   7564    *t0 = newTemp(Ity_I32);
   7565    *t1 = newTemp(Ity_I32);
   7566    *t2 = newTemp(Ity_I32);
   7567    *t3 = newTemp(Ity_I32);
   7568    assign( *t0, unop(Iop_64to32,   mkexpr(lo64)) );
   7569    assign( *t1, unop(Iop_64HIto32, mkexpr(lo64)) );
   7570    assign( *t2, unop(Iop_64to32,   mkexpr(hi64)) );
   7571    assign( *t3, unop(Iop_64HIto32, mkexpr(hi64)) );
   7572 }
   7573 
   7574 /* Construct a 128-bit value from four 32-bit ints. */
   7575 
   7576 static IRExpr* mk128from32s ( IRTemp t3, IRTemp t2,
   7577                               IRTemp t1, IRTemp t0 )
   7578 {
   7579    return
   7580       binop( Iop_64HLtoV128,
   7581              binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
   7582              binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0))
   7583    );
   7584 }
   7585 
   7586 /* Break a 64-bit value up into four 16-bit ints. */
   7587 
   7588 static void breakup64to16s ( IRTemp t64,
   7589                              /*OUTs*/
   7590                              IRTemp* t3, IRTemp* t2,
   7591                              IRTemp* t1, IRTemp* t0 )
   7592 {
   7593    IRTemp hi32 = newTemp(Ity_I32);
   7594    IRTemp lo32 = newTemp(Ity_I32);
   7595    assign( hi32, unop(Iop_64HIto32, mkexpr(t64)) );
   7596    assign( lo32, unop(Iop_64to32,   mkexpr(t64)) );
   7597 
   7598    vassert(t0 && *t0 == IRTemp_INVALID);
   7599    vassert(t1 && *t1 == IRTemp_INVALID);
   7600    vassert(t2 && *t2 == IRTemp_INVALID);
   7601    vassert(t3 && *t3 == IRTemp_INVALID);
   7602 
   7603    *t0 = newTemp(Ity_I16);
   7604    *t1 = newTemp(Ity_I16);
   7605    *t2 = newTemp(Ity_I16);
   7606    *t3 = newTemp(Ity_I16);
   7607    assign( *t0, unop(Iop_32to16,   mkexpr(lo32)) );
   7608    assign( *t1, unop(Iop_32HIto16, mkexpr(lo32)) );
   7609    assign( *t2, unop(Iop_32to16,   mkexpr(hi32)) );
   7610    assign( *t3, unop(Iop_32HIto16, mkexpr(hi32)) );
   7611 }
   7612 
   7613 /* Construct a 64-bit value from four 16-bit ints. */
   7614 
   7615 static IRExpr* mk64from16s ( IRTemp t3, IRTemp t2,
   7616                              IRTemp t1, IRTemp t0 )
   7617 {
   7618    return
   7619       binop( Iop_32HLto64,
   7620              binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)),
   7621              binop(Iop_16HLto32, mkexpr(t1), mkexpr(t0))
   7622    );
   7623 }
   7624 
   7625 /* Generate IR to set the guest %EFLAGS from the pushfl-format image
   7626    in the given 32-bit temporary.  The flags that are set are: O S Z A
   7627    C P D ID AC.
   7628 
   7629    In all cases, code to set AC is generated.  However, VEX actually
   7630    ignores the AC value and so can optionally emit an emulation
   7631    warning when it is enabled.  In this routine, an emulation warning
   7632    is only emitted if emit_AC_emwarn is True, in which case
   7633    next_insn_EIP must be correct (this allows for correct code
   7634    generation for popfl/popfw).  If emit_AC_emwarn is False,
   7635    next_insn_EIP is unimportant (this allows for easy if kludgey code
   7636    generation for IRET.) */
   7637 
   7638 static
   7639 void set_EFLAGS_from_value ( IRTemp t1,
   7640                              Bool   emit_AC_emwarn,
   7641                              Addr32 next_insn_EIP )
   7642 {
   7643    vassert(typeOfIRTemp(irsb->tyenv,t1) == Ity_I32);
   7644 
   7645    /* t1 is the flag word.  Mask out everything except OSZACP and set
   7646       the flags thunk to X86G_CC_OP_COPY. */
   7647    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   7648    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   7649    stmt( IRStmt_Put( OFFB_CC_DEP1,
   7650                      binop(Iop_And32,
   7651                            mkexpr(t1),
   7652                            mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
   7653                                   | X86G_CC_MASK_A | X86G_CC_MASK_Z
   7654                                   | X86G_CC_MASK_S| X86G_CC_MASK_O )
   7655                           )
   7656                     )
   7657        );
   7658    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   7659       elimination of previous stores to this field work better. */
   7660    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   7661 
   7662    /* Also need to set the D flag, which is held in bit 10 of t1.
   7663       If zero, put 1 in OFFB_DFLAG, else -1 in OFFB_DFLAG. */
   7664    stmt( IRStmt_Put(
   7665             OFFB_DFLAG,
   7666             IRExpr_ITE(
   7667                unop(Iop_32to1,
   7668                     binop(Iop_And32,
   7669                           binop(Iop_Shr32, mkexpr(t1), mkU8(10)),
   7670                           mkU32(1))),
   7671                mkU32(0xFFFFFFFF),
   7672                mkU32(1)))
   7673        );
   7674 
   7675    /* Set the ID flag */
   7676    stmt( IRStmt_Put(
   7677             OFFB_IDFLAG,
   7678             IRExpr_ITE(
   7679                unop(Iop_32to1,
   7680                     binop(Iop_And32,
   7681                           binop(Iop_Shr32, mkexpr(t1), mkU8(21)),
   7682                           mkU32(1))),
   7683                mkU32(1),
   7684                mkU32(0)))
   7685        );
   7686 
   7687    /* And set the AC flag.  If setting it 1 to, possibly emit an
   7688       emulation warning. */
   7689    stmt( IRStmt_Put(
   7690             OFFB_ACFLAG,
   7691             IRExpr_ITE(
   7692                unop(Iop_32to1,
   7693                     binop(Iop_And32,
   7694                           binop(Iop_Shr32, mkexpr(t1), mkU8(18)),
   7695                           mkU32(1))),
   7696                mkU32(1),
   7697                mkU32(0)))
   7698        );
   7699 
   7700    if (emit_AC_emwarn) {
   7701       put_emwarn( mkU32(EmWarn_X86_acFlag) );
   7702       stmt(
   7703          IRStmt_Exit(
   7704             binop( Iop_CmpNE32,
   7705                    binop(Iop_And32, mkexpr(t1), mkU32(1<<18)),
   7706                    mkU32(0) ),
   7707             Ijk_EmWarn,
   7708             IRConst_U32( next_insn_EIP ),
   7709             OFFB_EIP
   7710          )
   7711       );
   7712    }
   7713 }
   7714 
   7715 
   7716 /* Helper for the SSSE3 (not SSE3) PMULHRSW insns.  Given two 64-bit
   7717    values (aa,bb), computes, for each of the 4 16-bit lanes:
   7718 
   7719    (((aa_lane *s32 bb_lane) >>u 14) + 1) >>u 1
   7720 */
   7721 static IRExpr* dis_PMULHRSW_helper ( IRExpr* aax, IRExpr* bbx )
   7722 {
   7723    IRTemp aa      = newTemp(Ity_I64);
   7724    IRTemp bb      = newTemp(Ity_I64);
   7725    IRTemp aahi32s = newTemp(Ity_I64);
   7726    IRTemp aalo32s = newTemp(Ity_I64);
   7727    IRTemp bbhi32s = newTemp(Ity_I64);
   7728    IRTemp bblo32s = newTemp(Ity_I64);
   7729    IRTemp rHi     = newTemp(Ity_I64);
   7730    IRTemp rLo     = newTemp(Ity_I64);
   7731    IRTemp one32x2 = newTemp(Ity_I64);
   7732    assign(aa, aax);
   7733    assign(bb, bbx);
   7734    assign( aahi32s,
   7735            binop(Iop_SarN32x2,
   7736                  binop(Iop_InterleaveHI16x4, mkexpr(aa), mkexpr(aa)),
   7737                  mkU8(16) ));
   7738    assign( aalo32s,
   7739            binop(Iop_SarN32x2,
   7740                  binop(Iop_InterleaveLO16x4, mkexpr(aa), mkexpr(aa)),
   7741                  mkU8(16) ));
   7742    assign( bbhi32s,
   7743            binop(Iop_SarN32x2,
   7744                  binop(Iop_InterleaveHI16x4, mkexpr(bb), mkexpr(bb)),
   7745                  mkU8(16) ));
   7746    assign( bblo32s,
   7747            binop(Iop_SarN32x2,
   7748                  binop(Iop_InterleaveLO16x4, mkexpr(bb), mkexpr(bb)),
   7749                  mkU8(16) ));
   7750    assign(one32x2, mkU64( (1ULL << 32) + 1 ));
   7751    assign(
   7752       rHi,
   7753       binop(
   7754          Iop_ShrN32x2,
   7755          binop(
   7756             Iop_Add32x2,
   7757             binop(
   7758                Iop_ShrN32x2,
   7759                binop(Iop_Mul32x2, mkexpr(aahi32s), mkexpr(bbhi32s)),
   7760                mkU8(14)
   7761             ),
   7762             mkexpr(one32x2)
   7763          ),
   7764          mkU8(1)
   7765       )
   7766    );
   7767    assign(
   7768       rLo,
   7769       binop(
   7770          Iop_ShrN32x2,
   7771          binop(
   7772             Iop_Add32x2,
   7773             binop(
   7774                Iop_ShrN32x2,
   7775                binop(Iop_Mul32x2, mkexpr(aalo32s), mkexpr(bblo32s)),
   7776                mkU8(14)
   7777             ),
   7778             mkexpr(one32x2)
   7779          ),
   7780          mkU8(1)
   7781       )
   7782    );
   7783    return
   7784       binop(Iop_CatEvenLanes16x4, mkexpr(rHi), mkexpr(rLo));
   7785 }
   7786 
   7787 /* Helper for the SSSE3 (not SSE3) PSIGN{B,W,D} insns.  Given two 64-bit
   7788    values (aa,bb), computes, for each lane:
   7789 
   7790           if aa_lane < 0 then - bb_lane
   7791      else if aa_lane > 0 then bb_lane
   7792      else 0
   7793 */
   7794 static IRExpr* dis_PSIGN_helper ( IRExpr* aax, IRExpr* bbx, Int laneszB )
   7795 {
   7796    IRTemp aa       = newTemp(Ity_I64);
   7797    IRTemp bb       = newTemp(Ity_I64);
   7798    IRTemp zero     = newTemp(Ity_I64);
   7799    IRTemp bbNeg    = newTemp(Ity_I64);
   7800    IRTemp negMask  = newTemp(Ity_I64);
   7801    IRTemp posMask  = newTemp(Ity_I64);
   7802    IROp   opSub    = Iop_INVALID;
   7803    IROp   opCmpGTS = Iop_INVALID;
   7804 
   7805    switch (laneszB) {
   7806       case 1: opSub = Iop_Sub8x8;  opCmpGTS = Iop_CmpGT8Sx8;  break;
   7807       case 2: opSub = Iop_Sub16x4; opCmpGTS = Iop_CmpGT16Sx4; break;
   7808       case 4: opSub = Iop_Sub32x2; opCmpGTS = Iop_CmpGT32Sx2; break;
   7809       default: vassert(0);
   7810    }
   7811 
   7812    assign( aa,      aax );
   7813    assign( bb,      bbx );
   7814    assign( zero,    mkU64(0) );
   7815    assign( bbNeg,   binop(opSub,    mkexpr(zero), mkexpr(bb)) );
   7816    assign( negMask, binop(opCmpGTS, mkexpr(zero), mkexpr(aa)) );
   7817    assign( posMask, binop(opCmpGTS, mkexpr(aa),   mkexpr(zero)) );
   7818 
   7819    return
   7820       binop(Iop_Or64,
   7821             binop(Iop_And64, mkexpr(bb),    mkexpr(posMask)),
   7822             binop(Iop_And64, mkexpr(bbNeg), mkexpr(negMask)) );
   7823 
   7824 }
   7825 
   7826 /* Helper for the SSSE3 (not SSE3) PABS{B,W,D} insns.  Given a 64-bit
   7827    value aa, computes, for each lane
   7828 
   7829    if aa < 0 then -aa else aa
   7830 
   7831    Note that the result is interpreted as unsigned, so that the
   7832    absolute value of the most negative signed input can be
   7833    represented.
   7834 */
   7835 static IRExpr* dis_PABS_helper ( IRExpr* aax, Int laneszB )
   7836 {
   7837    IRTemp aa      = newTemp(Ity_I64);
   7838    IRTemp zero    = newTemp(Ity_I64);
   7839    IRTemp aaNeg   = newTemp(Ity_I64);
   7840    IRTemp negMask = newTemp(Ity_I64);
   7841    IRTemp posMask = newTemp(Ity_I64);
   7842    IROp   opSub   = Iop_INVALID;
   7843    IROp   opSarN  = Iop_INVALID;
   7844 
   7845    switch (laneszB) {
   7846       case 1: opSub = Iop_Sub8x8;  opSarN = Iop_SarN8x8;  break;
   7847       case 2: opSub = Iop_Sub16x4; opSarN = Iop_SarN16x4; break;
   7848       case 4: opSub = Iop_Sub32x2; opSarN = Iop_SarN32x2; break;
   7849       default: vassert(0);
   7850    }
   7851 
   7852    assign( aa,      aax );
   7853    assign( negMask, binop(opSarN, mkexpr(aa), mkU8(8*laneszB-1)) );
   7854    assign( posMask, unop(Iop_Not64, mkexpr(negMask)) );
   7855    assign( zero,    mkU64(0) );
   7856    assign( aaNeg,   binop(opSub, mkexpr(zero), mkexpr(aa)) );
   7857    return
   7858       binop(Iop_Or64,
   7859             binop(Iop_And64, mkexpr(aa),    mkexpr(posMask)),
   7860             binop(Iop_And64, mkexpr(aaNeg), mkexpr(negMask)) );
   7861 }
   7862 
   7863 static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64,
   7864                                         IRTemp lo64, Int byteShift )
   7865 {
   7866    vassert(byteShift >= 1 && byteShift <= 7);
   7867    return
   7868       binop(Iop_Or64,
   7869             binop(Iop_Shl64, mkexpr(hi64), mkU8(8*(8-byteShift))),
   7870             binop(Iop_Shr64, mkexpr(lo64), mkU8(8*byteShift))
   7871       );
   7872 }
   7873 
   7874 /* Generate a SIGSEGV followed by a restart of the current instruction
   7875    if effective_addr is not 16-aligned.  This is required behaviour
   7876    for some SSE3 instructions and all 128-bit SSSE3 instructions.
   7877    This assumes that guest_RIP_curr_instr is set correctly! */
   7878 static void gen_SEGV_if_not_16_aligned ( IRTemp effective_addr )
   7879 {
   7880    stmt(
   7881       IRStmt_Exit(
   7882          binop(Iop_CmpNE32,
   7883                binop(Iop_And32,mkexpr(effective_addr),mkU32(0xF)),
   7884                mkU32(0)),
   7885          Ijk_SigSEGV,
   7886          IRConst_U32(guest_EIP_curr_instr),
   7887          OFFB_EIP
   7888       )
   7889    );
   7890 }
   7891 
   7892 
   7893 /* Helper for deciding whether a given insn (starting at the opcode
   7894    byte) may validly be used with a LOCK prefix.  The following insns
   7895    may be used with LOCK when their destination operand is in memory.
   7896    AFAICS this is exactly the same for both 32-bit and 64-bit mode.
   7897 
   7898    ADD        80 /0,  81 /0,  82 /0,  83 /0,  00,  01
   7899    OR         80 /1,  81 /1,  82 /x,  83 /1,  08,  09
   7900    ADC        80 /2,  81 /2,  82 /2,  83 /2,  10,  11
   7901    SBB        81 /3,  81 /3,  82 /x,  83 /3,  18,  19
   7902    AND        80 /4,  81 /4,  82 /x,  83 /4,  20,  21
   7903    SUB        80 /5,  81 /5,  82 /x,  83 /5,  28,  29
   7904    XOR        80 /6,  81 /6,  82 /x,  83 /6,  30,  31
   7905 
   7906    DEC        FE /1,  FF /1
   7907    INC        FE /0,  FF /0
   7908 
   7909    NEG        F6 /3,  F7 /3
   7910    NOT        F6 /2,  F7 /2
   7911 
   7912    XCHG       86, 87
   7913 
   7914    BTC        0F BB,  0F BA /7
   7915    BTR        0F B3,  0F BA /6
   7916    BTS        0F AB,  0F BA /5
   7917 
   7918    CMPXCHG    0F B0,  0F B1
   7919    CMPXCHG8B  0F C7 /1
   7920 
   7921    XADD       0F C0,  0F C1
   7922 
   7923    ------------------------------
   7924 
   7925    80 /0  =  addb $imm8,  rm8
   7926    81 /0  =  addl $imm32, rm32  and  addw $imm16, rm16
   7927    82 /0  =  addb $imm8,  rm8
   7928    83 /0  =  addl $simm8, rm32  and  addw $simm8, rm16
   7929 
   7930    00     =  addb r8,  rm8
   7931    01     =  addl r32, rm32  and  addw r16, rm16
   7932 
   7933    Same for ADD OR ADC SBB AND SUB XOR
   7934 
   7935    FE /1  = dec rm8
   7936    FF /1  = dec rm32  and  dec rm16
   7937 
   7938    FE /0  = inc rm8
   7939    FF /0  = inc rm32  and  inc rm16
   7940 
   7941    F6 /3  = neg rm8
   7942    F7 /3  = neg rm32  and  neg rm16
   7943 
   7944    F6 /2  = not rm8
   7945    F7 /2  = not rm32  and  not rm16
   7946 
   7947    0F BB     = btcw r16, rm16    and  btcl r32, rm32
   7948    OF BA /7  = btcw $imm8, rm16  and  btcw $imm8, rm32
   7949 
   7950    Same for BTS, BTR
   7951 */
   7952 static Bool can_be_used_with_LOCK_prefix ( const UChar* opc )
   7953 {
   7954    switch (opc[0]) {
   7955       case 0x00: case 0x01: case 0x08: case 0x09:
   7956       case 0x10: case 0x11: case 0x18: case 0x19:
   7957       case 0x20: case 0x21: case 0x28: case 0x29:
   7958       case 0x30: case 0x31:
   7959          if (!epartIsReg(opc[1]))
   7960             return True;
   7961          break;
   7962 
   7963       case 0x80: case 0x81: case 0x82: case 0x83:
   7964          if (gregOfRM(opc[1]) >= 0 && gregOfRM(opc[1]) <= 6
   7965              && !epartIsReg(opc[1]))
   7966             return True;
   7967          break;
   7968 
   7969       case 0xFE: case 0xFF:
   7970          if (gregOfRM(opc[1]) >= 0 && gregOfRM(opc[1]) <= 1
   7971              && !epartIsReg(opc[1]))
   7972             return True;
   7973          break;
   7974 
   7975       case 0xF6: case 0xF7:
   7976          if (gregOfRM(opc[1]) >= 2 && gregOfRM(opc[1]) <= 3
   7977              && !epartIsReg(opc[1]))
   7978             return True;
   7979          break;
   7980 
   7981       case 0x86: case 0x87:
   7982          if (!epartIsReg(opc[1]))
   7983             return True;
   7984          break;
   7985 
   7986       case 0x0F: {
   7987          switch (opc[1]) {
   7988             case 0xBB: case 0xB3: case 0xAB:
   7989                if (!epartIsReg(opc[2]))
   7990                   return True;
   7991                break;
   7992             case 0xBA:
   7993                if (gregOfRM(opc[2]) >= 5 && gregOfRM(opc[2]) <= 7
   7994                    && !epartIsReg(opc[2]))
   7995                   return True;
   7996                break;
   7997             case 0xB0: case 0xB1:
   7998                if (!epartIsReg(opc[2]))
   7999                   return True;
   8000                break;
   8001             case 0xC7:
   8002                if (gregOfRM(opc[2]) == 1 && !epartIsReg(opc[2]) )
   8003                   return True;
   8004                break;
   8005             case 0xC0: case 0xC1:
   8006                if (!epartIsReg(opc[2]))
   8007                   return True;
   8008                break;
   8009             default:
   8010                break;
   8011          } /* switch (opc[1]) */
   8012          break;
   8013       }
   8014 
   8015       default:
   8016          break;
   8017    } /* switch (opc[0]) */
   8018 
   8019    return False;
   8020 }
   8021 
   8022 static IRTemp math_BSWAP ( IRTemp t1, IRType ty )
   8023 {
   8024    IRTemp t2 = newTemp(ty);
   8025    if (ty == Ity_I32) {
   8026       assign( t2,
   8027          binop(
   8028             Iop_Or32,
   8029             binop(Iop_Shl32, mkexpr(t1), mkU8(24)),
   8030             binop(
   8031                Iop_Or32,
   8032                binop(Iop_And32, binop(Iop_Shl32, mkexpr(t1), mkU8(8)),
   8033                                 mkU32(0x00FF0000)),
   8034                binop(Iop_Or32,
   8035                      binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(8)),
   8036                                       mkU32(0x0000FF00)),
   8037                      binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(24)),
   8038                                       mkU32(0x000000FF) )
   8039             )))
   8040       );
   8041       return t2;
   8042    }
   8043    if (ty == Ity_I16) {
   8044       assign(t2,
   8045              binop(Iop_Or16,
   8046                    binop(Iop_Shl16, mkexpr(t1), mkU8(8)),
   8047                    binop(Iop_Shr16, mkexpr(t1), mkU8(8)) ));
   8048       return t2;
   8049    }
   8050    vassert(0);
   8051    /*NOTREACHED*/
   8052    return IRTemp_INVALID;
   8053 }
   8054 
   8055 /*------------------------------------------------------------*/
   8056 /*--- Disassemble a single instruction                     ---*/
   8057 /*------------------------------------------------------------*/
   8058 
   8059 /* Disassemble a single instruction into IR.  The instruction is
   8060    located in host memory at &guest_code[delta].  *expect_CAS is set
   8061    to True if the resulting IR is expected to contain an IRCAS
   8062    statement, and False if it's not expected to.  This makes it
   8063    possible for the caller of disInstr_X86_WRK to check that
   8064    LOCK-prefixed instructions are at least plausibly translated, in
   8065    that it becomes possible to check that a (validly) LOCK-prefixed
   8066    instruction generates a translation containing an IRCAS, and
   8067    instructions without LOCK prefixes don't generate translations
   8068    containing an IRCAS.
   8069 */
   8070 static
   8071 DisResult disInstr_X86_WRK (
   8072              /*OUT*/Bool* expect_CAS,
   8073              Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
   8074              Bool         resteerCisOk,
   8075              void*        callback_opaque,
   8076              Long         delta64,
   8077              const VexArchInfo* archinfo,
   8078              const VexAbiInfo*  vbi,
   8079              Bool         sigill_diag
   8080           )
   8081 {
   8082    IRType    ty;
   8083    IRTemp    addr, t0, t1, t2, t3, t4, t5, t6;
   8084    Int       alen;
   8085    UChar     opc, modrm, abyte, pre;
   8086    UInt      d32;
   8087    HChar     dis_buf[50];
   8088    Int       am_sz, d_sz, n_prefixes;
   8089    DisResult dres;
   8090    const UChar* insn; /* used in SSE decoders */
   8091 
   8092    /* The running delta */
   8093    Int delta = (Int)delta64;
   8094 
   8095    /* Holds eip at the start of the insn, so that we can print
   8096       consistent error messages for unimplemented insns. */
   8097    Int delta_start = delta;
   8098 
   8099    /* sz denotes the nominal data-op size of the insn; we change it to
   8100       2 if an 0x66 prefix is seen */
   8101    Int sz = 4;
   8102 
   8103    /* sorb holds the segment-override-prefix byte, if any.  Zero if no
   8104       prefix has been seen, else one of {0x26, 0x3E, 0x64, 0x65}
   8105       indicating the prefix.  */
   8106    UChar sorb = 0;
   8107 
   8108    /* Gets set to True if a LOCK prefix is seen. */
   8109    Bool pfx_lock = False;
   8110 
   8111    /* Set result defaults. */
   8112    dres.whatNext    = Dis_Continue;
   8113    dres.len         = 0;
   8114    dres.continueAt  = 0;
   8115    dres.jk_StopHere = Ijk_INVALID;
   8116 
   8117    *expect_CAS = False;
   8118 
   8119    addr = t0 = t1 = t2 = t3 = t4 = t5 = t6 = IRTemp_INVALID;
   8120 
   8121    vassert(guest_EIP_bbstart + delta == guest_EIP_curr_instr);
   8122    DIP("\t0x%x:  ", guest_EIP_bbstart+delta);
   8123 
   8124    /* Spot "Special" instructions (see comment at top of file). */
   8125    {
   8126       const UChar* code = guest_code + delta;
   8127       /* Spot the 12-byte preamble:
   8128          C1C703   roll $3,  %edi
   8129          C1C70D   roll $13, %edi
   8130          C1C71D   roll $29, %edi
   8131          C1C713   roll $19, %edi
   8132       */
   8133       if (code[ 0] == 0xC1 && code[ 1] == 0xC7 && code[ 2] == 0x03 &&
   8134           code[ 3] == 0xC1 && code[ 4] == 0xC7 && code[ 5] == 0x0D &&
   8135           code[ 6] == 0xC1 && code[ 7] == 0xC7 && code[ 8] == 0x1D &&
   8136           code[ 9] == 0xC1 && code[10] == 0xC7 && code[11] == 0x13) {
   8137          /* Got a "Special" instruction preamble.  Which one is it? */
   8138          if (code[12] == 0x87 && code[13] == 0xDB /* xchgl %ebx,%ebx */) {
   8139             /* %EDX = client_request ( %EAX ) */
   8140             DIP("%%edx = client_request ( %%eax )\n");
   8141             delta += 14;
   8142             jmp_lit(&dres, Ijk_ClientReq, guest_EIP_bbstart+delta);
   8143             vassert(dres.whatNext == Dis_StopHere);
   8144             goto decode_success;
   8145          }
   8146          else
   8147          if (code[12] == 0x87 && code[13] == 0xC9 /* xchgl %ecx,%ecx */) {
   8148             /* %EAX = guest_NRADDR */
   8149             DIP("%%eax = guest_NRADDR\n");
   8150             delta += 14;
   8151             putIReg(4, R_EAX, IRExpr_Get( OFFB_NRADDR, Ity_I32 ));
   8152             goto decode_success;
   8153          }
   8154          else
   8155          if (code[12] == 0x87 && code[13] == 0xD2 /* xchgl %edx,%edx */) {
   8156             /* call-noredir *%EAX */
   8157             DIP("call-noredir *%%eax\n");
   8158             delta += 14;
   8159             t1 = newTemp(Ity_I32);
   8160             assign(t1, getIReg(4,R_EAX));
   8161             t2 = newTemp(Ity_I32);
   8162             assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
   8163             putIReg(4, R_ESP, mkexpr(t2));
   8164             storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta));
   8165             jmp_treg(&dres, Ijk_NoRedir, t1);
   8166             vassert(dres.whatNext == Dis_StopHere);
   8167             goto decode_success;
   8168          }
   8169          else
   8170          if (code[12] == 0x87 && code[13] == 0xFF /* xchgl %edi,%edi */) {
   8171             /* IR injection */
   8172             DIP("IR injection\n");
   8173             vex_inject_ir(irsb, Iend_LE);
   8174 
   8175             // Invalidate the current insn. The reason is that the IRop we're
   8176             // injecting here can change. In which case the translation has to
   8177             // be redone. For ease of handling, we simply invalidate all the
   8178             // time.
   8179             stmt(IRStmt_Put(OFFB_CMSTART, mkU32(guest_EIP_curr_instr)));
   8180             stmt(IRStmt_Put(OFFB_CMLEN,   mkU32(14)));
   8181 
   8182             delta += 14;
   8183 
   8184             stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_bbstart + delta) ) );
   8185             dres.whatNext    = Dis_StopHere;
   8186             dres.jk_StopHere = Ijk_InvalICache;
   8187             goto decode_success;
   8188          }
   8189          /* We don't know what it is. */
   8190          goto decode_failure;
   8191          /*NOTREACHED*/
   8192       }
   8193    }
   8194 
   8195    /* Handle a couple of weird-ass NOPs that have been observed in the
   8196       wild. */
   8197    {
   8198       const UChar* code = guest_code + delta;
   8199       /* Sun's JVM 1.5.0 uses the following as a NOP:
   8200          26 2E 64 65 90  %es:%cs:%fs:%gs:nop */
   8201       if (code[0] == 0x26 && code[1] == 0x2E && code[2] == 0x64
   8202           && code[3] == 0x65 && code[4] == 0x90) {
   8203          DIP("%%es:%%cs:%%fs:%%gs:nop\n");
   8204          delta += 5;
   8205          goto decode_success;
   8206       }
   8207       /* Don't barf on recent binutils padding,
   8208          all variants of which are: nopw %cs:0x0(%eax,%eax,1)
   8209          66 2e 0f 1f 84 00 00 00 00 00
   8210          66 66 2e 0f 1f 84 00 00 00 00 00
   8211          66 66 66 2e 0f 1f 84 00 00 00 00 00
   8212          66 66 66 66 2e 0f 1f 84 00 00 00 00 00
   8213          66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00
   8214          66 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00
   8215       */
   8216       if (code[0] == 0x66) {
   8217          Int data16_cnt;
   8218          for (data16_cnt = 1; data16_cnt < 6; data16_cnt++)
   8219             if (code[data16_cnt] != 0x66)
   8220                break;
   8221          if (code[data16_cnt] == 0x2E && code[data16_cnt + 1] == 0x0F
   8222              && code[data16_cnt + 2] == 0x1F && code[data16_cnt + 3] == 0x84
   8223              && code[data16_cnt + 4] == 0x00 && code[data16_cnt + 5] == 0x00
   8224              && code[data16_cnt + 6] == 0x00 && code[data16_cnt + 7] == 0x00
   8225              && code[data16_cnt + 8] == 0x00 ) {
   8226             DIP("nopw %%cs:0x0(%%eax,%%eax,1)\n");
   8227             delta += 9 + data16_cnt;
   8228             goto decode_success;
   8229          }
   8230       }
   8231    }
   8232 
   8233    /* Normal instruction handling starts here. */
   8234 
   8235    /* Deal with some but not all prefixes:
   8236          66(oso)
   8237          F0(lock)
   8238          2E(cs:) 3E(ds:) 26(es:) 64(fs:) 65(gs:) 36(ss:)
   8239       Not dealt with (left in place):
   8240          F2 F3
   8241    */
   8242    n_prefixes = 0;
   8243    while (True) {
   8244       if (n_prefixes > 7) goto decode_failure;
   8245       pre = getUChar(delta);
   8246       switch (pre) {
   8247          case 0x66:
   8248             sz = 2;
   8249             break;
   8250          case 0xF0:
   8251             pfx_lock = True;
   8252             *expect_CAS = True;
   8253             break;
   8254          case 0x3E: /* %DS: */
   8255          case 0x26: /* %ES: */
   8256          case 0x64: /* %FS: */
   8257          case 0x65: /* %GS: */
   8258             if (sorb != 0)
   8259                goto decode_failure; /* only one seg override allowed */
   8260             sorb = pre;
   8261             break;
   8262          case 0x2E: { /* %CS: */
   8263             /* 2E prefix on a conditional branch instruction is a
   8264                branch-prediction hint, which can safely be ignored.  */
   8265             UChar op1 = getIByte(delta+1);
   8266             UChar op2 = getIByte(delta+2);
   8267             if ((op1 >= 0x70 && op1 <= 0x7F)
   8268                 || (op1 == 0xE3)
   8269                 || (op1 == 0x0F && op2 >= 0x80 && op2 <= 0x8F)) {
   8270                if (0) vex_printf("vex x86->IR: ignoring branch hint\n");
   8271             } else {
   8272                /* All other CS override cases are not handled */
   8273                goto decode_failure;
   8274             }
   8275             break;
   8276          }
   8277          case 0x36: /* %SS: */
   8278             /* SS override cases are not handled */
   8279             goto decode_failure;
   8280          default:
   8281             goto not_a_prefix;
   8282       }
   8283       n_prefixes++;
   8284       delta++;
   8285    }
   8286 
   8287    not_a_prefix:
   8288 
   8289    /* Now we should be looking at the primary opcode byte or the
   8290       leading F2 or F3.  Check that any LOCK prefix is actually
   8291       allowed. */
   8292 
   8293    if (pfx_lock) {
   8294      if (can_be_used_with_LOCK_prefix( &guest_code[delta] )) {
   8295          DIP("lock ");
   8296       } else {
   8297          *expect_CAS = False;
   8298          goto decode_failure;
   8299       }
   8300    }
   8301 
   8302 
   8303    /* ---------------------------------------------------- */
   8304    /* --- The SSE decoder.                             --- */
   8305    /* ---------------------------------------------------- */
   8306 
   8307    /* What did I do to deserve SSE ?  Perhaps I was really bad in a
   8308       previous life? */
   8309 
   8310    /* Note, this doesn't handle SSE2 or SSE3.  That is handled in a
   8311       later section, further on. */
   8312 
   8313    insn = &guest_code[delta];
   8314 
   8315    /* Treat fxsave specially.  It should be doable even on an SSE0
   8316       (Pentium-II class) CPU.  Hence be prepared to handle it on
   8317       any subarchitecture variant.
   8318    */
   8319 
   8320    /* 0F AE /0 = FXSAVE m512 -- write x87 and SSE state to memory */
   8321    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
   8322        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 0) {
   8323       IRDirty* d;
   8324       modrm = getIByte(delta+2);
   8325       vassert(sz == 4);
   8326       vassert(!epartIsReg(modrm));
   8327 
   8328       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8329       delta += 2+alen;
   8330       gen_SEGV_if_not_16_aligned(addr);
   8331 
   8332       DIP("fxsave %s\n", dis_buf);
   8333 
   8334       /* Uses dirty helper:
   8335             void x86g_do_FXSAVE ( VexGuestX86State*, UInt ) */
   8336       d = unsafeIRDirty_0_N (
   8337              0/*regparms*/,
   8338              "x86g_dirtyhelper_FXSAVE",
   8339              &x86g_dirtyhelper_FXSAVE,
   8340              mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   8341           );
   8342 
   8343       /* declare we're writing memory */
   8344       d->mFx   = Ifx_Write;
   8345       d->mAddr = mkexpr(addr);
   8346       d->mSize = 464; /* according to recent Intel docs */
   8347 
   8348       /* declare we're reading guest state */
   8349       d->nFxState = 7;
   8350       vex_bzero(&d->fxState, sizeof(d->fxState));
   8351 
   8352       d->fxState[0].fx     = Ifx_Read;
   8353       d->fxState[0].offset = OFFB_FTOP;
   8354       d->fxState[0].size   = sizeof(UInt);
   8355 
   8356       d->fxState[1].fx     = Ifx_Read;
   8357       d->fxState[1].offset = OFFB_FPREGS;
   8358       d->fxState[1].size   = 8 * sizeof(ULong);
   8359 
   8360       d->fxState[2].fx     = Ifx_Read;
   8361       d->fxState[2].offset = OFFB_FPTAGS;
   8362       d->fxState[2].size   = 8 * sizeof(UChar);
   8363 
   8364       d->fxState[3].fx     = Ifx_Read;
   8365       d->fxState[3].offset = OFFB_FPROUND;
   8366       d->fxState[3].size   = sizeof(UInt);
   8367 
   8368       d->fxState[4].fx     = Ifx_Read;
   8369       d->fxState[4].offset = OFFB_FC3210;
   8370       d->fxState[4].size   = sizeof(UInt);
   8371 
   8372       d->fxState[5].fx     = Ifx_Read;
   8373       d->fxState[5].offset = OFFB_XMM0;
   8374       d->fxState[5].size   = 8 * sizeof(U128);
   8375 
   8376       d->fxState[6].fx     = Ifx_Read;
   8377       d->fxState[6].offset = OFFB_SSEROUND;
   8378       d->fxState[6].size   = sizeof(UInt);
   8379 
   8380       /* Be paranoid ... this assertion tries to ensure the 8 %xmm
   8381 	 images are packed back-to-back.  If not, the value of
   8382 	 d->fxState[5].size is wrong. */
   8383       vassert(16 == sizeof(U128));
   8384       vassert(OFFB_XMM7 == (OFFB_XMM0 + 7 * 16));
   8385 
   8386       stmt( IRStmt_Dirty(d) );
   8387 
   8388       goto decode_success;
   8389    }
   8390 
   8391    /* 0F AE /1 = FXRSTOR m512 -- read x87 and SSE state from memory */
   8392    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
   8393        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 1) {
   8394       IRDirty* d;
   8395       modrm = getIByte(delta+2);
   8396       vassert(sz == 4);
   8397       vassert(!epartIsReg(modrm));
   8398 
   8399       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8400       delta += 2+alen;
   8401       gen_SEGV_if_not_16_aligned(addr);
   8402 
   8403       DIP("fxrstor %s\n", dis_buf);
   8404 
   8405       /* Uses dirty helper:
   8406             VexEmNote x86g_do_FXRSTOR ( VexGuestX86State*, UInt )
   8407          NOTE:
   8408             the VexEmNote value is simply ignored (unlike for FRSTOR)
   8409       */
   8410       d = unsafeIRDirty_0_N (
   8411              0/*regparms*/,
   8412              "x86g_dirtyhelper_FXRSTOR",
   8413              &x86g_dirtyhelper_FXRSTOR,
   8414              mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   8415           );
   8416 
   8417       /* declare we're reading memory */
   8418       d->mFx   = Ifx_Read;
   8419       d->mAddr = mkexpr(addr);
   8420       d->mSize = 464; /* according to recent Intel docs */
   8421 
   8422       /* declare we're writing guest state */
   8423       d->nFxState = 7;
   8424       vex_bzero(&d->fxState, sizeof(d->fxState));
   8425 
   8426       d->fxState[0].fx     = Ifx_Write;
   8427       d->fxState[0].offset = OFFB_FTOP;
   8428       d->fxState[0].size   = sizeof(UInt);
   8429 
   8430       d->fxState[1].fx     = Ifx_Write;
   8431       d->fxState[1].offset = OFFB_FPREGS;
   8432       d->fxState[1].size   = 8 * sizeof(ULong);
   8433 
   8434       d->fxState[2].fx     = Ifx_Write;
   8435       d->fxState[2].offset = OFFB_FPTAGS;
   8436       d->fxState[2].size   = 8 * sizeof(UChar);
   8437 
   8438       d->fxState[3].fx     = Ifx_Write;
   8439       d->fxState[3].offset = OFFB_FPROUND;
   8440       d->fxState[3].size   = sizeof(UInt);
   8441 
   8442       d->fxState[4].fx     = Ifx_Write;
   8443       d->fxState[4].offset = OFFB_FC3210;
   8444       d->fxState[4].size   = sizeof(UInt);
   8445 
   8446       d->fxState[5].fx     = Ifx_Write;
   8447       d->fxState[5].offset = OFFB_XMM0;
   8448       d->fxState[5].size   = 8 * sizeof(U128);
   8449 
   8450       d->fxState[6].fx     = Ifx_Write;
   8451       d->fxState[6].offset = OFFB_SSEROUND;
   8452       d->fxState[6].size   = sizeof(UInt);
   8453 
   8454       /* Be paranoid ... this assertion tries to ensure the 8 %xmm
   8455 	 images are packed back-to-back.  If not, the value of
   8456 	 d->fxState[5].size is wrong. */
   8457       vassert(16 == sizeof(U128));
   8458       vassert(OFFB_XMM7 == (OFFB_XMM0 + 7 * 16));
   8459 
   8460       stmt( IRStmt_Dirty(d) );
   8461 
   8462       goto decode_success;
   8463    }
   8464 
   8465    /* ------ SSE decoder main ------ */
   8466 
   8467    /* Skip parts of the decoder which don't apply given the stated
   8468       guest subarchitecture. */
   8469    if (archinfo->hwcaps == 0/*baseline, no sse at all*/)
   8470       goto after_sse_decoders;
   8471 
   8472    /* With mmxext only some extended MMX instructions are recognized.
   8473       The mmxext instructions are MASKMOVQ MOVNTQ PAVGB PAVGW PMAXSW
   8474       PMAXUB PMINSW PMINUB PMULHUW PSADBW PSHUFW PEXTRW PINSRW PMOVMSKB
   8475       PREFETCHNTA PREFETCHT0 PREFETCHT1 PREFETCHT2 SFENCE
   8476 
   8477       http://support.amd.com/us/Embedded_TechDocs/22466.pdf
   8478       https://en.wikipedia.org/wiki/3DNow!#3DNow.21_extensions */
   8479 
   8480    if (archinfo->hwcaps == VEX_HWCAPS_X86_MMXEXT/*integer only sse1 subset*/)
   8481       goto mmxext;
   8482 
   8483    /* Otherwise we must be doing sse1 or sse2, so we can at least try
   8484       for SSE1 here. */
   8485 
   8486    /* 0F 58 = ADDPS -- add 32Fx4 from R/M to R */
   8487    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x58) {
   8488       delta = dis_SSE_E_to_G_all( sorb, delta+2, "addps", Iop_Add32Fx4 );
   8489       goto decode_success;
   8490    }
   8491 
   8492    /* F3 0F 58 = ADDSS -- add 32F0x4 from R/M to R */
   8493    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x58) {
   8494       vassert(sz == 4);
   8495       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "addss", Iop_Add32F0x4 );
   8496       goto decode_success;
   8497    }
   8498 
   8499    /* 0F 55 = ANDNPS -- G = (not G) and E */
   8500    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x55) {
   8501       delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "andnps", Iop_AndV128 );
   8502       goto decode_success;
   8503    }
   8504 
   8505    /* 0F 54 = ANDPS -- G = G and E */
   8506    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x54) {
   8507       delta = dis_SSE_E_to_G_all( sorb, delta+2, "andps", Iop_AndV128 );
   8508       goto decode_success;
   8509    }
   8510 
   8511    /* 0F C2 = CMPPS -- 32Fx4 comparison from R/M to R */
   8512    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC2) {
   8513       delta = dis_SSEcmp_E_to_G( sorb, delta+2, "cmpps", True, 4 );
   8514       goto decode_success;
   8515    }
   8516 
   8517    /* F3 0F C2 = CMPSS -- 32F0x4 comparison from R/M to R */
   8518    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xC2) {
   8519       vassert(sz == 4);
   8520       delta = dis_SSEcmp_E_to_G( sorb, delta+3, "cmpss", False, 4 );
   8521       goto decode_success;
   8522    }
   8523 
   8524    /* 0F 2F = COMISS  -- 32F0x4 comparison G,E, and set ZCP */
   8525    /* 0F 2E = UCOMISS -- 32F0x4 comparison G,E, and set ZCP */
   8526    if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
   8527       IRTemp argL = newTemp(Ity_F32);
   8528       IRTemp argR = newTemp(Ity_F32);
   8529       modrm = getIByte(delta+2);
   8530       if (epartIsReg(modrm)) {
   8531          assign( argR, getXMMRegLane32F( eregOfRM(modrm), 0/*lowest lane*/ ) );
   8532          delta += 2+1;
   8533          DIP("[u]comiss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   8534                                   nameXMMReg(gregOfRM(modrm)) );
   8535       } else {
   8536          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8537 	 assign( argR, loadLE(Ity_F32, mkexpr(addr)) );
   8538          delta += 2+alen;
   8539          DIP("[u]comiss %s,%s\n", dis_buf,
   8540                                   nameXMMReg(gregOfRM(modrm)) );
   8541       }
   8542       assign( argL, getXMMRegLane32F( gregOfRM(modrm), 0/*lowest lane*/ ) );
   8543 
   8544       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   8545       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   8546       stmt( IRStmt_Put(
   8547                OFFB_CC_DEP1,
   8548                binop( Iop_And32,
   8549                       binop(Iop_CmpF64,
   8550                             unop(Iop_F32toF64,mkexpr(argL)),
   8551                             unop(Iop_F32toF64,mkexpr(argR))),
   8552                       mkU32(0x45)
   8553           )));
   8554       /* Set NDEP even though it isn't used.  This makes redundant-PUT
   8555          elimination of previous stores to this field work better. */
   8556       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   8557       goto decode_success;
   8558    }
   8559 
   8560    /* 0F 2A = CVTPI2PS -- convert 2 x I32 in mem/mmx to 2 x F32 in low
   8561       half xmm */
   8562    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x2A) {
   8563       IRTemp arg64 = newTemp(Ity_I64);
   8564       IRTemp rmode = newTemp(Ity_I32);
   8565       vassert(sz == 4);
   8566 
   8567       modrm = getIByte(delta+2);
   8568       do_MMX_preamble();
   8569       if (epartIsReg(modrm)) {
   8570          assign( arg64, getMMXReg(eregOfRM(modrm)) );
   8571          delta += 2+1;
   8572          DIP("cvtpi2ps %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   8573                                  nameXMMReg(gregOfRM(modrm)));
   8574       } else {
   8575          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8576 	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   8577          delta += 2+alen;
   8578          DIP("cvtpi2ps %s,%s\n", dis_buf,
   8579                                  nameXMMReg(gregOfRM(modrm)) );
   8580       }
   8581 
   8582       assign( rmode, get_sse_roundingmode() );
   8583 
   8584       putXMMRegLane32F(
   8585          gregOfRM(modrm), 0,
   8586          binop(Iop_F64toF32,
   8587                mkexpr(rmode),
   8588                unop(Iop_I32StoF64,
   8589                     unop(Iop_64to32, mkexpr(arg64)) )) );
   8590 
   8591       putXMMRegLane32F(
   8592          gregOfRM(modrm), 1,
   8593          binop(Iop_F64toF32,
   8594                mkexpr(rmode),
   8595                unop(Iop_I32StoF64,
   8596                     unop(Iop_64HIto32, mkexpr(arg64)) )) );
   8597 
   8598       goto decode_success;
   8599    }
   8600 
   8601    /* F3 0F 2A = CVTSI2SS -- convert I32 in mem/ireg to F32 in low
   8602       quarter xmm */
   8603    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x2A) {
   8604       IRTemp arg32 = newTemp(Ity_I32);
   8605       IRTemp rmode = newTemp(Ity_I32);
   8606       vassert(sz == 4);
   8607 
   8608       modrm = getIByte(delta+3);
   8609       if (epartIsReg(modrm)) {
   8610          assign( arg32, getIReg(4, eregOfRM(modrm)) );
   8611          delta += 3+1;
   8612          DIP("cvtsi2ss %s,%s\n", nameIReg(4, eregOfRM(modrm)),
   8613                                  nameXMMReg(gregOfRM(modrm)));
   8614       } else {
   8615          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   8616 	 assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   8617          delta += 3+alen;
   8618          DIP("cvtsi2ss %s,%s\n", dis_buf,
   8619                                  nameXMMReg(gregOfRM(modrm)) );
   8620       }
   8621 
   8622       assign( rmode, get_sse_roundingmode() );
   8623 
   8624       putXMMRegLane32F(
   8625          gregOfRM(modrm), 0,
   8626          binop(Iop_F64toF32,
   8627                mkexpr(rmode),
   8628                unop(Iop_I32StoF64, mkexpr(arg32)) ) );
   8629 
   8630       goto decode_success;
   8631    }
   8632 
   8633    /* 0F 2D = CVTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
   8634       I32 in mmx, according to prevailing SSE rounding mode */
   8635    /* 0F 2C = CVTTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
   8636       I32 in mmx, rounding towards zero */
   8637    if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
   8638       IRTemp dst64  = newTemp(Ity_I64);
   8639       IRTemp rmode  = newTemp(Ity_I32);
   8640       IRTemp f32lo  = newTemp(Ity_F32);
   8641       IRTemp f32hi  = newTemp(Ity_F32);
   8642       Bool   r2zero = toBool(insn[1] == 0x2C);
   8643 
   8644       do_MMX_preamble();
   8645       modrm = getIByte(delta+2);
   8646 
   8647       if (epartIsReg(modrm)) {
   8648          delta += 2+1;
   8649 	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
   8650 	 assign(f32hi, getXMMRegLane32F(eregOfRM(modrm), 1));
   8651          DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
   8652                                    nameXMMReg(eregOfRM(modrm)),
   8653                                    nameMMXReg(gregOfRM(modrm)));
   8654       } else {
   8655          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8656 	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   8657 	 assign(f32hi, loadLE(Ity_F32, binop( Iop_Add32,
   8658                                               mkexpr(addr),
   8659                                               mkU32(4) )));
   8660          delta += 2+alen;
   8661          DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
   8662                                    dis_buf,
   8663                                    nameMMXReg(gregOfRM(modrm)));
   8664       }
   8665 
   8666       if (r2zero) {
   8667          assign(rmode, mkU32((UInt)Irrm_ZERO) );
   8668       } else {
   8669          assign( rmode, get_sse_roundingmode() );
   8670       }
   8671 
   8672       assign(
   8673          dst64,
   8674          binop( Iop_32HLto64,
   8675                 binop( Iop_F64toI32S,
   8676                        mkexpr(rmode),
   8677                        unop( Iop_F32toF64, mkexpr(f32hi) ) ),
   8678                 binop( Iop_F64toI32S,
   8679                        mkexpr(rmode),
   8680                        unop( Iop_F32toF64, mkexpr(f32lo) ) )
   8681               )
   8682       );
   8683 
   8684       putMMXReg(gregOfRM(modrm), mkexpr(dst64));
   8685       goto decode_success;
   8686    }
   8687 
   8688    /* F3 0F 2D = CVTSS2SI -- convert F32 in mem/low quarter xmm to
   8689       I32 in ireg, according to prevailing SSE rounding mode */
   8690    /* F3 0F 2C = CVTTSS2SI -- convert F32 in mem/low quarter xmm to
   8691       I32 in ireg, rounding towards zero */
   8692    if (insn[0] == 0xF3 && insn[1] == 0x0F
   8693        && (insn[2] == 0x2D || insn[2] == 0x2C)) {
   8694       IRTemp rmode = newTemp(Ity_I32);
   8695       IRTemp f32lo = newTemp(Ity_F32);
   8696       Bool   r2zero = toBool(insn[2] == 0x2C);
   8697       vassert(sz == 4);
   8698 
   8699       modrm = getIByte(delta+3);
   8700       if (epartIsReg(modrm)) {
   8701          delta += 3+1;
   8702 	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
   8703          DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
   8704                                    nameXMMReg(eregOfRM(modrm)),
   8705                                    nameIReg(4, gregOfRM(modrm)));
   8706       } else {
   8707          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   8708 	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   8709          delta += 3+alen;
   8710          DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
   8711                                    dis_buf,
   8712                                    nameIReg(4, gregOfRM(modrm)));
   8713       }
   8714 
   8715       if (r2zero) {
   8716          assign( rmode, mkU32((UInt)Irrm_ZERO) );
   8717       } else {
   8718          assign( rmode, get_sse_roundingmode() );
   8719       }
   8720 
   8721       putIReg(4, gregOfRM(modrm),
   8722                  binop( Iop_F64toI32S,
   8723                         mkexpr(rmode),
   8724                         unop( Iop_F32toF64, mkexpr(f32lo) ) )
   8725       );
   8726 
   8727       goto decode_success;
   8728    }
   8729 
   8730    /* 0F 5E = DIVPS -- div 32Fx4 from R/M to R */
   8731    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5E) {
   8732       delta = dis_SSE_E_to_G_all( sorb, delta+2, "divps", Iop_Div32Fx4 );
   8733       goto decode_success;
   8734    }
   8735 
   8736    /* F3 0F 5E = DIVSS -- div 32F0x4 from R/M to R */
   8737    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5E) {
   8738       vassert(sz == 4);
   8739       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "divss", Iop_Div32F0x4 );
   8740       goto decode_success;
   8741    }
   8742 
   8743    /* 0F AE /2 = LDMXCSR m32 -- load %mxcsr */
   8744    if (insn[0] == 0x0F && insn[1] == 0xAE
   8745        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 2) {
   8746 
   8747       IRTemp t64 = newTemp(Ity_I64);
   8748       IRTemp ew = newTemp(Ity_I32);
   8749 
   8750       modrm = getIByte(delta+2);
   8751       vassert(!epartIsReg(modrm));
   8752       vassert(sz == 4);
   8753 
   8754       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8755       delta += 2+alen;
   8756       DIP("ldmxcsr %s\n", dis_buf);
   8757 
   8758       /* The only thing we observe in %mxcsr is the rounding mode.
   8759          Therefore, pass the 32-bit value (SSE native-format control
   8760          word) to a clean helper, getting back a 64-bit value, the
   8761          lower half of which is the SSEROUND value to store, and the
   8762          upper half of which is the emulation-warning token which may
   8763          be generated.
   8764       */
   8765       /* ULong x86h_check_ldmxcsr ( UInt ); */
   8766       assign( t64, mkIRExprCCall(
   8767                       Ity_I64, 0/*regparms*/,
   8768                       "x86g_check_ldmxcsr",
   8769                       &x86g_check_ldmxcsr,
   8770                       mkIRExprVec_1( loadLE(Ity_I32, mkexpr(addr)) )
   8771                    )
   8772             );
   8773 
   8774       put_sse_roundingmode( unop(Iop_64to32, mkexpr(t64)) );
   8775       assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
   8776       put_emwarn( mkexpr(ew) );
   8777       /* Finally, if an emulation warning was reported, side-exit to
   8778          the next insn, reporting the warning, so that Valgrind's
   8779          dispatcher sees the warning. */
   8780       stmt(
   8781          IRStmt_Exit(
   8782             binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   8783             Ijk_EmWarn,
   8784             IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
   8785             OFFB_EIP
   8786          )
   8787       );
   8788       goto decode_success;
   8789    }
   8790 
   8791 
   8792    /* mmxext sse1 subset starts here. mmxext only arches will parse
   8793       only this subset of the sse1 instructions. */
   8794   mmxext:
   8795 
   8796    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8797    /* 0F F7 = MASKMOVQ -- 8x8 masked store */
   8798    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF7) {
   8799       Bool ok = False;
   8800       delta = dis_MMX( &ok, sorb, sz, delta+1 );
   8801       if (!ok)
   8802          goto decode_failure;
   8803       goto decode_success;
   8804    }
   8805 
   8806    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8807    /* 0F E7 = MOVNTQ -- for us, just a plain MMX store.  Note, the
   8808       Intel manual does not say anything about the usual business of
   8809       the FP reg tags getting trashed whenever an MMX insn happens.
   8810       So we just leave them alone.
   8811    */
   8812    if (insn[0] == 0x0F && insn[1] == 0xE7) {
   8813       modrm = getIByte(delta+2);
   8814       if (sz == 4 && !epartIsReg(modrm)) {
   8815          /* do_MMX_preamble(); Intel docs don't specify this */
   8816          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8817          storeLE( mkexpr(addr), getMMXReg(gregOfRM(modrm)) );
   8818          DIP("movntq %s,%s\n", dis_buf,
   8819                                nameMMXReg(gregOfRM(modrm)));
   8820          delta += 2+alen;
   8821          goto decode_success;
   8822       }
   8823       /* else fall through */
   8824    }
   8825 
   8826    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8827    /* 0F E0 = PAVGB -- 8x8 unsigned Packed Average, with rounding */
   8828    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE0) {
   8829       do_MMX_preamble();
   8830       delta = dis_MMXop_regmem_to_reg (
   8831                 sorb, delta+2, insn[1], "pavgb", False );
   8832       goto decode_success;
   8833    }
   8834 
   8835    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8836    /* 0F E3 = PAVGW -- 16x4 unsigned Packed Average, with rounding */
   8837    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE3) {
   8838       do_MMX_preamble();
   8839       delta = dis_MMXop_regmem_to_reg (
   8840                 sorb, delta+2, insn[1], "pavgw", False );
   8841       goto decode_success;
   8842    }
   8843 
   8844    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8845    /* 0F C5 = PEXTRW -- extract 16-bit field from mmx(E) and put
   8846       zero-extend of it in ireg(G). */
   8847    if (insn[0] == 0x0F && insn[1] == 0xC5) {
   8848       modrm = insn[2];
   8849       if (sz == 4 && epartIsReg(modrm)) {
   8850          IRTemp sV = newTemp(Ity_I64);
   8851          t5 = newTemp(Ity_I16);
   8852          do_MMX_preamble();
   8853          assign(sV, getMMXReg(eregOfRM(modrm)));
   8854          breakup64to16s( sV, &t3, &t2, &t1, &t0 );
   8855          switch (insn[3] & 3) {
   8856             case 0:  assign(t5, mkexpr(t0)); break;
   8857             case 1:  assign(t5, mkexpr(t1)); break;
   8858             case 2:  assign(t5, mkexpr(t2)); break;
   8859             case 3:  assign(t5, mkexpr(t3)); break;
   8860             default: vassert(0); /*NOTREACHED*/
   8861          }
   8862          putIReg(4, gregOfRM(modrm), unop(Iop_16Uto32, mkexpr(t5)));
   8863          DIP("pextrw $%d,%s,%s\n",
   8864              (Int)insn[3], nameMMXReg(eregOfRM(modrm)),
   8865                            nameIReg(4,gregOfRM(modrm)));
   8866          delta += 4;
   8867          goto decode_success;
   8868       }
   8869       /* else fall through */
   8870    }
   8871 
   8872    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8873    /* 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
   8874       put it into the specified lane of mmx(G). */
   8875    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC4) {
   8876       /* Use t0 .. t3 to hold the 4 original 16-bit lanes of the
   8877          mmx reg.  t4 is the new lane value.  t5 is the original
   8878          mmx value. t6 is the new mmx value. */
   8879       Int lane;
   8880       t4 = newTemp(Ity_I16);
   8881       t5 = newTemp(Ity_I64);
   8882       t6 = newTemp(Ity_I64);
   8883       modrm = insn[2];
   8884       do_MMX_preamble();
   8885 
   8886       assign(t5, getMMXReg(gregOfRM(modrm)));
   8887       breakup64to16s( t5, &t3, &t2, &t1, &t0 );
   8888 
   8889       if (epartIsReg(modrm)) {
   8890          assign(t4, getIReg(2, eregOfRM(modrm)));
   8891          delta += 3+1;
   8892          lane = insn[3+1-1];
   8893          DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   8894                                    nameIReg(2,eregOfRM(modrm)),
   8895                                    nameMMXReg(gregOfRM(modrm)));
   8896       } else {
   8897          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8898          delta += 3+alen;
   8899          lane = insn[3+alen-1];
   8900          assign(t4, loadLE(Ity_I16, mkexpr(addr)));
   8901          DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   8902                                    dis_buf,
   8903                                    nameMMXReg(gregOfRM(modrm)));
   8904       }
   8905 
   8906       switch (lane & 3) {
   8907          case 0:  assign(t6, mk64from16s(t3,t2,t1,t4)); break;
   8908          case 1:  assign(t6, mk64from16s(t3,t2,t4,t0)); break;
   8909          case 2:  assign(t6, mk64from16s(t3,t4,t1,t0)); break;
   8910          case 3:  assign(t6, mk64from16s(t4,t2,t1,t0)); break;
   8911          default: vassert(0); /*NOTREACHED*/
   8912       }
   8913       putMMXReg(gregOfRM(modrm), mkexpr(t6));
   8914       goto decode_success;
   8915    }
   8916 
   8917    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8918    /* 0F EE = PMAXSW -- 16x4 signed max */
   8919    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xEE) {
   8920       do_MMX_preamble();
   8921       delta = dis_MMXop_regmem_to_reg (
   8922                 sorb, delta+2, insn[1], "pmaxsw", False );
   8923       goto decode_success;
   8924    }
   8925 
   8926    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8927    /* 0F DE = PMAXUB -- 8x8 unsigned max */
   8928    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xDE) {
   8929       do_MMX_preamble();
   8930       delta = dis_MMXop_regmem_to_reg (
   8931                 sorb, delta+2, insn[1], "pmaxub", False );
   8932       goto decode_success;
   8933    }
   8934 
   8935    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8936    /* 0F EA = PMINSW -- 16x4 signed min */
   8937    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xEA) {
   8938       do_MMX_preamble();
   8939       delta = dis_MMXop_regmem_to_reg (
   8940                 sorb, delta+2, insn[1], "pminsw", False );
   8941       goto decode_success;
   8942    }
   8943 
   8944    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8945    /* 0F DA = PMINUB -- 8x8 unsigned min */
   8946    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xDA) {
   8947       do_MMX_preamble();
   8948       delta = dis_MMXop_regmem_to_reg (
   8949                 sorb, delta+2, insn[1], "pminub", False );
   8950       goto decode_success;
   8951    }
   8952 
   8953    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8954    /* 0F D7 = PMOVMSKB -- extract sign bits from each of 8 lanes in
   8955       mmx(E), turn them into a byte, and put zero-extend of it in
   8956       ireg(G). */
   8957    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xD7) {
   8958       modrm = insn[2];
   8959       if (epartIsReg(modrm)) {
   8960          do_MMX_preamble();
   8961          t0 = newTemp(Ity_I64);
   8962          t1 = newTemp(Ity_I32);
   8963          assign(t0, getMMXReg(eregOfRM(modrm)));
   8964          assign(t1, unop(Iop_8Uto32, unop(Iop_GetMSBs8x8, mkexpr(t0))));
   8965          putIReg(4, gregOfRM(modrm), mkexpr(t1));
   8966          DIP("pmovmskb %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   8967                                  nameIReg(4,gregOfRM(modrm)));
   8968          delta += 3;
   8969          goto decode_success;
   8970       }
   8971       /* else fall through */
   8972    }
   8973 
   8974    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8975    /* 0F E4 = PMULUH -- 16x4 hi-half of unsigned widening multiply */
   8976    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE4) {
   8977       do_MMX_preamble();
   8978       delta = dis_MMXop_regmem_to_reg (
   8979                 sorb, delta+2, insn[1], "pmuluh", False );
   8980       goto decode_success;
   8981    }
   8982 
   8983    /* 0F 18 /0 = PREFETCHNTA -- prefetch into caches, */
   8984    /* 0F 18 /1 = PREFETCH0   -- with various different hints */
   8985    /* 0F 18 /2 = PREFETCH1 */
   8986    /* 0F 18 /3 = PREFETCH2 */
   8987    if (insn[0] == 0x0F && insn[1] == 0x18
   8988        && !epartIsReg(insn[2])
   8989        && gregOfRM(insn[2]) >= 0 && gregOfRM(insn[2]) <= 3) {
   8990       const HChar* hintstr = "??";
   8991 
   8992       modrm = getIByte(delta+2);
   8993       vassert(!epartIsReg(modrm));
   8994 
   8995       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8996       delta += 2+alen;
   8997 
   8998       switch (gregOfRM(modrm)) {
   8999          case 0: hintstr = "nta"; break;
   9000          case 1: hintstr = "t0"; break;
   9001          case 2: hintstr = "t1"; break;
   9002          case 3: hintstr = "t2"; break;
   9003          default: vassert(0); /*NOTREACHED*/
   9004       }
   9005 
   9006       DIP("prefetch%s %s\n", hintstr, dis_buf);
   9007       goto decode_success;
   9008    }
   9009 
   9010    /* 0F 0D /0 = PREFETCH  m8 -- 3DNow! prefetch */
   9011    /* 0F 0D /1 = PREFETCHW m8 -- ditto, with some other hint */
   9012    if (insn[0] == 0x0F && insn[1] == 0x0D
   9013        && !epartIsReg(insn[2])
   9014        && gregOfRM(insn[2]) >= 0 && gregOfRM(insn[2]) <= 1) {
   9015       const HChar* hintstr = "??";
   9016 
   9017       modrm = getIByte(delta+2);
   9018       vassert(!epartIsReg(modrm));
   9019 
   9020       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9021       delta += 2+alen;
   9022 
   9023       switch (gregOfRM(modrm)) {
   9024          case 0: hintstr = ""; break;
   9025          case 1: hintstr = "w"; break;
   9026          default: vassert(0); /*NOTREACHED*/
   9027       }
   9028 
   9029       DIP("prefetch%s %s\n", hintstr, dis_buf);
   9030       goto decode_success;
   9031    }
   9032 
   9033    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   9034    /* 0F F6 = PSADBW -- sum of 8Ux8 absolute differences */
   9035    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF6) {
   9036       do_MMX_preamble();
   9037       delta = dis_MMXop_regmem_to_reg (
   9038                  sorb, delta+2, insn[1], "psadbw", False );
   9039       goto decode_success;
   9040    }
   9041 
   9042    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   9043    /* 0F 70 = PSHUFW -- rearrange 4x16 from E(mmx or mem) to G(mmx) */
   9044    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x70) {
   9045       Int order;
   9046       IRTemp sV, dV, s3, s2, s1, s0;
   9047       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   9048       sV = newTemp(Ity_I64);
   9049       dV = newTemp(Ity_I64);
   9050       do_MMX_preamble();
   9051       modrm = insn[2];
   9052       if (epartIsReg(modrm)) {
   9053          assign( sV, getMMXReg(eregOfRM(modrm)) );
   9054          order = (Int)insn[3];
   9055          delta += 2+2;
   9056          DIP("pshufw $%d,%s,%s\n", order,
   9057                                    nameMMXReg(eregOfRM(modrm)),
   9058                                    nameMMXReg(gregOfRM(modrm)));
   9059       } else {
   9060          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9061          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   9062 	 order = (Int)insn[2+alen];
   9063          delta += 3+alen;
   9064          DIP("pshufw $%d,%s,%s\n", order,
   9065                                    dis_buf,
   9066                                    nameMMXReg(gregOfRM(modrm)));
   9067       }
   9068       breakup64to16s( sV, &s3, &s2, &s1, &s0 );
   9069 
   9070 #     define SEL(n) \
   9071                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   9072       assign(dV,
   9073 	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
   9074                           SEL((order>>2)&3), SEL((order>>0)&3) )
   9075       );
   9076       putMMXReg(gregOfRM(modrm), mkexpr(dV));
   9077 #     undef SEL
   9078       goto decode_success;
   9079    }
   9080 
   9081    /* 0F AE /7 = SFENCE -- flush pending operations to memory */
   9082    if (insn[0] == 0x0F && insn[1] == 0xAE
   9083        && epartIsReg(insn[2]) && gregOfRM(insn[2]) == 7) {
   9084       vassert(sz == 4);
   9085       delta += 3;
   9086       /* Insert a memory fence.  It's sometimes important that these
   9087          are carried through to the generated code. */
   9088       stmt( IRStmt_MBE(Imbe_Fence) );
   9089       DIP("sfence\n");
   9090       goto decode_success;
   9091    }
   9092 
   9093    /* End of mmxext sse1 subset. No more sse parsing for mmxext only arches. */
   9094    if (archinfo->hwcaps == VEX_HWCAPS_X86_MMXEXT/*integer only sse1 subset*/)
   9095       goto after_sse_decoders;
   9096 
   9097 
   9098    /* 0F 5F = MAXPS -- max 32Fx4 from R/M to R */
   9099    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5F) {
   9100       delta = dis_SSE_E_to_G_all( sorb, delta+2, "maxps", Iop_Max32Fx4 );
   9101       goto decode_success;
   9102    }
   9103 
   9104    /* F3 0F 5F = MAXSS -- max 32F0x4 from R/M to R */
   9105    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5F) {
   9106       vassert(sz == 4);
   9107       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "maxss", Iop_Max32F0x4 );
   9108       goto decode_success;
   9109    }
   9110 
   9111    /* 0F 5D = MINPS -- min 32Fx4 from R/M to R */
   9112    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5D) {
   9113       delta = dis_SSE_E_to_G_all( sorb, delta+2, "minps", Iop_Min32Fx4 );
   9114       goto decode_success;
   9115    }
   9116 
   9117    /* F3 0F 5D = MINSS -- min 32F0x4 from R/M to R */
   9118    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5D) {
   9119       vassert(sz == 4);
   9120       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "minss", Iop_Min32F0x4 );
   9121       goto decode_success;
   9122    }
   9123 
   9124    /* 0F 28 = MOVAPS -- move from E (mem or xmm) to G (xmm). */
   9125    /* 0F 10 = MOVUPS -- move from E (mem or xmm) to G (xmm). */
   9126    if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x28 || insn[1] == 0x10)) {
   9127       modrm = getIByte(delta+2);
   9128       if (epartIsReg(modrm)) {
   9129          putXMMReg( gregOfRM(modrm),
   9130                     getXMMReg( eregOfRM(modrm) ));
   9131          DIP("mov[ua]ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9132                                   nameXMMReg(gregOfRM(modrm)));
   9133          delta += 2+1;
   9134       } else {
   9135          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9136          if (insn[1] == 0x28/*movaps*/)
   9137             gen_SEGV_if_not_16_aligned( addr );
   9138          putXMMReg( gregOfRM(modrm),
   9139                     loadLE(Ity_V128, mkexpr(addr)) );
   9140          DIP("mov[ua]ps %s,%s\n", dis_buf,
   9141                                   nameXMMReg(gregOfRM(modrm)));
   9142          delta += 2+alen;
   9143       }
   9144       goto decode_success;
   9145    }
   9146 
   9147    /* 0F 29 = MOVAPS -- move from G (xmm) to E (mem or xmm). */
   9148    /* 0F 11 = MOVUPS -- move from G (xmm) to E (mem or xmm). */
   9149    if (sz == 4 && insn[0] == 0x0F
   9150        && (insn[1] == 0x29 || insn[1] == 0x11)) {
   9151       modrm = getIByte(delta+2);
   9152       if (epartIsReg(modrm)) {
   9153          /* fall through; awaiting test case */
   9154       } else {
   9155          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9156          if (insn[1] == 0x29/*movaps*/)
   9157             gen_SEGV_if_not_16_aligned( addr );
   9158          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   9159          DIP("mov[ua]ps %s,%s\n", nameXMMReg(gregOfRM(modrm)),
   9160                                   dis_buf );
   9161          delta += 2+alen;
   9162          goto decode_success;
   9163       }
   9164    }
   9165 
   9166    /* 0F 16 = MOVHPS -- move from mem to high half of XMM. */
   9167    /* 0F 16 = MOVLHPS -- move from lo half to hi half of XMM. */
   9168    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x16) {
   9169       modrm = getIByte(delta+2);
   9170       if (epartIsReg(modrm)) {
   9171          delta += 2+1;
   9172          putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
   9173                           getXMMRegLane64( eregOfRM(modrm), 0 ) );
   9174          DIP("movhps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9175                                nameXMMReg(gregOfRM(modrm)));
   9176       } else {
   9177          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9178          delta += 2+alen;
   9179          putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
   9180                           loadLE(Ity_I64, mkexpr(addr)) );
   9181          DIP("movhps %s,%s\n", dis_buf,
   9182                                nameXMMReg( gregOfRM(modrm) ));
   9183       }
   9184       goto decode_success;
   9185    }
   9186 
   9187    /* 0F 17 = MOVHPS -- move from high half of XMM to mem. */
   9188    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x17) {
   9189       if (!epartIsReg(insn[2])) {
   9190          delta += 2;
   9191          addr = disAMode ( &alen, sorb, delta, dis_buf );
   9192          delta += alen;
   9193          storeLE( mkexpr(addr),
   9194                   getXMMRegLane64( gregOfRM(insn[2]),
   9195                                    1/*upper lane*/ ) );
   9196          DIP("movhps %s,%s\n", nameXMMReg( gregOfRM(insn[2]) ),
   9197                                dis_buf);
   9198          goto decode_success;
   9199       }
   9200       /* else fall through */
   9201    }
   9202 
   9203    /* 0F 12 = MOVLPS -- move from mem to low half of XMM. */
   9204    /* OF 12 = MOVHLPS -- from from hi half to lo half of XMM. */
   9205    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x12) {
   9206       modrm = getIByte(delta+2);
   9207       if (epartIsReg(modrm)) {
   9208          delta += 2+1;
   9209          putXMMRegLane64( gregOfRM(modrm),
   9210                           0/*lower lane*/,
   9211                           getXMMRegLane64( eregOfRM(modrm), 1 ));
   9212          DIP("movhlps %s, %s\n", nameXMMReg(eregOfRM(modrm)),
   9213                                  nameXMMReg(gregOfRM(modrm)));
   9214       } else {
   9215          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9216          delta += 2+alen;
   9217          putXMMRegLane64( gregOfRM(modrm),  0/*lower lane*/,
   9218                           loadLE(Ity_I64, mkexpr(addr)) );
   9219          DIP("movlps %s, %s\n",
   9220              dis_buf, nameXMMReg( gregOfRM(modrm) ));
   9221       }
   9222       goto decode_success;
   9223    }
   9224 
   9225    /* 0F 13 = MOVLPS -- move from low half of XMM to mem. */
   9226    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x13) {
   9227       if (!epartIsReg(insn[2])) {
   9228          delta += 2;
   9229          addr = disAMode ( &alen, sorb, delta, dis_buf );
   9230          delta += alen;
   9231          storeLE( mkexpr(addr),
   9232                   getXMMRegLane64( gregOfRM(insn[2]),
   9233                                    0/*lower lane*/ ) );
   9234          DIP("movlps %s, %s\n", nameXMMReg( gregOfRM(insn[2]) ),
   9235                                 dis_buf);
   9236          goto decode_success;
   9237       }
   9238       /* else fall through */
   9239    }
   9240 
   9241    /* 0F 50 = MOVMSKPS - move 4 sign bits from 4 x F32 in xmm(E)
   9242       to 4 lowest bits of ireg(G) */
   9243    if (insn[0] == 0x0F && insn[1] == 0x50) {
   9244       modrm = getIByte(delta+2);
   9245       if (sz == 4 && epartIsReg(modrm)) {
   9246          Int src;
   9247          t0 = newTemp(Ity_I32);
   9248          t1 = newTemp(Ity_I32);
   9249          t2 = newTemp(Ity_I32);
   9250          t3 = newTemp(Ity_I32);
   9251          delta += 2+1;
   9252          src = eregOfRM(modrm);
   9253          assign( t0, binop( Iop_And32,
   9254                             binop(Iop_Shr32, getXMMRegLane32(src,0), mkU8(31)),
   9255                             mkU32(1) ));
   9256          assign( t1, binop( Iop_And32,
   9257                             binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(30)),
   9258                             mkU32(2) ));
   9259          assign( t2, binop( Iop_And32,
   9260                             binop(Iop_Shr32, getXMMRegLane32(src,2), mkU8(29)),
   9261                             mkU32(4) ));
   9262          assign( t3, binop( Iop_And32,
   9263                             binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(28)),
   9264                             mkU32(8) ));
   9265          putIReg(4, gregOfRM(modrm),
   9266                     binop(Iop_Or32,
   9267                           binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
   9268                           binop(Iop_Or32, mkexpr(t2), mkexpr(t3))
   9269                          )
   9270                  );
   9271          DIP("movmskps %s,%s\n", nameXMMReg(src),
   9272                                  nameIReg(4, gregOfRM(modrm)));
   9273          goto decode_success;
   9274       }
   9275       /* else fall through */
   9276    }
   9277 
   9278    /* 0F 2B = MOVNTPS -- for us, just a plain SSE store. */
   9279    /* 66 0F 2B = MOVNTPD -- for us, just a plain SSE store. */
   9280    if (insn[0] == 0x0F && insn[1] == 0x2B) {
   9281       modrm = getIByte(delta+2);
   9282       if (!epartIsReg(modrm)) {
   9283          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9284          gen_SEGV_if_not_16_aligned( addr );
   9285          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   9286          DIP("movntp%s %s,%s\n", sz==2 ? "d" : "s",
   9287                                  dis_buf,
   9288                                  nameXMMReg(gregOfRM(modrm)));
   9289          delta += 2+alen;
   9290          goto decode_success;
   9291       }
   9292       /* else fall through */
   9293    }
   9294 
   9295    /* F3 0F 10 = MOVSS -- move 32 bits from E (mem or lo 1/4 xmm) to G
   9296       (lo 1/4 xmm).  If E is mem, upper 3/4 of G is zeroed out. */
   9297    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x10) {
   9298       vassert(sz == 4);
   9299       modrm = getIByte(delta+3);
   9300       if (epartIsReg(modrm)) {
   9301          putXMMRegLane32( gregOfRM(modrm), 0,
   9302                           getXMMRegLane32( eregOfRM(modrm), 0 ));
   9303          DIP("movss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9304                               nameXMMReg(gregOfRM(modrm)));
   9305          delta += 3+1;
   9306       } else {
   9307          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9308          /* zero bits 127:64 */
   9309          putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
   9310          /* zero bits 63:32 */
   9311          putXMMRegLane32( gregOfRM(modrm), 1, mkU32(0) );
   9312          /* write bits 31:0 */
   9313          putXMMRegLane32( gregOfRM(modrm), 0,
   9314                           loadLE(Ity_I32, mkexpr(addr)) );
   9315          DIP("movss %s,%s\n", dis_buf,
   9316                               nameXMMReg(gregOfRM(modrm)));
   9317          delta += 3+alen;
   9318       }
   9319       goto decode_success;
   9320    }
   9321 
   9322    /* F3 0F 11 = MOVSS -- move 32 bits from G (lo 1/4 xmm) to E (mem
   9323       or lo 1/4 xmm). */
   9324    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x11) {
   9325       vassert(sz == 4);
   9326       modrm = getIByte(delta+3);
   9327       if (epartIsReg(modrm)) {
   9328          /* fall through, we don't yet have a test case */
   9329       } else {
   9330          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9331          storeLE( mkexpr(addr),
   9332                   getXMMRegLane32(gregOfRM(modrm), 0) );
   9333          DIP("movss %s,%s\n", nameXMMReg(gregOfRM(modrm)),
   9334                               dis_buf);
   9335          delta += 3+alen;
   9336          goto decode_success;
   9337       }
   9338    }
   9339 
   9340    /* 0F 59 = MULPS -- mul 32Fx4 from R/M to R */
   9341    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x59) {
   9342       delta = dis_SSE_E_to_G_all( sorb, delta+2, "mulps", Iop_Mul32Fx4 );
   9343       goto decode_success;
   9344    }
   9345 
   9346    /* F3 0F 59 = MULSS -- mul 32F0x4 from R/M to R */
   9347    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x59) {
   9348       vassert(sz == 4);
   9349       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "mulss", Iop_Mul32F0x4 );
   9350       goto decode_success;
   9351    }
   9352 
   9353    /* 0F 56 = ORPS -- G = G and E */
   9354    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x56) {
   9355       delta = dis_SSE_E_to_G_all( sorb, delta+2, "orps", Iop_OrV128 );
   9356       goto decode_success;
   9357    }
   9358 
   9359    /* 0F 53 = RCPPS -- approx reciprocal 32Fx4 from R/M to R */
   9360    if (insn[0] == 0x0F && insn[1] == 0x53) {
   9361       vassert(sz == 4);
   9362       delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
   9363                                         "rcpps", Iop_RecipEst32Fx4 );
   9364       goto decode_success;
   9365    }
   9366 
   9367    /* F3 0F 53 = RCPSS -- approx reciprocal 32F0x4 from R/M to R */
   9368    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x53) {
   9369       vassert(sz == 4);
   9370       delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
   9371                                          "rcpss", Iop_RecipEst32F0x4 );
   9372       goto decode_success;
   9373    }
   9374 
   9375    /* 0F 52 = RSQRTPS -- approx reciprocal sqrt 32Fx4 from R/M to R */
   9376    if (insn[0] == 0x0F && insn[1] == 0x52) {
   9377       vassert(sz == 4);
   9378       delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
   9379                                         "rsqrtps", Iop_RSqrtEst32Fx4 );
   9380       goto decode_success;
   9381    }
   9382 
   9383    /* F3 0F 52 = RSQRTSS -- approx reciprocal sqrt 32F0x4 from R/M to R */
   9384    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x52) {
   9385       vassert(sz == 4);
   9386       delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
   9387                                          "rsqrtss", Iop_RSqrtEst32F0x4 );
   9388       goto decode_success;
   9389    }
   9390 
   9391    /* 0F C6 /r ib = SHUFPS -- shuffle packed F32s */
   9392    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC6) {
   9393       Int    select;
   9394       IRTemp sV, dV;
   9395       IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   9396       sV = newTemp(Ity_V128);
   9397       dV = newTemp(Ity_V128);
   9398       s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   9399       modrm = insn[2];
   9400       assign( dV, getXMMReg(gregOfRM(modrm)) );
   9401 
   9402       if (epartIsReg(modrm)) {
   9403          assign( sV, getXMMReg(eregOfRM(modrm)) );
   9404          select = (Int)insn[3];
   9405          delta += 2+2;
   9406          DIP("shufps $%d,%s,%s\n", select,
   9407                                    nameXMMReg(eregOfRM(modrm)),
   9408                                    nameXMMReg(gregOfRM(modrm)));
   9409       } else {
   9410          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9411          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   9412          select = (Int)insn[2+alen];
   9413          delta += 3+alen;
   9414          DIP("shufps $%d,%s,%s\n", select,
   9415                                    dis_buf,
   9416                                    nameXMMReg(gregOfRM(modrm)));
   9417       }
   9418 
   9419       breakup128to32s( dV, &d3, &d2, &d1, &d0 );
   9420       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   9421 
   9422 #     define SELD(n) ((n)==0 ? d0 : ((n)==1 ? d1 : ((n)==2 ? d2 : d3)))
   9423 #     define SELS(n) ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   9424 
   9425       putXMMReg(
   9426          gregOfRM(modrm),
   9427          mk128from32s( SELS((select>>6)&3), SELS((select>>4)&3),
   9428                        SELD((select>>2)&3), SELD((select>>0)&3) )
   9429       );
   9430 
   9431 #     undef SELD
   9432 #     undef SELS
   9433 
   9434       goto decode_success;
   9435    }
   9436 
   9437    /* 0F 51 = SQRTPS -- approx sqrt 32Fx4 from R/M to R */
   9438    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x51) {
   9439       delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
   9440                                         "sqrtps", Iop_Sqrt32Fx4 );
   9441       goto decode_success;
   9442    }
   9443 
   9444    /* F3 0F 51 = SQRTSS -- approx sqrt 32F0x4 from R/M to R */
   9445    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x51) {
   9446       vassert(sz == 4);
   9447       delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
   9448                                          "sqrtss", Iop_Sqrt32F0x4 );
   9449       goto decode_success;
   9450    }
   9451 
   9452    /* 0F AE /3 = STMXCSR m32 -- store %mxcsr */
   9453    if (insn[0] == 0x0F && insn[1] == 0xAE
   9454        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 3) {
   9455       modrm = getIByte(delta+2);
   9456       vassert(sz == 4);
   9457       vassert(!epartIsReg(modrm));
   9458 
   9459       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9460       delta += 2+alen;
   9461 
   9462       /* Fake up a native SSE mxcsr word.  The only thing it depends
   9463          on is SSEROUND[1:0], so call a clean helper to cook it up.
   9464       */
   9465       /* UInt x86h_create_mxcsr ( UInt sseround ) */
   9466       DIP("stmxcsr %s\n", dis_buf);
   9467       storeLE( mkexpr(addr),
   9468                mkIRExprCCall(
   9469                   Ity_I32, 0/*regp*/,
   9470                   "x86g_create_mxcsr", &x86g_create_mxcsr,
   9471                   mkIRExprVec_1( get_sse_roundingmode() )
   9472                )
   9473              );
   9474       goto decode_success;
   9475    }
   9476 
   9477    /* 0F 5C = SUBPS -- sub 32Fx4 from R/M to R */
   9478    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5C) {
   9479       delta = dis_SSE_E_to_G_all( sorb, delta+2, "subps", Iop_Sub32Fx4 );
   9480       goto decode_success;
   9481    }
   9482 
   9483    /* F3 0F 5C = SUBSS -- sub 32F0x4 from R/M to R */
   9484    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5C) {
   9485       vassert(sz == 4);
   9486       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "subss", Iop_Sub32F0x4 );
   9487       goto decode_success;
   9488    }
   9489 
   9490    /* 0F 15 = UNPCKHPS -- unpack and interleave high part F32s */
   9491    /* 0F 14 = UNPCKLPS -- unpack and interleave low part F32s */
   9492    /* These just appear to be special cases of SHUFPS */
   9493    if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
   9494       IRTemp sV, dV;
   9495       IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   9496       Bool hi = toBool(insn[1] == 0x15);
   9497       sV = newTemp(Ity_V128);
   9498       dV = newTemp(Ity_V128);
   9499       s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   9500       modrm = insn[2];
   9501       assign( dV, getXMMReg(gregOfRM(modrm)) );
   9502 
   9503       if (epartIsReg(modrm)) {
   9504          assign( sV, getXMMReg(eregOfRM(modrm)) );
   9505          delta += 2+1;
   9506          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   9507                                   nameXMMReg(eregOfRM(modrm)),
   9508                                   nameXMMReg(gregOfRM(modrm)));
   9509       } else {
   9510          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9511          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   9512          delta += 2+alen;
   9513          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   9514                                   dis_buf,
   9515                                   nameXMMReg(gregOfRM(modrm)));
   9516       }
   9517 
   9518       breakup128to32s( dV, &d3, &d2, &d1, &d0 );
   9519       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   9520 
   9521       if (hi) {
   9522          putXMMReg( gregOfRM(modrm), mk128from32s( s3, d3, s2, d2 ) );
   9523       } else {
   9524          putXMMReg( gregOfRM(modrm), mk128from32s( s1, d1, s0, d0 ) );
   9525       }
   9526 
   9527       goto decode_success;
   9528    }
   9529 
   9530    /* 0F 57 = XORPS -- G = G and E */
   9531    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x57) {
   9532       delta = dis_SSE_E_to_G_all( sorb, delta+2, "xorps", Iop_XorV128 );
   9533       goto decode_success;
   9534    }
   9535 
   9536    /* ---------------------------------------------------- */
   9537    /* --- end of the SSE decoder.                      --- */
   9538    /* ---------------------------------------------------- */
   9539 
   9540    /* ---------------------------------------------------- */
   9541    /* --- start of the SSE2 decoder.                   --- */
   9542    /* ---------------------------------------------------- */
   9543 
   9544    /* Skip parts of the decoder which don't apply given the stated
   9545       guest subarchitecture. */
   9546    if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2))
   9547       goto after_sse_decoders; /* no SSE2 capabilities */
   9548 
   9549    insn = &guest_code[delta];
   9550 
   9551    /* 66 0F 58 = ADDPD -- add 32Fx4 from R/M to R */
   9552    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x58) {
   9553       delta = dis_SSE_E_to_G_all( sorb, delta+2, "addpd", Iop_Add64Fx2 );
   9554       goto decode_success;
   9555    }
   9556 
   9557    /* F2 0F 58 = ADDSD -- add 64F0x2 from R/M to R */
   9558    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x58) {
   9559       vassert(sz == 4);
   9560       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "addsd", Iop_Add64F0x2 );
   9561       goto decode_success;
   9562    }
   9563 
   9564    /* 66 0F 55 = ANDNPD -- G = (not G) and E */
   9565    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x55) {
   9566       delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "andnpd", Iop_AndV128 );
   9567       goto decode_success;
   9568    }
   9569 
   9570    /* 66 0F 54 = ANDPD -- G = G and E */
   9571    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x54) {
   9572       delta = dis_SSE_E_to_G_all( sorb, delta+2, "andpd", Iop_AndV128 );
   9573       goto decode_success;
   9574    }
   9575 
   9576    /* 66 0F C2 = CMPPD -- 64Fx2 comparison from R/M to R */
   9577    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC2) {
   9578       delta = dis_SSEcmp_E_to_G( sorb, delta+2, "cmppd", True, 8 );
   9579       goto decode_success;
   9580    }
   9581 
   9582    /* F2 0F C2 = CMPSD -- 64F0x2 comparison from R/M to R */
   9583    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xC2) {
   9584       vassert(sz == 4);
   9585       delta = dis_SSEcmp_E_to_G( sorb, delta+3, "cmpsd", False, 8 );
   9586       goto decode_success;
   9587    }
   9588 
   9589    /* 66 0F 2F = COMISD  -- 64F0x2 comparison G,E, and set ZCP */
   9590    /* 66 0F 2E = UCOMISD -- 64F0x2 comparison G,E, and set ZCP */
   9591    if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
   9592       IRTemp argL = newTemp(Ity_F64);
   9593       IRTemp argR = newTemp(Ity_F64);
   9594       modrm = getIByte(delta+2);
   9595       if (epartIsReg(modrm)) {
   9596          assign( argR, getXMMRegLane64F( eregOfRM(modrm), 0/*lowest lane*/ ) );
   9597          delta += 2+1;
   9598          DIP("[u]comisd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9599                                   nameXMMReg(gregOfRM(modrm)) );
   9600       } else {
   9601          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9602 	 assign( argR, loadLE(Ity_F64, mkexpr(addr)) );
   9603          delta += 2+alen;
   9604          DIP("[u]comisd %s,%s\n", dis_buf,
   9605                                   nameXMMReg(gregOfRM(modrm)) );
   9606       }
   9607       assign( argL, getXMMRegLane64F( gregOfRM(modrm), 0/*lowest lane*/ ) );
   9608 
   9609       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   9610       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   9611       stmt( IRStmt_Put(
   9612                OFFB_CC_DEP1,
   9613                binop( Iop_And32,
   9614                       binop(Iop_CmpF64, mkexpr(argL), mkexpr(argR)),
   9615                       mkU32(0x45)
   9616           )));
   9617       /* Set NDEP even though it isn't used.  This makes redundant-PUT
   9618          elimination of previous stores to this field work better. */
   9619       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   9620       goto decode_success;
   9621    }
   9622 
   9623    /* F3 0F E6 = CVTDQ2PD -- convert 2 x I32 in mem/lo half xmm to 2 x
   9624       F64 in xmm(G) */
   9625    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xE6) {
   9626       IRTemp arg64 = newTemp(Ity_I64);
   9627       vassert(sz == 4);
   9628 
   9629       modrm = getIByte(delta+3);
   9630       if (epartIsReg(modrm)) {
   9631          assign( arg64, getXMMRegLane64(eregOfRM(modrm), 0) );
   9632          delta += 3+1;
   9633          DIP("cvtdq2pd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9634                                  nameXMMReg(gregOfRM(modrm)));
   9635       } else {
   9636          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9637 	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   9638          delta += 3+alen;
   9639          DIP("cvtdq2pd %s,%s\n", dis_buf,
   9640                                  nameXMMReg(gregOfRM(modrm)) );
   9641       }
   9642 
   9643       putXMMRegLane64F(
   9644          gregOfRM(modrm), 0,
   9645          unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)))
   9646       );
   9647 
   9648       putXMMRegLane64F(
   9649          gregOfRM(modrm), 1,
   9650          unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)))
   9651       );
   9652 
   9653       goto decode_success;
   9654    }
   9655 
   9656    /* 0F 5B = CVTDQ2PS -- convert 4 x I32 in mem/xmm to 4 x F32 in
   9657       xmm(G) */
   9658    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5B) {
   9659       IRTemp argV  = newTemp(Ity_V128);
   9660       IRTemp rmode = newTemp(Ity_I32);
   9661 
   9662       modrm = getIByte(delta+2);
   9663       if (epartIsReg(modrm)) {
   9664          assign( argV, getXMMReg(eregOfRM(modrm)) );
   9665          delta += 2+1;
   9666          DIP("cvtdq2ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9667                                  nameXMMReg(gregOfRM(modrm)));
   9668       } else {
   9669          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9670 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   9671          delta += 2+alen;
   9672          DIP("cvtdq2ps %s,%s\n", dis_buf,
   9673                                  nameXMMReg(gregOfRM(modrm)) );
   9674       }
   9675 
   9676       assign( rmode, get_sse_roundingmode() );
   9677       breakup128to32s( argV, &t3, &t2, &t1, &t0 );
   9678 
   9679 #     define CVT(_t)  binop( Iop_F64toF32,                    \
   9680                              mkexpr(rmode),                   \
   9681                              unop(Iop_I32StoF64,mkexpr(_t)))
   9682 
   9683       putXMMRegLane32F( gregOfRM(modrm), 3, CVT(t3) );
   9684       putXMMRegLane32F( gregOfRM(modrm), 2, CVT(t2) );
   9685       putXMMRegLane32F( gregOfRM(modrm), 1, CVT(t1) );
   9686       putXMMRegLane32F( gregOfRM(modrm), 0, CVT(t0) );
   9687 
   9688 #     undef CVT
   9689 
   9690       goto decode_success;
   9691    }
   9692 
   9693    /* F2 0F E6 = CVTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
   9694       lo half xmm(G), and zero upper half */
   9695    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xE6) {
   9696       IRTemp argV  = newTemp(Ity_V128);
   9697       IRTemp rmode = newTemp(Ity_I32);
   9698       vassert(sz == 4);
   9699 
   9700       modrm = getIByte(delta+3);
   9701       if (epartIsReg(modrm)) {
   9702          assign( argV, getXMMReg(eregOfRM(modrm)) );
   9703          delta += 3+1;
   9704          DIP("cvtpd2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9705                                  nameXMMReg(gregOfRM(modrm)));
   9706       } else {
   9707          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9708 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   9709          delta += 3+alen;
   9710          DIP("cvtpd2dq %s,%s\n", dis_buf,
   9711                                  nameXMMReg(gregOfRM(modrm)) );
   9712       }
   9713 
   9714       assign( rmode, get_sse_roundingmode() );
   9715       t0 = newTemp(Ity_F64);
   9716       t1 = newTemp(Ity_F64);
   9717       assign( t0, unop(Iop_ReinterpI64asF64,
   9718                        unop(Iop_V128to64, mkexpr(argV))) );
   9719       assign( t1, unop(Iop_ReinterpI64asF64,
   9720                        unop(Iop_V128HIto64, mkexpr(argV))) );
   9721 
   9722 #     define CVT(_t)  binop( Iop_F64toI32S,                   \
   9723                              mkexpr(rmode),                   \
   9724                              mkexpr(_t) )
   9725 
   9726       putXMMRegLane32( gregOfRM(modrm), 3, mkU32(0) );
   9727       putXMMRegLane32( gregOfRM(modrm), 2, mkU32(0) );
   9728       putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
   9729       putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
   9730 
   9731 #     undef CVT
   9732 
   9733       goto decode_success;
   9734    }
   9735 
   9736    /* 66 0F 2D = CVTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
   9737       I32 in mmx, according to prevailing SSE rounding mode */
   9738    /* 66 0F 2C = CVTTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
   9739       I32 in mmx, rounding towards zero */
   9740    if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
   9741       IRTemp dst64  = newTemp(Ity_I64);
   9742       IRTemp rmode  = newTemp(Ity_I32);
   9743       IRTemp f64lo  = newTemp(Ity_F64);
   9744       IRTemp f64hi  = newTemp(Ity_F64);
   9745       Bool   r2zero = toBool(insn[1] == 0x2C);
   9746 
   9747       do_MMX_preamble();
   9748       modrm = getIByte(delta+2);
   9749 
   9750       if (epartIsReg(modrm)) {
   9751          delta += 2+1;
   9752 	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
   9753 	 assign(f64hi, getXMMRegLane64F(eregOfRM(modrm), 1));
   9754          DIP("cvt%spd2pi %s,%s\n", r2zero ? "t" : "",
   9755                                    nameXMMReg(eregOfRM(modrm)),
   9756                                    nameMMXReg(gregOfRM(modrm)));
   9757       } else {
   9758          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9759 	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   9760 	 assign(f64hi, loadLE(Ity_F64, binop( Iop_Add32,
   9761                                               mkexpr(addr),
   9762                                               mkU32(8) )));
   9763          delta += 2+alen;
   9764          DIP("cvt%spf2pi %s,%s\n", r2zero ? "t" : "",
   9765                                    dis_buf,
   9766                                    nameMMXReg(gregOfRM(modrm)));
   9767       }
   9768 
   9769       if (r2zero) {
   9770          assign(rmode, mkU32((UInt)Irrm_ZERO) );
   9771       } else {
   9772          assign( rmode, get_sse_roundingmode() );
   9773       }
   9774 
   9775       assign(
   9776          dst64,
   9777          binop( Iop_32HLto64,
   9778                 binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64hi) ),
   9779                 binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo) )
   9780               )
   9781       );
   9782 
   9783       putMMXReg(gregOfRM(modrm), mkexpr(dst64));
   9784       goto decode_success;
   9785    }
   9786 
   9787    /* 66 0F 5A = CVTPD2PS -- convert 2 x F64 in mem/xmm to 2 x F32 in
   9788       lo half xmm(G), and zero upper half */
   9789    /* Note, this is practically identical to CVTPD2DQ.  It would have
   9790       been nicer to merge them together, but the insn[] offsets differ
   9791       by one. */
   9792    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5A) {
   9793       IRTemp argV  = newTemp(Ity_V128);
   9794       IRTemp rmode = newTemp(Ity_I32);
   9795 
   9796       modrm = getIByte(delta+2);
   9797       if (epartIsReg(modrm)) {
   9798          assign( argV, getXMMReg(eregOfRM(modrm)) );
   9799          delta += 2+1;
   9800          DIP("cvtpd2ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9801                                  nameXMMReg(gregOfRM(modrm)));
   9802       } else {
   9803          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9804 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   9805          delta += 2+alen;
   9806          DIP("cvtpd2ps %s,%s\n", dis_buf,
   9807                                  nameXMMReg(gregOfRM(modrm)) );
   9808       }
   9809 
   9810       assign( rmode, get_sse_roundingmode() );
   9811       t0 = newTemp(Ity_F64);
   9812       t1 = newTemp(Ity_F64);
   9813       assign( t0, unop(Iop_ReinterpI64asF64,
   9814                        unop(Iop_V128to64, mkexpr(argV))) );
   9815       assign( t1, unop(Iop_ReinterpI64asF64,
   9816                        unop(Iop_V128HIto64, mkexpr(argV))) );
   9817 
   9818 #     define CVT(_t)  binop( Iop_F64toF32,                    \
   9819                              mkexpr(rmode),                   \
   9820                              mkexpr(_t) )
   9821 
   9822       putXMMRegLane32(  gregOfRM(modrm), 3, mkU32(0) );
   9823       putXMMRegLane32(  gregOfRM(modrm), 2, mkU32(0) );
   9824       putXMMRegLane32F( gregOfRM(modrm), 1, CVT(t1) );
   9825       putXMMRegLane32F( gregOfRM(modrm), 0, CVT(t0) );
   9826 
   9827 #     undef CVT
   9828 
   9829       goto decode_success;
   9830    }
   9831 
   9832    /* 66 0F 2A = CVTPI2PD -- convert 2 x I32 in mem/mmx to 2 x F64 in
   9833       xmm(G) */
   9834    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x2A) {
   9835       IRTemp arg64 = newTemp(Ity_I64);
   9836 
   9837       modrm = getIByte(delta+2);
   9838       if (epartIsReg(modrm)) {
   9839          /* Only switch to MMX mode if the source is a MMX register.
   9840             This is inconsistent with all other instructions which
   9841             convert between XMM and (M64 or MMX), which always switch
   9842             to MMX mode even if 64-bit operand is M64 and not MMX.  At
   9843             least, that's what the Intel docs seem to me to say.
   9844             Fixes #210264. */
   9845          do_MMX_preamble();
   9846          assign( arg64, getMMXReg(eregOfRM(modrm)) );
   9847          delta += 2+1;
   9848          DIP("cvtpi2pd %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   9849                                  nameXMMReg(gregOfRM(modrm)));
   9850       } else {
   9851          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9852 	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   9853          delta += 2+alen;
   9854          DIP("cvtpi2pd %s,%s\n", dis_buf,
   9855                                  nameXMMReg(gregOfRM(modrm)) );
   9856       }
   9857 
   9858       putXMMRegLane64F(
   9859          gregOfRM(modrm), 0,
   9860          unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)) )
   9861       );
   9862 
   9863       putXMMRegLane64F(
   9864          gregOfRM(modrm), 1,
   9865          unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)) )
   9866       );
   9867 
   9868       goto decode_success;
   9869    }
   9870 
   9871    /* 66 0F 5B = CVTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
   9872       xmm(G) */
   9873    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5B) {
   9874       IRTemp argV  = newTemp(Ity_V128);
   9875       IRTemp rmode = newTemp(Ity_I32);
   9876 
   9877       modrm = getIByte(delta+2);
   9878       if (epartIsReg(modrm)) {
   9879          assign( argV, getXMMReg(eregOfRM(modrm)) );
   9880          delta += 2+1;
   9881          DIP("cvtps2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9882                                  nameXMMReg(gregOfRM(modrm)));
   9883       } else {
   9884          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9885 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   9886          delta += 2+alen;
   9887          DIP("cvtps2dq %s,%s\n", dis_buf,
   9888                                  nameXMMReg(gregOfRM(modrm)) );
   9889       }
   9890 
   9891       assign( rmode, get_sse_roundingmode() );
   9892       breakup128to32s( argV, &t3, &t2, &t1, &t0 );
   9893 
   9894       /* This is less than ideal.  If it turns out to be a performance
   9895 	 bottleneck it can be improved. */
   9896 #     define CVT(_t)                            \
   9897         binop( Iop_F64toI32S,                   \
   9898                mkexpr(rmode),                   \
   9899                unop( Iop_F32toF64,              \
   9900                      unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
   9901 
   9902       putXMMRegLane32( gregOfRM(modrm), 3, CVT(t3) );
   9903       putXMMRegLane32( gregOfRM(modrm), 2, CVT(t2) );
   9904       putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
   9905       putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
   9906 
   9907 #     undef CVT
   9908 
   9909       goto decode_success;
   9910    }
   9911 
   9912    /* 0F 5A = CVTPS2PD -- convert 2 x F32 in low half mem/xmm to 2 x
   9913       F64 in xmm(G). */
   9914    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5A) {
   9915       IRTemp f32lo = newTemp(Ity_F32);
   9916       IRTemp f32hi = newTemp(Ity_F32);
   9917 
   9918       modrm = getIByte(delta+2);
   9919       if (epartIsReg(modrm)) {
   9920          assign( f32lo, getXMMRegLane32F(eregOfRM(modrm), 0) );
   9921          assign( f32hi, getXMMRegLane32F(eregOfRM(modrm), 1) );
   9922          delta += 2+1;
   9923          DIP("cvtps2pd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9924                                  nameXMMReg(gregOfRM(modrm)));
   9925       } else {
   9926          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9927 	 assign( f32lo, loadLE(Ity_F32, mkexpr(addr)) );
   9928 	 assign( f32hi, loadLE(Ity_F32,
   9929                                binop(Iop_Add32,mkexpr(addr),mkU32(4))) );
   9930          delta += 2+alen;
   9931          DIP("cvtps2pd %s,%s\n", dis_buf,
   9932                                  nameXMMReg(gregOfRM(modrm)) );
   9933       }
   9934 
   9935       putXMMRegLane64F( gregOfRM(modrm), 1,
   9936                         unop(Iop_F32toF64, mkexpr(f32hi)) );
   9937       putXMMRegLane64F( gregOfRM(modrm), 0,
   9938                         unop(Iop_F32toF64, mkexpr(f32lo)) );
   9939 
   9940       goto decode_success;
   9941    }
   9942 
   9943    /* F2 0F 2D = CVTSD2SI -- convert F64 in mem/low half xmm to
   9944       I32 in ireg, according to prevailing SSE rounding mode */
   9945    /* F2 0F 2C = CVTTSD2SI -- convert F64 in mem/low half xmm to
   9946       I32 in ireg, rounding towards zero */
   9947    if (insn[0] == 0xF2 && insn[1] == 0x0F
   9948        && (insn[2] == 0x2D || insn[2] == 0x2C)) {
   9949       IRTemp rmode = newTemp(Ity_I32);
   9950       IRTemp f64lo = newTemp(Ity_F64);
   9951       Bool   r2zero = toBool(insn[2] == 0x2C);
   9952       vassert(sz == 4);
   9953 
   9954       modrm = getIByte(delta+3);
   9955       if (epartIsReg(modrm)) {
   9956          delta += 3+1;
   9957 	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
   9958          DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
   9959                                    nameXMMReg(eregOfRM(modrm)),
   9960                                    nameIReg(4, gregOfRM(modrm)));
   9961       } else {
   9962          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9963 	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   9964          delta += 3+alen;
   9965          DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
   9966                                    dis_buf,
   9967                                    nameIReg(4, gregOfRM(modrm)));
   9968       }
   9969 
   9970       if (r2zero) {
   9971          assign( rmode, mkU32((UInt)Irrm_ZERO) );
   9972       } else {
   9973          assign( rmode, get_sse_roundingmode() );
   9974       }
   9975 
   9976       putIReg(4, gregOfRM(modrm),
   9977                  binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo)) );
   9978 
   9979       goto decode_success;
   9980    }
   9981 
   9982    /* F2 0F 5A = CVTSD2SS -- convert F64 in mem/low half xmm to F32 in
   9983       low 1/4 xmm(G), according to prevailing SSE rounding mode */
   9984    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5A) {
   9985       IRTemp rmode = newTemp(Ity_I32);
   9986       IRTemp f64lo = newTemp(Ity_F64);
   9987       vassert(sz == 4);
   9988 
   9989       modrm = getIByte(delta+3);
   9990       if (epartIsReg(modrm)) {
   9991          delta += 3+1;
   9992 	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
   9993          DIP("cvtsd2ss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9994                                  nameXMMReg(gregOfRM(modrm)));
   9995       } else {
   9996          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9997 	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   9998          delta += 3+alen;
   9999          DIP("cvtsd2ss %s,%s\n", dis_buf,
   10000                                  nameXMMReg(gregOfRM(modrm)));
   10001       }
   10002 
   10003       assign( rmode, get_sse_roundingmode() );
   10004       putXMMRegLane32F(
   10005          gregOfRM(modrm), 0,
   10006          binop( Iop_F64toF32, mkexpr(rmode), mkexpr(f64lo) )
   10007       );
   10008 
   10009       goto decode_success;
   10010    }
   10011 
   10012    /* F2 0F 2A = CVTSI2SD -- convert I32 in mem/ireg to F64 in low
   10013       half xmm */
   10014    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x2A) {
   10015       IRTemp arg32 = newTemp(Ity_I32);
   10016       vassert(sz == 4);
   10017 
   10018       modrm = getIByte(delta+3);
   10019       if (epartIsReg(modrm)) {
   10020          assign( arg32, getIReg(4, eregOfRM(modrm)) );
   10021          delta += 3+1;
   10022          DIP("cvtsi2sd %s,%s\n", nameIReg(4, eregOfRM(modrm)),
   10023                                  nameXMMReg(gregOfRM(modrm)));
   10024       } else {
   10025          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   10026 	 assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   10027          delta += 3+alen;
   10028          DIP("cvtsi2sd %s,%s\n", dis_buf,
   10029                                  nameXMMReg(gregOfRM(modrm)) );
   10030       }
   10031 
   10032       putXMMRegLane64F(
   10033          gregOfRM(modrm), 0,
   10034          unop(Iop_I32StoF64, mkexpr(arg32)) );
   10035 
   10036       goto decode_success;
   10037    }
   10038 
   10039    /* F3 0F 5A = CVTSS2SD -- convert F32 in mem/low 1/4 xmm to F64 in
   10040       low half xmm(G) */
   10041    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5A) {
   10042       IRTemp f32lo = newTemp(Ity_F32);
   10043       vassert(sz == 4);
   10044 
   10045       modrm = getIByte(delta+3);
   10046       if (epartIsReg(modrm)) {
   10047          delta += 3+1;
   10048 	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
   10049          DIP("cvtss2sd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10050                                  nameXMMReg(gregOfRM(modrm)));
   10051       } else {
   10052          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   10053 	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   10054          delta += 3+alen;
   10055          DIP("cvtss2sd %s,%s\n", dis_buf,
   10056                                  nameXMMReg(gregOfRM(modrm)));
   10057       }
   10058 
   10059       putXMMRegLane64F( gregOfRM(modrm), 0,
   10060                         unop( Iop_F32toF64, mkexpr(f32lo) ) );
   10061 
   10062       goto decode_success;
   10063    }
   10064 
   10065    /* 66 0F E6 = CVTTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
   10066       lo half xmm(G), and zero upper half, rounding towards zero */
   10067    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE6) {
   10068       IRTemp argV  = newTemp(Ity_V128);
   10069       IRTemp rmode = newTemp(Ity_I32);
   10070 
   10071       modrm = getIByte(delta+2);
   10072       if (epartIsReg(modrm)) {
   10073          assign( argV, getXMMReg(eregOfRM(modrm)) );
   10074          delta += 2+1;
   10075          DIP("cvttpd2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10076                                   nameXMMReg(gregOfRM(modrm)));
   10077       } else {
   10078          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10079 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10080          delta += 2+alen;
   10081          DIP("cvttpd2dq %s,%s\n", dis_buf,
   10082                                   nameXMMReg(gregOfRM(modrm)) );
   10083       }
   10084 
   10085       assign( rmode, mkU32((UInt)Irrm_ZERO) );
   10086 
   10087       t0 = newTemp(Ity_F64);
   10088       t1 = newTemp(Ity_F64);
   10089       assign( t0, unop(Iop_ReinterpI64asF64,
   10090                        unop(Iop_V128to64, mkexpr(argV))) );
   10091       assign( t1, unop(Iop_ReinterpI64asF64,
   10092                        unop(Iop_V128HIto64, mkexpr(argV))) );
   10093 
   10094 #     define CVT(_t)  binop( Iop_F64toI32S,                   \
   10095                              mkexpr(rmode),                   \
   10096                              mkexpr(_t) )
   10097 
   10098       putXMMRegLane32( gregOfRM(modrm), 3, mkU32(0) );
   10099       putXMMRegLane32( gregOfRM(modrm), 2, mkU32(0) );
   10100       putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
   10101       putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
   10102 
   10103 #     undef CVT
   10104 
   10105       goto decode_success;
   10106    }
   10107 
   10108    /* F3 0F 5B = CVTTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
   10109       xmm(G), rounding towards zero */
   10110    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5B) {
   10111       IRTemp argV  = newTemp(Ity_V128);
   10112       IRTemp rmode = newTemp(Ity_I32);
   10113       vassert(sz == 4);
   10114 
   10115       modrm = getIByte(delta+3);
   10116       if (epartIsReg(modrm)) {
   10117          assign( argV, getXMMReg(eregOfRM(modrm)) );
   10118          delta += 3+1;
   10119          DIP("cvttps2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10120                                   nameXMMReg(gregOfRM(modrm)));
   10121       } else {
   10122          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   10123 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10124          delta += 3+alen;
   10125          DIP("cvttps2dq %s,%s\n", dis_buf,
   10126                                   nameXMMReg(gregOfRM(modrm)) );
   10127       }
   10128 
   10129       assign( rmode, mkU32((UInt)Irrm_ZERO) );
   10130       breakup128to32s( argV, &t3, &t2, &t1, &t0 );
   10131 
   10132       /* This is less than ideal.  If it turns out to be a performance
   10133 	 bottleneck it can be improved. */
   10134 #     define CVT(_t)                            \
   10135         binop( Iop_F64toI32S,                   \
   10136                mkexpr(rmode),                   \
   10137                unop( Iop_F32toF64,              \
   10138                      unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
   10139 
   10140       putXMMRegLane32( gregOfRM(modrm), 3, CVT(t3) );
   10141       putXMMRegLane32( gregOfRM(modrm), 2, CVT(t2) );
   10142       putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
   10143       putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
   10144 
   10145 #     undef CVT
   10146 
   10147       goto decode_success;
   10148    }
   10149 
   10150    /* 66 0F 5E = DIVPD -- div 64Fx2 from R/M to R */
   10151    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5E) {
   10152       delta = dis_SSE_E_to_G_all( sorb, delta+2, "divpd", Iop_Div64Fx2 );
   10153       goto decode_success;
   10154    }
   10155 
   10156    /* F2 0F 5E = DIVSD -- div 64F0x2 from R/M to R */
   10157    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5E) {
   10158       vassert(sz == 4);
   10159       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "divsd", Iop_Div64F0x2 );
   10160       goto decode_success;
   10161    }
   10162 
   10163    /* 0F AE /5 = LFENCE -- flush pending operations to memory */
   10164    /* 0F AE /6 = MFENCE -- flush pending operations to memory */
   10165    if (insn[0] == 0x0F && insn[1] == 0xAE
   10166        && epartIsReg(insn[2])
   10167        && (gregOfRM(insn[2]) == 5 || gregOfRM(insn[2]) == 6)) {
   10168       vassert(sz == 4);
   10169       delta += 3;
   10170       /* Insert a memory fence.  It's sometimes important that these
   10171          are carried through to the generated code. */
   10172       stmt( IRStmt_MBE(Imbe_Fence) );
   10173       DIP("%sfence\n", gregOfRM(insn[2])==5 ? "l" : "m");
   10174       goto decode_success;
   10175    }
   10176 
   10177    /* 66 0F 5F = MAXPD -- max 64Fx2 from R/M to R */
   10178    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5F) {
   10179       delta = dis_SSE_E_to_G_all( sorb, delta+2, "maxpd", Iop_Max64Fx2 );
   10180       goto decode_success;
   10181    }
   10182 
   10183    /* F2 0F 5F = MAXSD -- max 64F0x2 from R/M to R */
   10184    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5F) {
   10185       vassert(sz == 4);
   10186       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "maxsd", Iop_Max64F0x2 );
   10187       goto decode_success;
   10188    }
   10189 
   10190    /* 66 0F 5D = MINPD -- min 64Fx2 from R/M to R */
   10191    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5D) {
   10192       delta = dis_SSE_E_to_G_all( sorb, delta+2, "minpd", Iop_Min64Fx2 );
   10193       goto decode_success;
   10194    }
   10195 
   10196    /* F2 0F 5D = MINSD -- min 64F0x2 from R/M to R */
   10197    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5D) {
   10198       vassert(sz == 4);
   10199       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "minsd", Iop_Min64F0x2 );
   10200       goto decode_success;
   10201    }
   10202 
   10203    /* 66 0F 28 = MOVAPD -- move from E (mem or xmm) to G (xmm). */
   10204    /* 66 0F 10 = MOVUPD -- move from E (mem or xmm) to G (xmm). */
   10205    /* 66 0F 6F = MOVDQA -- move from E (mem or xmm) to G (xmm). */
   10206    if (sz == 2 && insn[0] == 0x0F
   10207        && (insn[1] == 0x28 || insn[1] == 0x10 || insn[1] == 0x6F)) {
   10208       const HChar* wot = insn[1]==0x28 ? "apd" :
   10209                          insn[1]==0x10 ? "upd" : "dqa";
   10210       modrm = getIByte(delta+2);
   10211       if (epartIsReg(modrm)) {
   10212          putXMMReg( gregOfRM(modrm),
   10213                     getXMMReg( eregOfRM(modrm) ));
   10214          DIP("mov%s %s,%s\n", wot, nameXMMReg(eregOfRM(modrm)),
   10215                                    nameXMMReg(gregOfRM(modrm)));
   10216          delta += 2+1;
   10217       } else {
   10218          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10219          if (insn[1] == 0x28/*movapd*/ || insn[1] == 0x6F/*movdqa*/)
   10220             gen_SEGV_if_not_16_aligned( addr );
   10221          putXMMReg( gregOfRM(modrm),
   10222                     loadLE(Ity_V128, mkexpr(addr)) );
   10223          DIP("mov%s %s,%s\n", wot, dis_buf,
   10224                                    nameXMMReg(gregOfRM(modrm)));
   10225          delta += 2+alen;
   10226       }
   10227       goto decode_success;
   10228    }
   10229 
   10230    /* 66 0F 29 = MOVAPD -- move from G (xmm) to E (mem or xmm). */
   10231    /* 66 0F 11 = MOVUPD -- move from G (xmm) to E (mem or xmm). */
   10232    if (sz == 2 && insn[0] == 0x0F
   10233        && (insn[1] == 0x29 || insn[1] == 0x11)) {
   10234       const HChar* wot = insn[1]==0x29 ? "apd" : "upd";
   10235       modrm = getIByte(delta+2);
   10236       if (epartIsReg(modrm)) {
   10237          /* fall through; awaiting test case */
   10238       } else {
   10239          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10240          if (insn[1] == 0x29/*movapd*/)
   10241             gen_SEGV_if_not_16_aligned( addr );
   10242          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   10243          DIP("mov%s %s,%s\n", wot, nameXMMReg(gregOfRM(modrm)),
   10244                                    dis_buf );
   10245          delta += 2+alen;
   10246          goto decode_success;
   10247       }
   10248    }
   10249 
   10250    /* 66 0F 6E = MOVD from r/m32 to xmm, zeroing high 3/4 of xmm. */
   10251    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6E) {
   10252       modrm = getIByte(delta+2);
   10253       if (epartIsReg(modrm)) {
   10254          delta += 2+1;
   10255          putXMMReg(
   10256             gregOfRM(modrm),
   10257             unop( Iop_32UtoV128, getIReg(4, eregOfRM(modrm)) )
   10258          );
   10259          DIP("movd %s, %s\n",
   10260              nameIReg(4,eregOfRM(modrm)), nameXMMReg(gregOfRM(modrm)));
   10261       } else {
   10262          addr = disAMode( &alen, sorb, delta+2, dis_buf );
   10263          delta += 2+alen;
   10264          putXMMReg(
   10265             gregOfRM(modrm),
   10266             unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)) )
   10267          );
   10268          DIP("movd %s, %s\n", dis_buf, nameXMMReg(gregOfRM(modrm)));
   10269       }
   10270       goto decode_success;
   10271    }
   10272 
   10273    /* 66 0F 7E = MOVD from xmm low 1/4 to r/m32. */
   10274    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x7E) {
   10275       modrm = getIByte(delta+2);
   10276       if (epartIsReg(modrm)) {
   10277          delta += 2+1;
   10278          putIReg( 4, eregOfRM(modrm),
   10279                   getXMMRegLane32(gregOfRM(modrm), 0) );
   10280          DIP("movd %s, %s\n",
   10281              nameXMMReg(gregOfRM(modrm)), nameIReg(4,eregOfRM(modrm)));
   10282       } else {
   10283          addr = disAMode( &alen, sorb, delta+2, dis_buf );
   10284          delta += 2+alen;
   10285          storeLE( mkexpr(addr),
   10286                   getXMMRegLane32(gregOfRM(modrm), 0) );
   10287          DIP("movd %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
   10288       }
   10289       goto decode_success;
   10290    }
   10291 
   10292    /* 66 0F 7F = MOVDQA -- move from G (xmm) to E (mem or xmm). */
   10293    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x7F) {
   10294       modrm = getIByte(delta+2);
   10295       if (epartIsReg(modrm)) {
   10296          delta += 2+1;
   10297          putXMMReg( eregOfRM(modrm),
   10298                     getXMMReg(gregOfRM(modrm)) );
   10299          DIP("movdqa %s, %s\n", nameXMMReg(gregOfRM(modrm)),
   10300                                 nameXMMReg(eregOfRM(modrm)));
   10301       } else {
   10302          addr = disAMode( &alen, sorb, delta+2, dis_buf );
   10303          delta += 2+alen;
   10304          gen_SEGV_if_not_16_aligned( addr );
   10305          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   10306          DIP("movdqa %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
   10307       }
   10308       goto decode_success;
   10309    }
   10310 
   10311    /* F3 0F 6F = MOVDQU -- move from E (mem or xmm) to G (xmm). */
   10312    /* Unfortunately can't simply use the MOVDQA case since the
   10313       prefix lengths are different (66 vs F3) */
   10314    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x6F) {
   10315       vassert(sz == 4);
   10316       modrm = getIByte(delta+3);
   10317       if (epartIsReg(modrm)) {
   10318          putXMMReg( gregOfRM(modrm),
   10319                     getXMMReg( eregOfRM(modrm) ));
   10320          DIP("movdqu %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10321                                nameXMMReg(gregOfRM(modrm)));
   10322          delta += 3+1;
   10323       } else {
   10324          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   10325          putXMMReg( gregOfRM(modrm),
   10326                     loadLE(Ity_V128, mkexpr(addr)) );
   10327          DIP("movdqu %s,%s\n", dis_buf,
   10328                                nameXMMReg(gregOfRM(modrm)));
   10329          delta += 3+alen;
   10330       }
   10331       goto decode_success;
   10332    }
   10333 
   10334    /* F3 0F 7F = MOVDQU -- move from G (xmm) to E (mem or xmm). */
   10335    /* Unfortunately can't simply use the MOVDQA case since the
   10336       prefix lengths are different (66 vs F3) */
   10337    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x7F) {
   10338       vassert(sz == 4);
   10339       modrm = getIByte(delta+3);
   10340       if (epartIsReg(modrm)) {
   10341          delta += 3+1;
   10342          putXMMReg( eregOfRM(modrm),
   10343                     getXMMReg(gregOfRM(modrm)) );
   10344          DIP("movdqu %s, %s\n", nameXMMReg(gregOfRM(modrm)),
   10345                                 nameXMMReg(eregOfRM(modrm)));
   10346       } else {
   10347          addr = disAMode( &alen, sorb, delta+3, dis_buf );
   10348          delta += 3+alen;
   10349          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   10350          DIP("movdqu %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
   10351       }
   10352       goto decode_success;
   10353    }
   10354 
   10355    /* F2 0F D6 = MOVDQ2Q -- move from E (lo half xmm, not mem) to G (mmx). */
   10356    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xD6) {
   10357       vassert(sz == 4);
   10358       modrm = getIByte(delta+3);
   10359       if (epartIsReg(modrm)) {
   10360          do_MMX_preamble();
   10361          putMMXReg( gregOfRM(modrm),
   10362                     getXMMRegLane64( eregOfRM(modrm), 0 ));
   10363          DIP("movdq2q %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10364                                 nameMMXReg(gregOfRM(modrm)));
   10365          delta += 3+1;
   10366          goto decode_success;
   10367       } else {
   10368          /* fall through, apparently no mem case for this insn */
   10369       }
   10370    }
   10371 
   10372    /* 66 0F 16 = MOVHPD -- move from mem to high half of XMM. */
   10373    /* These seems identical to MOVHPS.  This instruction encoding is
   10374       completely crazy. */
   10375    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x16) {
   10376       modrm = getIByte(delta+2);
   10377       if (epartIsReg(modrm)) {
   10378          /* fall through; apparently reg-reg is not possible */
   10379       } else {
   10380          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10381          delta += 2+alen;
   10382          putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
   10383                           loadLE(Ity_I64, mkexpr(addr)) );
   10384          DIP("movhpd %s,%s\n", dis_buf,
   10385                                nameXMMReg( gregOfRM(modrm) ));
   10386          goto decode_success;
   10387       }
   10388    }
   10389 
   10390    /* 66 0F 17 = MOVHPD -- move from high half of XMM to mem. */
   10391    /* Again, this seems identical to MOVHPS. */
   10392    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x17) {
   10393       if (!epartIsReg(insn[2])) {
   10394          delta += 2;
   10395          addr = disAMode ( &alen, sorb, delta, dis_buf );
   10396          delta += alen;
   10397          storeLE( mkexpr(addr),
   10398                   getXMMRegLane64( gregOfRM(insn[2]),
   10399                                    1/*upper lane*/ ) );
   10400          DIP("movhpd %s,%s\n", nameXMMReg( gregOfRM(insn[2]) ),
   10401                                dis_buf);
   10402          goto decode_success;
   10403       }
   10404       /* else fall through */
   10405    }
   10406 
   10407    /* 66 0F 12 = MOVLPD -- move from mem to low half of XMM. */
   10408    /* Identical to MOVLPS ? */
   10409    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x12) {
   10410       modrm = getIByte(delta+2);
   10411       if (epartIsReg(modrm)) {
   10412          /* fall through; apparently reg-reg is not possible */
   10413       } else {
   10414          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10415          delta += 2+alen;
   10416          putXMMRegLane64( gregOfRM(modrm),  0/*lower lane*/,
   10417                           loadLE(Ity_I64, mkexpr(addr)) );
   10418          DIP("movlpd %s, %s\n",
   10419              dis_buf, nameXMMReg( gregOfRM(modrm) ));
   10420          goto decode_success;
   10421       }
   10422    }
   10423 
   10424    /* 66 0F 13 = MOVLPD -- move from low half of XMM to mem. */
   10425    /* Identical to MOVLPS ? */
   10426    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x13) {
   10427       if (!epartIsReg(insn[2])) {
   10428          delta += 2;
   10429          addr = disAMode ( &alen, sorb, delta, dis_buf );
   10430          delta += alen;
   10431          storeLE( mkexpr(addr),
   10432                   getXMMRegLane64( gregOfRM(insn[2]),
   10433                                    0/*lower lane*/ ) );
   10434          DIP("movlpd %s, %s\n", nameXMMReg( gregOfRM(insn[2]) ),
   10435                                 dis_buf);
   10436          goto decode_success;
   10437       }
   10438       /* else fall through */
   10439    }
   10440 
   10441    /* 66 0F 50 = MOVMSKPD - move 2 sign bits from 2 x F64 in xmm(E) to
   10442       2 lowest bits of ireg(G) */
   10443    if (insn[0] == 0x0F && insn[1] == 0x50) {
   10444       modrm = getIByte(delta+2);
   10445       if (sz == 2 && epartIsReg(modrm)) {
   10446          Int src;
   10447          t0 = newTemp(Ity_I32);
   10448          t1 = newTemp(Ity_I32);
   10449          delta += 2+1;
   10450          src = eregOfRM(modrm);
   10451          assign( t0, binop( Iop_And32,
   10452                             binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(31)),
   10453                             mkU32(1) ));
   10454          assign( t1, binop( Iop_And32,
   10455                             binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(30)),
   10456                             mkU32(2) ));
   10457          putIReg(4, gregOfRM(modrm),
   10458                     binop(Iop_Or32, mkexpr(t0), mkexpr(t1))
   10459                  );
   10460          DIP("movmskpd %s,%s\n", nameXMMReg(src),
   10461                                  nameIReg(4, gregOfRM(modrm)));
   10462          goto decode_success;
   10463       }
   10464       /* else fall through */
   10465    }
   10466 
   10467    /* 66 0F F7 = MASKMOVDQU -- store selected bytes of double quadword */
   10468    if (insn[0] == 0x0F && insn[1] == 0xF7) {
   10469       modrm = getIByte(delta+2);
   10470       if (sz == 2 && epartIsReg(modrm)) {
   10471          IRTemp regD    = newTemp(Ity_V128);
   10472          IRTemp mask    = newTemp(Ity_V128);
   10473          IRTemp olddata = newTemp(Ity_V128);
   10474          IRTemp newdata = newTemp(Ity_V128);
   10475                 addr    = newTemp(Ity_I32);
   10476 
   10477          assign( addr, handleSegOverride( sorb, getIReg(4, R_EDI) ));
   10478          assign( regD, getXMMReg( gregOfRM(modrm) ));
   10479 
   10480          /* Unfortunately can't do the obvious thing with SarN8x16
   10481             here since that can't be re-emitted as SSE2 code - no such
   10482             insn. */
   10483 	 assign(
   10484             mask,
   10485             binop(Iop_64HLtoV128,
   10486                   binop(Iop_SarN8x8,
   10487                         getXMMRegLane64( eregOfRM(modrm), 1 ),
   10488                         mkU8(7) ),
   10489                   binop(Iop_SarN8x8,
   10490                         getXMMRegLane64( eregOfRM(modrm), 0 ),
   10491                         mkU8(7) ) ));
   10492          assign( olddata, loadLE( Ity_V128, mkexpr(addr) ));
   10493          assign( newdata,
   10494                  binop(Iop_OrV128,
   10495                        binop(Iop_AndV128,
   10496                              mkexpr(regD),
   10497                              mkexpr(mask) ),
   10498                        binop(Iop_AndV128,
   10499                              mkexpr(olddata),
   10500                              unop(Iop_NotV128, mkexpr(mask)))) );
   10501          storeLE( mkexpr(addr), mkexpr(newdata) );
   10502 
   10503          delta += 2+1;
   10504          DIP("maskmovdqu %s,%s\n", nameXMMReg( eregOfRM(modrm) ),
   10505                                    nameXMMReg( gregOfRM(modrm) ) );
   10506          goto decode_success;
   10507       }
   10508       /* else fall through */
   10509    }
   10510 
   10511    /* 66 0F E7 = MOVNTDQ -- for us, just a plain SSE store. */
   10512    if (insn[0] == 0x0F && insn[1] == 0xE7) {
   10513       modrm = getIByte(delta+2);
   10514       if (sz == 2 && !epartIsReg(modrm)) {
   10515          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10516          gen_SEGV_if_not_16_aligned( addr );
   10517          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   10518          DIP("movntdq %s,%s\n", dis_buf,
   10519                                 nameXMMReg(gregOfRM(modrm)));
   10520          delta += 2+alen;
   10521          goto decode_success;
   10522       }
   10523       /* else fall through */
   10524    }
   10525 
   10526    /* 0F C3 = MOVNTI -- for us, just a plain ireg store. */
   10527    if (insn[0] == 0x0F && insn[1] == 0xC3) {
   10528       vassert(sz == 4);
   10529       modrm = getIByte(delta+2);
   10530       if (!epartIsReg(modrm)) {
   10531          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10532          storeLE( mkexpr(addr), getIReg(4, gregOfRM(modrm)) );
   10533          DIP("movnti %s,%s\n", dis_buf,
   10534                                nameIReg(4, gregOfRM(modrm)));
   10535          delta += 2+alen;
   10536          goto decode_success;
   10537       }
   10538       /* else fall through */
   10539    }
   10540 
   10541    /* 66 0F D6 = MOVQ -- move 64 bits from G (lo half xmm) to E (mem
   10542       or lo half xmm).  */
   10543    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD6) {
   10544       modrm = getIByte(delta+2);
   10545       if (epartIsReg(modrm)) {
   10546          /* fall through, awaiting test case */
   10547          /* dst: lo half copied, hi half zeroed */
   10548       } else {
   10549          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10550          storeLE( mkexpr(addr),
   10551                   getXMMRegLane64( gregOfRM(modrm), 0 ));
   10552          DIP("movq %s,%s\n", nameXMMReg(gregOfRM(modrm)), dis_buf );
   10553          delta += 2+alen;
   10554          goto decode_success;
   10555       }
   10556    }
   10557 
   10558    /* F3 0F D6 = MOVQ2DQ -- move from E (mmx) to G (lo half xmm, zero
   10559       hi half). */
   10560    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xD6) {
   10561       vassert(sz == 4);
   10562       modrm = getIByte(delta+3);
   10563       if (epartIsReg(modrm)) {
   10564          do_MMX_preamble();
   10565          putXMMReg( gregOfRM(modrm),
   10566                     unop(Iop_64UtoV128, getMMXReg( eregOfRM(modrm) )) );
   10567          DIP("movq2dq %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   10568                                 nameXMMReg(gregOfRM(modrm)));
   10569          delta += 3+1;
   10570          goto decode_success;
   10571       } else {
   10572          /* fall through, apparently no mem case for this insn */
   10573       }
   10574    }
   10575 
   10576    /* F3 0F 7E = MOVQ -- move 64 bits from E (mem or lo half xmm) to
   10577       G (lo half xmm).  Upper half of G is zeroed out. */
   10578    /* F2 0F 10 = MOVSD -- move 64 bits from E (mem or lo half xmm) to
   10579       G (lo half xmm).  If E is mem, upper half of G is zeroed out.
   10580       If E is reg, upper half of G is unchanged. */
   10581    if ((insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x10)
   10582        || (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x7E)) {
   10583       vassert(sz == 4);
   10584       modrm = getIByte(delta+3);
   10585       if (epartIsReg(modrm)) {
   10586          putXMMRegLane64( gregOfRM(modrm), 0,
   10587                           getXMMRegLane64( eregOfRM(modrm), 0 ));
   10588          if (insn[0] == 0xF3/*MOVQ*/) {
   10589             /* zero bits 127:64 */
   10590             putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
   10591          }
   10592          DIP("movsd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10593                               nameXMMReg(gregOfRM(modrm)));
   10594          delta += 3+1;
   10595       } else {
   10596          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   10597          /* zero bits 127:64 */
   10598          putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
   10599          /* write bits 63:0 */
   10600          putXMMRegLane64( gregOfRM(modrm), 0,
   10601                           loadLE(Ity_I64, mkexpr(addr)) );
   10602          DIP("movsd %s,%s\n", dis_buf,
   10603                               nameXMMReg(gregOfRM(modrm)));
   10604          delta += 3+alen;
   10605       }
   10606       goto decode_success;
   10607    }
   10608 
   10609    /* F2 0F 11 = MOVSD -- move 64 bits from G (lo half xmm) to E (mem
   10610       or lo half xmm). */
   10611    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x11) {
   10612       vassert(sz == 4);
   10613       modrm = getIByte(delta+3);
   10614       if (epartIsReg(modrm)) {
   10615          putXMMRegLane64( eregOfRM(modrm), 0,
   10616                           getXMMRegLane64( gregOfRM(modrm), 0 ));
   10617          DIP("movsd %s,%s\n", nameXMMReg(gregOfRM(modrm)),
   10618                               nameXMMReg(eregOfRM(modrm)));
   10619          delta += 3+1;
   10620       } else {
   10621          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   10622          storeLE( mkexpr(addr),
   10623                   getXMMRegLane64(gregOfRM(modrm), 0) );
   10624          DIP("movsd %s,%s\n", nameXMMReg(gregOfRM(modrm)),
   10625                               dis_buf);
   10626          delta += 3+alen;
   10627       }
   10628       goto decode_success;
   10629    }
   10630 
   10631    /* 66 0F 59 = MULPD -- mul 64Fx2 from R/M to R */
   10632    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x59) {
   10633       delta = dis_SSE_E_to_G_all( sorb, delta+2, "mulpd", Iop_Mul64Fx2 );
   10634       goto decode_success;
   10635    }
   10636 
   10637    /* F2 0F 59 = MULSD -- mul 64F0x2 from R/M to R */
   10638    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x59) {
   10639       vassert(sz == 4);
   10640       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "mulsd", Iop_Mul64F0x2 );
   10641       goto decode_success;
   10642    }
   10643 
   10644    /* 66 0F 56 = ORPD -- G = G and E */
   10645    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x56) {
   10646       delta = dis_SSE_E_to_G_all( sorb, delta+2, "orpd", Iop_OrV128 );
   10647       goto decode_success;
   10648    }
   10649 
   10650    /* 66 0F C6 /r ib = SHUFPD -- shuffle packed F64s */
   10651    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC6) {
   10652       Int    select;
   10653       IRTemp sV = newTemp(Ity_V128);
   10654       IRTemp dV = newTemp(Ity_V128);
   10655       IRTemp s1 = newTemp(Ity_I64);
   10656       IRTemp s0 = newTemp(Ity_I64);
   10657       IRTemp d1 = newTemp(Ity_I64);
   10658       IRTemp d0 = newTemp(Ity_I64);
   10659 
   10660       modrm = insn[2];
   10661       assign( dV, getXMMReg(gregOfRM(modrm)) );
   10662 
   10663       if (epartIsReg(modrm)) {
   10664          assign( sV, getXMMReg(eregOfRM(modrm)) );
   10665          select = (Int)insn[3];
   10666          delta += 2+2;
   10667          DIP("shufpd $%d,%s,%s\n", select,
   10668                                    nameXMMReg(eregOfRM(modrm)),
   10669                                    nameXMMReg(gregOfRM(modrm)));
   10670       } else {
   10671          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10672          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   10673          select = (Int)insn[2+alen];
   10674          delta += 3+alen;
   10675          DIP("shufpd $%d,%s,%s\n", select,
   10676                                    dis_buf,
   10677                                    nameXMMReg(gregOfRM(modrm)));
   10678       }
   10679 
   10680       assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
   10681       assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
   10682       assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
   10683       assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
   10684 
   10685 #     define SELD(n) mkexpr((n)==0 ? d0 : d1)
   10686 #     define SELS(n) mkexpr((n)==0 ? s0 : s1)
   10687 
   10688       putXMMReg(
   10689          gregOfRM(modrm),
   10690          binop(Iop_64HLtoV128, SELS((select>>1)&1), SELD((select>>0)&1) )
   10691       );
   10692 
   10693 #     undef SELD
   10694 #     undef SELS
   10695 
   10696       goto decode_success;
   10697    }
   10698 
   10699    /* 66 0F 51 = SQRTPD -- approx sqrt 64Fx2 from R/M to R */
   10700    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x51) {
   10701       delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
   10702                                         "sqrtpd", Iop_Sqrt64Fx2 );
   10703       goto decode_success;
   10704    }
   10705 
   10706    /* F2 0F 51 = SQRTSD -- approx sqrt 64F0x2 from R/M to R */
   10707    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x51) {
   10708       vassert(sz == 4);
   10709       delta = dis_SSE_E_to_G_unary_lo64( sorb, delta+3,
   10710                                          "sqrtsd", Iop_Sqrt64F0x2 );
   10711       goto decode_success;
   10712    }
   10713 
   10714    /* 66 0F 5C = SUBPD -- sub 64Fx2 from R/M to R */
   10715    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5C) {
   10716       delta = dis_SSE_E_to_G_all( sorb, delta+2, "subpd", Iop_Sub64Fx2 );
   10717       goto decode_success;
   10718    }
   10719 
   10720    /* F2 0F 5C = SUBSD -- sub 64F0x2 from R/M to R */
   10721    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5C) {
   10722       vassert(sz == 4);
   10723       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "subsd", Iop_Sub64F0x2 );
   10724       goto decode_success;
   10725    }
   10726 
   10727    /* 66 0F 15 = UNPCKHPD -- unpack and interleave high part F64s */
   10728    /* 66 0F 14 = UNPCKLPD -- unpack and interleave low part F64s */
   10729    /* These just appear to be special cases of SHUFPS */
   10730    if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
   10731       IRTemp s1 = newTemp(Ity_I64);
   10732       IRTemp s0 = newTemp(Ity_I64);
   10733       IRTemp d1 = newTemp(Ity_I64);
   10734       IRTemp d0 = newTemp(Ity_I64);
   10735       IRTemp sV = newTemp(Ity_V128);
   10736       IRTemp dV = newTemp(Ity_V128);
   10737       Bool   hi = toBool(insn[1] == 0x15);
   10738 
   10739       modrm = insn[2];
   10740       assign( dV, getXMMReg(gregOfRM(modrm)) );
   10741 
   10742       if (epartIsReg(modrm)) {
   10743          assign( sV, getXMMReg(eregOfRM(modrm)) );
   10744          delta += 2+1;
   10745          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   10746                                   nameXMMReg(eregOfRM(modrm)),
   10747                                   nameXMMReg(gregOfRM(modrm)));
   10748       } else {
   10749          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10750          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   10751          delta += 2+alen;
   10752          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   10753                                   dis_buf,
   10754                                   nameXMMReg(gregOfRM(modrm)));
   10755       }
   10756 
   10757       assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
   10758       assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
   10759       assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
   10760       assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
   10761 
   10762       if (hi) {
   10763          putXMMReg( gregOfRM(modrm),
   10764                     binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1)) );
   10765       } else {
   10766          putXMMReg( gregOfRM(modrm),
   10767                     binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)) );
   10768       }
   10769 
   10770       goto decode_success;
   10771    }
   10772 
   10773    /* 66 0F 57 = XORPD -- G = G and E */
   10774    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x57) {
   10775       delta = dis_SSE_E_to_G_all( sorb, delta+2, "xorpd", Iop_XorV128 );
   10776       goto decode_success;
   10777    }
   10778 
   10779    /* 66 0F 6B = PACKSSDW */
   10780    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6B) {
   10781       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10782                                  "packssdw",
   10783                                  Iop_QNarrowBin32Sto16Sx8, True );
   10784       goto decode_success;
   10785    }
   10786 
   10787    /* 66 0F 63 = PACKSSWB */
   10788    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x63) {
   10789       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10790                                  "packsswb",
   10791                                  Iop_QNarrowBin16Sto8Sx16, True );
   10792       goto decode_success;
   10793    }
   10794 
   10795    /* 66 0F 67 = PACKUSWB */
   10796    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x67) {
   10797       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10798                                  "packuswb",
   10799                                  Iop_QNarrowBin16Sto8Ux16, True );
   10800       goto decode_success;
   10801    }
   10802 
   10803    /* 66 0F FC = PADDB */
   10804    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFC) {
   10805       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10806                                  "paddb", Iop_Add8x16, False );
   10807       goto decode_success;
   10808    }
   10809 
   10810    /* 66 0F FE = PADDD */
   10811    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFE) {
   10812       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10813                                  "paddd", Iop_Add32x4, False );
   10814       goto decode_success;
   10815    }
   10816 
   10817    /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   10818    /* 0F D4 = PADDQ -- add 64x1 */
   10819    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xD4) {
   10820       do_MMX_preamble();
   10821       delta = dis_MMXop_regmem_to_reg (
   10822                 sorb, delta+2, insn[1], "paddq", False );
   10823       goto decode_success;
   10824    }
   10825 
   10826    /* 66 0F D4 = PADDQ */
   10827    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD4) {
   10828       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10829                                  "paddq", Iop_Add64x2, False );
   10830       goto decode_success;
   10831    }
   10832 
   10833    /* 66 0F FD = PADDW */
   10834    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFD) {
   10835       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10836                                  "paddw", Iop_Add16x8, False );
   10837       goto decode_success;
   10838    }
   10839 
   10840    /* 66 0F EC = PADDSB */
   10841    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEC) {
   10842       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10843                                  "paddsb", Iop_QAdd8Sx16, False );
   10844       goto decode_success;
   10845    }
   10846 
   10847    /* 66 0F ED = PADDSW */
   10848    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xED) {
   10849       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10850                                  "paddsw", Iop_QAdd16Sx8, False );
   10851       goto decode_success;
   10852    }
   10853 
   10854    /* 66 0F DC = PADDUSB */
   10855    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDC) {
   10856       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10857                                  "paddusb", Iop_QAdd8Ux16, False );
   10858       goto decode_success;
   10859    }
   10860 
   10861    /* 66 0F DD = PADDUSW */
   10862    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDD) {
   10863       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10864                                  "paddusw", Iop_QAdd16Ux8, False );
   10865       goto decode_success;
   10866    }
   10867 
   10868    /* 66 0F DB = PAND */
   10869    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDB) {
   10870       delta = dis_SSE_E_to_G_all( sorb, delta+2, "pand", Iop_AndV128 );
   10871       goto decode_success;
   10872    }
   10873 
   10874    /* 66 0F DF = PANDN */
   10875    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDF) {
   10876       delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "pandn", Iop_AndV128 );
   10877       goto decode_success;
   10878    }
   10879 
   10880    /* 66 0F E0 = PAVGB */
   10881    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE0) {
   10882       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10883                                  "pavgb", Iop_Avg8Ux16, False );
   10884       goto decode_success;
   10885    }
   10886 
   10887    /* 66 0F E3 = PAVGW */
   10888    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE3) {
   10889       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10890                                  "pavgw", Iop_Avg16Ux8, False );
   10891       goto decode_success;
   10892    }
   10893 
   10894    /* 66 0F 74 = PCMPEQB */
   10895    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x74) {
   10896       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10897                                  "pcmpeqb", Iop_CmpEQ8x16, False );
   10898       goto decode_success;
   10899    }
   10900 
   10901    /* 66 0F 76 = PCMPEQD */
   10902    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x76) {
   10903       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10904                                  "pcmpeqd", Iop_CmpEQ32x4, False );
   10905       goto decode_success;
   10906    }
   10907 
   10908    /* 66 0F 75 = PCMPEQW */
   10909    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x75) {
   10910       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10911                                  "pcmpeqw", Iop_CmpEQ16x8, False );
   10912       goto decode_success;
   10913    }
   10914 
   10915    /* 66 0F 64 = PCMPGTB */
   10916    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x64) {
   10917       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10918                                  "pcmpgtb", Iop_CmpGT8Sx16, False );
   10919       goto decode_success;
   10920    }
   10921 
   10922    /* 66 0F 66 = PCMPGTD */
   10923    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x66) {
   10924       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10925                                  "pcmpgtd", Iop_CmpGT32Sx4, False );
   10926       goto decode_success;
   10927    }
   10928 
   10929    /* 66 0F 65 = PCMPGTW */
   10930    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x65) {
   10931       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10932                                  "pcmpgtw", Iop_CmpGT16Sx8, False );
   10933       goto decode_success;
   10934    }
   10935 
   10936    /* 66 0F C5 = PEXTRW -- extract 16-bit field from xmm(E) and put
   10937       zero-extend of it in ireg(G). */
   10938    if (insn[0] == 0x0F && insn[1] == 0xC5) {
   10939       modrm = insn[2];
   10940       if (sz == 2 && epartIsReg(modrm)) {
   10941          t5 = newTemp(Ity_V128);
   10942          t4 = newTemp(Ity_I16);
   10943          assign(t5, getXMMReg(eregOfRM(modrm)));
   10944          breakup128to32s( t5, &t3, &t2, &t1, &t0 );
   10945          switch (insn[3] & 7) {
   10946             case 0:  assign(t4, unop(Iop_32to16,   mkexpr(t0))); break;
   10947             case 1:  assign(t4, unop(Iop_32HIto16, mkexpr(t0))); break;
   10948             case 2:  assign(t4, unop(Iop_32to16,   mkexpr(t1))); break;
   10949             case 3:  assign(t4, unop(Iop_32HIto16, mkexpr(t1))); break;
   10950             case 4:  assign(t4, unop(Iop_32to16,   mkexpr(t2))); break;
   10951             case 5:  assign(t4, unop(Iop_32HIto16, mkexpr(t2))); break;
   10952             case 6:  assign(t4, unop(Iop_32to16,   mkexpr(t3))); break;
   10953             case 7:  assign(t4, unop(Iop_32HIto16, mkexpr(t3))); break;
   10954             default: vassert(0); /*NOTREACHED*/
   10955          }
   10956          putIReg(4, gregOfRM(modrm), unop(Iop_16Uto32, mkexpr(t4)));
   10957          DIP("pextrw $%d,%s,%s\n",
   10958              (Int)insn[3], nameXMMReg(eregOfRM(modrm)),
   10959                            nameIReg(4,gregOfRM(modrm)));
   10960          delta += 4;
   10961          goto decode_success;
   10962       }
   10963       /* else fall through */
   10964    }
   10965 
   10966    /* 66 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
   10967       put it into the specified lane of xmm(G). */
   10968    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC4) {
   10969       Int lane;
   10970       t4 = newTemp(Ity_I16);
   10971       modrm = insn[2];
   10972 
   10973       if (epartIsReg(modrm)) {
   10974          assign(t4, getIReg(2, eregOfRM(modrm)));
   10975          delta += 3+1;
   10976          lane = insn[3+1-1];
   10977          DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   10978                                    nameIReg(2,eregOfRM(modrm)),
   10979                                    nameXMMReg(gregOfRM(modrm)));
   10980       } else {
   10981          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10982          delta += 3+alen;
   10983          lane = insn[3+alen-1];
   10984          assign(t4, loadLE(Ity_I16, mkexpr(addr)));
   10985          DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   10986                                    dis_buf,
   10987                                    nameXMMReg(gregOfRM(modrm)));
   10988       }
   10989 
   10990       putXMMRegLane16( gregOfRM(modrm), lane & 7, mkexpr(t4) );
   10991       goto decode_success;
   10992    }
   10993 
   10994    /* 66 0F F5 = PMADDWD -- Multiply and add packed integers from
   10995       E(xmm or mem) to G(xmm) */
   10996    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF5) {
   10997       IRTemp s1V  = newTemp(Ity_V128);
   10998       IRTemp s2V  = newTemp(Ity_V128);
   10999       IRTemp dV   = newTemp(Ity_V128);
   11000       IRTemp s1Hi = newTemp(Ity_I64);
   11001       IRTemp s1Lo = newTemp(Ity_I64);
   11002       IRTemp s2Hi = newTemp(Ity_I64);
   11003       IRTemp s2Lo = newTemp(Ity_I64);
   11004       IRTemp dHi  = newTemp(Ity_I64);
   11005       IRTemp dLo  = newTemp(Ity_I64);
   11006       modrm = insn[2];
   11007       if (epartIsReg(modrm)) {
   11008          assign( s1V, getXMMReg(eregOfRM(modrm)) );
   11009          delta += 2+1;
   11010          DIP("pmaddwd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11011                                 nameXMMReg(gregOfRM(modrm)));
   11012       } else {
   11013          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11014          assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
   11015          delta += 2+alen;
   11016          DIP("pmaddwd %s,%s\n", dis_buf,
   11017                                 nameXMMReg(gregOfRM(modrm)));
   11018       }
   11019       assign( s2V, getXMMReg(gregOfRM(modrm)) );
   11020       assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
   11021       assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
   11022       assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
   11023       assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
   11024       assign( dHi, mkIRExprCCall(
   11025                       Ity_I64, 0/*regparms*/,
   11026                       "x86g_calculate_mmx_pmaddwd",
   11027                       &x86g_calculate_mmx_pmaddwd,
   11028                       mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
   11029                    ));
   11030       assign( dLo, mkIRExprCCall(
   11031                       Ity_I64, 0/*regparms*/,
   11032                       "x86g_calculate_mmx_pmaddwd",
   11033                       &x86g_calculate_mmx_pmaddwd,
   11034                       mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
   11035                    ));
   11036       assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
   11037       putXMMReg(gregOfRM(modrm), mkexpr(dV));
   11038       goto decode_success;
   11039    }
   11040 
   11041    /* 66 0F EE = PMAXSW -- 16x8 signed max */
   11042    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEE) {
   11043       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11044                                  "pmaxsw", Iop_Max16Sx8, False );
   11045       goto decode_success;
   11046    }
   11047 
   11048    /* 66 0F DE = PMAXUB -- 8x16 unsigned max */
   11049    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDE) {
   11050       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11051                                  "pmaxub", Iop_Max8Ux16, False );
   11052       goto decode_success;
   11053    }
   11054 
   11055    /* 66 0F EA = PMINSW -- 16x8 signed min */
   11056    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEA) {
   11057       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11058                                  "pminsw", Iop_Min16Sx8, False );
   11059       goto decode_success;
   11060    }
   11061 
   11062    /* 66 0F DA = PMINUB -- 8x16 unsigned min */
   11063    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDA) {
   11064       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11065                                  "pminub", Iop_Min8Ux16, False );
   11066       goto decode_success;
   11067    }
   11068 
   11069    /* 66 0F D7 = PMOVMSKB -- extract sign bits from each of 16 lanes
   11070       in xmm(E), turn them into a byte, and put zero-extend of it in
   11071       ireg(G). */
   11072    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD7) {
   11073       modrm = insn[2];
   11074       if (epartIsReg(modrm)) {
   11075          t0 = newTemp(Ity_I64);
   11076          t1 = newTemp(Ity_I64);
   11077          assign(t0, getXMMRegLane64(eregOfRM(modrm), 0));
   11078          assign(t1, getXMMRegLane64(eregOfRM(modrm), 1));
   11079          t5 = newTemp(Ity_I32);
   11080          assign(t5,
   11081                 unop(Iop_16Uto32,
   11082                      binop(Iop_8HLto16,
   11083                            unop(Iop_GetMSBs8x8, mkexpr(t1)),
   11084                            unop(Iop_GetMSBs8x8, mkexpr(t0)))));
   11085          putIReg(4, gregOfRM(modrm), mkexpr(t5));
   11086          DIP("pmovmskb %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11087                                  nameIReg(4,gregOfRM(modrm)));
   11088          delta += 3;
   11089          goto decode_success;
   11090       }
   11091       /* else fall through */
   11092    }
   11093 
   11094    /* 66 0F E4 = PMULHUW -- 16x8 hi-half of unsigned widening multiply */
   11095    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE4) {
   11096       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11097                                  "pmulhuw", Iop_MulHi16Ux8, False );
   11098       goto decode_success;
   11099    }
   11100 
   11101    /* 66 0F E5 = PMULHW -- 16x8 hi-half of signed widening multiply */
   11102    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE5) {
   11103       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11104                                  "pmulhw", Iop_MulHi16Sx8, False );
   11105       goto decode_success;
   11106    }
   11107 
   11108    /* 66 0F D5 = PMULHL -- 16x8 multiply */
   11109    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD5) {
   11110       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11111                                  "pmullw", Iop_Mul16x8, False );
   11112       goto decode_success;
   11113    }
   11114 
   11115    /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   11116    /* 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
   11117       0 to form 64-bit result */
   11118    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF4) {
   11119       IRTemp sV = newTemp(Ity_I64);
   11120       IRTemp dV = newTemp(Ity_I64);
   11121       t1 = newTemp(Ity_I32);
   11122       t0 = newTemp(Ity_I32);
   11123       modrm = insn[2];
   11124 
   11125       do_MMX_preamble();
   11126       assign( dV, getMMXReg(gregOfRM(modrm)) );
   11127 
   11128       if (epartIsReg(modrm)) {
   11129          assign( sV, getMMXReg(eregOfRM(modrm)) );
   11130          delta += 2+1;
   11131          DIP("pmuludq %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   11132                                 nameMMXReg(gregOfRM(modrm)));
   11133       } else {
   11134          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11135          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   11136          delta += 2+alen;
   11137          DIP("pmuludq %s,%s\n", dis_buf,
   11138                                 nameMMXReg(gregOfRM(modrm)));
   11139       }
   11140 
   11141       assign( t0, unop(Iop_64to32, mkexpr(dV)) );
   11142       assign( t1, unop(Iop_64to32, mkexpr(sV)) );
   11143       putMMXReg( gregOfRM(modrm),
   11144                  binop( Iop_MullU32, mkexpr(t0), mkexpr(t1) ) );
   11145       goto decode_success;
   11146    }
   11147 
   11148    /* 66 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
   11149       0 to form lower 64-bit half and lanes 2 x 2 to form upper 64-bit
   11150       half */
   11151    /* This is a really poor translation -- could be improved if
   11152       performance critical */
   11153    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF4) {
   11154       IRTemp sV, dV;
   11155       IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   11156       sV = newTemp(Ity_V128);
   11157       dV = newTemp(Ity_V128);
   11158       s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   11159       t1 = newTemp(Ity_I64);
   11160       t0 = newTemp(Ity_I64);
   11161       modrm = insn[2];
   11162       assign( dV, getXMMReg(gregOfRM(modrm)) );
   11163 
   11164       if (epartIsReg(modrm)) {
   11165          assign( sV, getXMMReg(eregOfRM(modrm)) );
   11166          delta += 2+1;
   11167          DIP("pmuludq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11168                                 nameXMMReg(gregOfRM(modrm)));
   11169       } else {
   11170          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11171          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11172          delta += 2+alen;
   11173          DIP("pmuludq %s,%s\n", dis_buf,
   11174                                 nameXMMReg(gregOfRM(modrm)));
   11175       }
   11176 
   11177       breakup128to32s( dV, &d3, &d2, &d1, &d0 );
   11178       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   11179 
   11180       assign( t0, binop( Iop_MullU32, mkexpr(d0), mkexpr(s0)) );
   11181       putXMMRegLane64( gregOfRM(modrm), 0, mkexpr(t0) );
   11182       assign( t1, binop( Iop_MullU32, mkexpr(d2), mkexpr(s2)) );
   11183       putXMMRegLane64( gregOfRM(modrm), 1, mkexpr(t1) );
   11184       goto decode_success;
   11185    }
   11186 
   11187    /* 66 0F EB = POR */
   11188    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEB) {
   11189       delta = dis_SSE_E_to_G_all( sorb, delta+2, "por", Iop_OrV128 );
   11190       goto decode_success;
   11191    }
   11192 
   11193    /* 66 0F F6 = PSADBW -- 2 x (8x8 -> 48 zeroes ++ u16) Sum Abs Diffs
   11194       from E(xmm or mem) to G(xmm) */
   11195    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF6) {
   11196       IRTemp s1V  = newTemp(Ity_V128);
   11197       IRTemp s2V  = newTemp(Ity_V128);
   11198       IRTemp dV   = newTemp(Ity_V128);
   11199       IRTemp s1Hi = newTemp(Ity_I64);
   11200       IRTemp s1Lo = newTemp(Ity_I64);
   11201       IRTemp s2Hi = newTemp(Ity_I64);
   11202       IRTemp s2Lo = newTemp(Ity_I64);
   11203       IRTemp dHi  = newTemp(Ity_I64);
   11204       IRTemp dLo  = newTemp(Ity_I64);
   11205       modrm = insn[2];
   11206       if (epartIsReg(modrm)) {
   11207          assign( s1V, getXMMReg(eregOfRM(modrm)) );
   11208          delta += 2+1;
   11209          DIP("psadbw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11210                                nameXMMReg(gregOfRM(modrm)));
   11211       } else {
   11212          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11213          assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
   11214          delta += 2+alen;
   11215          DIP("psadbw %s,%s\n", dis_buf,
   11216                                nameXMMReg(gregOfRM(modrm)));
   11217       }
   11218       assign( s2V, getXMMReg(gregOfRM(modrm)) );
   11219       assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
   11220       assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
   11221       assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
   11222       assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
   11223       assign( dHi, mkIRExprCCall(
   11224                       Ity_I64, 0/*regparms*/,
   11225                       "x86g_calculate_mmx_psadbw",
   11226                       &x86g_calculate_mmx_psadbw,
   11227                       mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
   11228                    ));
   11229       assign( dLo, mkIRExprCCall(
   11230                       Ity_I64, 0/*regparms*/,
   11231                       "x86g_calculate_mmx_psadbw",
   11232                       &x86g_calculate_mmx_psadbw,
   11233                       mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
   11234                    ));
   11235       assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
   11236       putXMMReg(gregOfRM(modrm), mkexpr(dV));
   11237       goto decode_success;
   11238    }
   11239 
   11240    /* 66 0F 70 = PSHUFD -- rearrange 4x32 from E(xmm or mem) to G(xmm) */
   11241    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x70) {
   11242       Int order;
   11243       IRTemp sV, dV, s3, s2, s1, s0;
   11244       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11245       sV = newTemp(Ity_V128);
   11246       dV = newTemp(Ity_V128);
   11247       modrm = insn[2];
   11248       if (epartIsReg(modrm)) {
   11249          assign( sV, getXMMReg(eregOfRM(modrm)) );
   11250          order = (Int)insn[3];
   11251          delta += 2+2;
   11252          DIP("pshufd $%d,%s,%s\n", order,
   11253                                    nameXMMReg(eregOfRM(modrm)),
   11254                                    nameXMMReg(gregOfRM(modrm)));
   11255       } else {
   11256          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11257          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11258 	 order = (Int)insn[2+alen];
   11259          delta += 3+alen;
   11260          DIP("pshufd $%d,%s,%s\n", order,
   11261                                    dis_buf,
   11262                                    nameXMMReg(gregOfRM(modrm)));
   11263       }
   11264       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   11265 
   11266 #     define SEL(n) \
   11267                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   11268       assign(dV,
   11269 	     mk128from32s( SEL((order>>6)&3), SEL((order>>4)&3),
   11270                            SEL((order>>2)&3), SEL((order>>0)&3) )
   11271       );
   11272       putXMMReg(gregOfRM(modrm), mkexpr(dV));
   11273 #     undef SEL
   11274       goto decode_success;
   11275    }
   11276 
   11277    /* F3 0F 70 = PSHUFHW -- rearrange upper half 4x16 from E(xmm or
   11278       mem) to G(xmm), and copy lower half */
   11279    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x70) {
   11280       Int order;
   11281       IRTemp sVhi, dVhi, sV, dV, s3, s2, s1, s0;
   11282       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11283       sV   = newTemp(Ity_V128);
   11284       dV   = newTemp(Ity_V128);
   11285       sVhi = newTemp(Ity_I64);
   11286       dVhi = newTemp(Ity_I64);
   11287       modrm = insn[3];
   11288       if (epartIsReg(modrm)) {
   11289          assign( sV, getXMMReg(eregOfRM(modrm)) );
   11290          order = (Int)insn[4];
   11291          delta += 4+1;
   11292          DIP("pshufhw $%d,%s,%s\n", order,
   11293                                     nameXMMReg(eregOfRM(modrm)),
   11294                                     nameXMMReg(gregOfRM(modrm)));
   11295       } else {
   11296          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11297          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11298 	 order = (Int)insn[3+alen];
   11299          delta += 4+alen;
   11300          DIP("pshufhw $%d,%s,%s\n", order,
   11301                                     dis_buf,
   11302                                     nameXMMReg(gregOfRM(modrm)));
   11303       }
   11304       assign( sVhi, unop(Iop_V128HIto64, mkexpr(sV)) );
   11305       breakup64to16s( sVhi, &s3, &s2, &s1, &s0 );
   11306 
   11307 #     define SEL(n) \
   11308                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   11309       assign(dVhi,
   11310 	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
   11311                           SEL((order>>2)&3), SEL((order>>0)&3) )
   11312       );
   11313       assign(dV, binop( Iop_64HLtoV128,
   11314                         mkexpr(dVhi),
   11315                         unop(Iop_V128to64, mkexpr(sV))) );
   11316       putXMMReg(gregOfRM(modrm), mkexpr(dV));
   11317 #     undef SEL
   11318       goto decode_success;
   11319    }
   11320 
   11321    /* F2 0F 70 = PSHUFLW -- rearrange lower half 4x16 from E(xmm or
   11322       mem) to G(xmm), and copy upper half */
   11323    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x70) {
   11324       Int order;
   11325       IRTemp sVlo, dVlo, sV, dV, s3, s2, s1, s0;
   11326       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11327       sV   = newTemp(Ity_V128);
   11328       dV   = newTemp(Ity_V128);
   11329       sVlo = newTemp(Ity_I64);
   11330       dVlo = newTemp(Ity_I64);
   11331       modrm = insn[3];
   11332       if (epartIsReg(modrm)) {
   11333          assign( sV, getXMMReg(eregOfRM(modrm)) );
   11334          order = (Int)insn[4];
   11335          delta += 4+1;
   11336          DIP("pshuflw $%d,%s,%s\n", order,
   11337                                     nameXMMReg(eregOfRM(modrm)),
   11338                                     nameXMMReg(gregOfRM(modrm)));
   11339       } else {
   11340          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11341          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11342 	 order = (Int)insn[3+alen];
   11343          delta += 4+alen;
   11344          DIP("pshuflw $%d,%s,%s\n", order,
   11345                                     dis_buf,
   11346                                     nameXMMReg(gregOfRM(modrm)));
   11347       }
   11348       assign( sVlo, unop(Iop_V128to64, mkexpr(sV)) );
   11349       breakup64to16s( sVlo, &s3, &s2, &s1, &s0 );
   11350 
   11351 #     define SEL(n) \
   11352                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   11353       assign(dVlo,
   11354 	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
   11355                           SEL((order>>2)&3), SEL((order>>0)&3) )
   11356       );
   11357       assign(dV, binop( Iop_64HLtoV128,
   11358                         unop(Iop_V128HIto64, mkexpr(sV)),
   11359                         mkexpr(dVlo) ) );
   11360       putXMMReg(gregOfRM(modrm), mkexpr(dV));
   11361 #     undef SEL
   11362       goto decode_success;
   11363    }
   11364 
   11365    /* 66 0F 72 /6 ib = PSLLD by immediate */
   11366    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
   11367        && epartIsReg(insn[2])
   11368        && gregOfRM(insn[2]) == 6) {
   11369       delta = dis_SSE_shiftE_imm( delta+2, "pslld", Iop_ShlN32x4 );
   11370       goto decode_success;
   11371    }
   11372 
   11373    /* 66 0F F2 = PSLLD by E */
   11374    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF2) {
   11375       delta = dis_SSE_shiftG_byE( sorb, delta+2, "pslld", Iop_ShlN32x4 );
   11376       goto decode_success;
   11377    }
   11378 
   11379    /* 66 0F 73 /7 ib = PSLLDQ by immediate */
   11380    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
   11381        && epartIsReg(insn[2])
   11382        && gregOfRM(insn[2]) == 7) {
   11383       IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
   11384       Int    imm = (Int)insn[3];
   11385       Int    reg = eregOfRM(insn[2]);
   11386       DIP("pslldq $%d,%s\n", imm, nameXMMReg(reg));
   11387       vassert(imm >= 0 && imm <= 255);
   11388       delta += 4;
   11389 
   11390       sV    = newTemp(Ity_V128);
   11391       dV    = newTemp(Ity_V128);
   11392       hi64  = newTemp(Ity_I64);
   11393       lo64  = newTemp(Ity_I64);
   11394       hi64r = newTemp(Ity_I64);
   11395       lo64r = newTemp(Ity_I64);
   11396 
   11397       if (imm >= 16) {
   11398          putXMMReg(reg, mkV128(0x0000));
   11399          goto decode_success;
   11400       }
   11401 
   11402       assign( sV, getXMMReg(reg) );
   11403       assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
   11404       assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
   11405 
   11406       if (imm == 0) {
   11407          assign( lo64r, mkexpr(lo64) );
   11408          assign( hi64r, mkexpr(hi64) );
   11409       }
   11410       else
   11411       if (imm == 8) {
   11412          assign( lo64r, mkU64(0) );
   11413          assign( hi64r, mkexpr(lo64) );
   11414       }
   11415       else
   11416       if (imm > 8) {
   11417          assign( lo64r, mkU64(0) );
   11418          assign( hi64r, binop( Iop_Shl64,
   11419                                mkexpr(lo64),
   11420                                mkU8( 8*(imm-8) ) ));
   11421       } else {
   11422          assign( lo64r, binop( Iop_Shl64,
   11423                                mkexpr(lo64),
   11424                                mkU8(8 * imm) ));
   11425          assign( hi64r,
   11426                  binop( Iop_Or64,
   11427                         binop(Iop_Shl64, mkexpr(hi64),
   11428                                          mkU8(8 * imm)),
   11429                         binop(Iop_Shr64, mkexpr(lo64),
   11430                                          mkU8(8 * (8 - imm)) )
   11431                       )
   11432                );
   11433       }
   11434       assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
   11435       putXMMReg(reg, mkexpr(dV));
   11436       goto decode_success;
   11437    }
   11438 
   11439    /* 66 0F 73 /6 ib = PSLLQ by immediate */
   11440    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
   11441        && epartIsReg(insn[2])
   11442        && gregOfRM(insn[2]) == 6) {
   11443       delta = dis_SSE_shiftE_imm( delta+2, "psllq", Iop_ShlN64x2 );
   11444       goto decode_success;
   11445    }
   11446 
   11447    /* 66 0F F3 = PSLLQ by E */
   11448    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF3) {
   11449       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psllq", Iop_ShlN64x2 );
   11450       goto decode_success;
   11451    }
   11452 
   11453    /* 66 0F 71 /6 ib = PSLLW by immediate */
   11454    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
   11455        && epartIsReg(insn[2])
   11456        && gregOfRM(insn[2]) == 6) {
   11457       delta = dis_SSE_shiftE_imm( delta+2, "psllw", Iop_ShlN16x8 );
   11458       goto decode_success;
   11459    }
   11460 
   11461    /* 66 0F F1 = PSLLW by E */
   11462    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF1) {
   11463       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psllw", Iop_ShlN16x8 );
   11464       goto decode_success;
   11465    }
   11466 
   11467    /* 66 0F 72 /4 ib = PSRAD by immediate */
   11468    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
   11469        && epartIsReg(insn[2])
   11470        && gregOfRM(insn[2]) == 4) {
   11471       delta = dis_SSE_shiftE_imm( delta+2, "psrad", Iop_SarN32x4 );
   11472       goto decode_success;
   11473    }
   11474 
   11475    /* 66 0F E2 = PSRAD by E */
   11476    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE2) {
   11477       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrad", Iop_SarN32x4 );
   11478       goto decode_success;
   11479    }
   11480 
   11481    /* 66 0F 71 /4 ib = PSRAW by immediate */
   11482    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
   11483        && epartIsReg(insn[2])
   11484        && gregOfRM(insn[2]) == 4) {
   11485       delta = dis_SSE_shiftE_imm( delta+2, "psraw", Iop_SarN16x8 );
   11486       goto decode_success;
   11487    }
   11488 
   11489    /* 66 0F E1 = PSRAW by E */
   11490    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE1) {
   11491       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psraw", Iop_SarN16x8 );
   11492       goto decode_success;
   11493    }
   11494 
   11495    /* 66 0F 72 /2 ib = PSRLD by immediate */
   11496    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
   11497        && epartIsReg(insn[2])
   11498        && gregOfRM(insn[2]) == 2) {
   11499       delta = dis_SSE_shiftE_imm( delta+2, "psrld", Iop_ShrN32x4 );
   11500       goto decode_success;
   11501    }
   11502 
   11503    /* 66 0F D2 = PSRLD by E */
   11504    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD2) {
   11505       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrld", Iop_ShrN32x4 );
   11506       goto decode_success;
   11507    }
   11508 
   11509    /* 66 0F 73 /3 ib = PSRLDQ by immediate */
   11510    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
   11511        && epartIsReg(insn[2])
   11512        && gregOfRM(insn[2]) == 3) {
   11513       IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
   11514       Int    imm = (Int)insn[3];
   11515       Int    reg = eregOfRM(insn[2]);
   11516       DIP("psrldq $%d,%s\n", imm, nameXMMReg(reg));
   11517       vassert(imm >= 0 && imm <= 255);
   11518       delta += 4;
   11519 
   11520       sV    = newTemp(Ity_V128);
   11521       dV    = newTemp(Ity_V128);
   11522       hi64  = newTemp(Ity_I64);
   11523       lo64  = newTemp(Ity_I64);
   11524       hi64r = newTemp(Ity_I64);
   11525       lo64r = newTemp(Ity_I64);
   11526 
   11527       if (imm >= 16) {
   11528          putXMMReg(reg, mkV128(0x0000));
   11529          goto decode_success;
   11530       }
   11531 
   11532       assign( sV, getXMMReg(reg) );
   11533       assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
   11534       assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
   11535 
   11536       if (imm == 0) {
   11537          assign( lo64r, mkexpr(lo64) );
   11538          assign( hi64r, mkexpr(hi64) );
   11539       }
   11540       else
   11541       if (imm == 8) {
   11542          assign( hi64r, mkU64(0) );
   11543          assign( lo64r, mkexpr(hi64) );
   11544       }
   11545       else
   11546       if (imm > 8) {
   11547          assign( hi64r, mkU64(0) );
   11548          assign( lo64r, binop( Iop_Shr64,
   11549                                mkexpr(hi64),
   11550                                mkU8( 8*(imm-8) ) ));
   11551       } else {
   11552          assign( hi64r, binop( Iop_Shr64,
   11553                                mkexpr(hi64),
   11554                                mkU8(8 * imm) ));
   11555          assign( lo64r,
   11556                  binop( Iop_Or64,
   11557                         binop(Iop_Shr64, mkexpr(lo64),
   11558                                          mkU8(8 * imm)),
   11559                         binop(Iop_Shl64, mkexpr(hi64),
   11560                                          mkU8(8 * (8 - imm)) )
   11561                       )
   11562                );
   11563       }
   11564 
   11565       assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
   11566       putXMMReg(reg, mkexpr(dV));
   11567       goto decode_success;
   11568    }
   11569 
   11570    /* 66 0F 73 /2 ib = PSRLQ by immediate */
   11571    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
   11572        && epartIsReg(insn[2])
   11573        && gregOfRM(insn[2]) == 2) {
   11574       delta = dis_SSE_shiftE_imm( delta+2, "psrlq", Iop_ShrN64x2 );
   11575       goto decode_success;
   11576    }
   11577 
   11578    /* 66 0F D3 = PSRLQ by E */
   11579    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD3) {
   11580       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrlq", Iop_ShrN64x2 );
   11581       goto decode_success;
   11582    }
   11583 
   11584    /* 66 0F 71 /2 ib = PSRLW by immediate */
   11585    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
   11586        && epartIsReg(insn[2])
   11587        && gregOfRM(insn[2]) == 2) {
   11588       delta = dis_SSE_shiftE_imm( delta+2, "psrlw", Iop_ShrN16x8 );
   11589       goto decode_success;
   11590    }
   11591 
   11592    /* 66 0F D1 = PSRLW by E */
   11593    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD1) {
   11594       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrlw", Iop_ShrN16x8 );
   11595       goto decode_success;
   11596    }
   11597 
   11598    /* 66 0F F8 = PSUBB */
   11599    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF8) {
   11600       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11601                                  "psubb", Iop_Sub8x16, False );
   11602       goto decode_success;
   11603    }
   11604 
   11605    /* 66 0F FA = PSUBD */
   11606    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFA) {
   11607       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11608                                  "psubd", Iop_Sub32x4, False );
   11609       goto decode_success;
   11610    }
   11611 
   11612    /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   11613    /* 0F FB = PSUBQ -- sub 64x1 */
   11614    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xFB) {
   11615       do_MMX_preamble();
   11616       delta = dis_MMXop_regmem_to_reg (
   11617                 sorb, delta+2, insn[1], "psubq", False );
   11618       goto decode_success;
   11619    }
   11620 
   11621    /* 66 0F FB = PSUBQ */
   11622    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFB) {
   11623       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11624                                  "psubq", Iop_Sub64x2, False );
   11625       goto decode_success;
   11626    }
   11627 
   11628    /* 66 0F F9 = PSUBW */
   11629    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF9) {
   11630       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11631                                  "psubw", Iop_Sub16x8, False );
   11632       goto decode_success;
   11633    }
   11634 
   11635    /* 66 0F E8 = PSUBSB */
   11636    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE8) {
   11637       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11638                                  "psubsb", Iop_QSub8Sx16, False );
   11639       goto decode_success;
   11640    }
   11641 
   11642    /* 66 0F E9 = PSUBSW */
   11643    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE9) {
   11644       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11645                                  "psubsw", Iop_QSub16Sx8, False );
   11646       goto decode_success;
   11647    }
   11648 
   11649    /* 66 0F D8 = PSUBSB */
   11650    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD8) {
   11651       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11652                                  "psubusb", Iop_QSub8Ux16, False );
   11653       goto decode_success;
   11654    }
   11655 
   11656    /* 66 0F D9 = PSUBSW */
   11657    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD9) {
   11658       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11659                                  "psubusw", Iop_QSub16Ux8, False );
   11660       goto decode_success;
   11661    }
   11662 
   11663    /* 66 0F 68 = PUNPCKHBW */
   11664    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x68) {
   11665       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11666                                  "punpckhbw",
   11667                                  Iop_InterleaveHI8x16, True );
   11668       goto decode_success;
   11669    }
   11670 
   11671    /* 66 0F 6A = PUNPCKHDQ */
   11672    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6A) {
   11673       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11674                                  "punpckhdq",
   11675                                  Iop_InterleaveHI32x4, True );
   11676       goto decode_success;
   11677    }
   11678 
   11679    /* 66 0F 6D = PUNPCKHQDQ */
   11680    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6D) {
   11681       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11682                                  "punpckhqdq",
   11683                                  Iop_InterleaveHI64x2, True );
   11684       goto decode_success;
   11685    }
   11686 
   11687    /* 66 0F 69 = PUNPCKHWD */
   11688    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x69) {
   11689       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11690                                  "punpckhwd",
   11691                                  Iop_InterleaveHI16x8, True );
   11692       goto decode_success;
   11693    }
   11694 
   11695    /* 66 0F 60 = PUNPCKLBW */
   11696    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x60) {
   11697       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11698                                  "punpcklbw",
   11699                                  Iop_InterleaveLO8x16, True );
   11700       goto decode_success;
   11701    }
   11702 
   11703    /* 66 0F 62 = PUNPCKLDQ */
   11704    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x62) {
   11705       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11706                                  "punpckldq",
   11707                                  Iop_InterleaveLO32x4, True );
   11708       goto decode_success;
   11709    }
   11710 
   11711    /* 66 0F 6C = PUNPCKLQDQ */
   11712    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6C) {
   11713       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11714                                  "punpcklqdq",
   11715                                  Iop_InterleaveLO64x2, True );
   11716       goto decode_success;
   11717    }
   11718 
   11719    /* 66 0F 61 = PUNPCKLWD */
   11720    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x61) {
   11721       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11722                                  "punpcklwd",
   11723                                  Iop_InterleaveLO16x8, True );
   11724       goto decode_success;
   11725    }
   11726 
   11727    /* 66 0F EF = PXOR */
   11728    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEF) {
   11729       delta = dis_SSE_E_to_G_all( sorb, delta+2, "pxor", Iop_XorV128 );
   11730       goto decode_success;
   11731    }
   11732 
   11733 //--    /* FXSAVE/FXRSTOR m32 -- load/store the FPU/MMX/SSE state. */
   11734 //--    if (insn[0] == 0x0F && insn[1] == 0xAE
   11735 //--        && (!epartIsReg(insn[2]))
   11736 //--        && (gregOfRM(insn[2]) == 1 || gregOfRM(insn[2]) == 0) ) {
   11737 //--       Bool store = gregOfRM(insn[2]) == 0;
   11738 //--       vg_assert(sz == 4);
   11739 //--       pair = disAMode ( cb, sorb, eip+2, dis_buf );
   11740 //--       t1   = LOW24(pair);
   11741 //--       eip += 2+HI8(pair);
   11742 //--       uInstr3(cb, store ? SSE2a_MemWr : SSE2a_MemRd, 512,
   11743 //--                   Lit16, (((UShort)insn[0]) << 8) | (UShort)insn[1],
   11744 //--                   Lit16, (UShort)insn[2],
   11745 //--                   TempReg, t1 );
   11746 //--       DIP("fx%s %s\n", store ? "save" : "rstor", dis_buf );
   11747 //--       goto decode_success;
   11748 //--    }
   11749 
   11750    /* 0F AE /7 = CLFLUSH -- flush cache line */
   11751    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
   11752        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 7) {
   11753 
   11754       /* This is something of a hack.  We need to know the size of the
   11755          cache line containing addr.  Since we don't (easily), assume
   11756          256 on the basis that no real cache would have a line that
   11757          big.  It's safe to invalidate more stuff than we need, just
   11758          inefficient. */
   11759       UInt lineszB = 256;
   11760 
   11761       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11762       delta += 2+alen;
   11763 
   11764       /* Round addr down to the start of the containing block. */
   11765       stmt( IRStmt_Put(
   11766                OFFB_CMSTART,
   11767                binop( Iop_And32,
   11768                       mkexpr(addr),
   11769                       mkU32( ~(lineszB-1) ))) );
   11770 
   11771       stmt( IRStmt_Put(OFFB_CMLEN, mkU32(lineszB) ) );
   11772 
   11773       jmp_lit(&dres, Ijk_InvalICache, (Addr32)(guest_EIP_bbstart+delta));
   11774 
   11775       DIP("clflush %s\n", dis_buf);
   11776       goto decode_success;
   11777    }
   11778 
   11779    /* ---------------------------------------------------- */
   11780    /* --- end of the SSE2 decoder.                     --- */
   11781    /* ---------------------------------------------------- */
   11782 
   11783    /* ---------------------------------------------------- */
   11784    /* --- start of the SSE3 decoder.                   --- */
   11785    /* ---------------------------------------------------- */
   11786 
   11787    /* Skip parts of the decoder which don't apply given the stated
   11788       guest subarchitecture. */
   11789    if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE3))
   11790       goto after_sse_decoders; /* no SSE3 capabilities */
   11791 
   11792    insn = &guest_code[delta];
   11793 
   11794    /* F3 0F 12 = MOVSLDUP -- move from E (mem or xmm) to G (xmm),
   11795       duplicating some lanes (2:2:0:0). */
   11796    /* F3 0F 16 = MOVSHDUP -- move from E (mem or xmm) to G (xmm),
   11797       duplicating some lanes (3:3:1:1). */
   11798    if (sz == 4 && insn[0] == 0xF3 && insn[1] == 0x0F
   11799        && (insn[2] == 0x12 || insn[2] == 0x16)) {
   11800       IRTemp s3, s2, s1, s0;
   11801       IRTemp sV  = newTemp(Ity_V128);
   11802       Bool   isH = insn[2] == 0x16;
   11803       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11804 
   11805       modrm = insn[3];
   11806       if (epartIsReg(modrm)) {
   11807          assign( sV, getXMMReg( eregOfRM(modrm)) );
   11808          DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
   11809                                   nameXMMReg(eregOfRM(modrm)),
   11810                                   nameXMMReg(gregOfRM(modrm)));
   11811          delta += 3+1;
   11812       } else {
   11813          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11814          gen_SEGV_if_not_16_aligned( addr );
   11815          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11816          DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
   11817 	     dis_buf,
   11818              nameXMMReg(gregOfRM(modrm)));
   11819          delta += 3+alen;
   11820       }
   11821 
   11822       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   11823       putXMMReg( gregOfRM(modrm),
   11824                  isH ? mk128from32s( s3, s3, s1, s1 )
   11825                      : mk128from32s( s2, s2, s0, s0 ) );
   11826       goto decode_success;
   11827    }
   11828 
   11829    /* F2 0F 12 = MOVDDUP -- move from E (mem or xmm) to G (xmm),
   11830       duplicating some lanes (0:1:0:1). */
   11831    if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x12) {
   11832       IRTemp sV = newTemp(Ity_V128);
   11833       IRTemp d0 = newTemp(Ity_I64);
   11834 
   11835       modrm = insn[3];
   11836       if (epartIsReg(modrm)) {
   11837          assign( sV, getXMMReg( eregOfRM(modrm)) );
   11838          DIP("movddup %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11839                                 nameXMMReg(gregOfRM(modrm)));
   11840          delta += 3+1;
   11841          assign ( d0, unop(Iop_V128to64, mkexpr(sV)) );
   11842       } else {
   11843          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11844          assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
   11845          DIP("movddup %s,%s\n", dis_buf,
   11846                                 nameXMMReg(gregOfRM(modrm)));
   11847          delta += 3+alen;
   11848       }
   11849 
   11850       putXMMReg( gregOfRM(modrm), binop(Iop_64HLtoV128,mkexpr(d0),mkexpr(d0)) );
   11851       goto decode_success;
   11852    }
   11853 
   11854    /* F2 0F D0 = ADDSUBPS -- 32x4 +/-/+/- from E (mem or xmm) to G (xmm). */
   11855    if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xD0) {
   11856       IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
   11857       IRTemp eV   = newTemp(Ity_V128);
   11858       IRTemp gV   = newTemp(Ity_V128);
   11859       IRTemp addV = newTemp(Ity_V128);
   11860       IRTemp subV = newTemp(Ity_V128);
   11861       IRTemp rm     = newTemp(Ity_I32);
   11862       a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11863 
   11864       modrm = insn[3];
   11865       if (epartIsReg(modrm)) {
   11866          assign( eV, getXMMReg( eregOfRM(modrm)) );
   11867          DIP("addsubps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11868                                  nameXMMReg(gregOfRM(modrm)));
   11869          delta += 3+1;
   11870       } else {
   11871          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11872          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   11873          DIP("addsubps %s,%s\n", dis_buf,
   11874                                  nameXMMReg(gregOfRM(modrm)));
   11875          delta += 3+alen;
   11876       }
   11877 
   11878       assign( gV, getXMMReg(gregOfRM(modrm)) );
   11879 
   11880       assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   11881       assign( addV, triop(Iop_Add32Fx4, mkexpr(rm), mkexpr(gV), mkexpr(eV)) );
   11882       assign( subV, triop(Iop_Sub32Fx4, mkexpr(rm), mkexpr(gV), mkexpr(eV)) );
   11883 
   11884       breakup128to32s( addV, &a3, &a2, &a1, &a0 );
   11885       breakup128to32s( subV, &s3, &s2, &s1, &s0 );
   11886 
   11887       putXMMReg( gregOfRM(modrm), mk128from32s( a3, s2, a1, s0 ));
   11888       goto decode_success;
   11889    }
   11890 
   11891    /* 66 0F D0 = ADDSUBPD -- 64x4 +/- from E (mem or xmm) to G (xmm). */
   11892    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD0) {
   11893       IRTemp eV   = newTemp(Ity_V128);
   11894       IRTemp gV   = newTemp(Ity_V128);
   11895       IRTemp addV = newTemp(Ity_V128);
   11896       IRTemp subV = newTemp(Ity_V128);
   11897       IRTemp a1     = newTemp(Ity_I64);
   11898       IRTemp s0     = newTemp(Ity_I64);
   11899       IRTemp rm     = newTemp(Ity_I32);
   11900 
   11901       modrm = insn[2];
   11902       if (epartIsReg(modrm)) {
   11903          assign( eV, getXMMReg( eregOfRM(modrm)) );
   11904          DIP("addsubpd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11905                                  nameXMMReg(gregOfRM(modrm)));
   11906          delta += 2+1;
   11907       } else {
   11908          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11909          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   11910          DIP("addsubpd %s,%s\n", dis_buf,
   11911                                  nameXMMReg(gregOfRM(modrm)));
   11912          delta += 2+alen;
   11913       }
   11914 
   11915       assign( gV, getXMMReg(gregOfRM(modrm)) );
   11916 
   11917       assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   11918       assign( addV, triop(Iop_Add64Fx2, mkexpr(rm), mkexpr(gV), mkexpr(eV)) );
   11919       assign( subV, triop(Iop_Sub64Fx2, mkexpr(rm), mkexpr(gV), mkexpr(eV)) );
   11920 
   11921       assign( a1, unop(Iop_V128HIto64, mkexpr(addV) ));
   11922       assign( s0, unop(Iop_V128to64,   mkexpr(subV) ));
   11923 
   11924       putXMMReg( gregOfRM(modrm),
   11925                  binop(Iop_64HLtoV128, mkexpr(a1), mkexpr(s0)) );
   11926       goto decode_success;
   11927    }
   11928 
   11929    /* F2 0F 7D = HSUBPS -- 32x4 sub across from E (mem or xmm) to G (xmm). */
   11930    /* F2 0F 7C = HADDPS -- 32x4 add across from E (mem or xmm) to G (xmm). */
   11931    if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F
   11932        && (insn[2] == 0x7C || insn[2] == 0x7D)) {
   11933       IRTemp e3, e2, e1, e0, g3, g2, g1, g0;
   11934       IRTemp eV     = newTemp(Ity_V128);
   11935       IRTemp gV     = newTemp(Ity_V128);
   11936       IRTemp leftV  = newTemp(Ity_V128);
   11937       IRTemp rightV = newTemp(Ity_V128);
   11938       IRTemp rm     = newTemp(Ity_I32);
   11939       Bool   isAdd  = insn[2] == 0x7C;
   11940       const HChar* str = isAdd ? "add" : "sub";
   11941       e3 = e2 = e1 = e0 = g3 = g2 = g1 = g0 = IRTemp_INVALID;
   11942 
   11943       modrm = insn[3];
   11944       if (epartIsReg(modrm)) {
   11945          assign( eV, getXMMReg( eregOfRM(modrm)) );
   11946          DIP("h%sps %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
   11947                                    nameXMMReg(gregOfRM(modrm)));
   11948          delta += 3+1;
   11949       } else {
   11950          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11951          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   11952          DIP("h%sps %s,%s\n", str, dis_buf,
   11953                                    nameXMMReg(gregOfRM(modrm)));
   11954          delta += 3+alen;
   11955       }
   11956 
   11957       assign( gV, getXMMReg(gregOfRM(modrm)) );
   11958 
   11959       breakup128to32s( eV, &e3, &e2, &e1, &e0 );
   11960       breakup128to32s( gV, &g3, &g2, &g1, &g0 );
   11961 
   11962       assign( leftV,  mk128from32s( e2, e0, g2, g0 ) );
   11963       assign( rightV, mk128from32s( e3, e1, g3, g1 ) );
   11964 
   11965       assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   11966       putXMMReg( gregOfRM(modrm),
   11967                  triop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4,
   11968                        mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
   11969       goto decode_success;
   11970    }
   11971 
   11972    /* 66 0F 7D = HSUBPD -- 64x2 sub across from E (mem or xmm) to G (xmm). */
   11973    /* 66 0F 7C = HADDPD -- 64x2 add across from E (mem or xmm) to G (xmm). */
   11974    if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x7C || insn[1] == 0x7D)) {
   11975       IRTemp e1     = newTemp(Ity_I64);
   11976       IRTemp e0     = newTemp(Ity_I64);
   11977       IRTemp g1     = newTemp(Ity_I64);
   11978       IRTemp g0     = newTemp(Ity_I64);
   11979       IRTemp eV     = newTemp(Ity_V128);
   11980       IRTemp gV     = newTemp(Ity_V128);
   11981       IRTemp leftV  = newTemp(Ity_V128);
   11982       IRTemp rightV = newTemp(Ity_V128);
   11983       IRTemp rm     = newTemp(Ity_I32);
   11984       Bool   isAdd  = insn[1] == 0x7C;
   11985       const HChar* str = isAdd ? "add" : "sub";
   11986 
   11987       modrm = insn[2];
   11988       if (epartIsReg(modrm)) {
   11989          assign( eV, getXMMReg( eregOfRM(modrm)) );
   11990          DIP("h%spd %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
   11991                                    nameXMMReg(gregOfRM(modrm)));
   11992          delta += 2+1;
   11993       } else {
   11994          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11995          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   11996          DIP("h%spd %s,%s\n", str, dis_buf,
   11997                               nameXMMReg(gregOfRM(modrm)));
   11998          delta += 2+alen;
   11999       }
   12000 
   12001       assign( gV, getXMMReg(gregOfRM(modrm)) );
   12002 
   12003       assign( e1, unop(Iop_V128HIto64, mkexpr(eV) ));
   12004       assign( e0, unop(Iop_V128to64, mkexpr(eV) ));
   12005       assign( g1, unop(Iop_V128HIto64, mkexpr(gV) ));
   12006       assign( g0, unop(Iop_V128to64, mkexpr(gV) ));
   12007 
   12008       assign( leftV,  binop(Iop_64HLtoV128, mkexpr(e0),mkexpr(g0)) );
   12009       assign( rightV, binop(Iop_64HLtoV128, mkexpr(e1),mkexpr(g1)) );
   12010 
   12011       assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   12012       putXMMReg( gregOfRM(modrm),
   12013                  triop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2,
   12014                        mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
   12015       goto decode_success;
   12016    }
   12017 
   12018    /* F2 0F F0 = LDDQU -- move from E (mem or xmm) to G (xmm). */
   12019    if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xF0) {
   12020       modrm = getIByte(delta+3);
   12021       if (epartIsReg(modrm)) {
   12022          goto decode_failure;
   12023       } else {
   12024          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12025          putXMMReg( gregOfRM(modrm),
   12026                     loadLE(Ity_V128, mkexpr(addr)) );
   12027          DIP("lddqu %s,%s\n", dis_buf,
   12028                               nameXMMReg(gregOfRM(modrm)));
   12029          delta += 3+alen;
   12030       }
   12031       goto decode_success;
   12032    }
   12033 
   12034    /* ---------------------------------------------------- */
   12035    /* --- end of the SSE3 decoder.                     --- */
   12036    /* ---------------------------------------------------- */
   12037 
   12038    /* ---------------------------------------------------- */
   12039    /* --- start of the SSSE3 decoder.                  --- */
   12040    /* ---------------------------------------------------- */
   12041 
   12042    /* 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
   12043       Unsigned Bytes (MMX) */
   12044    if (sz == 4
   12045        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
   12046       IRTemp sV        = newTemp(Ity_I64);
   12047       IRTemp dV        = newTemp(Ity_I64);
   12048       IRTemp sVoddsSX  = newTemp(Ity_I64);
   12049       IRTemp sVevensSX = newTemp(Ity_I64);
   12050       IRTemp dVoddsZX  = newTemp(Ity_I64);
   12051       IRTemp dVevensZX = newTemp(Ity_I64);
   12052 
   12053       modrm = insn[3];
   12054       do_MMX_preamble();
   12055       assign( dV, getMMXReg(gregOfRM(modrm)) );
   12056 
   12057       if (epartIsReg(modrm)) {
   12058          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12059          delta += 3+1;
   12060          DIP("pmaddubsw %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   12061                                   nameMMXReg(gregOfRM(modrm)));
   12062       } else {
   12063          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12064          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12065          delta += 3+alen;
   12066          DIP("pmaddubsw %s,%s\n", dis_buf,
   12067                                   nameMMXReg(gregOfRM(modrm)));
   12068       }
   12069 
   12070       /* compute dV unsigned x sV signed */
   12071       assign( sVoddsSX,
   12072               binop(Iop_SarN16x4, mkexpr(sV), mkU8(8)) );
   12073       assign( sVevensSX,
   12074               binop(Iop_SarN16x4,
   12075                     binop(Iop_ShlN16x4, mkexpr(sV), mkU8(8)),
   12076                     mkU8(8)) );
   12077       assign( dVoddsZX,
   12078               binop(Iop_ShrN16x4, mkexpr(dV), mkU8(8)) );
   12079       assign( dVevensZX,
   12080               binop(Iop_ShrN16x4,
   12081                     binop(Iop_ShlN16x4, mkexpr(dV), mkU8(8)),
   12082                     mkU8(8)) );
   12083 
   12084       putMMXReg(
   12085          gregOfRM(modrm),
   12086          binop(Iop_QAdd16Sx4,
   12087                binop(Iop_Mul16x4, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
   12088                binop(Iop_Mul16x4, mkexpr(sVevensSX), mkexpr(dVevensZX))
   12089          )
   12090       );
   12091       goto decode_success;
   12092    }
   12093 
   12094    /* 66 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
   12095       Unsigned Bytes (XMM) */
   12096    if (sz == 2
   12097        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
   12098       IRTemp sV        = newTemp(Ity_V128);
   12099       IRTemp dV        = newTemp(Ity_V128);
   12100       IRTemp sVoddsSX  = newTemp(Ity_V128);
   12101       IRTemp sVevensSX = newTemp(Ity_V128);
   12102       IRTemp dVoddsZX  = newTemp(Ity_V128);
   12103       IRTemp dVevensZX = newTemp(Ity_V128);
   12104 
   12105       modrm = insn[3];
   12106       assign( dV, getXMMReg(gregOfRM(modrm)) );
   12107 
   12108       if (epartIsReg(modrm)) {
   12109          assign( sV, getXMMReg(eregOfRM(modrm)) );
   12110          delta += 3+1;
   12111          DIP("pmaddubsw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   12112                                   nameXMMReg(gregOfRM(modrm)));
   12113       } else {
   12114          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12115          gen_SEGV_if_not_16_aligned( addr );
   12116          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12117          delta += 3+alen;
   12118          DIP("pmaddubsw %s,%s\n", dis_buf,
   12119                                   nameXMMReg(gregOfRM(modrm)));
   12120       }
   12121 
   12122       /* compute dV unsigned x sV signed */
   12123       assign( sVoddsSX,
   12124               binop(Iop_SarN16x8, mkexpr(sV), mkU8(8)) );
   12125       assign( sVevensSX,
   12126               binop(Iop_SarN16x8,
   12127                     binop(Iop_ShlN16x8, mkexpr(sV), mkU8(8)),
   12128                     mkU8(8)) );
   12129       assign( dVoddsZX,
   12130               binop(Iop_ShrN16x8, mkexpr(dV), mkU8(8)) );
   12131       assign( dVevensZX,
   12132               binop(Iop_ShrN16x8,
   12133                     binop(Iop_ShlN16x8, mkexpr(dV), mkU8(8)),
   12134                     mkU8(8)) );
   12135 
   12136       putXMMReg(
   12137          gregOfRM(modrm),
   12138          binop(Iop_QAdd16Sx8,
   12139                binop(Iop_Mul16x8, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
   12140                binop(Iop_Mul16x8, mkexpr(sVevensSX), mkexpr(dVevensZX))
   12141          )
   12142       );
   12143       goto decode_success;
   12144    }
   12145 
   12146    /* ***--- these are MMX class insns introduced in SSSE3 ---*** */
   12147    /* 0F 38 03 = PHADDSW -- 16x4 signed qadd across from E (mem or
   12148       mmx) and G to G (mmx). */
   12149    /* 0F 38 07 = PHSUBSW -- 16x4 signed qsub across from E (mem or
   12150       mmx) and G to G (mmx). */
   12151    /* 0F 38 01 = PHADDW -- 16x4 add across from E (mem or mmx) and G
   12152       to G (mmx). */
   12153    /* 0F 38 05 = PHSUBW -- 16x4 sub across from E (mem or mmx) and G
   12154       to G (mmx). */
   12155    /* 0F 38 02 = PHADDD -- 32x2 add across from E (mem or mmx) and G
   12156       to G (mmx). */
   12157    /* 0F 38 06 = PHSUBD -- 32x2 sub across from E (mem or mmx) and G
   12158       to G (mmx). */
   12159 
   12160    if (sz == 4
   12161        && insn[0] == 0x0F && insn[1] == 0x38
   12162        && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
   12163            || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
   12164       const HChar* str = "???";
   12165       IROp   opV64  = Iop_INVALID;
   12166       IROp   opCatO = Iop_CatOddLanes16x4;
   12167       IROp   opCatE = Iop_CatEvenLanes16x4;
   12168       IRTemp sV     = newTemp(Ity_I64);
   12169       IRTemp dV     = newTemp(Ity_I64);
   12170 
   12171       modrm = insn[3];
   12172 
   12173       switch (insn[2]) {
   12174          case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
   12175          case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
   12176          case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
   12177          case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
   12178          case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
   12179          case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
   12180          default: vassert(0);
   12181       }
   12182       if (insn[2] == 0x02 || insn[2] == 0x06) {
   12183          opCatO = Iop_InterleaveHI32x2;
   12184          opCatE = Iop_InterleaveLO32x2;
   12185       }
   12186 
   12187       do_MMX_preamble();
   12188       assign( dV, getMMXReg(gregOfRM(modrm)) );
   12189 
   12190       if (epartIsReg(modrm)) {
   12191          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12192          delta += 3+1;
   12193          DIP("ph%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
   12194                                   nameMMXReg(gregOfRM(modrm)));
   12195       } else {
   12196          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12197          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12198          delta += 3+alen;
   12199          DIP("ph%s %s,%s\n", str, dis_buf,
   12200                                   nameMMXReg(gregOfRM(modrm)));
   12201       }
   12202 
   12203       putMMXReg(
   12204          gregOfRM(modrm),
   12205          binop(opV64,
   12206                binop(opCatE,mkexpr(sV),mkexpr(dV)),
   12207                binop(opCatO,mkexpr(sV),mkexpr(dV))
   12208          )
   12209       );
   12210       goto decode_success;
   12211    }
   12212 
   12213    /* 66 0F 38 03 = PHADDSW -- 16x8 signed qadd across from E (mem or
   12214       xmm) and G to G (xmm). */
   12215    /* 66 0F 38 07 = PHSUBSW -- 16x8 signed qsub across from E (mem or
   12216       xmm) and G to G (xmm). */
   12217    /* 66 0F 38 01 = PHADDW -- 16x8 add across from E (mem or xmm) and
   12218       G to G (xmm). */
   12219    /* 66 0F 38 05 = PHSUBW -- 16x8 sub across from E (mem or xmm) and
   12220       G to G (xmm). */
   12221    /* 66 0F 38 02 = PHADDD -- 32x4 add across from E (mem or xmm) and
   12222       G to G (xmm). */
   12223    /* 66 0F 38 06 = PHSUBD -- 32x4 sub across from E (mem or xmm) and
   12224       G to G (xmm). */
   12225 
   12226    if (sz == 2
   12227        && insn[0] == 0x0F && insn[1] == 0x38
   12228        && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
   12229            || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
   12230       const HChar* str = "???";
   12231       IROp   opV64  = Iop_INVALID;
   12232       IROp   opCatO = Iop_CatOddLanes16x4;
   12233       IROp   opCatE = Iop_CatEvenLanes16x4;
   12234       IRTemp sV     = newTemp(Ity_V128);
   12235       IRTemp dV     = newTemp(Ity_V128);
   12236       IRTemp sHi    = newTemp(Ity_I64);
   12237       IRTemp sLo    = newTemp(Ity_I64);
   12238       IRTemp dHi    = newTemp(Ity_I64);
   12239       IRTemp dLo    = newTemp(Ity_I64);
   12240 
   12241       modrm = insn[3];
   12242 
   12243       switch (insn[2]) {
   12244          case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
   12245          case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
   12246          case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
   12247          case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
   12248          case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
   12249          case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
   12250          default: vassert(0);
   12251       }
   12252       if (insn[2] == 0x02 || insn[2] == 0x06) {
   12253          opCatO = Iop_InterleaveHI32x2;
   12254          opCatE = Iop_InterleaveLO32x2;
   12255       }
   12256 
   12257       assign( dV, getXMMReg(gregOfRM(modrm)) );
   12258 
   12259       if (epartIsReg(modrm)) {
   12260          assign( sV, getXMMReg( eregOfRM(modrm)) );
   12261          DIP("ph%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
   12262                                   nameXMMReg(gregOfRM(modrm)));
   12263          delta += 3+1;
   12264       } else {
   12265          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12266          gen_SEGV_if_not_16_aligned( addr );
   12267          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12268          DIP("ph%s %s,%s\n", str, dis_buf,
   12269                              nameXMMReg(gregOfRM(modrm)));
   12270          delta += 3+alen;
   12271       }
   12272 
   12273       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   12274       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   12275       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12276       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12277 
   12278       /* This isn't a particularly efficient way to compute the
   12279          result, but at least it avoids a proliferation of IROps,
   12280          hence avoids complication all the backends. */
   12281       putXMMReg(
   12282          gregOfRM(modrm),
   12283          binop(Iop_64HLtoV128,
   12284                binop(opV64,
   12285                      binop(opCatE,mkexpr(sHi),mkexpr(sLo)),
   12286                      binop(opCatO,mkexpr(sHi),mkexpr(sLo))
   12287                ),
   12288                binop(opV64,
   12289                      binop(opCatE,mkexpr(dHi),mkexpr(dLo)),
   12290                      binop(opCatO,mkexpr(dHi),mkexpr(dLo))
   12291                )
   12292          )
   12293       );
   12294       goto decode_success;
   12295    }
   12296 
   12297    /* 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and Scale
   12298       (MMX) */
   12299    if (sz == 4
   12300        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
   12301       IRTemp sV = newTemp(Ity_I64);
   12302       IRTemp dV = newTemp(Ity_I64);
   12303 
   12304       modrm = insn[3];
   12305       do_MMX_preamble();
   12306       assign( dV, getMMXReg(gregOfRM(modrm)) );
   12307 
   12308       if (epartIsReg(modrm)) {
   12309          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12310          delta += 3+1;
   12311          DIP("pmulhrsw %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   12312                                  nameMMXReg(gregOfRM(modrm)));
   12313       } else {
   12314          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12315          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12316          delta += 3+alen;
   12317          DIP("pmulhrsw %s,%s\n", dis_buf,
   12318                                  nameMMXReg(gregOfRM(modrm)));
   12319       }
   12320 
   12321       putMMXReg(
   12322          gregOfRM(modrm),
   12323          dis_PMULHRSW_helper( mkexpr(sV), mkexpr(dV) )
   12324       );
   12325       goto decode_success;
   12326    }
   12327 
   12328    /* 66 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and
   12329       Scale (XMM) */
   12330    if (sz == 2
   12331        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
   12332       IRTemp sV  = newTemp(Ity_V128);
   12333       IRTemp dV  = newTemp(Ity_V128);
   12334       IRTemp sHi = newTemp(Ity_I64);
   12335       IRTemp sLo = newTemp(Ity_I64);
   12336       IRTemp dHi = newTemp(Ity_I64);
   12337       IRTemp dLo = newTemp(Ity_I64);
   12338 
   12339       modrm = insn[3];
   12340       assign( dV, getXMMReg(gregOfRM(modrm)) );
   12341 
   12342       if (epartIsReg(modrm)) {
   12343          assign( sV, getXMMReg(eregOfRM(modrm)) );
   12344          delta += 3+1;
   12345          DIP("pmulhrsw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   12346                                  nameXMMReg(gregOfRM(modrm)));
   12347       } else {
   12348          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12349          gen_SEGV_if_not_16_aligned( addr );
   12350          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12351          delta += 3+alen;
   12352          DIP("pmulhrsw %s,%s\n", dis_buf,
   12353                                  nameXMMReg(gregOfRM(modrm)));
   12354       }
   12355 
   12356       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   12357       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   12358       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12359       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12360 
   12361       putXMMReg(
   12362          gregOfRM(modrm),
   12363          binop(Iop_64HLtoV128,
   12364                dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
   12365                dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
   12366          )
   12367       );
   12368       goto decode_success;
   12369    }
   12370 
   12371    /* 0F 38 08 = PSIGNB -- Packed Sign 8x8  (MMX) */
   12372    /* 0F 38 09 = PSIGNW -- Packed Sign 16x4 (MMX) */
   12373    /* 0F 38 09 = PSIGND -- Packed Sign 32x2 (MMX) */
   12374    if (sz == 4
   12375        && insn[0] == 0x0F && insn[1] == 0x38
   12376        && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
   12377       IRTemp sV      = newTemp(Ity_I64);
   12378       IRTemp dV      = newTemp(Ity_I64);
   12379       const HChar* str = "???";
   12380       Int    laneszB = 0;
   12381 
   12382       switch (insn[2]) {
   12383          case 0x08: laneszB = 1; str = "b"; break;
   12384          case 0x09: laneszB = 2; str = "w"; break;
   12385          case 0x0A: laneszB = 4; str = "d"; break;
   12386          default: vassert(0);
   12387       }
   12388 
   12389       modrm = insn[3];
   12390       do_MMX_preamble();
   12391       assign( dV, getMMXReg(gregOfRM(modrm)) );
   12392 
   12393       if (epartIsReg(modrm)) {
   12394          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12395          delta += 3+1;
   12396          DIP("psign%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
   12397                                      nameMMXReg(gregOfRM(modrm)));
   12398       } else {
   12399          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12400          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12401          delta += 3+alen;
   12402          DIP("psign%s %s,%s\n", str, dis_buf,
   12403                                      nameMMXReg(gregOfRM(modrm)));
   12404       }
   12405 
   12406       putMMXReg(
   12407          gregOfRM(modrm),
   12408          dis_PSIGN_helper( mkexpr(sV), mkexpr(dV), laneszB )
   12409       );
   12410       goto decode_success;
   12411    }
   12412 
   12413    /* 66 0F 38 08 = PSIGNB -- Packed Sign 8x16 (XMM) */
   12414    /* 66 0F 38 09 = PSIGNW -- Packed Sign 16x8 (XMM) */
   12415    /* 66 0F 38 09 = PSIGND -- Packed Sign 32x4 (XMM) */
   12416    if (sz == 2
   12417        && insn[0] == 0x0F && insn[1] == 0x38
   12418        && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
   12419       IRTemp sV      = newTemp(Ity_V128);
   12420       IRTemp dV      = newTemp(Ity_V128);
   12421       IRTemp sHi     = newTemp(Ity_I64);
   12422       IRTemp sLo     = newTemp(Ity_I64);
   12423       IRTemp dHi     = newTemp(Ity_I64);
   12424       IRTemp dLo     = newTemp(Ity_I64);
   12425       const HChar* str = "???";
   12426       Int    laneszB = 0;
   12427 
   12428       switch (insn[2]) {
   12429          case 0x08: laneszB = 1; str = "b"; break;
   12430          case 0x09: laneszB = 2; str = "w"; break;
   12431          case 0x0A: laneszB = 4; str = "d"; break;
   12432          default: vassert(0);
   12433       }
   12434 
   12435       modrm = insn[3];
   12436       assign( dV, getXMMReg(gregOfRM(modrm)) );
   12437 
   12438       if (epartIsReg(modrm)) {
   12439          assign( sV, getXMMReg(eregOfRM(modrm)) );
   12440          delta += 3+1;
   12441          DIP("psign%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
   12442                                      nameXMMReg(gregOfRM(modrm)));
   12443       } else {
   12444          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12445          gen_SEGV_if_not_16_aligned( addr );
   12446          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12447          delta += 3+alen;
   12448          DIP("psign%s %s,%s\n", str, dis_buf,
   12449                                      nameXMMReg(gregOfRM(modrm)));
   12450       }
   12451 
   12452       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   12453       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   12454       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12455       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12456 
   12457       putXMMReg(
   12458          gregOfRM(modrm),
   12459          binop(Iop_64HLtoV128,
   12460                dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
   12461                dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
   12462          )
   12463       );
   12464       goto decode_success;
   12465    }
   12466 
   12467    /* 0F 38 1C = PABSB -- Packed Absolute Value 8x8  (MMX) */
   12468    /* 0F 38 1D = PABSW -- Packed Absolute Value 16x4 (MMX) */
   12469    /* 0F 38 1E = PABSD -- Packed Absolute Value 32x2 (MMX) */
   12470    if (sz == 4
   12471        && insn[0] == 0x0F && insn[1] == 0x38
   12472        && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
   12473       IRTemp sV      = newTemp(Ity_I64);
   12474       const HChar* str = "???";
   12475       Int    laneszB = 0;
   12476 
   12477       switch (insn[2]) {
   12478          case 0x1C: laneszB = 1; str = "b"; break;
   12479          case 0x1D: laneszB = 2; str = "w"; break;
   12480          case 0x1E: laneszB = 4; str = "d"; break;
   12481          default: vassert(0);
   12482       }
   12483 
   12484       modrm = insn[3];
   12485       do_MMX_preamble();
   12486 
   12487       if (epartIsReg(modrm)) {
   12488          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12489          delta += 3+1;
   12490          DIP("pabs%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
   12491                                     nameMMXReg(gregOfRM(modrm)));
   12492       } else {
   12493          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12494          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12495          delta += 3+alen;
   12496          DIP("pabs%s %s,%s\n", str, dis_buf,
   12497                                     nameMMXReg(gregOfRM(modrm)));
   12498       }
   12499 
   12500       putMMXReg(
   12501          gregOfRM(modrm),
   12502          dis_PABS_helper( mkexpr(sV), laneszB )
   12503       );
   12504       goto decode_success;
   12505    }
   12506 
   12507    /* 66 0F 38 1C = PABSB -- Packed Absolute Value 8x16 (XMM) */
   12508    /* 66 0F 38 1D = PABSW -- Packed Absolute Value 16x8 (XMM) */
   12509    /* 66 0F 38 1E = PABSD -- Packed Absolute Value 32x4 (XMM) */
   12510    if (sz == 2
   12511        && insn[0] == 0x0F && insn[1] == 0x38
   12512        && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
   12513       IRTemp sV      = newTemp(Ity_V128);
   12514       IRTemp sHi     = newTemp(Ity_I64);
   12515       IRTemp sLo     = newTemp(Ity_I64);
   12516       const HChar* str = "???";
   12517       Int    laneszB = 0;
   12518 
   12519       switch (insn[2]) {
   12520          case 0x1C: laneszB = 1; str = "b"; break;
   12521          case 0x1D: laneszB = 2; str = "w"; break;
   12522          case 0x1E: laneszB = 4; str = "d"; break;
   12523          default: vassert(0);
   12524       }
   12525 
   12526       modrm = insn[3];
   12527 
   12528       if (epartIsReg(modrm)) {
   12529          assign( sV, getXMMReg(eregOfRM(modrm)) );
   12530          delta += 3+1;
   12531          DIP("pabs%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
   12532                                     nameXMMReg(gregOfRM(modrm)));
   12533       } else {
   12534          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12535          gen_SEGV_if_not_16_aligned( addr );
   12536          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12537          delta += 3+alen;
   12538          DIP("pabs%s %s,%s\n", str, dis_buf,
   12539                                     nameXMMReg(gregOfRM(modrm)));
   12540       }
   12541 
   12542       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12543       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12544 
   12545       putXMMReg(
   12546          gregOfRM(modrm),
   12547          binop(Iop_64HLtoV128,
   12548                dis_PABS_helper( mkexpr(sHi), laneszB ),
   12549                dis_PABS_helper( mkexpr(sLo), laneszB )
   12550          )
   12551       );
   12552       goto decode_success;
   12553    }
   12554 
   12555    /* 0F 3A 0F = PALIGNR -- Packed Align Right (MMX) */
   12556    if (sz == 4
   12557        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
   12558       IRTemp sV  = newTemp(Ity_I64);
   12559       IRTemp dV  = newTemp(Ity_I64);
   12560       IRTemp res = newTemp(Ity_I64);
   12561 
   12562       modrm = insn[3];
   12563       do_MMX_preamble();
   12564       assign( dV, getMMXReg(gregOfRM(modrm)) );
   12565 
   12566       if (epartIsReg(modrm)) {
   12567          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12568          d32 = (UInt)insn[3+1];
   12569          delta += 3+1+1;
   12570          DIP("palignr $%d,%s,%s\n",  (Int)d32,
   12571                                      nameMMXReg(eregOfRM(modrm)),
   12572                                      nameMMXReg(gregOfRM(modrm)));
   12573       } else {
   12574          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12575          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12576          d32 = (UInt)insn[3+alen];
   12577          delta += 3+alen+1;
   12578          DIP("palignr $%d%s,%s\n", (Int)d32,
   12579                                    dis_buf,
   12580                                    nameMMXReg(gregOfRM(modrm)));
   12581       }
   12582 
   12583       if (d32 == 0) {
   12584          assign( res, mkexpr(sV) );
   12585       }
   12586       else if (d32 >= 1 && d32 <= 7) {
   12587          assign(res,
   12588                 binop(Iop_Or64,
   12589                       binop(Iop_Shr64, mkexpr(sV), mkU8(8*d32)),
   12590                       binop(Iop_Shl64, mkexpr(dV), mkU8(8*(8-d32))
   12591                      )));
   12592       }
   12593       else if (d32 == 8) {
   12594         assign( res, mkexpr(dV) );
   12595       }
   12596       else if (d32 >= 9 && d32 <= 15) {
   12597          assign( res, binop(Iop_Shr64, mkexpr(dV), mkU8(8*(d32-8))) );
   12598       }
   12599       else if (d32 >= 16 && d32 <= 255) {
   12600          assign( res, mkU64(0) );
   12601       }
   12602       else
   12603          vassert(0);
   12604 
   12605       putMMXReg( gregOfRM(modrm), mkexpr(res) );
   12606       goto decode_success;
   12607    }
   12608 
   12609    /* 66 0F 3A 0F = PALIGNR -- Packed Align Right (XMM) */
   12610    if (sz == 2
   12611        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
   12612       IRTemp sV  = newTemp(Ity_V128);
   12613       IRTemp dV  = newTemp(Ity_V128);
   12614       IRTemp sHi = newTemp(Ity_I64);
   12615       IRTemp sLo = newTemp(Ity_I64);
   12616       IRTemp dHi = newTemp(Ity_I64);
   12617       IRTemp dLo = newTemp(Ity_I64);
   12618       IRTemp rHi = newTemp(Ity_I64);
   12619       IRTemp rLo = newTemp(Ity_I64);
   12620 
   12621       modrm = insn[3];
   12622       assign( dV, getXMMReg(gregOfRM(modrm)) );
   12623 
   12624       if (epartIsReg(modrm)) {
   12625          assign( sV, getXMMReg(eregOfRM(modrm)) );
   12626          d32 = (UInt)insn[3+1];
   12627          delta += 3+1+1;
   12628          DIP("palignr $%d,%s,%s\n", (Int)d32,
   12629                                     nameXMMReg(eregOfRM(modrm)),
   12630                                     nameXMMReg(gregOfRM(modrm)));
   12631       } else {
   12632          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12633          gen_SEGV_if_not_16_aligned( addr );
   12634          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12635          d32 = (UInt)insn[3+alen];
   12636          delta += 3+alen+1;
   12637          DIP("palignr $%d,%s,%s\n", (Int)d32,
   12638                                     dis_buf,
   12639                                     nameXMMReg(gregOfRM(modrm)));
   12640       }
   12641 
   12642       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   12643       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   12644       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12645       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12646 
   12647       if (d32 == 0) {
   12648          assign( rHi, mkexpr(sHi) );
   12649          assign( rLo, mkexpr(sLo) );
   12650       }
   12651       else if (d32 >= 1 && d32 <= 7) {
   12652          assign( rHi, dis_PALIGNR_XMM_helper(dLo, sHi, d32) );
   12653          assign( rLo, dis_PALIGNR_XMM_helper(sHi, sLo, d32) );
   12654       }
   12655       else if (d32 == 8) {
   12656          assign( rHi, mkexpr(dLo) );
   12657          assign( rLo, mkexpr(sHi) );
   12658       }
   12659       else if (d32 >= 9 && d32 <= 15) {
   12660          assign( rHi, dis_PALIGNR_XMM_helper(dHi, dLo, d32-8) );
   12661          assign( rLo, dis_PALIGNR_XMM_helper(dLo, sHi, d32-8) );
   12662       }
   12663       else if (d32 == 16) {
   12664          assign( rHi, mkexpr(dHi) );
   12665          assign( rLo, mkexpr(dLo) );
   12666       }
   12667       else if (d32 >= 17 && d32 <= 23) {
   12668          assign( rHi, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d32-16))) );
   12669          assign( rLo, dis_PALIGNR_XMM_helper(dHi, dLo, d32-16) );
   12670       }
   12671       else if (d32 == 24) {
   12672          assign( rHi, mkU64(0) );
   12673          assign( rLo, mkexpr(dHi) );
   12674       }
   12675       else if (d32 >= 25 && d32 <= 31) {
   12676          assign( rHi, mkU64(0) );
   12677          assign( rLo, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d32-24))) );
   12678       }
   12679       else if (d32 >= 32 && d32 <= 255) {
   12680          assign( rHi, mkU64(0) );
   12681          assign( rLo, mkU64(0) );
   12682       }
   12683       else
   12684          vassert(0);
   12685 
   12686       putXMMReg(
   12687          gregOfRM(modrm),
   12688          binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
   12689       );
   12690       goto decode_success;
   12691    }
   12692 
   12693    /* 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x8 (MMX) */
   12694    if (sz == 4
   12695        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
   12696       IRTemp sV      = newTemp(Ity_I64);
   12697       IRTemp dV      = newTemp(Ity_I64);
   12698 
   12699       modrm = insn[3];
   12700       do_MMX_preamble();
   12701       assign( dV, getMMXReg(gregOfRM(modrm)) );
   12702 
   12703       if (epartIsReg(modrm)) {
   12704          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12705          delta += 3+1;
   12706          DIP("pshufb %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   12707                                nameMMXReg(gregOfRM(modrm)));
   12708       } else {
   12709          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12710          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12711          delta += 3+alen;
   12712          DIP("pshufb %s,%s\n", dis_buf,
   12713                                nameMMXReg(gregOfRM(modrm)));
   12714       }
   12715 
   12716       putMMXReg(
   12717          gregOfRM(modrm),
   12718          binop(
   12719             Iop_And64,
   12720             /* permute the lanes */
   12721             binop(
   12722                Iop_Perm8x8,
   12723                mkexpr(dV),
   12724                binop(Iop_And64, mkexpr(sV), mkU64(0x0707070707070707ULL))
   12725             ),
   12726             /* mask off lanes which have (index & 0x80) == 0x80 */
   12727             unop(Iop_Not64, binop(Iop_SarN8x8, mkexpr(sV), mkU8(7)))
   12728          )
   12729       );
   12730       goto decode_success;
   12731    }
   12732 
   12733    /* 66 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x16 (XMM) */
   12734    if (sz == 2
   12735        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
   12736       IRTemp sV         = newTemp(Ity_V128);
   12737       IRTemp dV         = newTemp(Ity_V128);
   12738       IRTemp sHi        = newTemp(Ity_I64);
   12739       IRTemp sLo        = newTemp(Ity_I64);
   12740       IRTemp dHi        = newTemp(Ity_I64);
   12741       IRTemp dLo        = newTemp(Ity_I64);
   12742       IRTemp rHi        = newTemp(Ity_I64);
   12743       IRTemp rLo        = newTemp(Ity_I64);
   12744       IRTemp sevens     = newTemp(Ity_I64);
   12745       IRTemp mask0x80hi = newTemp(Ity_I64);
   12746       IRTemp mask0x80lo = newTemp(Ity_I64);
   12747       IRTemp maskBit3hi = newTemp(Ity_I64);
   12748       IRTemp maskBit3lo = newTemp(Ity_I64);
   12749       IRTemp sAnd7hi    = newTemp(Ity_I64);
   12750       IRTemp sAnd7lo    = newTemp(Ity_I64);
   12751       IRTemp permdHi    = newTemp(Ity_I64);
   12752       IRTemp permdLo    = newTemp(Ity_I64);
   12753 
   12754       modrm = insn[3];
   12755       assign( dV, getXMMReg(gregOfRM(modrm)) );
   12756 
   12757       if (epartIsReg(modrm)) {
   12758          assign( sV, getXMMReg(eregOfRM(modrm)) );
   12759          delta += 3+1;
   12760          DIP("pshufb %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   12761                                nameXMMReg(gregOfRM(modrm)));
   12762       } else {
   12763          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12764          gen_SEGV_if_not_16_aligned( addr );
   12765          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12766          delta += 3+alen;
   12767          DIP("pshufb %s,%s\n", dis_buf,
   12768                                nameXMMReg(gregOfRM(modrm)));
   12769       }
   12770 
   12771       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   12772       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   12773       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12774       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12775 
   12776       assign( sevens, mkU64(0x0707070707070707ULL) );
   12777 
   12778       /*
   12779       mask0x80hi = Not(SarN8x8(sHi,7))
   12780       maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7)
   12781       sAnd7hi    = And(sHi,sevens)
   12782       permdHi    = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi),
   12783                        And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) )
   12784       rHi        = And(permdHi,mask0x80hi)
   12785       */
   12786       assign(
   12787          mask0x80hi,
   12788          unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7))));
   12789 
   12790       assign(
   12791          maskBit3hi,
   12792          binop(Iop_SarN8x8,
   12793                binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)),
   12794                mkU8(7)));
   12795 
   12796       assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens)));
   12797 
   12798       assign(
   12799          permdHi,
   12800          binop(
   12801             Iop_Or64,
   12802             binop(Iop_And64,
   12803                   binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)),
   12804                   mkexpr(maskBit3hi)),
   12805             binop(Iop_And64,
   12806                   binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)),
   12807                   unop(Iop_Not64,mkexpr(maskBit3hi))) ));
   12808 
   12809       assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) );
   12810 
   12811       /* And the same for the lower half of the result.  What fun. */
   12812 
   12813       assign(
   12814          mask0x80lo,
   12815          unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7))));
   12816 
   12817       assign(
   12818          maskBit3lo,
   12819          binop(Iop_SarN8x8,
   12820                binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)),
   12821                mkU8(7)));
   12822 
   12823       assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens)));
   12824 
   12825       assign(
   12826          permdLo,
   12827          binop(
   12828             Iop_Or64,
   12829             binop(Iop_And64,
   12830                   binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)),
   12831                   mkexpr(maskBit3lo)),
   12832             binop(Iop_And64,
   12833                   binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)),
   12834                   unop(Iop_Not64,mkexpr(maskBit3lo))) ));
   12835 
   12836       assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) );
   12837 
   12838       putXMMReg(
   12839          gregOfRM(modrm),
   12840          binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
   12841       );
   12842       goto decode_success;
   12843    }
   12844 
   12845    /* 0F 38 F0 = MOVBE m16/32(E), r16/32(G) */
   12846    /* 0F 38 F1 = MOVBE r16/32(G), m16/32(E) */
   12847    if ((sz == 2 || sz == 4)
   12848        && insn[0] == 0x0F && insn[1] == 0x38
   12849        && (insn[2] == 0xF0 || insn[2] == 0xF1)
   12850        && !epartIsReg(insn[3])) {
   12851 
   12852       modrm = insn[3];
   12853       addr = disAMode(&alen, sorb, delta + 3, dis_buf);
   12854       delta += 3 + alen;
   12855       ty = szToITy(sz);
   12856       IRTemp src = newTemp(ty);
   12857 
   12858       if (insn[2] == 0xF0) { /* LOAD */
   12859          assign(src, loadLE(ty, mkexpr(addr)));
   12860          IRTemp dst = math_BSWAP(src, ty);
   12861          putIReg(sz, gregOfRM(modrm), mkexpr(dst));
   12862          DIP("movbe %s,%s\n", dis_buf, nameIReg(sz, gregOfRM(modrm)));
   12863       } else { /* STORE */
   12864          assign(src, getIReg(sz, gregOfRM(modrm)));
   12865          IRTemp dst = math_BSWAP(src, ty);
   12866          storeLE(mkexpr(addr), mkexpr(dst));
   12867          DIP("movbe %s,%s\n", nameIReg(sz, gregOfRM(modrm)), dis_buf);
   12868       }
   12869       goto decode_success;
   12870    }
   12871 
   12872    /* ---------------------------------------------------- */
   12873    /* --- end of the SSSE3 decoder.                    --- */
   12874    /* ---------------------------------------------------- */
   12875 
   12876    /* ---------------------------------------------------- */
   12877    /* --- start of the SSE4 decoder                    --- */
   12878    /* ---------------------------------------------------- */
   12879 
   12880    /* 66 0F 3A 0B /r ib = ROUNDSD imm8, xmm2/m64, xmm1
   12881       (Partial implementation only -- only deal with cases where
   12882       the rounding mode is specified directly by the immediate byte.)
   12883       66 0F 3A 0A /r ib = ROUNDSS imm8, xmm2/m32, xmm1
   12884       (Limitations ditto)
   12885    */
   12886    if (sz == 2
   12887        && insn[0] == 0x0F && insn[1] == 0x3A
   12888        && (/*insn[2] == 0x0B || */insn[2] == 0x0A)) {
   12889 
   12890       Bool   isD = insn[2] == 0x0B;
   12891       IRTemp src = newTemp(isD ? Ity_F64 : Ity_F32);
   12892       IRTemp res = newTemp(isD ? Ity_F64 : Ity_F32);
   12893       Int    imm = 0;
   12894 
   12895       modrm = insn[3];
   12896 
   12897       if (epartIsReg(modrm)) {
   12898          assign( src,
   12899                  isD ? getXMMRegLane64F( eregOfRM(modrm), 0 )
   12900                      : getXMMRegLane32F( eregOfRM(modrm), 0 ) );
   12901          imm = insn[3+1];
   12902          if (imm & ~3) goto decode_failure;
   12903          delta += 3+1+1;
   12904          DIP( "rounds%c $%d,%s,%s\n",
   12905               isD ? 'd' : 's',
   12906               imm, nameXMMReg( eregOfRM(modrm) ),
   12907                    nameXMMReg( gregOfRM(modrm) ) );
   12908       } else {
   12909          addr = disAMode( &alen, sorb, delta+3, dis_buf );
   12910          assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
   12911          imm = insn[3+alen];
   12912          if (imm & ~3) goto decode_failure;
   12913          delta += 3+alen+1;
   12914          DIP( "roundsd $%d,%s,%s\n",
   12915               imm, dis_buf, nameXMMReg( gregOfRM(modrm) ) );
   12916       }
   12917 
   12918       /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   12919          that encoding is the same as the encoding for IRRoundingMode,
   12920          we can use that value directly in the IR as a rounding
   12921          mode. */
   12922       assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
   12923                   mkU32(imm & 3), mkexpr(src)) );
   12924 
   12925       if (isD)
   12926          putXMMRegLane64F( gregOfRM(modrm), 0, mkexpr(res) );
   12927       else
   12928          putXMMRegLane32F( gregOfRM(modrm), 0, mkexpr(res) );
   12929 
   12930       goto decode_success;
   12931    }
   12932 
   12933    /* F3 0F BD -- LZCNT (count leading zeroes.  An AMD extension,
   12934       which we can only decode if we're sure this is an AMD cpu that
   12935       supports LZCNT, since otherwise it's BSR, which behaves
   12936       differently. */
   12937    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xBD
   12938        && 0 != (archinfo->hwcaps & VEX_HWCAPS_X86_LZCNT)) {
   12939       vassert(sz == 2 || sz == 4);
   12940       /*IRType*/ ty  = szToITy(sz);
   12941       IRTemp     src = newTemp(ty);
   12942       modrm = insn[3];
   12943       if (epartIsReg(modrm)) {
   12944          assign(src, getIReg(sz, eregOfRM(modrm)));
   12945          delta += 3+1;
   12946          DIP("lzcnt%c %s, %s\n", nameISize(sz),
   12947              nameIReg(sz, eregOfRM(modrm)),
   12948              nameIReg(sz, gregOfRM(modrm)));
   12949       } else {
   12950          addr = disAMode( &alen, sorb, delta+3, dis_buf );
   12951          assign(src, loadLE(ty, mkexpr(addr)));
   12952          delta += 3+alen;
   12953          DIP("lzcnt%c %s, %s\n", nameISize(sz), dis_buf,
   12954              nameIReg(sz, gregOfRM(modrm)));
   12955       }
   12956 
   12957       IRTemp res = gen_LZCNT(ty, src);
   12958       putIReg(sz, gregOfRM(modrm), mkexpr(res));
   12959 
   12960       // Update flags.  This is pretty lame .. perhaps can do better
   12961       // if this turns out to be performance critical.
   12962       // O S A P are cleared.  Z is set if RESULT == 0.
   12963       // C is set if SRC is zero.
   12964       IRTemp src32 = newTemp(Ity_I32);
   12965       IRTemp res32 = newTemp(Ity_I32);
   12966       assign(src32, widenUto32(mkexpr(src)));
   12967       assign(res32, widenUto32(mkexpr(res)));
   12968 
   12969       IRTemp oszacp = newTemp(Ity_I32);
   12970       assign(
   12971          oszacp,
   12972          binop(Iop_Or32,
   12973                binop(Iop_Shl32,
   12974                      unop(Iop_1Uto32,
   12975                           binop(Iop_CmpEQ32, mkexpr(res32), mkU32(0))),
   12976                      mkU8(X86G_CC_SHIFT_Z)),
   12977                binop(Iop_Shl32,
   12978                      unop(Iop_1Uto32,
   12979                           binop(Iop_CmpEQ32, mkexpr(src32), mkU32(0))),
   12980                      mkU8(X86G_CC_SHIFT_C))
   12981          )
   12982       );
   12983 
   12984       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   12985       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   12986       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   12987       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
   12988 
   12989       goto decode_success;
   12990    }
   12991 
   12992    /* ---------------------------------------------------- */
   12993    /* --- end of the SSE4 decoder                      --- */
   12994    /* ---------------------------------------------------- */
   12995 
   12996    after_sse_decoders:
   12997 
   12998    /* ---------------------------------------------------- */
   12999    /* --- deal with misc 0x67 pfxs (addr size override) -- */
   13000    /* ---------------------------------------------------- */
   13001 
   13002    /* 67 E3 = JCXZ (for JECXZ see below) */
   13003    if (insn[0] == 0x67 && insn[1] == 0xE3 && sz == 4) {
   13004       delta += 2;
   13005       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
   13006       delta ++;
   13007       stmt( IRStmt_Exit(
   13008                binop(Iop_CmpEQ16, getIReg(2,R_ECX), mkU16(0)),
   13009                Ijk_Boring,
   13010                IRConst_U32(d32),
   13011                OFFB_EIP
   13012             ));
   13013        DIP("jcxz 0x%x\n", d32);
   13014        goto decode_success;
   13015    }
   13016 
   13017    /* ---------------------------------------------------- */
   13018    /* --- start of the baseline insn decoder            -- */
   13019    /* ---------------------------------------------------- */
   13020 
   13021    /* Get the primary opcode. */
   13022    opc = getIByte(delta); delta++;
   13023 
   13024    /* We get here if the current insn isn't SSE, or this CPU doesn't
   13025       support SSE. */
   13026 
   13027    switch (opc) {
   13028 
   13029    /* ------------------------ Control flow --------------- */
   13030 
   13031    case 0xC2: /* RET imm16 */
   13032       d32 = getUDisp16(delta);
   13033       delta += 2;
   13034       dis_ret(&dres, d32);
   13035       DIP("ret %d\n", (Int)d32);
   13036       break;
   13037    case 0xC3: /* RET */
   13038       dis_ret(&dres, 0);
   13039       DIP("ret\n");
   13040       break;
   13041 
   13042    case 0xCF: /* IRET */
   13043       /* Note, this is an extremely kludgey and limited implementation
   13044          of iret.  All it really does is:
   13045             popl %EIP; popl %CS; popl %EFLAGS.
   13046          %CS is set but ignored (as it is in (eg) popw %cs)". */
   13047       t1 = newTemp(Ity_I32); /* ESP */
   13048       t2 = newTemp(Ity_I32); /* new EIP */
   13049       t3 = newTemp(Ity_I32); /* new CS */
   13050       t4 = newTemp(Ity_I32); /* new EFLAGS */
   13051       assign(t1, getIReg(4,R_ESP));
   13052       assign(t2, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(0) )));
   13053       assign(t3, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(4) )));
   13054       assign(t4, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(8) )));
   13055       /* Get stuff off stack */
   13056       putIReg(4, R_ESP,binop(Iop_Add32, mkexpr(t1), mkU32(12)));
   13057       /* set %CS (which is ignored anyway) */
   13058       putSReg( R_CS, unop(Iop_32to16, mkexpr(t3)) );
   13059       /* set %EFLAGS */
   13060       set_EFLAGS_from_value( t4, False/*!emit_AC_emwarn*/, 0/*unused*/ );
   13061       /* goto new EIP value */
   13062       jmp_treg(&dres, Ijk_Ret, t2);
   13063       vassert(dres.whatNext == Dis_StopHere);
   13064       DIP("iret (very kludgey)\n");
   13065       break;
   13066 
   13067    case 0xE8: /* CALL J4 */
   13068       d32 = getUDisp32(delta); delta += 4;
   13069       d32 += (guest_EIP_bbstart+delta);
   13070       /* (guest_eip_bbstart+delta) == return-to addr, d32 == call-to addr */
   13071       if (d32 == guest_EIP_bbstart+delta && getIByte(delta) >= 0x58
   13072                                          && getIByte(delta) <= 0x5F) {
   13073          /* Specially treat the position-independent-code idiom
   13074                  call X
   13075               X: popl %reg
   13076             as
   13077                  movl %eip, %reg.
   13078             since this generates better code, but for no other reason. */
   13079          Int archReg = getIByte(delta) - 0x58;
   13080          /* vex_printf("-- fPIC thingy\n"); */
   13081          putIReg(4, archReg, mkU32(guest_EIP_bbstart+delta));
   13082          delta++; /* Step over the POP */
   13083          DIP("call 0x%x ; popl %s\n",d32,nameIReg(4,archReg));
   13084       } else {
   13085          /* The normal sequence for a call. */
   13086          t1 = newTemp(Ity_I32);
   13087          assign(t1, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
   13088          putIReg(4, R_ESP, mkexpr(t1));
   13089          storeLE( mkexpr(t1), mkU32(guest_EIP_bbstart+delta));
   13090          if (resteerOkFn( callback_opaque, (Addr32)d32 )) {
   13091             /* follow into the call target. */
   13092             dres.whatNext   = Dis_ResteerU;
   13093             dres.continueAt = (Addr32)d32;
   13094          } else {
   13095             jmp_lit(&dres, Ijk_Call, d32);
   13096             vassert(dres.whatNext == Dis_StopHere);
   13097          }
   13098          DIP("call 0x%x\n",d32);
   13099       }
   13100       break;
   13101 
   13102 //--    case 0xC8: /* ENTER */
   13103 //--       d32 = getUDisp16(eip); eip += 2;
   13104 //--       abyte = getIByte(delta); delta++;
   13105 //--
   13106 //--       vg_assert(sz == 4);
   13107 //--       vg_assert(abyte == 0);
   13108 //--
   13109 //--       t1 = newTemp(cb); t2 = newTemp(cb);
   13110 //--       uInstr2(cb, GET,   sz, ArchReg, R_EBP, TempReg, t1);
   13111 //--       uInstr2(cb, GET,    4, ArchReg, R_ESP, TempReg, t2);
   13112 //--       uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
   13113 //--       uLiteral(cb, sz);
   13114 //--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
   13115 //--       uInstr2(cb, STORE,  4, TempReg, t1,    TempReg, t2);
   13116 //--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_EBP);
   13117 //--       if (d32) {
   13118 //--          uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
   13119 //--          uLiteral(cb, d32);
   13120 //--          uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
   13121 //--       }
   13122 //--       DIP("enter 0x%x, 0x%x", d32, abyte);
   13123 //--       break;
   13124 
   13125    case 0xC9: /* LEAVE */
   13126       vassert(sz == 4);
   13127       t1 = newTemp(Ity_I32); t2 = newTemp(Ity_I32);
   13128       assign(t1, getIReg(4,R_EBP));
   13129       /* First PUT ESP looks redundant, but need it because ESP must
   13130          always be up-to-date for Memcheck to work... */
   13131       putIReg(4, R_ESP, mkexpr(t1));
   13132       assign(t2, loadLE(Ity_I32,mkexpr(t1)));
   13133       putIReg(4, R_EBP, mkexpr(t2));
   13134       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t1), mkU32(4)) );
   13135       DIP("leave\n");
   13136       break;
   13137 
   13138    /* ---------------- Misc weird-ass insns --------------- */
   13139 
   13140    case 0x27: /* DAA */
   13141    case 0x2F: /* DAS */
   13142    case 0x37: /* AAA */
   13143    case 0x3F: /* AAS */
   13144       /* An ugly implementation for some ugly instructions.  Oh
   13145 	 well. */
   13146       if (sz != 4) goto decode_failure;
   13147       t1 = newTemp(Ity_I32);
   13148       t2 = newTemp(Ity_I32);
   13149       /* Make up a 32-bit value (t1), with the old value of AX in the
   13150          bottom 16 bits, and the old OSZACP bitmask in the upper 16
   13151          bits. */
   13152       assign(t1,
   13153              binop(Iop_16HLto32,
   13154                    unop(Iop_32to16,
   13155                         mk_x86g_calculate_eflags_all()),
   13156                    getIReg(2, R_EAX)
   13157             ));
   13158       /* Call the helper fn, to get a new AX and OSZACP value, and
   13159          poke both back into the guest state.  Also pass the helper
   13160          the actual opcode so it knows which of the 4 instructions it
   13161          is doing the computation for. */
   13162       vassert(opc == 0x27 || opc == 0x2F || opc == 0x37 || opc == 0x3F);
   13163       assign(t2,
   13164               mkIRExprCCall(
   13165                  Ity_I32, 0/*regparm*/, "x86g_calculate_daa_das_aaa_aas",
   13166                  &x86g_calculate_daa_das_aaa_aas,
   13167                  mkIRExprVec_2( mkexpr(t1), mkU32( opc & 0xFF) )
   13168             ));
   13169      putIReg(2, R_EAX, unop(Iop_32to16, mkexpr(t2) ));
   13170 
   13171      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   13172      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   13173      stmt( IRStmt_Put( OFFB_CC_DEP1,
   13174                        binop(Iop_And32,
   13175                              binop(Iop_Shr32, mkexpr(t2), mkU8(16)),
   13176                              mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
   13177                                     | X86G_CC_MASK_A | X86G_CC_MASK_Z
   13178                                     | X86G_CC_MASK_S| X86G_CC_MASK_O )
   13179                             )
   13180                       )
   13181          );
   13182      /* Set NDEP even though it isn't used.  This makes redundant-PUT
   13183         elimination of previous stores to this field work better. */
   13184      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   13185      switch (opc) {
   13186         case 0x27: DIP("daa\n"); break;
   13187         case 0x2F: DIP("das\n"); break;
   13188         case 0x37: DIP("aaa\n"); break;
   13189         case 0x3F: DIP("aas\n"); break;
   13190         default: vassert(0);
   13191      }
   13192      break;
   13193 
   13194    case 0xD4: /* AAM */
   13195    case 0xD5: /* AAD */
   13196       d32 = getIByte(delta); delta++;
   13197       if (sz != 4 || d32 != 10) goto decode_failure;
   13198       t1 = newTemp(Ity_I32);
   13199       t2 = newTemp(Ity_I32);
   13200       /* Make up a 32-bit value (t1), with the old value of AX in the
   13201          bottom 16 bits, and the old OSZACP bitmask in the upper 16
   13202          bits. */
   13203       assign(t1,
   13204              binop(Iop_16HLto32,
   13205                    unop(Iop_32to16,
   13206                         mk_x86g_calculate_eflags_all()),
   13207                    getIReg(2, R_EAX)
   13208             ));
   13209       /* Call the helper fn, to get a new AX and OSZACP value, and
   13210          poke both back into the guest state.  Also pass the helper
   13211          the actual opcode so it knows which of the 2 instructions it
   13212          is doing the computation for. */
   13213       assign(t2,
   13214               mkIRExprCCall(
   13215                  Ity_I32, 0/*regparm*/, "x86g_calculate_aad_aam",
   13216                  &x86g_calculate_aad_aam,
   13217                  mkIRExprVec_2( mkexpr(t1), mkU32( opc & 0xFF) )
   13218             ));
   13219       putIReg(2, R_EAX, unop(Iop_32to16, mkexpr(t2) ));
   13220 
   13221       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   13222       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   13223       stmt( IRStmt_Put( OFFB_CC_DEP1,
   13224                         binop(Iop_And32,
   13225                               binop(Iop_Shr32, mkexpr(t2), mkU8(16)),
   13226                               mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
   13227                                      | X86G_CC_MASK_A | X86G_CC_MASK_Z
   13228                                      | X86G_CC_MASK_S| X86G_CC_MASK_O )
   13229                              )
   13230                        )
   13231           );
   13232       /* Set NDEP even though it isn't used.  This makes
   13233          redundant-PUT elimination of previous stores to this field
   13234          work better. */
   13235       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   13236 
   13237       DIP(opc == 0xD4 ? "aam\n" : "aad\n");
   13238       break;
   13239 
   13240    /* ------------------------ CWD/CDQ -------------------- */
   13241 
   13242    case 0x98: /* CBW */
   13243       if (sz == 4) {
   13244          putIReg(4, R_EAX, unop(Iop_16Sto32, getIReg(2, R_EAX)));
   13245          DIP("cwde\n");
   13246       } else {
   13247          vassert(sz == 2);
   13248          putIReg(2, R_EAX, unop(Iop_8Sto16, getIReg(1, R_EAX)));
   13249          DIP("cbw\n");
   13250       }
   13251       break;
   13252 
   13253    case 0x99: /* CWD/CDQ */
   13254       ty = szToITy(sz);
   13255       putIReg(sz, R_EDX,
   13256                   binop(mkSizedOp(ty,Iop_Sar8),
   13257                         getIReg(sz, R_EAX),
   13258                         mkU8(sz == 2 ? 15 : 31)) );
   13259       DIP(sz == 2 ? "cwdq\n" : "cdqq\n");
   13260       break;
   13261 
   13262    /* ------------------------ FPU ops -------------------- */
   13263 
   13264    case 0x9E: /* SAHF */
   13265       codegen_SAHF();
   13266       DIP("sahf\n");
   13267       break;
   13268 
   13269    case 0x9F: /* LAHF */
   13270       codegen_LAHF();
   13271       DIP("lahf\n");
   13272       break;
   13273 
   13274    case 0x9B: /* FWAIT */
   13275       /* ignore? */
   13276       DIP("fwait\n");
   13277       break;
   13278 
   13279    case 0xD8:
   13280    case 0xD9:
   13281    case 0xDA:
   13282    case 0xDB:
   13283    case 0xDC:
   13284    case 0xDD:
   13285    case 0xDE:
   13286    case 0xDF: {
   13287       Int  delta0    = delta;
   13288       Bool decode_OK = False;
   13289       delta = dis_FPU ( &decode_OK, sorb, delta );
   13290       if (!decode_OK) {
   13291          delta = delta0;
   13292          goto decode_failure;
   13293       }
   13294       break;
   13295    }
   13296 
   13297    /* ------------------------ INC & DEC ------------------ */
   13298 
   13299    case 0x40: /* INC eAX */
   13300    case 0x41: /* INC eCX */
   13301    case 0x42: /* INC eDX */
   13302    case 0x43: /* INC eBX */
   13303    case 0x44: /* INC eSP */
   13304    case 0x45: /* INC eBP */
   13305    case 0x46: /* INC eSI */
   13306    case 0x47: /* INC eDI */
   13307       vassert(sz == 2 || sz == 4);
   13308       ty = szToITy(sz);
   13309       t1 = newTemp(ty);
   13310       assign( t1, binop(mkSizedOp(ty,Iop_Add8),
   13311                         getIReg(sz, (UInt)(opc - 0x40)),
   13312                         mkU(ty,1)) );
   13313       setFlags_INC_DEC( True, t1, ty );
   13314       putIReg(sz, (UInt)(opc - 0x40), mkexpr(t1));
   13315       DIP("inc%c %s\n", nameISize(sz), nameIReg(sz,opc-0x40));
   13316       break;
   13317 
   13318    case 0x48: /* DEC eAX */
   13319    case 0x49: /* DEC eCX */
   13320    case 0x4A: /* DEC eDX */
   13321    case 0x4B: /* DEC eBX */
   13322    case 0x4C: /* DEC eSP */
   13323    case 0x4D: /* DEC eBP */
   13324    case 0x4E: /* DEC eSI */
   13325    case 0x4F: /* DEC eDI */
   13326       vassert(sz == 2 || sz == 4);
   13327       ty = szToITy(sz);
   13328       t1 = newTemp(ty);
   13329       assign( t1, binop(mkSizedOp(ty,Iop_Sub8),
   13330                         getIReg(sz, (UInt)(opc - 0x48)),
   13331                         mkU(ty,1)) );
   13332       setFlags_INC_DEC( False, t1, ty );
   13333       putIReg(sz, (UInt)(opc - 0x48), mkexpr(t1));
   13334       DIP("dec%c %s\n", nameISize(sz), nameIReg(sz,opc-0x48));
   13335       break;
   13336 
   13337    /* ------------------------ INT ------------------------ */
   13338 
   13339    case 0xCC: /* INT 3 */
   13340       jmp_lit(&dres, Ijk_SigTRAP, ((Addr32)guest_EIP_bbstart)+delta);
   13341       vassert(dres.whatNext == Dis_StopHere);
   13342       DIP("int $0x3\n");
   13343       break;
   13344 
   13345    case 0xCD: /* INT imm8 */
   13346       d32 = getIByte(delta); delta++;
   13347 
   13348       /* For any of the cases where we emit a jump (that is, for all
   13349          currently handled cases), it's important that all ArchRegs
   13350          carry their up-to-date value at this point.  So we declare an
   13351          end-of-block here, which forces any TempRegs caching ArchRegs
   13352          to be flushed. */
   13353 
   13354       /* Handle int $0x3F .. $0x4F by synthesising a segfault and a
   13355          restart of this instruction (hence the "-2" two lines below,
   13356          to get the restart EIP to be this instruction.  This is
   13357          probably Linux-specific and it would be more correct to only
   13358          do this if the VexAbiInfo says that is what we should do.
   13359          This used to handle just 0x40-0x43; Jikes RVM uses a larger
   13360          range (0x3F-0x49), and this allows some slack as well. */
   13361       if (d32 >= 0x3F && d32 <= 0x4F) {
   13362          jmp_lit(&dres, Ijk_SigSEGV, ((Addr32)guest_EIP_bbstart)+delta-2);
   13363          vassert(dres.whatNext == Dis_StopHere);
   13364          DIP("int $0x%x\n", (Int)d32);
   13365          break;
   13366       }
   13367 
   13368       /* Handle int $0x80 (linux syscalls), int $0x81 and $0x82
   13369          (darwin syscalls).  As part of this, note where we are, so we
   13370          can back up the guest to this point if the syscall needs to
   13371          be restarted. */
   13372       if (d32 == 0x80) {
   13373          stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
   13374                            mkU32(guest_EIP_curr_instr) ) );
   13375          jmp_lit(&dres, Ijk_Sys_int128, ((Addr32)guest_EIP_bbstart)+delta);
   13376          vassert(dres.whatNext == Dis_StopHere);
   13377          DIP("int $0x80\n");
   13378          break;
   13379       }
   13380       if (d32 == 0x81) {
   13381          stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
   13382                            mkU32(guest_EIP_curr_instr) ) );
   13383          jmp_lit(&dres, Ijk_Sys_int129, ((Addr32)guest_EIP_bbstart)+delta);
   13384          vassert(dres.whatNext == Dis_StopHere);
   13385          DIP("int $0x81\n");
   13386          break;
   13387       }
   13388       if (d32 == 0x82) {
   13389          stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
   13390                            mkU32(guest_EIP_curr_instr) ) );
   13391          jmp_lit(&dres, Ijk_Sys_int130, ((Addr32)guest_EIP_bbstart)+delta);
   13392          vassert(dres.whatNext == Dis_StopHere);
   13393          DIP("int $0x82\n");
   13394          break;
   13395       }
   13396 
   13397       /* none of the above */
   13398       goto decode_failure;
   13399 
   13400    /* ------------------------ Jcond, byte offset --------- */
   13401 
   13402    case 0xEB: /* Jb (jump, byte offset) */
   13403       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
   13404       delta++;
   13405       if (resteerOkFn( callback_opaque, (Addr32)d32) ) {
   13406          dres.whatNext   = Dis_ResteerU;
   13407          dres.continueAt = (Addr32)d32;
   13408       } else {
   13409          jmp_lit(&dres, Ijk_Boring, d32);
   13410          vassert(dres.whatNext == Dis_StopHere);
   13411       }
   13412       DIP("jmp-8 0x%x\n", d32);
   13413       break;
   13414 
   13415    case 0xE9: /* Jv (jump, 16/32 offset) */
   13416       vassert(sz == 4); /* JRS added 2004 July 11 */
   13417       d32 = (((Addr32)guest_EIP_bbstart)+delta+sz) + getSDisp(sz,delta);
   13418       delta += sz;
   13419       if (resteerOkFn( callback_opaque, (Addr32)d32) ) {
   13420          dres.whatNext   = Dis_ResteerU;
   13421          dres.continueAt = (Addr32)d32;
   13422       } else {
   13423          jmp_lit(&dres, Ijk_Boring, d32);
   13424          vassert(dres.whatNext == Dis_StopHere);
   13425       }
   13426       DIP("jmp 0x%x\n", d32);
   13427       break;
   13428 
   13429    case 0x70:
   13430    case 0x71:
   13431    case 0x72: /* JBb/JNAEb (jump below) */
   13432    case 0x73: /* JNBb/JAEb (jump not below) */
   13433    case 0x74: /* JZb/JEb (jump zero) */
   13434    case 0x75: /* JNZb/JNEb (jump not zero) */
   13435    case 0x76: /* JBEb/JNAb (jump below or equal) */
   13436    case 0x77: /* JNBEb/JAb (jump not below or equal) */
   13437    case 0x78: /* JSb (jump negative) */
   13438    case 0x79: /* JSb (jump not negative) */
   13439    case 0x7A: /* JP (jump parity even) */
   13440    case 0x7B: /* JNP/JPO (jump parity odd) */
   13441    case 0x7C: /* JLb/JNGEb (jump less) */
   13442    case 0x7D: /* JGEb/JNLb (jump greater or equal) */
   13443    case 0x7E: /* JLEb/JNGb (jump less or equal) */
   13444    case 0x7F: /* JGb/JNLEb (jump greater) */
   13445     { Int    jmpDelta;
   13446       const HChar* comment  = "";
   13447       jmpDelta = (Int)getSDisp8(delta);
   13448       vassert(-128 <= jmpDelta && jmpDelta < 128);
   13449       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + jmpDelta;
   13450       delta++;
   13451       if (resteerCisOk
   13452           && vex_control.guest_chase_cond
   13453           && (Addr32)d32 != (Addr32)guest_EIP_bbstart
   13454           && jmpDelta < 0
   13455           && resteerOkFn( callback_opaque, (Addr32)d32) ) {
   13456          /* Speculation: assume this backward branch is taken.  So we
   13457             need to emit a side-exit to the insn following this one,
   13458             on the negation of the condition, and continue at the
   13459             branch target address (d32).  If we wind up back at the
   13460             first instruction of the trace, just stop; it's better to
   13461             let the IR loop unroller handle that case. */
   13462          stmt( IRStmt_Exit(
   13463                   mk_x86g_calculate_condition((X86Condcode)(1 ^ (opc - 0x70))),
   13464                   Ijk_Boring,
   13465                   IRConst_U32(guest_EIP_bbstart+delta),
   13466                   OFFB_EIP ) );
   13467          dres.whatNext   = Dis_ResteerC;
   13468          dres.continueAt = (Addr32)d32;
   13469          comment = "(assumed taken)";
   13470       }
   13471       else
   13472       if (resteerCisOk
   13473           && vex_control.guest_chase_cond
   13474           && (Addr32)d32 != (Addr32)guest_EIP_bbstart
   13475           && jmpDelta >= 0
   13476           && resteerOkFn( callback_opaque,
   13477                           (Addr32)(guest_EIP_bbstart+delta)) ) {
   13478          /* Speculation: assume this forward branch is not taken.  So
   13479             we need to emit a side-exit to d32 (the dest) and continue
   13480             disassembling at the insn immediately following this
   13481             one. */
   13482          stmt( IRStmt_Exit(
   13483                   mk_x86g_calculate_condition((X86Condcode)(opc - 0x70)),
   13484                   Ijk_Boring,
   13485                   IRConst_U32(d32),
   13486                   OFFB_EIP ) );
   13487          dres.whatNext   = Dis_ResteerC;
   13488          dres.continueAt = guest_EIP_bbstart + delta;
   13489          comment = "(assumed not taken)";
   13490       }
   13491       else {
   13492          /* Conservative default translation - end the block at this
   13493             point. */
   13494          jcc_01( &dres, (X86Condcode)(opc - 0x70),
   13495                  (Addr32)(guest_EIP_bbstart+delta), d32);
   13496          vassert(dres.whatNext == Dis_StopHere);
   13497       }
   13498       DIP("j%s-8 0x%x %s\n", name_X86Condcode(opc - 0x70), d32, comment);
   13499       break;
   13500     }
   13501 
   13502    case 0xE3: /* JECXZ (for JCXZ see above) */
   13503       if (sz != 4) goto decode_failure;
   13504       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
   13505       delta ++;
   13506       stmt( IRStmt_Exit(
   13507                binop(Iop_CmpEQ32, getIReg(4,R_ECX), mkU32(0)),
   13508             Ijk_Boring,
   13509             IRConst_U32(d32),
   13510             OFFB_EIP
   13511           ));
   13512       DIP("jecxz 0x%x\n", d32);
   13513       break;
   13514 
   13515    case 0xE0: /* LOOPNE disp8: decrement count, jump if count != 0 && ZF==0 */
   13516    case 0xE1: /* LOOPE  disp8: decrement count, jump if count != 0 && ZF==1 */
   13517    case 0xE2: /* LOOP   disp8: decrement count, jump if count != 0 */
   13518     { /* Again, the docs say this uses ECX/CX as a count depending on
   13519          the address size override, not the operand one.  Since we
   13520          don't handle address size overrides, I guess that means
   13521          ECX. */
   13522       IRExpr* zbit  = NULL;
   13523       IRExpr* count = NULL;
   13524       IRExpr* cond  = NULL;
   13525       const HChar* xtra = NULL;
   13526 
   13527       if (sz != 4) goto decode_failure;
   13528       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
   13529       delta++;
   13530       putIReg(4, R_ECX, binop(Iop_Sub32, getIReg(4,R_ECX), mkU32(1)));
   13531 
   13532       count = getIReg(4,R_ECX);
   13533       cond = binop(Iop_CmpNE32, count, mkU32(0));
   13534       switch (opc) {
   13535          case 0xE2:
   13536             xtra = "";
   13537             break;
   13538          case 0xE1:
   13539             xtra = "e";
   13540             zbit = mk_x86g_calculate_condition( X86CondZ );
   13541 	    cond = mkAnd1(cond, zbit);
   13542             break;
   13543          case 0xE0:
   13544             xtra = "ne";
   13545             zbit = mk_x86g_calculate_condition( X86CondNZ );
   13546 	    cond = mkAnd1(cond, zbit);
   13547             break;
   13548          default:
   13549 	    vassert(0);
   13550       }
   13551       stmt( IRStmt_Exit(cond, Ijk_Boring, IRConst_U32(d32), OFFB_EIP) );
   13552 
   13553       DIP("loop%s 0x%x\n", xtra, d32);
   13554       break;
   13555     }
   13556 
   13557    /* ------------------------ IMUL ----------------------- */
   13558 
   13559    case 0x69: /* IMUL Iv, Ev, Gv */
   13560       delta = dis_imul_I_E_G ( sorb, sz, delta, sz );
   13561       break;
   13562    case 0x6B: /* IMUL Ib, Ev, Gv */
   13563       delta = dis_imul_I_E_G ( sorb, sz, delta, 1 );
   13564       break;
   13565 
   13566    /* ------------------------ MOV ------------------------ */
   13567 
   13568    case 0x88: /* MOV Gb,Eb */
   13569       delta = dis_mov_G_E(sorb, 1, delta);
   13570       break;
   13571 
   13572    case 0x89: /* MOV Gv,Ev */
   13573       delta = dis_mov_G_E(sorb, sz, delta);
   13574       break;
   13575 
   13576    case 0x8A: /* MOV Eb,Gb */
   13577       delta = dis_mov_E_G(sorb, 1, delta);
   13578       break;
   13579 
   13580    case 0x8B: /* MOV Ev,Gv */
   13581       delta = dis_mov_E_G(sorb, sz, delta);
   13582       break;
   13583 
   13584    case 0x8D: /* LEA M,Gv */
   13585       if (sz != 4)
   13586          goto decode_failure;
   13587       modrm = getIByte(delta);
   13588       if (epartIsReg(modrm))
   13589          goto decode_failure;
   13590       /* NOTE!  this is the one place where a segment override prefix
   13591          has no effect on the address calculation.  Therefore we pass
   13592          zero instead of sorb here. */
   13593       addr = disAMode ( &alen, /*sorb*/ 0, delta, dis_buf );
   13594       delta += alen;
   13595       putIReg(sz, gregOfRM(modrm), mkexpr(addr));
   13596       DIP("lea%c %s, %s\n", nameISize(sz), dis_buf,
   13597                             nameIReg(sz,gregOfRM(modrm)));
   13598       break;
   13599 
   13600    case 0x8C: /* MOV Sw,Ew -- MOV from a SEGMENT REGISTER */
   13601       delta = dis_mov_Sw_Ew(sorb, sz, delta);
   13602       break;
   13603 
   13604    case 0x8E: /* MOV Ew,Sw -- MOV to a SEGMENT REGISTER */
   13605       delta = dis_mov_Ew_Sw(sorb, delta);
   13606       break;
   13607 
   13608    case 0xA0: /* MOV Ob,AL */
   13609       sz = 1;
   13610       /* Fall through ... */
   13611    case 0xA1: /* MOV Ov,eAX */
   13612       d32 = getUDisp32(delta); delta += 4;
   13613       ty = szToITy(sz);
   13614       addr = newTemp(Ity_I32);
   13615       assign( addr, handleSegOverride(sorb, mkU32(d32)) );
   13616       putIReg(sz, R_EAX, loadLE(ty, mkexpr(addr)));
   13617       DIP("mov%c %s0x%x, %s\n", nameISize(sz), sorbTxt(sorb),
   13618                                 d32, nameIReg(sz,R_EAX));
   13619       break;
   13620 
   13621    case 0xA2: /* MOV Ob,AL */
   13622       sz = 1;
   13623       /* Fall through ... */
   13624    case 0xA3: /* MOV eAX,Ov */
   13625       d32 = getUDisp32(delta); delta += 4;
   13626       ty = szToITy(sz);
   13627       addr = newTemp(Ity_I32);
   13628       assign( addr, handleSegOverride(sorb, mkU32(d32)) );
   13629       storeLE( mkexpr(addr), getIReg(sz,R_EAX) );
   13630       DIP("mov%c %s, %s0x%x\n", nameISize(sz), nameIReg(sz,R_EAX),
   13631                                 sorbTxt(sorb), d32);
   13632       break;
   13633 
   13634    case 0xB0: /* MOV imm,AL */
   13635    case 0xB1: /* MOV imm,CL */
   13636    case 0xB2: /* MOV imm,DL */
   13637    case 0xB3: /* MOV imm,BL */
   13638    case 0xB4: /* MOV imm,AH */
   13639    case 0xB5: /* MOV imm,CH */
   13640    case 0xB6: /* MOV imm,DH */
   13641    case 0xB7: /* MOV imm,BH */
   13642       d32 = getIByte(delta); delta += 1;
   13643       putIReg(1, opc-0xB0, mkU8(d32));
   13644       DIP("movb $0x%x,%s\n", d32, nameIReg(1,opc-0xB0));
   13645       break;
   13646 
   13647    case 0xB8: /* MOV imm,eAX */
   13648    case 0xB9: /* MOV imm,eCX */
   13649    case 0xBA: /* MOV imm,eDX */
   13650    case 0xBB: /* MOV imm,eBX */
   13651    case 0xBC: /* MOV imm,eSP */
   13652    case 0xBD: /* MOV imm,eBP */
   13653    case 0xBE: /* MOV imm,eSI */
   13654    case 0xBF: /* MOV imm,eDI */
   13655       d32 = getUDisp(sz,delta); delta += sz;
   13656       putIReg(sz, opc-0xB8, mkU(szToITy(sz), d32));
   13657       DIP("mov%c $0x%x,%s\n", nameISize(sz), d32, nameIReg(sz,opc-0xB8));
   13658       break;
   13659 
   13660    case 0xC6: /* C6 /0 = MOV Ib,Eb */
   13661       sz = 1;
   13662       goto maybe_do_Mov_I_E;
   13663    case 0xC7: /* C7 /0 = MOV Iv,Ev */
   13664       goto maybe_do_Mov_I_E;
   13665 
   13666    maybe_do_Mov_I_E:
   13667       modrm = getIByte(delta);
   13668       if (gregOfRM(modrm) == 0) {
   13669          if (epartIsReg(modrm)) {
   13670             delta++; /* mod/rm byte */
   13671             d32 = getUDisp(sz,delta); delta += sz;
   13672             putIReg(sz, eregOfRM(modrm), mkU(szToITy(sz), d32));
   13673             DIP("mov%c $0x%x, %s\n", nameISize(sz), d32,
   13674                                      nameIReg(sz,eregOfRM(modrm)));
   13675          } else {
   13676             addr = disAMode ( &alen, sorb, delta, dis_buf );
   13677             delta += alen;
   13678             d32 = getUDisp(sz,delta); delta += sz;
   13679             storeLE(mkexpr(addr), mkU(szToITy(sz), d32));
   13680             DIP("mov%c $0x%x, %s\n", nameISize(sz), d32, dis_buf);
   13681          }
   13682          break;
   13683       }
   13684       goto decode_failure;
   13685 
   13686    /* ------------------------ opl imm, A ----------------- */
   13687 
   13688    case 0x04: /* ADD Ib, AL */
   13689       delta = dis_op_imm_A(  1, False, Iop_Add8, True, delta, "add" );
   13690       break;
   13691    case 0x05: /* ADD Iv, eAX */
   13692       delta = dis_op_imm_A( sz, False, Iop_Add8, True, delta, "add" );
   13693       break;
   13694 
   13695    case 0x0C: /* OR Ib, AL */
   13696       delta = dis_op_imm_A(  1, False, Iop_Or8, True, delta, "or" );
   13697       break;
   13698    case 0x0D: /* OR Iv, eAX */
   13699       delta = dis_op_imm_A( sz, False, Iop_Or8, True, delta, "or" );
   13700       break;
   13701 
   13702    case 0x14: /* ADC Ib, AL */
   13703       delta = dis_op_imm_A(  1, True, Iop_Add8, True, delta, "adc" );
   13704       break;
   13705    case 0x15: /* ADC Iv, eAX */
   13706       delta = dis_op_imm_A( sz, True, Iop_Add8, True, delta, "adc" );
   13707       break;
   13708 
   13709    case 0x1C: /* SBB Ib, AL */
   13710       delta = dis_op_imm_A( 1, True, Iop_Sub8, True, delta, "sbb" );
   13711       break;
   13712    case 0x1D: /* SBB Iv, eAX */
   13713       delta = dis_op_imm_A( sz, True, Iop_Sub8, True, delta, "sbb" );
   13714       break;
   13715 
   13716    case 0x24: /* AND Ib, AL */
   13717       delta = dis_op_imm_A(  1, False, Iop_And8, True, delta, "and" );
   13718       break;
   13719    case 0x25: /* AND Iv, eAX */
   13720       delta = dis_op_imm_A( sz, False, Iop_And8, True, delta, "and" );
   13721       break;
   13722 
   13723    case 0x2C: /* SUB Ib, AL */
   13724       delta = dis_op_imm_A(  1, False, Iop_Sub8, True, delta, "sub" );
   13725       break;
   13726    case 0x2D: /* SUB Iv, eAX */
   13727       delta = dis_op_imm_A( sz, False, Iop_Sub8, True, delta, "sub" );
   13728       break;
   13729 
   13730    case 0x34: /* XOR Ib, AL */
   13731       delta = dis_op_imm_A(  1, False, Iop_Xor8, True, delta, "xor" );
   13732       break;
   13733    case 0x35: /* XOR Iv, eAX */
   13734       delta = dis_op_imm_A( sz, False, Iop_Xor8, True, delta, "xor" );
   13735       break;
   13736 
   13737    case 0x3C: /* CMP Ib, AL */
   13738       delta = dis_op_imm_A(  1, False, Iop_Sub8, False, delta, "cmp" );
   13739       break;
   13740    case 0x3D: /* CMP Iv, eAX */
   13741       delta = dis_op_imm_A( sz, False, Iop_Sub8, False, delta, "cmp" );
   13742       break;
   13743 
   13744    case 0xA8: /* TEST Ib, AL */
   13745       delta = dis_op_imm_A(  1, False, Iop_And8, False, delta, "test" );
   13746       break;
   13747    case 0xA9: /* TEST Iv, eAX */
   13748       delta = dis_op_imm_A( sz, False, Iop_And8, False, delta, "test" );
   13749       break;
   13750 
   13751    /* ------------------------ opl Ev, Gv ----------------- */
   13752 
   13753    case 0x02: /* ADD Eb,Gb */
   13754       delta = dis_op2_E_G ( sorb, False, Iop_Add8, True, 1, delta, "add" );
   13755       break;
   13756    case 0x03: /* ADD Ev,Gv */
   13757       delta = dis_op2_E_G ( sorb, False, Iop_Add8, True, sz, delta, "add" );
   13758       break;
   13759 
   13760    case 0x0A: /* OR Eb,Gb */
   13761       delta = dis_op2_E_G ( sorb, False, Iop_Or8, True, 1, delta, "or" );
   13762       break;
   13763    case 0x0B: /* OR Ev,Gv */
   13764       delta = dis_op2_E_G ( sorb, False, Iop_Or8, True, sz, delta, "or" );
   13765       break;
   13766 
   13767    case 0x12: /* ADC Eb,Gb */
   13768       delta = dis_op2_E_G ( sorb, True, Iop_Add8, True, 1, delta, "adc" );
   13769       break;
   13770    case 0x13: /* ADC Ev,Gv */
   13771       delta = dis_op2_E_G ( sorb, True, Iop_Add8, True, sz, delta, "adc" );
   13772       break;
   13773 
   13774    case 0x1A: /* SBB Eb,Gb */
   13775       delta = dis_op2_E_G ( sorb, True, Iop_Sub8, True, 1, delta, "sbb" );
   13776       break;
   13777    case 0x1B: /* SBB Ev,Gv */
   13778       delta = dis_op2_E_G ( sorb, True, Iop_Sub8, True, sz, delta, "sbb" );
   13779       break;
   13780 
   13781    case 0x22: /* AND Eb,Gb */
   13782       delta = dis_op2_E_G ( sorb, False, Iop_And8, True, 1, delta, "and" );
   13783       break;
   13784    case 0x23: /* AND Ev,Gv */
   13785       delta = dis_op2_E_G ( sorb, False, Iop_And8, True, sz, delta, "and" );
   13786       break;
   13787 
   13788    case 0x2A: /* SUB Eb,Gb */
   13789       delta = dis_op2_E_G ( sorb, False, Iop_Sub8, True, 1, delta, "sub" );
   13790       break;
   13791    case 0x2B: /* SUB Ev,Gv */
   13792       delta = dis_op2_E_G ( sorb, False, Iop_Sub8, True, sz, delta, "sub" );
   13793       break;
   13794 
   13795    case 0x32: /* XOR Eb,Gb */
   13796       delta = dis_op2_E_G ( sorb, False, Iop_Xor8, True, 1, delta, "xor" );
   13797       break;
   13798    case 0x33: /* XOR Ev,Gv */
   13799       delta = dis_op2_E_G ( sorb, False, Iop_Xor8, True, sz, delta, "xor" );
   13800       break;
   13801 
   13802    case 0x3A: /* CMP Eb,Gb */
   13803       delta = dis_op2_E_G ( sorb, False, Iop_Sub8, False, 1, delta, "cmp" );
   13804       break;
   13805    case 0x3B: /* CMP Ev,Gv */
   13806       delta = dis_op2_E_G ( sorb, False, Iop_Sub8, False, sz, delta, "cmp" );
   13807       break;
   13808 
   13809    case 0x84: /* TEST Eb,Gb */
   13810       delta = dis_op2_E_G ( sorb, False, Iop_And8, False, 1, delta, "test" );
   13811       break;
   13812    case 0x85: /* TEST Ev,Gv */
   13813       delta = dis_op2_E_G ( sorb, False, Iop_And8, False, sz, delta, "test" );
   13814       break;
   13815 
   13816    /* ------------------------ opl Gv, Ev ----------------- */
   13817 
   13818    case 0x00: /* ADD Gb,Eb */
   13819       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13820                             Iop_Add8, True, 1, delta, "add" );
   13821       break;
   13822    case 0x01: /* ADD Gv,Ev */
   13823       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13824                             Iop_Add8, True, sz, delta, "add" );
   13825       break;
   13826 
   13827    case 0x08: /* OR Gb,Eb */
   13828       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13829                             Iop_Or8, True, 1, delta, "or" );
   13830       break;
   13831    case 0x09: /* OR Gv,Ev */
   13832       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13833                             Iop_Or8, True, sz, delta, "or" );
   13834       break;
   13835 
   13836    case 0x10: /* ADC Gb,Eb */
   13837       delta = dis_op2_G_E ( sorb, pfx_lock, True,
   13838                             Iop_Add8, True, 1, delta, "adc" );
   13839       break;
   13840    case 0x11: /* ADC Gv,Ev */
   13841       delta = dis_op2_G_E ( sorb, pfx_lock, True,
   13842                             Iop_Add8, True, sz, delta, "adc" );
   13843       break;
   13844 
   13845    case 0x18: /* SBB Gb,Eb */
   13846       delta = dis_op2_G_E ( sorb, pfx_lock, True,
   13847                             Iop_Sub8, True, 1, delta, "sbb" );
   13848       break;
   13849    case 0x19: /* SBB Gv,Ev */
   13850       delta = dis_op2_G_E ( sorb, pfx_lock, True,
   13851                             Iop_Sub8, True, sz, delta, "sbb" );
   13852       break;
   13853 
   13854    case 0x20: /* AND Gb,Eb */
   13855       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13856                             Iop_And8, True, 1, delta, "and" );
   13857       break;
   13858    case 0x21: /* AND Gv,Ev */
   13859       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13860                             Iop_And8, True, sz, delta, "and" );
   13861       break;
   13862 
   13863    case 0x28: /* SUB Gb,Eb */
   13864       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13865                             Iop_Sub8, True, 1, delta, "sub" );
   13866       break;
   13867    case 0x29: /* SUB Gv,Ev */
   13868       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13869                             Iop_Sub8, True, sz, delta, "sub" );
   13870       break;
   13871 
   13872    case 0x30: /* XOR Gb,Eb */
   13873       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13874                             Iop_Xor8, True, 1, delta, "xor" );
   13875       break;
   13876    case 0x31: /* XOR Gv,Ev */
   13877       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13878                             Iop_Xor8, True, sz, delta, "xor" );
   13879       break;
   13880 
   13881    case 0x38: /* CMP Gb,Eb */
   13882       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13883                             Iop_Sub8, False, 1, delta, "cmp" );
   13884       break;
   13885    case 0x39: /* CMP Gv,Ev */
   13886       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13887                             Iop_Sub8, False, sz, delta, "cmp" );
   13888       break;
   13889 
   13890    /* ------------------------ POP ------------------------ */
   13891 
   13892    case 0x58: /* POP eAX */
   13893    case 0x59: /* POP eCX */
   13894    case 0x5A: /* POP eDX */
   13895    case 0x5B: /* POP eBX */
   13896    case 0x5D: /* POP eBP */
   13897    case 0x5E: /* POP eSI */
   13898    case 0x5F: /* POP eDI */
   13899    case 0x5C: /* POP eSP */
   13900       vassert(sz == 2 || sz == 4);
   13901       t1 = newTemp(szToITy(sz)); t2 = newTemp(Ity_I32);
   13902       assign(t2, getIReg(4, R_ESP));
   13903       assign(t1, loadLE(szToITy(sz),mkexpr(t2)));
   13904       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t2), mkU32(sz)));
   13905       putIReg(sz, opc-0x58, mkexpr(t1));
   13906       DIP("pop%c %s\n", nameISize(sz), nameIReg(sz,opc-0x58));
   13907       break;
   13908 
   13909    case 0x9D: /* POPF */
   13910       vassert(sz == 2 || sz == 4);
   13911       t1 = newTemp(Ity_I32); t2 = newTemp(Ity_I32);
   13912       assign(t2, getIReg(4, R_ESP));
   13913       assign(t1, widenUto32(loadLE(szToITy(sz),mkexpr(t2))));
   13914       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t2), mkU32(sz)));
   13915 
   13916       /* Generate IR to set %EFLAGS{O,S,Z,A,C,P,D,ID,AC} from the
   13917 	 value in t1. */
   13918       set_EFLAGS_from_value( t1, True/*emit_AC_emwarn*/,
   13919                                  ((Addr32)guest_EIP_bbstart)+delta );
   13920 
   13921       DIP("popf%c\n", nameISize(sz));
   13922       break;
   13923 
   13924    case 0x61: /* POPA */
   13925       /* This is almost certainly wrong for sz==2.  So ... */
   13926       if (sz != 4) goto decode_failure;
   13927 
   13928       /* t5 is the old %ESP value. */
   13929       t5 = newTemp(Ity_I32);
   13930       assign( t5, getIReg(4, R_ESP) );
   13931 
   13932       /* Reload all the registers, except %esp. */
   13933       putIReg(4,R_EAX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(28)) ));
   13934       putIReg(4,R_ECX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(24)) ));
   13935       putIReg(4,R_EDX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(20)) ));
   13936       putIReg(4,R_EBX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(16)) ));
   13937       /* ignore saved %ESP */
   13938       putIReg(4,R_EBP, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 8)) ));
   13939       putIReg(4,R_ESI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 4)) ));
   13940       putIReg(4,R_EDI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 0)) ));
   13941 
   13942       /* and move %ESP back up */
   13943       putIReg( 4, R_ESP, binop(Iop_Add32, mkexpr(t5), mkU32(8*4)) );
   13944 
   13945       DIP("popa%c\n", nameISize(sz));
   13946       break;
   13947 
   13948    case 0x8F: /* POPL/POPW m32 */
   13949      { Int    len;
   13950        UChar  rm = getIByte(delta);
   13951 
   13952        /* make sure this instruction is correct POP */
   13953        if (epartIsReg(rm) || gregOfRM(rm) != 0)
   13954           goto decode_failure;
   13955        /* and has correct size */
   13956        if (sz != 4 && sz != 2)
   13957           goto decode_failure;
   13958        ty = szToITy(sz);
   13959 
   13960        t1 = newTemp(Ity_I32); /* stack address */
   13961        t3 = newTemp(ty); /* data */
   13962        /* set t1 to ESP: t1 = ESP */
   13963        assign( t1, getIReg(4, R_ESP) );
   13964        /* load M[ESP] to virtual register t3: t3 = M[t1] */
   13965        assign( t3, loadLE(ty, mkexpr(t1)) );
   13966 
   13967        /* increase ESP; must be done before the STORE.  Intel manual says:
   13968             If the ESP register is used as a base register for addressing
   13969             a destination operand in memory, the POP instruction computes
   13970             the effective address of the operand after it increments the
   13971             ESP register.
   13972        */
   13973        putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t1), mkU32(sz)) );
   13974 
   13975        /* resolve MODR/M */
   13976        addr = disAMode ( &len, sorb, delta, dis_buf);
   13977        storeLE( mkexpr(addr), mkexpr(t3) );
   13978 
   13979        DIP("pop%c %s\n", sz==2 ? 'w' : 'l', dis_buf);
   13980 
   13981        delta += len;
   13982        break;
   13983      }
   13984 
   13985    case 0x1F: /* POP %DS */
   13986       dis_pop_segreg( R_DS, sz ); break;
   13987    case 0x07: /* POP %ES */
   13988       dis_pop_segreg( R_ES, sz ); break;
   13989    case 0x17: /* POP %SS */
   13990       dis_pop_segreg( R_SS, sz ); break;
   13991 
   13992    /* ------------------------ PUSH ----------------------- */
   13993 
   13994    case 0x50: /* PUSH eAX */
   13995    case 0x51: /* PUSH eCX */
   13996    case 0x52: /* PUSH eDX */
   13997    case 0x53: /* PUSH eBX */
   13998    case 0x55: /* PUSH eBP */
   13999    case 0x56: /* PUSH eSI */
   14000    case 0x57: /* PUSH eDI */
   14001    case 0x54: /* PUSH eSP */
   14002       /* This is the Right Way, in that the value to be pushed is
   14003          established before %esp is changed, so that pushl %esp
   14004          correctly pushes the old value. */
   14005       vassert(sz == 2 || sz == 4);
   14006       ty = sz==2 ? Ity_I16 : Ity_I32;
   14007       t1 = newTemp(ty); t2 = newTemp(Ity_I32);
   14008       assign(t1, getIReg(sz, opc-0x50));
   14009       assign(t2, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)));
   14010       putIReg(4, R_ESP, mkexpr(t2) );
   14011       storeLE(mkexpr(t2),mkexpr(t1));
   14012       DIP("push%c %s\n", nameISize(sz), nameIReg(sz,opc-0x50));
   14013       break;
   14014 
   14015 
   14016    case 0x68: /* PUSH Iv */
   14017       d32 = getUDisp(sz,delta); delta += sz;
   14018       goto do_push_I;
   14019    case 0x6A: /* PUSH Ib, sign-extended to sz */
   14020       d32 = getSDisp8(delta); delta += 1;
   14021       goto do_push_I;
   14022    do_push_I:
   14023       ty = szToITy(sz);
   14024       t1 = newTemp(Ity_I32); t2 = newTemp(ty);
   14025       assign( t1, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
   14026       putIReg(4, R_ESP, mkexpr(t1) );
   14027       /* stop mkU16 asserting if d32 is a negative 16-bit number
   14028          (bug #132813) */
   14029       if (ty == Ity_I16)
   14030          d32 &= 0xFFFF;
   14031       storeLE( mkexpr(t1), mkU(ty,d32) );
   14032       DIP("push%c $0x%x\n", nameISize(sz), d32);
   14033       break;
   14034 
   14035    case 0x9C: /* PUSHF */ {
   14036       vassert(sz == 2 || sz == 4);
   14037 
   14038       t1 = newTemp(Ity_I32);
   14039       assign( t1, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
   14040       putIReg(4, R_ESP, mkexpr(t1) );
   14041 
   14042       /* Calculate OSZACP, and patch in fixed fields as per
   14043          Intel docs.
   14044          - bit 1 is always 1
   14045          - bit 9 is Interrupt Enable (should always be 1 in user mode?)
   14046       */
   14047       t2 = newTemp(Ity_I32);
   14048       assign( t2, binop(Iop_Or32,
   14049                         mk_x86g_calculate_eflags_all(),
   14050                         mkU32( (1<<1)|(1<<9) ) ));
   14051 
   14052       /* Patch in the D flag.  This can simply be a copy of bit 10 of
   14053          baseBlock[OFFB_DFLAG]. */
   14054       t3 = newTemp(Ity_I32);
   14055       assign( t3, binop(Iop_Or32,
   14056                         mkexpr(t2),
   14057                         binop(Iop_And32,
   14058                               IRExpr_Get(OFFB_DFLAG,Ity_I32),
   14059                               mkU32(1<<10)))
   14060             );
   14061 
   14062       /* And patch in the ID flag. */
   14063       t4 = newTemp(Ity_I32);
   14064       assign( t4, binop(Iop_Or32,
   14065                         mkexpr(t3),
   14066                         binop(Iop_And32,
   14067                               binop(Iop_Shl32, IRExpr_Get(OFFB_IDFLAG,Ity_I32),
   14068                                                mkU8(21)),
   14069                               mkU32(1<<21)))
   14070             );
   14071 
   14072       /* And patch in the AC flag. */
   14073       t5 = newTemp(Ity_I32);
   14074       assign( t5, binop(Iop_Or32,
   14075                         mkexpr(t4),
   14076                         binop(Iop_And32,
   14077                               binop(Iop_Shl32, IRExpr_Get(OFFB_ACFLAG,Ity_I32),
   14078                                                mkU8(18)),
   14079                               mkU32(1<<18)))
   14080             );
   14081 
   14082       /* if sz==2, the stored value needs to be narrowed. */
   14083       if (sz == 2)
   14084         storeLE( mkexpr(t1), unop(Iop_32to16,mkexpr(t5)) );
   14085       else
   14086         storeLE( mkexpr(t1), mkexpr(t5) );
   14087 
   14088       DIP("pushf%c\n", nameISize(sz));
   14089       break;
   14090    }
   14091 
   14092    case 0x60: /* PUSHA */
   14093       /* This is almost certainly wrong for sz==2.  So ... */
   14094       if (sz != 4) goto decode_failure;
   14095 
   14096       /* This is the Right Way, in that the value to be pushed is
   14097          established before %esp is changed, so that pusha
   14098          correctly pushes the old %esp value.  New value of %esp is
   14099          pushed at start. */
   14100       /* t0 is the %ESP value we're going to push. */
   14101       t0 = newTemp(Ity_I32);
   14102       assign( t0, getIReg(4, R_ESP) );
   14103 
   14104       /* t5 will be the new %ESP value. */
   14105       t5 = newTemp(Ity_I32);
   14106       assign( t5, binop(Iop_Sub32, mkexpr(t0), mkU32(8*4)) );
   14107 
   14108       /* Update guest state before prodding memory. */
   14109       putIReg(4, R_ESP, mkexpr(t5));
   14110 
   14111       /* Dump all the registers. */
   14112       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(28)), getIReg(4,R_EAX) );
   14113       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(24)), getIReg(4,R_ECX) );
   14114       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(20)), getIReg(4,R_EDX) );
   14115       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(16)), getIReg(4,R_EBX) );
   14116       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(12)), mkexpr(t0) /*esp*/);
   14117       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 8)), getIReg(4,R_EBP) );
   14118       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 4)), getIReg(4,R_ESI) );
   14119       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 0)), getIReg(4,R_EDI) );
   14120 
   14121       DIP("pusha%c\n", nameISize(sz));
   14122       break;
   14123 
   14124    case 0x0E: /* PUSH %CS */
   14125       dis_push_segreg( R_CS, sz ); break;
   14126    case 0x1E: /* PUSH %DS */
   14127       dis_push_segreg( R_DS, sz ); break;
   14128    case 0x06: /* PUSH %ES */
   14129       dis_push_segreg( R_ES, sz ); break;
   14130    case 0x16: /* PUSH %SS */
   14131       dis_push_segreg( R_SS, sz ); break;
   14132 
   14133    /* ------------------------ SCAS et al ----------------- */
   14134 
   14135    case 0xA4: /* MOVS, no REP prefix */
   14136    case 0xA5:
   14137       if (sorb != 0)
   14138          goto decode_failure; /* else dis_string_op asserts */
   14139       dis_string_op( dis_MOVS, ( opc == 0xA4 ? 1 : sz ), "movs", sorb );
   14140       break;
   14141 
   14142   case 0xA6: /* CMPSb, no REP prefix */
   14143   case 0xA7:
   14144       if (sorb != 0)
   14145          goto decode_failure; /* else dis_string_op asserts */
   14146       dis_string_op( dis_CMPS, ( opc == 0xA6 ? 1 : sz ), "cmps", sorb );
   14147       break;
   14148 
   14149    case 0xAA: /* STOS, no REP prefix */
   14150    case 0xAB:
   14151       if (sorb != 0)
   14152          goto decode_failure; /* else dis_string_op asserts */
   14153       dis_string_op( dis_STOS, ( opc == 0xAA ? 1 : sz ), "stos", sorb );
   14154       break;
   14155 
   14156    case 0xAC: /* LODS, no REP prefix */
   14157    case 0xAD:
   14158       if (sorb != 0)
   14159          goto decode_failure; /* else dis_string_op asserts */
   14160       dis_string_op( dis_LODS, ( opc == 0xAC ? 1 : sz ), "lods", sorb );
   14161       break;
   14162 
   14163    case 0xAE: /* SCAS, no REP prefix */
   14164    case 0xAF:
   14165       if (sorb != 0)
   14166          goto decode_failure; /* else dis_string_op asserts */
   14167       dis_string_op( dis_SCAS, ( opc == 0xAE ? 1 : sz ), "scas", sorb );
   14168       break;
   14169 
   14170 
   14171    case 0xFC: /* CLD */
   14172       stmt( IRStmt_Put( OFFB_DFLAG, mkU32(1)) );
   14173       DIP("cld\n");
   14174       break;
   14175 
   14176    case 0xFD: /* STD */
   14177       stmt( IRStmt_Put( OFFB_DFLAG, mkU32(0xFFFFFFFF)) );
   14178       DIP("std\n");
   14179       break;
   14180 
   14181    case 0xF8: /* CLC */
   14182    case 0xF9: /* STC */
   14183    case 0xF5: /* CMC */
   14184       t0 = newTemp(Ity_I32);
   14185       t1 = newTemp(Ity_I32);
   14186       assign( t0, mk_x86g_calculate_eflags_all() );
   14187       switch (opc) {
   14188          case 0xF8:
   14189             assign( t1, binop(Iop_And32, mkexpr(t0),
   14190                                          mkU32(~X86G_CC_MASK_C)));
   14191             DIP("clc\n");
   14192             break;
   14193          case 0xF9:
   14194             assign( t1, binop(Iop_Or32, mkexpr(t0),
   14195                                         mkU32(X86G_CC_MASK_C)));
   14196             DIP("stc\n");
   14197             break;
   14198          case 0xF5:
   14199             assign( t1, binop(Iop_Xor32, mkexpr(t0),
   14200                                          mkU32(X86G_CC_MASK_C)));
   14201             DIP("cmc\n");
   14202             break;
   14203          default:
   14204             vpanic("disInstr(x86)(clc/stc/cmc)");
   14205       }
   14206       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   14207       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   14208       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t1) ));
   14209       /* Set NDEP even though it isn't used.  This makes redundant-PUT
   14210          elimination of previous stores to this field work better. */
   14211       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   14212       break;
   14213 
   14214    case 0xD6: /* SALC */
   14215       t0 = newTemp(Ity_I32);
   14216       t1 = newTemp(Ity_I32);
   14217       assign( t0,  binop(Iop_And32,
   14218                          mk_x86g_calculate_eflags_c(),
   14219                          mkU32(1)) );
   14220       assign( t1, binop(Iop_Sar32,
   14221                         binop(Iop_Shl32, mkexpr(t0), mkU8(31)),
   14222                         mkU8(31)) );
   14223       putIReg(1, R_EAX, unop(Iop_32to8, mkexpr(t1)) );
   14224       DIP("salc\n");
   14225       break;
   14226 
   14227    /* REPNE prefix insn */
   14228    case 0xF2: {
   14229       Addr32 eip_orig = guest_EIP_bbstart + delta_start;
   14230       if (sorb != 0) goto decode_failure;
   14231       abyte = getIByte(delta); delta++;
   14232 
   14233       if (abyte == 0x66) { sz = 2; abyte = getIByte(delta); delta++; }
   14234 
   14235       switch (abyte) {
   14236       /* According to the Intel manual, "repne movs" should never occur, but
   14237        * in practice it has happened, so allow for it here... */
   14238       case 0xA4: sz = 1;   /* REPNE MOVS<sz> */
   14239       case 0xA5:
   14240          dis_REP_op ( &dres, X86CondNZ, dis_MOVS, sz, eip_orig,
   14241                              guest_EIP_bbstart+delta, "repne movs" );
   14242          break;
   14243 
   14244       case 0xA6: sz = 1;   /* REPNE CMP<sz> */
   14245       case 0xA7:
   14246          dis_REP_op ( &dres, X86CondNZ, dis_CMPS, sz, eip_orig,
   14247                              guest_EIP_bbstart+delta, "repne cmps" );
   14248          break;
   14249 
   14250       case 0xAA: sz = 1;   /* REPNE STOS<sz> */
   14251       case 0xAB:
   14252          dis_REP_op ( &dres, X86CondNZ, dis_STOS, sz, eip_orig,
   14253                              guest_EIP_bbstart+delta, "repne stos" );
   14254          break;
   14255 
   14256       case 0xAE: sz = 1;   /* REPNE SCAS<sz> */
   14257       case 0xAF:
   14258          dis_REP_op ( &dres, X86CondNZ, dis_SCAS, sz, eip_orig,
   14259                              guest_EIP_bbstart+delta, "repne scas" );
   14260          break;
   14261 
   14262       default:
   14263          goto decode_failure;
   14264       }
   14265       break;
   14266    }
   14267 
   14268    /* REP/REPE prefix insn (for SCAS and CMPS, 0xF3 means REPE,
   14269       for the rest, it means REP) */
   14270    case 0xF3: {
   14271       Addr32 eip_orig = guest_EIP_bbstart + delta_start;
   14272       abyte = getIByte(delta); delta++;
   14273 
   14274       if (abyte == 0x66) { sz = 2; abyte = getIByte(delta); delta++; }
   14275 
   14276       if (sorb != 0 && abyte != 0x0F) goto decode_failure;
   14277 
   14278       switch (abyte) {
   14279       case 0x0F:
   14280          switch (getIByte(delta)) {
   14281          /* On older CPUs, TZCNT behaves the same as BSF.  */
   14282          case 0xBC: /* REP BSF Gv,Ev */
   14283             delta = dis_bs_E_G ( sorb, sz, delta + 1, True );
   14284             break;
   14285          /* On older CPUs, LZCNT behaves the same as BSR.  */
   14286          case 0xBD: /* REP BSR Gv,Ev */
   14287             delta = dis_bs_E_G ( sorb, sz, delta + 1, False );
   14288             break;
   14289          default:
   14290             goto decode_failure;
   14291          }
   14292          break;
   14293 
   14294       case 0xA4: sz = 1;   /* REP MOVS<sz> */
   14295       case 0xA5:
   14296          dis_REP_op ( &dres, X86CondAlways, dis_MOVS, sz, eip_orig,
   14297                              guest_EIP_bbstart+delta, "rep movs" );
   14298          break;
   14299 
   14300       case 0xA6: sz = 1;   /* REPE CMP<sz> */
   14301       case 0xA7:
   14302          dis_REP_op ( &dres, X86CondZ, dis_CMPS, sz, eip_orig,
   14303                              guest_EIP_bbstart+delta, "repe cmps" );
   14304          break;
   14305 
   14306       case 0xAA: sz = 1;   /* REP STOS<sz> */
   14307       case 0xAB:
   14308          dis_REP_op ( &dres, X86CondAlways, dis_STOS, sz, eip_orig,
   14309                              guest_EIP_bbstart+delta, "rep stos" );
   14310          break;
   14311 
   14312       case 0xAC: sz = 1;   /* REP LODS<sz> */
   14313       case 0xAD:
   14314          dis_REP_op ( &dres, X86CondAlways, dis_LODS, sz, eip_orig,
   14315                              guest_EIP_bbstart+delta, "rep lods" );
   14316          break;
   14317 
   14318       case 0xAE: sz = 1;   /* REPE SCAS<sz> */
   14319       case 0xAF:
   14320          dis_REP_op ( &dres, X86CondZ, dis_SCAS, sz, eip_orig,
   14321                              guest_EIP_bbstart+delta, "repe scas" );
   14322          break;
   14323 
   14324       case 0x90:           /* REP NOP (PAUSE) */
   14325          /* a hint to the P4 re spin-wait loop */
   14326          DIP("rep nop (P4 pause)\n");
   14327          /* "observe" the hint.  The Vex client needs to be careful not
   14328             to cause very long delays as a result, though. */
   14329          jmp_lit(&dres, Ijk_Yield, ((Addr32)guest_EIP_bbstart)+delta);
   14330          vassert(dres.whatNext == Dis_StopHere);
   14331          break;
   14332 
   14333       case 0xC3:           /* REP RET -- same as normal ret? */
   14334          dis_ret(&dres, 0);
   14335          DIP("rep ret\n");
   14336          break;
   14337 
   14338       default:
   14339          goto decode_failure;
   14340       }
   14341       break;
   14342    }
   14343 
   14344    /* ------------------------ XCHG ----------------------- */
   14345 
   14346    /* XCHG reg,mem automatically asserts LOCK# even without a LOCK
   14347       prefix; hence it must be translated with an IRCAS (at least, the
   14348       memory variant). */
   14349    case 0x86: /* XCHG Gb,Eb */
   14350       sz = 1;
   14351       /* Fall through ... */
   14352    case 0x87: /* XCHG Gv,Ev */
   14353       modrm = getIByte(delta);
   14354       ty = szToITy(sz);
   14355       t1 = newTemp(ty); t2 = newTemp(ty);
   14356       if (epartIsReg(modrm)) {
   14357          assign(t1, getIReg(sz, eregOfRM(modrm)));
   14358          assign(t2, getIReg(sz, gregOfRM(modrm)));
   14359          putIReg(sz, gregOfRM(modrm), mkexpr(t1));
   14360          putIReg(sz, eregOfRM(modrm), mkexpr(t2));
   14361          delta++;
   14362          DIP("xchg%c %s, %s\n",
   14363              nameISize(sz), nameIReg(sz,gregOfRM(modrm)),
   14364                             nameIReg(sz,eregOfRM(modrm)));
   14365       } else {
   14366          *expect_CAS = True;
   14367          addr = disAMode ( &alen, sorb, delta, dis_buf );
   14368          assign( t1, loadLE(ty,mkexpr(addr)) );
   14369          assign( t2, getIReg(sz,gregOfRM(modrm)) );
   14370          casLE( mkexpr(addr),
   14371                 mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
   14372          putIReg( sz, gregOfRM(modrm), mkexpr(t1) );
   14373          delta += alen;
   14374          DIP("xchg%c %s, %s\n", nameISize(sz),
   14375                                 nameIReg(sz,gregOfRM(modrm)), dis_buf);
   14376       }
   14377       break;
   14378 
   14379    case 0x90: /* XCHG eAX,eAX */
   14380       DIP("nop\n");
   14381       break;
   14382    case 0x91: /* XCHG eAX,eCX */
   14383    case 0x92: /* XCHG eAX,eDX */
   14384    case 0x93: /* XCHG eAX,eBX */
   14385    case 0x94: /* XCHG eAX,eSP */
   14386    case 0x95: /* XCHG eAX,eBP */
   14387    case 0x96: /* XCHG eAX,eSI */
   14388    case 0x97: /* XCHG eAX,eDI */
   14389       codegen_xchg_eAX_Reg ( sz, opc - 0x90 );
   14390       break;
   14391 
   14392    /* ------------------------ XLAT ----------------------- */
   14393 
   14394    case 0xD7: /* XLAT */
   14395       if (sz != 4) goto decode_failure; /* sz == 2 is also allowed (0x66) */
   14396       putIReg(
   14397          1,
   14398          R_EAX/*AL*/,
   14399          loadLE(Ity_I8,
   14400                 handleSegOverride(
   14401                    sorb,
   14402                    binop(Iop_Add32,
   14403                          getIReg(4, R_EBX),
   14404                          unop(Iop_8Uto32, getIReg(1, R_EAX/*AL*/))))));
   14405 
   14406       DIP("xlat%c [ebx]\n", nameISize(sz));
   14407       break;
   14408 
   14409    /* ------------------------ IN / OUT ----------------------- */
   14410 
   14411    case 0xE4: /* IN imm8, AL */
   14412       sz = 1;
   14413       t1 = newTemp(Ity_I32);
   14414       abyte = getIByte(delta); delta++;
   14415       assign(t1, mkU32( abyte & 0xFF ));
   14416       DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIReg(sz,R_EAX));
   14417       goto do_IN;
   14418    case 0xE5: /* IN imm8, eAX */
   14419       vassert(sz == 2 || sz == 4);
   14420       t1 = newTemp(Ity_I32);
   14421       abyte = getIByte(delta); delta++;
   14422       assign(t1, mkU32( abyte & 0xFF ));
   14423       DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIReg(sz,R_EAX));
   14424       goto do_IN;
   14425    case 0xEC: /* IN %DX, AL */
   14426       sz = 1;
   14427       t1 = newTemp(Ity_I32);
   14428       assign(t1, unop(Iop_16Uto32, getIReg(2, R_EDX)));
   14429       DIP("in%c %s,%s\n", nameISize(sz), nameIReg(2,R_EDX),
   14430                                          nameIReg(sz,R_EAX));
   14431       goto do_IN;
   14432    case 0xED: /* IN %DX, eAX */
   14433       vassert(sz == 2 || sz == 4);
   14434       t1 = newTemp(Ity_I32);
   14435       assign(t1, unop(Iop_16Uto32, getIReg(2, R_EDX)));
   14436       DIP("in%c %s,%s\n", nameISize(sz), nameIReg(2,R_EDX),
   14437                                          nameIReg(sz,R_EAX));
   14438       goto do_IN;
   14439    do_IN: {
   14440       /* At this point, sz indicates the width, and t1 is a 32-bit
   14441          value giving port number. */
   14442       IRDirty* d;
   14443       vassert(sz == 1 || sz == 2 || sz == 4);
   14444       ty = szToITy(sz);
   14445       t2 = newTemp(Ity_I32);
   14446       d = unsafeIRDirty_1_N(
   14447              t2,
   14448              0/*regparms*/,
   14449              "x86g_dirtyhelper_IN",
   14450              &x86g_dirtyhelper_IN,
   14451              mkIRExprVec_2( mkexpr(t1), mkU32(sz) )
   14452           );
   14453       /* do the call, dumping the result in t2. */
   14454       stmt( IRStmt_Dirty(d) );
   14455       putIReg(sz, R_EAX, narrowTo( ty, mkexpr(t2) ) );
   14456       break;
   14457    }
   14458 
   14459    case 0xE6: /* OUT AL, imm8 */
   14460       sz = 1;
   14461       t1 = newTemp(Ity_I32);
   14462       abyte = getIByte(delta); delta++;
   14463       assign( t1, mkU32( abyte & 0xFF ) );
   14464       DIP("out%c %s,$%d\n", nameISize(sz), nameIReg(sz,R_EAX), (Int)abyte);
   14465       goto do_OUT;
   14466    case 0xE7: /* OUT eAX, imm8 */
   14467       vassert(sz == 2 || sz == 4);
   14468       t1 = newTemp(Ity_I32);
   14469       abyte = getIByte(delta); delta++;
   14470       assign( t1, mkU32( abyte & 0xFF ) );
   14471       DIP("out%c %s,$%d\n", nameISize(sz), nameIReg(sz,R_EAX), (Int)abyte);
   14472       goto do_OUT;
   14473    case 0xEE: /* OUT AL, %DX */
   14474       sz = 1;
   14475       t1 = newTemp(Ity_I32);
   14476       assign( t1, unop(Iop_16Uto32, getIReg(2, R_EDX)) );
   14477       DIP("out%c %s,%s\n", nameISize(sz), nameIReg(sz,R_EAX),
   14478                                           nameIReg(2,R_EDX));
   14479       goto do_OUT;
   14480    case 0xEF: /* OUT eAX, %DX */
   14481       vassert(sz == 2 || sz == 4);
   14482       t1 = newTemp(Ity_I32);
   14483       assign( t1, unop(Iop_16Uto32, getIReg(2, R_EDX)) );
   14484       DIP("out%c %s,%s\n", nameISize(sz), nameIReg(sz,R_EAX),
   14485                                           nameIReg(2,R_EDX));
   14486       goto do_OUT;
   14487    do_OUT: {
   14488       /* At this point, sz indicates the width, and t1 is a 32-bit
   14489          value giving port number. */
   14490       IRDirty* d;
   14491       vassert(sz == 1 || sz == 2 || sz == 4);
   14492       ty = szToITy(sz);
   14493       d = unsafeIRDirty_0_N(
   14494              0/*regparms*/,
   14495              "x86g_dirtyhelper_OUT",
   14496              &x86g_dirtyhelper_OUT,
   14497              mkIRExprVec_3( mkexpr(t1),
   14498                             widenUto32( getIReg(sz, R_EAX) ),
   14499                             mkU32(sz) )
   14500           );
   14501       stmt( IRStmt_Dirty(d) );
   14502       break;
   14503    }
   14504 
   14505    /* ------------------------ (Grp1 extensions) ---------- */
   14506 
   14507    case 0x82: /* Grp1 Ib,Eb too.  Apparently this is the same as
   14508                  case 0x80, but only in 32-bit mode. */
   14509       /* fallthru */
   14510    case 0x80: /* Grp1 Ib,Eb */
   14511       modrm = getIByte(delta);
   14512       am_sz = lengthAMode(delta);
   14513       sz    = 1;
   14514       d_sz  = 1;
   14515       d32   = getUChar(delta + am_sz);
   14516       delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
   14517       break;
   14518 
   14519    case 0x81: /* Grp1 Iv,Ev */
   14520       modrm = getIByte(delta);
   14521       am_sz = lengthAMode(delta);
   14522       d_sz  = sz;
   14523       d32   = getUDisp(d_sz, delta + am_sz);
   14524       delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
   14525       break;
   14526 
   14527    case 0x83: /* Grp1 Ib,Ev */
   14528       modrm = getIByte(delta);
   14529       am_sz = lengthAMode(delta);
   14530       d_sz  = 1;
   14531       d32   = getSDisp8(delta + am_sz);
   14532       delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
   14533       break;
   14534 
   14535    /* ------------------------ (Grp2 extensions) ---------- */
   14536 
   14537    case 0xC0: { /* Grp2 Ib,Eb */
   14538       Bool decode_OK = True;
   14539       modrm = getIByte(delta);
   14540       am_sz = lengthAMode(delta);
   14541       d_sz  = 1;
   14542       d32   = getUChar(delta + am_sz);
   14543       sz    = 1;
   14544       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14545                          mkU8(d32 & 0xFF), NULL, &decode_OK );
   14546       if (!decode_OK)
   14547          goto decode_failure;
   14548       break;
   14549    }
   14550    case 0xC1: { /* Grp2 Ib,Ev */
   14551       Bool decode_OK = True;
   14552       modrm = getIByte(delta);
   14553       am_sz = lengthAMode(delta);
   14554       d_sz  = 1;
   14555       d32   = getUChar(delta + am_sz);
   14556       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14557                          mkU8(d32 & 0xFF), NULL, &decode_OK );
   14558       if (!decode_OK)
   14559          goto decode_failure;
   14560       break;
   14561    }
   14562    case 0xD0: { /* Grp2 1,Eb */
   14563       Bool decode_OK = True;
   14564       modrm = getIByte(delta);
   14565       am_sz = lengthAMode(delta);
   14566       d_sz  = 0;
   14567       d32   = 1;
   14568       sz    = 1;
   14569       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14570                          mkU8(d32), NULL, &decode_OK );
   14571       if (!decode_OK)
   14572          goto decode_failure;
   14573       break;
   14574    }
   14575    case 0xD1: { /* Grp2 1,Ev */
   14576       Bool decode_OK = True;
   14577       modrm = getUChar(delta);
   14578       am_sz = lengthAMode(delta);
   14579       d_sz  = 0;
   14580       d32   = 1;
   14581       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14582                          mkU8(d32), NULL, &decode_OK );
   14583       if (!decode_OK)
   14584          goto decode_failure;
   14585       break;
   14586    }
   14587    case 0xD2: { /* Grp2 CL,Eb */
   14588       Bool decode_OK = True;
   14589       modrm = getUChar(delta);
   14590       am_sz = lengthAMode(delta);
   14591       d_sz  = 0;
   14592       sz    = 1;
   14593       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14594                          getIReg(1,R_ECX), "%cl", &decode_OK );
   14595       if (!decode_OK)
   14596          goto decode_failure;
   14597       break;
   14598    }
   14599    case 0xD3: { /* Grp2 CL,Ev */
   14600       Bool decode_OK = True;
   14601       modrm = getIByte(delta);
   14602       am_sz = lengthAMode(delta);
   14603       d_sz  = 0;
   14604       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14605                          getIReg(1,R_ECX), "%cl", &decode_OK );
   14606       if (!decode_OK)
   14607          goto decode_failure;
   14608       break;
   14609    }
   14610 
   14611    /* ------------------------ (Grp3 extensions) ---------- */
   14612 
   14613    case 0xF6: { /* Grp3 Eb */
   14614       Bool decode_OK = True;
   14615       delta = dis_Grp3 ( sorb, pfx_lock, 1, delta, &decode_OK );
   14616       if (!decode_OK)
   14617          goto decode_failure;
   14618       break;
   14619    }
   14620    case 0xF7: { /* Grp3 Ev */
   14621       Bool decode_OK = True;
   14622       delta = dis_Grp3 ( sorb, pfx_lock, sz, delta, &decode_OK );
   14623       if (!decode_OK)
   14624          goto decode_failure;
   14625       break;
   14626    }
   14627 
   14628    /* ------------------------ (Grp4 extensions) ---------- */
   14629 
   14630    case 0xFE: { /* Grp4 Eb */
   14631       Bool decode_OK = True;
   14632       delta = dis_Grp4 ( sorb, pfx_lock, delta, &decode_OK );
   14633       if (!decode_OK)
   14634          goto decode_failure;
   14635       break;
   14636    }
   14637 
   14638    /* ------------------------ (Grp5 extensions) ---------- */
   14639 
   14640    case 0xFF: { /* Grp5 Ev */
   14641       Bool decode_OK = True;
   14642       delta = dis_Grp5 ( sorb, pfx_lock, sz, delta, &dres, &decode_OK );
   14643       if (!decode_OK)
   14644          goto decode_failure;
   14645       break;
   14646    }
   14647 
   14648    /* ------------------------ Escapes to 2-byte opcodes -- */
   14649 
   14650    case 0x0F: {
   14651       opc = getIByte(delta); delta++;
   14652       switch (opc) {
   14653 
   14654       /* =-=-=-=-=-=-=-=-=- Grp8 =-=-=-=-=-=-=-=-=-=-=-= */
   14655 
   14656       case 0xBA: { /* Grp8 Ib,Ev */
   14657          Bool decode_OK = False;
   14658          modrm = getUChar(delta);
   14659          am_sz = lengthAMode(delta);
   14660          d32   = getSDisp8(delta + am_sz);
   14661          delta = dis_Grp8_Imm ( sorb, pfx_lock, delta, modrm,
   14662                                 am_sz, sz, d32, &decode_OK );
   14663          if (!decode_OK)
   14664             goto decode_failure;
   14665          break;
   14666       }
   14667 
   14668       /* =-=-=-=-=-=-=-=-=- BSF/BSR -=-=-=-=-=-=-=-=-=-= */
   14669 
   14670       case 0xBC: /* BSF Gv,Ev */
   14671          delta = dis_bs_E_G ( sorb, sz, delta, True );
   14672          break;
   14673       case 0xBD: /* BSR Gv,Ev */
   14674          delta = dis_bs_E_G ( sorb, sz, delta, False );
   14675          break;
   14676 
   14677       /* =-=-=-=-=-=-=-=-=- BSWAP -=-=-=-=-=-=-=-=-=-=-= */
   14678 
   14679       case 0xC8: /* BSWAP %eax */
   14680       case 0xC9:
   14681       case 0xCA:
   14682       case 0xCB:
   14683       case 0xCC:
   14684       case 0xCD:
   14685       case 0xCE:
   14686       case 0xCF: /* BSWAP %edi */
   14687          /* AFAICS from the Intel docs, this only exists at size 4. */
   14688          if (sz != 4) goto decode_failure;
   14689 
   14690          t1 = newTemp(Ity_I32);
   14691          assign( t1, getIReg(4, opc-0xC8) );
   14692          t2 = math_BSWAP(t1, Ity_I32);
   14693 
   14694          putIReg(4, opc-0xC8, mkexpr(t2));
   14695          DIP("bswapl %s\n", nameIReg(4, opc-0xC8));
   14696          break;
   14697 
   14698       /* =-=-=-=-=-=-=-=-=- BT/BTS/BTR/BTC =-=-=-=-=-=-= */
   14699 
   14700       case 0xA3: /* BT Gv,Ev */
   14701          delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpNone );
   14702          break;
   14703       case 0xB3: /* BTR Gv,Ev */
   14704          delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpReset );
   14705          break;
   14706       case 0xAB: /* BTS Gv,Ev */
   14707          delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpSet );
   14708          break;
   14709       case 0xBB: /* BTC Gv,Ev */
   14710          delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpComp );
   14711          break;
   14712 
   14713       /* =-=-=-=-=-=-=-=-=- CMOV =-=-=-=-=-=-=-=-=-=-=-= */
   14714 
   14715       case 0x40:
   14716       case 0x41:
   14717       case 0x42: /* CMOVBb/CMOVNAEb (cmov below) */
   14718       case 0x43: /* CMOVNBb/CMOVAEb (cmov not below) */
   14719       case 0x44: /* CMOVZb/CMOVEb (cmov zero) */
   14720       case 0x45: /* CMOVNZb/CMOVNEb (cmov not zero) */
   14721       case 0x46: /* CMOVBEb/CMOVNAb (cmov below or equal) */
   14722       case 0x47: /* CMOVNBEb/CMOVAb (cmov not below or equal) */
   14723       case 0x48: /* CMOVSb (cmov negative) */
   14724       case 0x49: /* CMOVSb (cmov not negative) */
   14725       case 0x4A: /* CMOVP (cmov parity even) */
   14726       case 0x4B: /* CMOVNP (cmov parity odd) */
   14727       case 0x4C: /* CMOVLb/CMOVNGEb (cmov less) */
   14728       case 0x4D: /* CMOVGEb/CMOVNLb (cmov greater or equal) */
   14729       case 0x4E: /* CMOVLEb/CMOVNGb (cmov less or equal) */
   14730       case 0x4F: /* CMOVGb/CMOVNLEb (cmov greater) */
   14731          delta = dis_cmov_E_G(sorb, sz, (X86Condcode)(opc - 0x40), delta);
   14732          break;
   14733 
   14734       /* =-=-=-=-=-=-=-=-=- CMPXCHG -=-=-=-=-=-=-=-=-=-= */
   14735 
   14736       case 0xB0: /* CMPXCHG Gb,Eb */
   14737          delta = dis_cmpxchg_G_E ( sorb, pfx_lock, 1, delta );
   14738          break;
   14739       case 0xB1: /* CMPXCHG Gv,Ev */
   14740          delta = dis_cmpxchg_G_E ( sorb, pfx_lock, sz, delta );
   14741          break;
   14742 
   14743       case 0xC7: { /* CMPXCHG8B Gv (0F C7 /1) */
   14744          IRTemp expdHi    = newTemp(Ity_I32);
   14745          IRTemp expdLo    = newTemp(Ity_I32);
   14746          IRTemp dataHi    = newTemp(Ity_I32);
   14747          IRTemp dataLo    = newTemp(Ity_I32);
   14748          IRTemp oldHi     = newTemp(Ity_I32);
   14749          IRTemp oldLo     = newTemp(Ity_I32);
   14750          IRTemp flags_old = newTemp(Ity_I32);
   14751          IRTemp flags_new = newTemp(Ity_I32);
   14752          IRTemp success   = newTemp(Ity_I1);
   14753 
   14754          /* Translate this using a DCAS, even if there is no LOCK
   14755             prefix.  Life is too short to bother with generating two
   14756             different translations for the with/without-LOCK-prefix
   14757             cases. */
   14758          *expect_CAS = True;
   14759 
   14760 	 /* Decode, and generate address. */
   14761          if (sz != 4) goto decode_failure;
   14762          modrm = getIByte(delta);
   14763          if (epartIsReg(modrm)) goto decode_failure;
   14764          if (gregOfRM(modrm) != 1) goto decode_failure;
   14765          addr = disAMode ( &alen, sorb, delta, dis_buf );
   14766          delta += alen;
   14767 
   14768          /* Get the expected and new values. */
   14769          assign( expdHi, getIReg(4,R_EDX) );
   14770          assign( expdLo, getIReg(4,R_EAX) );
   14771          assign( dataHi, getIReg(4,R_ECX) );
   14772          assign( dataLo, getIReg(4,R_EBX) );
   14773 
   14774          /* Do the DCAS */
   14775          stmt( IRStmt_CAS(
   14776                   mkIRCAS( oldHi, oldLo,
   14777                            Iend_LE, mkexpr(addr),
   14778                            mkexpr(expdHi), mkexpr(expdLo),
   14779                            mkexpr(dataHi), mkexpr(dataLo)
   14780                )));
   14781 
   14782          /* success when oldHi:oldLo == expdHi:expdLo */
   14783          assign( success,
   14784                  binop(Iop_CasCmpEQ32,
   14785                        binop(Iop_Or32,
   14786                              binop(Iop_Xor32, mkexpr(oldHi), mkexpr(expdHi)),
   14787                              binop(Iop_Xor32, mkexpr(oldLo), mkexpr(expdLo))
   14788                        ),
   14789                        mkU32(0)
   14790                  ));
   14791 
   14792          /* If the DCAS is successful, that is to say oldHi:oldLo ==
   14793             expdHi:expdLo, then put expdHi:expdLo back in EDX:EAX,
   14794             which is where they came from originally.  Both the actual
   14795             contents of these two regs, and any shadow values, are
   14796             unchanged.  If the DCAS fails then we're putting into
   14797             EDX:EAX the value seen in memory. */
   14798          putIReg(4, R_EDX,
   14799                     IRExpr_ITE( mkexpr(success),
   14800                                 mkexpr(expdHi), mkexpr(oldHi)
   14801                 ));
   14802          putIReg(4, R_EAX,
   14803                     IRExpr_ITE( mkexpr(success),
   14804                                 mkexpr(expdLo), mkexpr(oldLo)
   14805                 ));
   14806 
   14807          /* Copy the success bit into the Z flag and leave the others
   14808             unchanged */
   14809          assign( flags_old, widenUto32(mk_x86g_calculate_eflags_all()));
   14810          assign(
   14811             flags_new,
   14812             binop(Iop_Or32,
   14813                   binop(Iop_And32, mkexpr(flags_old),
   14814                                    mkU32(~X86G_CC_MASK_Z)),
   14815                   binop(Iop_Shl32,
   14816                         binop(Iop_And32,
   14817                               unop(Iop_1Uto32, mkexpr(success)), mkU32(1)),
   14818                         mkU8(X86G_CC_SHIFT_Z)) ));
   14819 
   14820          stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   14821          stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(flags_new) ));
   14822          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   14823          /* Set NDEP even though it isn't used.  This makes
   14824             redundant-PUT elimination of previous stores to this field
   14825             work better. */
   14826          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   14827 
   14828          /* Sheesh.  Aren't you glad it was me and not you that had to
   14829 	    write and validate all this grunge? */
   14830 
   14831 	 DIP("cmpxchg8b %s\n", dis_buf);
   14832 	 break;
   14833       }
   14834 
   14835       /* =-=-=-=-=-=-=-=-=- CPUID -=-=-=-=-=-=-=-=-=-=-= */
   14836 
   14837       case 0xA2: { /* CPUID */
   14838          /* Uses dirty helper:
   14839                void dirtyhelper_CPUID_sse[012] ( VexGuestX86State* )
   14840             declared to mod eax, wr ebx, ecx, edx
   14841          */
   14842          IRDirty* d     = NULL;
   14843          void*    fAddr = NULL;
   14844          const HChar* fName = NULL;
   14845          if (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2) {
   14846             fName = "x86g_dirtyhelper_CPUID_sse2";
   14847             fAddr = &x86g_dirtyhelper_CPUID_sse2;
   14848          }
   14849          else
   14850          if (archinfo->hwcaps & VEX_HWCAPS_X86_SSE1) {
   14851             fName = "x86g_dirtyhelper_CPUID_sse1";
   14852             fAddr = &x86g_dirtyhelper_CPUID_sse1;
   14853          }
   14854          else
   14855          if (archinfo->hwcaps & VEX_HWCAPS_X86_MMXEXT) {
   14856             fName = "x86g_dirtyhelper_CPUID_mmxext";
   14857             fAddr = &x86g_dirtyhelper_CPUID_mmxext;
   14858          }
   14859          else
   14860          if (archinfo->hwcaps == 0/*no SSE*/) {
   14861             fName = "x86g_dirtyhelper_CPUID_sse0";
   14862             fAddr = &x86g_dirtyhelper_CPUID_sse0;
   14863          } else
   14864             vpanic("disInstr(x86)(cpuid)");
   14865 
   14866          vassert(fName); vassert(fAddr);
   14867          d = unsafeIRDirty_0_N ( 0/*regparms*/,
   14868                                  fName, fAddr, mkIRExprVec_1(IRExpr_BBPTR()) );
   14869          /* declare guest state effects */
   14870          d->nFxState = 4;
   14871          vex_bzero(&d->fxState, sizeof(d->fxState));
   14872          d->fxState[0].fx     = Ifx_Modify;
   14873          d->fxState[0].offset = OFFB_EAX;
   14874          d->fxState[0].size   = 4;
   14875          d->fxState[1].fx     = Ifx_Write;
   14876          d->fxState[1].offset = OFFB_EBX;
   14877          d->fxState[1].size   = 4;
   14878          d->fxState[2].fx     = Ifx_Modify;
   14879          d->fxState[2].offset = OFFB_ECX;
   14880          d->fxState[2].size   = 4;
   14881          d->fxState[3].fx     = Ifx_Write;
   14882          d->fxState[3].offset = OFFB_EDX;
   14883          d->fxState[3].size   = 4;
   14884          /* execute the dirty call, side-effecting guest state */
   14885          stmt( IRStmt_Dirty(d) );
   14886          /* CPUID is a serialising insn.  So, just in case someone is
   14887             using it as a memory fence ... */
   14888          stmt( IRStmt_MBE(Imbe_Fence) );
   14889          DIP("cpuid\n");
   14890          break;
   14891       }
   14892 
   14893 //--          if (!VG_(cpu_has_feature)(VG_X86_FEAT_CPUID))
   14894 //--             goto decode_failure;
   14895 //--
   14896 //--          t1 = newTemp(cb);
   14897 //--          t2 = newTemp(cb);
   14898 //--          t3 = newTemp(cb);
   14899 //--          t4 = newTemp(cb);
   14900 //--          uInstr0(cb, CALLM_S, 0);
   14901 //--
   14902 //--          uInstr2(cb, GET,   4, ArchReg, R_EAX, TempReg, t1);
   14903 //--          uInstr1(cb, PUSH,  4, TempReg, t1);
   14904 //--
   14905 //--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t2);
   14906 //--          uLiteral(cb, 0);
   14907 //--          uInstr1(cb, PUSH,  4, TempReg, t2);
   14908 //--
   14909 //--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t3);
   14910 //--          uLiteral(cb, 0);
   14911 //--          uInstr1(cb, PUSH,  4, TempReg, t3);
   14912 //--
   14913 //--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t4);
   14914 //--          uLiteral(cb, 0);
   14915 //--          uInstr1(cb, PUSH,  4, TempReg, t4);
   14916 //--
   14917 //--          uInstr1(cb, CALLM, 0, Lit16,   VGOFF_(helper_CPUID));
   14918 //--          uFlagsRWU(cb, FlagsEmpty, FlagsEmpty, FlagsEmpty);
   14919 //--
   14920 //--          uInstr1(cb, POP,   4, TempReg, t4);
   14921 //--          uInstr2(cb, PUT,   4, TempReg, t4, ArchReg, R_EDX);
   14922 //--
   14923 //--          uInstr1(cb, POP,   4, TempReg, t3);
   14924 //--          uInstr2(cb, PUT,   4, TempReg, t3, ArchReg, R_ECX);
   14925 //--
   14926 //--          uInstr1(cb, POP,   4, TempReg, t2);
   14927 //--          uInstr2(cb, PUT,   4, TempReg, t2, ArchReg, R_EBX);
   14928 //--
   14929 //--          uInstr1(cb, POP,   4, TempReg, t1);
   14930 //--          uInstr2(cb, PUT,   4, TempReg, t1, ArchReg, R_EAX);
   14931 //--
   14932 //--          uInstr0(cb, CALLM_E, 0);
   14933 //--          DIP("cpuid\n");
   14934 //--          break;
   14935 //--
   14936       /* =-=-=-=-=-=-=-=-=- MOVZX, MOVSX =-=-=-=-=-=-=-= */
   14937 
   14938       case 0xB6: /* MOVZXb Eb,Gv */
   14939          if (sz != 2 && sz != 4)
   14940             goto decode_failure;
   14941          delta = dis_movx_E_G ( sorb, delta, 1, sz, False );
   14942          break;
   14943 
   14944       case 0xB7: /* MOVZXw Ew,Gv */
   14945          if (sz != 4)
   14946             goto decode_failure;
   14947          delta = dis_movx_E_G ( sorb, delta, 2, 4, False );
   14948          break;
   14949 
   14950       case 0xBE: /* MOVSXb Eb,Gv */
   14951          if (sz != 2 && sz != 4)
   14952             goto decode_failure;
   14953          delta = dis_movx_E_G ( sorb, delta, 1, sz, True );
   14954          break;
   14955 
   14956       case 0xBF: /* MOVSXw Ew,Gv */
   14957          if (sz != 4 && /* accept movsww, sigh, see #250799 */sz != 2)
   14958             goto decode_failure;
   14959          delta = dis_movx_E_G ( sorb, delta, 2, sz, True );
   14960          break;
   14961 
   14962 //--       /* =-=-=-=-=-=-=-=-=-=-= MOVNTI -=-=-=-=-=-=-=-=-= */
   14963 //--
   14964 //--       case 0xC3: /* MOVNTI Gv,Ev */
   14965 //--          vg_assert(sz == 4);
   14966 //--          modrm = getUChar(eip);
   14967 //--          vg_assert(!epartIsReg(modrm));
   14968 //--          t1 = newTemp(cb);
   14969 //--          uInstr2(cb, GET, 4, ArchReg, gregOfRM(modrm), TempReg, t1);
   14970 //--          pair = disAMode ( cb, sorb, eip, dis_buf );
   14971 //--          t2 = LOW24(pair);
   14972 //--          eip += HI8(pair);
   14973 //--          uInstr2(cb, STORE, 4, TempReg, t1, TempReg, t2);
   14974 //--          DIP("movnti %s,%s\n", nameIReg(4,gregOfRM(modrm)), dis_buf);
   14975 //--          break;
   14976 
   14977       /* =-=-=-=-=-=-=-=-=- MUL/IMUL =-=-=-=-=-=-=-=-=-= */
   14978 
   14979       case 0xAF: /* IMUL Ev, Gv */
   14980          delta = dis_mul_E_G ( sorb, sz, delta );
   14981          break;
   14982 
   14983       /* =-=-=-=-=-=-=-=-=- NOPs =-=-=-=-=-=-=-=-=-=-=-= */
   14984 
   14985       case 0x1F:
   14986          modrm = getUChar(delta);
   14987          if (epartIsReg(modrm)) goto decode_failure;
   14988          addr = disAMode ( &alen, sorb, delta, dis_buf );
   14989          delta += alen;
   14990          DIP("nop%c %s\n", nameISize(sz), dis_buf);
   14991          break;
   14992 
   14993       /* =-=-=-=-=-=-=-=-=- Jcond d32 -=-=-=-=-=-=-=-=-= */
   14994       case 0x80:
   14995       case 0x81:
   14996       case 0x82: /* JBb/JNAEb (jump below) */
   14997       case 0x83: /* JNBb/JAEb (jump not below) */
   14998       case 0x84: /* JZb/JEb (jump zero) */
   14999       case 0x85: /* JNZb/JNEb (jump not zero) */
   15000       case 0x86: /* JBEb/JNAb (jump below or equal) */
   15001       case 0x87: /* JNBEb/JAb (jump not below or equal) */
   15002       case 0x88: /* JSb (jump negative) */
   15003       case 0x89: /* JSb (jump not negative) */
   15004       case 0x8A: /* JP (jump parity even) */
   15005       case 0x8B: /* JNP/JPO (jump parity odd) */
   15006       case 0x8C: /* JLb/JNGEb (jump less) */
   15007       case 0x8D: /* JGEb/JNLb (jump greater or equal) */
   15008       case 0x8E: /* JLEb/JNGb (jump less or equal) */
   15009       case 0x8F: /* JGb/JNLEb (jump greater) */
   15010        { Int    jmpDelta;
   15011          const HChar* comment  = "";
   15012          jmpDelta = (Int)getUDisp32(delta);
   15013          d32 = (((Addr32)guest_EIP_bbstart)+delta+4) + jmpDelta;
   15014          delta += 4;
   15015          if (resteerCisOk
   15016              && vex_control.guest_chase_cond
   15017              && (Addr32)d32 != (Addr32)guest_EIP_bbstart
   15018              && jmpDelta < 0
   15019              && resteerOkFn( callback_opaque, (Addr32)d32) ) {
   15020             /* Speculation: assume this backward branch is taken.  So
   15021                we need to emit a side-exit to the insn following this
   15022                one, on the negation of the condition, and continue at
   15023                the branch target address (d32).  If we wind up back at
   15024                the first instruction of the trace, just stop; it's
   15025                better to let the IR loop unroller handle that case.*/
   15026             stmt( IRStmt_Exit(
   15027                      mk_x86g_calculate_condition((X86Condcode)
   15028                                                  (1 ^ (opc - 0x80))),
   15029                      Ijk_Boring,
   15030                      IRConst_U32(guest_EIP_bbstart+delta),
   15031                      OFFB_EIP ) );
   15032             dres.whatNext   = Dis_ResteerC;
   15033             dres.continueAt = (Addr32)d32;
   15034             comment = "(assumed taken)";
   15035          }
   15036          else
   15037          if (resteerCisOk
   15038              && vex_control.guest_chase_cond
   15039              && (Addr32)d32 != (Addr32)guest_EIP_bbstart
   15040              && jmpDelta >= 0
   15041              && resteerOkFn( callback_opaque,
   15042                              (Addr32)(guest_EIP_bbstart+delta)) ) {
   15043             /* Speculation: assume this forward branch is not taken.
   15044                So we need to emit a side-exit to d32 (the dest) and
   15045                continue disassembling at the insn immediately
   15046                following this one. */
   15047             stmt( IRStmt_Exit(
   15048                      mk_x86g_calculate_condition((X86Condcode)(opc - 0x80)),
   15049                      Ijk_Boring,
   15050                      IRConst_U32(d32),
   15051                      OFFB_EIP ) );
   15052             dres.whatNext   = Dis_ResteerC;
   15053             dres.continueAt = guest_EIP_bbstart + delta;
   15054             comment = "(assumed not taken)";
   15055          }
   15056          else {
   15057             /* Conservative default translation - end the block at
   15058                this point. */
   15059             jcc_01( &dres, (X86Condcode)(opc - 0x80),
   15060                     (Addr32)(guest_EIP_bbstart+delta), d32);
   15061             vassert(dres.whatNext == Dis_StopHere);
   15062          }
   15063          DIP("j%s-32 0x%x %s\n", name_X86Condcode(opc - 0x80), d32, comment);
   15064          break;
   15065        }
   15066 
   15067       /* =-=-=-=-=-=-=-=-=- RDTSC -=-=-=-=-=-=-=-=-=-=-= */
   15068       case 0x31: { /* RDTSC */
   15069          IRTemp   val  = newTemp(Ity_I64);
   15070          IRExpr** args = mkIRExprVec_0();
   15071          IRDirty* d    = unsafeIRDirty_1_N (
   15072                             val,
   15073                             0/*regparms*/,
   15074                             "x86g_dirtyhelper_RDTSC",
   15075                             &x86g_dirtyhelper_RDTSC,
   15076                             args
   15077                          );
   15078          /* execute the dirty call, dumping the result in val. */
   15079          stmt( IRStmt_Dirty(d) );
   15080          putIReg(4, R_EDX, unop(Iop_64HIto32, mkexpr(val)));
   15081          putIReg(4, R_EAX, unop(Iop_64to32, mkexpr(val)));
   15082          DIP("rdtsc\n");
   15083          break;
   15084       }
   15085 
   15086       /* =-=-=-=-=-=-=-=-=- PUSH/POP Sreg =-=-=-=-=-=-=-=-=-= */
   15087 
   15088       case 0xA1: /* POP %FS */
   15089          dis_pop_segreg( R_FS, sz ); break;
   15090       case 0xA9: /* POP %GS */
   15091          dis_pop_segreg( R_GS, sz ); break;
   15092 
   15093       case 0xA0: /* PUSH %FS */
   15094          dis_push_segreg( R_FS, sz ); break;
   15095       case 0xA8: /* PUSH %GS */
   15096          dis_push_segreg( R_GS, sz ); break;
   15097 
   15098       /* =-=-=-=-=-=-=-=-=- SETcc Eb =-=-=-=-=-=-=-=-=-= */
   15099       case 0x90:
   15100       case 0x91:
   15101       case 0x92: /* set-Bb/set-NAEb (jump below) */
   15102       case 0x93: /* set-NBb/set-AEb (jump not below) */
   15103       case 0x94: /* set-Zb/set-Eb (jump zero) */
   15104       case 0x95: /* set-NZb/set-NEb (jump not zero) */
   15105       case 0x96: /* set-BEb/set-NAb (jump below or equal) */
   15106       case 0x97: /* set-NBEb/set-Ab (jump not below or equal) */
   15107       case 0x98: /* set-Sb (jump negative) */
   15108       case 0x99: /* set-Sb (jump not negative) */
   15109       case 0x9A: /* set-P (jump parity even) */
   15110       case 0x9B: /* set-NP (jump parity odd) */
   15111       case 0x9C: /* set-Lb/set-NGEb (jump less) */
   15112       case 0x9D: /* set-GEb/set-NLb (jump greater or equal) */
   15113       case 0x9E: /* set-LEb/set-NGb (jump less or equal) */
   15114       case 0x9F: /* set-Gb/set-NLEb (jump greater) */
   15115          t1 = newTemp(Ity_I8);
   15116          assign( t1, unop(Iop_1Uto8,mk_x86g_calculate_condition(opc-0x90)) );
   15117          modrm = getIByte(delta);
   15118          if (epartIsReg(modrm)) {
   15119             delta++;
   15120             putIReg(1, eregOfRM(modrm), mkexpr(t1));
   15121             DIP("set%s %s\n", name_X86Condcode(opc-0x90),
   15122                               nameIReg(1,eregOfRM(modrm)));
   15123          } else {
   15124            addr = disAMode ( &alen, sorb, delta, dis_buf );
   15125            delta += alen;
   15126            storeLE( mkexpr(addr), mkexpr(t1) );
   15127            DIP("set%s %s\n", name_X86Condcode(opc-0x90), dis_buf);
   15128          }
   15129          break;
   15130 
   15131       /* =-=-=-=-=-=-=-=-=- SHLD/SHRD -=-=-=-=-=-=-=-=-= */
   15132 
   15133       case 0xA4: /* SHLDv imm8,Gv,Ev */
   15134          modrm = getIByte(delta);
   15135          d32   = delta + lengthAMode(delta);
   15136          vex_sprintf(dis_buf, "$%d", getIByte(d32));
   15137          delta = dis_SHLRD_Gv_Ev (
   15138                   sorb, delta, modrm, sz,
   15139                   mkU8(getIByte(d32)), True, /* literal */
   15140                   dis_buf, True );
   15141          break;
   15142       case 0xA5: /* SHLDv %cl,Gv,Ev */
   15143          modrm = getIByte(delta);
   15144          delta = dis_SHLRD_Gv_Ev (
   15145                     sorb, delta, modrm, sz,
   15146                     getIReg(1,R_ECX), False, /* not literal */
   15147                     "%cl", True );
   15148          break;
   15149 
   15150       case 0xAC: /* SHRDv imm8,Gv,Ev */
   15151          modrm = getIByte(delta);
   15152          d32   = delta + lengthAMode(delta);
   15153          vex_sprintf(dis_buf, "$%d", getIByte(d32));
   15154          delta = dis_SHLRD_Gv_Ev (
   15155                     sorb, delta, modrm, sz,
   15156                     mkU8(getIByte(d32)), True, /* literal */
   15157                     dis_buf, False );
   15158          break;
   15159       case 0xAD: /* SHRDv %cl,Gv,Ev */
   15160          modrm = getIByte(delta);
   15161          delta = dis_SHLRD_Gv_Ev (
   15162                     sorb, delta, modrm, sz,
   15163                     getIReg(1,R_ECX), False, /* not literal */
   15164                     "%cl", False );
   15165          break;
   15166 
   15167       /* =-=-=-=-=-=-=-=-=- SYSENTER -=-=-=-=-=-=-=-=-=-= */
   15168 
   15169       case 0x34:
   15170          /* Simple implementation needing a long explaination.
   15171 
   15172             sysenter is a kind of syscall entry.  The key thing here
   15173             is that the return address is not known -- that is
   15174             something that is beyond Vex's knowledge.  So this IR
   15175             forces a return to the scheduler, which can do what it
   15176             likes to simulate the systenter, but it MUST set this
   15177             thread's guest_EIP field with the continuation address
   15178             before resuming execution.  If that doesn't happen, the
   15179             thread will jump to address zero, which is probably
   15180             fatal.
   15181          */
   15182 
   15183          /* Note where we are, so we can back up the guest to this
   15184             point if the syscall needs to be restarted. */
   15185          stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
   15186                            mkU32(guest_EIP_curr_instr) ) );
   15187          jmp_lit(&dres, Ijk_Sys_sysenter, 0/*bogus next EIP value*/);
   15188          vassert(dres.whatNext == Dis_StopHere);
   15189          DIP("sysenter");
   15190          break;
   15191 
   15192       /* =-=-=-=-=-=-=-=-=- XADD -=-=-=-=-=-=-=-=-=-= */
   15193 
   15194       case 0xC0: { /* XADD Gb,Eb */
   15195          Bool decodeOK;
   15196          delta = dis_xadd_G_E ( sorb, pfx_lock, 1, delta, &decodeOK );
   15197          if (!decodeOK) goto decode_failure;
   15198          break;
   15199       }
   15200       case 0xC1: { /* XADD Gv,Ev */
   15201          Bool decodeOK;
   15202          delta = dis_xadd_G_E ( sorb, pfx_lock, sz, delta, &decodeOK );
   15203          if (!decodeOK) goto decode_failure;
   15204          break;
   15205       }
   15206 
   15207       /* =-=-=-=-=-=-=-=-=- MMXery =-=-=-=-=-=-=-=-=-=-= */
   15208 
   15209       case 0x71:
   15210       case 0x72:
   15211       case 0x73: /* PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
   15212 
   15213       case 0x6E: /* MOVD (src)ireg-or-mem, (dst)mmxreg */
   15214       case 0x7E: /* MOVD (src)mmxreg, (dst)ireg-or-mem */
   15215       case 0x7F: /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
   15216       case 0x6F: /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
   15217 
   15218       case 0xFC:
   15219       case 0xFD:
   15220       case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
   15221 
   15222       case 0xEC:
   15223       case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
   15224 
   15225       case 0xDC:
   15226       case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   15227 
   15228       case 0xF8:
   15229       case 0xF9:
   15230       case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
   15231 
   15232       case 0xE8:
   15233       case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
   15234 
   15235       case 0xD8:
   15236       case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   15237 
   15238       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
   15239       case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
   15240 
   15241       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
   15242 
   15243       case 0x74:
   15244       case 0x75:
   15245       case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
   15246 
   15247       case 0x64:
   15248       case 0x65:
   15249       case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
   15250 
   15251       case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
   15252       case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
   15253       case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
   15254 
   15255       case 0x68:
   15256       case 0x69:
   15257       case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
   15258 
   15259       case 0x60:
   15260       case 0x61:
   15261       case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
   15262 
   15263       case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
   15264       case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
   15265       case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
   15266       case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
   15267 
   15268       case 0xF1: /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
   15269       case 0xF2:
   15270       case 0xF3:
   15271 
   15272       case 0xD1: /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
   15273       case 0xD2:
   15274       case 0xD3:
   15275 
   15276       case 0xE1: /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
   15277       case 0xE2:
   15278       {
   15279          Int  delta0    = delta-1;
   15280          Bool decode_OK = False;
   15281 
   15282          /* If sz==2 this is SSE, and we assume sse idec has
   15283             already spotted those cases by now. */
   15284          if (sz != 4)
   15285             goto decode_failure;
   15286 
   15287          delta = dis_MMX ( &decode_OK, sorb, sz, delta-1 );
   15288          if (!decode_OK) {
   15289             delta = delta0;
   15290             goto decode_failure;
   15291          }
   15292          break;
   15293       }
   15294 
   15295       case 0x0E: /* FEMMS */
   15296       case 0x77: /* EMMS */
   15297          if (sz != 4)
   15298             goto decode_failure;
   15299          do_EMMS_preamble();
   15300          DIP("{f}emms\n");
   15301          break;
   15302 
   15303       /* =-=-=-=-=-=-=-=-=- SGDT and SIDT =-=-=-=-=-=-=-=-=-=-= */
   15304       case 0x01: /* 0F 01 /0 -- SGDT */
   15305                  /* 0F 01 /1 -- SIDT */
   15306       {
   15307           /* This is really revolting, but ... since each processor
   15308              (core) only has one IDT and one GDT, just let the guest
   15309              see it (pass-through semantics).  I can't see any way to
   15310              construct a faked-up value, so don't bother to try. */
   15311          modrm = getUChar(delta);
   15312          addr = disAMode ( &alen, sorb, delta, dis_buf );
   15313          delta += alen;
   15314          if (epartIsReg(modrm)) goto decode_failure;
   15315          if (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)
   15316             goto decode_failure;
   15317          switch (gregOfRM(modrm)) {
   15318             case 0: DIP("sgdt %s\n", dis_buf); break;
   15319             case 1: DIP("sidt %s\n", dis_buf); break;
   15320             default: vassert(0); /*NOTREACHED*/
   15321          }
   15322 
   15323          IRDirty* d = unsafeIRDirty_0_N (
   15324                           0/*regparms*/,
   15325                           "x86g_dirtyhelper_SxDT",
   15326                           &x86g_dirtyhelper_SxDT,
   15327                           mkIRExprVec_2( mkexpr(addr),
   15328                                          mkU32(gregOfRM(modrm)) )
   15329                       );
   15330          /* declare we're writing memory */
   15331          d->mFx   = Ifx_Write;
   15332          d->mAddr = mkexpr(addr);
   15333          d->mSize = 6;
   15334          stmt( IRStmt_Dirty(d) );
   15335          break;
   15336       }
   15337 
   15338       case 0x05: /* AMD's syscall */
   15339          stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
   15340               mkU32(guest_EIP_curr_instr) ) );
   15341          jmp_lit(&dres, Ijk_Sys_syscall, ((Addr32)guest_EIP_bbstart)+delta);
   15342          vassert(dres.whatNext == Dis_StopHere);
   15343          DIP("syscall\n");
   15344          break;
   15345 
   15346       /* =-=-=-=-=-=-=-=-=- unimp2 =-=-=-=-=-=-=-=-=-=-= */
   15347 
   15348       default:
   15349          goto decode_failure;
   15350    } /* switch (opc) for the 2-byte opcodes */
   15351    goto decode_success;
   15352    } /* case 0x0F: of primary opcode */
   15353 
   15354    /* ------------------------ ??? ------------------------ */
   15355 
   15356   default:
   15357   decode_failure:
   15358    /* All decode failures end up here. */
   15359    if (sigill_diag) {
   15360       vex_printf("vex x86->IR: unhandled instruction bytes: "
   15361                  "0x%x 0x%x 0x%x 0x%x\n",
   15362                  (Int)getIByte(delta_start+0),
   15363                  (Int)getIByte(delta_start+1),
   15364                  (Int)getIByte(delta_start+2),
   15365                  (Int)getIByte(delta_start+3) );
   15366    }
   15367 
   15368    /* Tell the dispatcher that this insn cannot be decoded, and so has
   15369       not been executed, and (is currently) the next to be executed.
   15370       EIP should be up-to-date since it made so at the start of each
   15371       insn, but nevertheless be paranoid and update it again right
   15372       now. */
   15373    stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_curr_instr) ) );
   15374    jmp_lit(&dres, Ijk_NoDecode, guest_EIP_curr_instr);
   15375    vassert(dres.whatNext == Dis_StopHere);
   15376    dres.len = 0;
   15377    /* We also need to say that a CAS is not expected now, regardless
   15378       of what it might have been set to at the start of the function,
   15379       since the IR that we've emitted just above (to synthesis a
   15380       SIGILL) does not involve any CAS, and presumably no other IR has
   15381       been emitted for this (non-decoded) insn. */
   15382    *expect_CAS = False;
   15383    return dres;
   15384 
   15385    } /* switch (opc) for the main (primary) opcode switch. */
   15386 
   15387   decode_success:
   15388    /* All decode successes end up here. */
   15389    switch (dres.whatNext) {
   15390       case Dis_Continue:
   15391          stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_bbstart + delta) ) );
   15392          break;
   15393       case Dis_ResteerU:
   15394       case Dis_ResteerC:
   15395          stmt( IRStmt_Put( OFFB_EIP, mkU32(dres.continueAt) ) );
   15396          break;
   15397       case Dis_StopHere:
   15398          break;
   15399       default:
   15400          vassert(0);
   15401    }
   15402 
   15403    DIP("\n");
   15404    dres.len = delta - delta_start;
   15405    return dres;
   15406 }
   15407 
   15408 #undef DIP
   15409 #undef DIS
   15410 
   15411 
   15412 /*------------------------------------------------------------*/
   15413 /*--- Top-level fn                                         ---*/
   15414 /*------------------------------------------------------------*/
   15415 
   15416 /* Disassemble a single instruction into IR.  The instruction
   15417    is located in host memory at &guest_code[delta]. */
   15418 
   15419 DisResult disInstr_X86 ( IRSB*        irsb_IN,
   15420                          Bool         (*resteerOkFn) ( void*, Addr ),
   15421                          Bool         resteerCisOk,
   15422                          void*        callback_opaque,
   15423                          const UChar* guest_code_IN,
   15424                          Long         delta,
   15425                          Addr         guest_IP,
   15426                          VexArch      guest_arch,
   15427                          const VexArchInfo* archinfo,
   15428                          const VexAbiInfo*  abiinfo,
   15429                          VexEndness   host_endness_IN,
   15430                          Bool         sigill_diag_IN )
   15431 {
   15432    Int       i, x1, x2;
   15433    Bool      expect_CAS, has_CAS;
   15434    DisResult dres;
   15435 
   15436    /* Set globals (see top of this file) */
   15437    vassert(guest_arch == VexArchX86);
   15438    guest_code           = guest_code_IN;
   15439    irsb                 = irsb_IN;
   15440    host_endness         = host_endness_IN;
   15441    guest_EIP_curr_instr = (Addr32)guest_IP;
   15442    guest_EIP_bbstart    = (Addr32)toUInt(guest_IP - delta);
   15443 
   15444    x1 = irsb_IN->stmts_used;
   15445    expect_CAS = False;
   15446    dres = disInstr_X86_WRK ( &expect_CAS, resteerOkFn,
   15447                              resteerCisOk,
   15448                              callback_opaque,
   15449                              delta, archinfo, abiinfo, sigill_diag_IN );
   15450    x2 = irsb_IN->stmts_used;
   15451    vassert(x2 >= x1);
   15452 
   15453    /* See comment at the top of disInstr_X86_WRK for meaning of
   15454       expect_CAS.  Here, we (sanity-)check for the presence/absence of
   15455       IRCAS as directed by the returned expect_CAS value. */
   15456    has_CAS = False;
   15457    for (i = x1; i < x2; i++) {
   15458       if (irsb_IN->stmts[i]->tag == Ist_CAS)
   15459          has_CAS = True;
   15460    }
   15461 
   15462    if (expect_CAS != has_CAS) {
   15463       /* inconsistency detected.  re-disassemble the instruction so as
   15464          to generate a useful error message; then assert. */
   15465       vex_traceflags |= VEX_TRACE_FE;
   15466       dres = disInstr_X86_WRK ( &expect_CAS, resteerOkFn,
   15467                                 resteerCisOk,
   15468                                 callback_opaque,
   15469                                 delta, archinfo, abiinfo, sigill_diag_IN );
   15470       for (i = x1; i < x2; i++) {
   15471          vex_printf("\t\t");
   15472          ppIRStmt(irsb_IN->stmts[i]);
   15473          vex_printf("\n");
   15474       }
   15475       /* Failure of this assertion is serious and denotes a bug in
   15476          disInstr. */
   15477       vpanic("disInstr_X86: inconsistency in LOCK prefix handling");
   15478    }
   15479 
   15480    return dres;
   15481 }
   15482 
   15483 
   15484 /*--------------------------------------------------------------------*/
   15485 /*--- end                                         guest_x86_toIR.c ---*/
   15486 /*--------------------------------------------------------------------*/
   15487