Home | History | Annotate | Download | only in priv
      1 
      2 /*--------------------------------------------------------------------*/
      3 /*--- begin                                       guest_x86_toIR.c ---*/
      4 /*--------------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2012 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 /* Translates x86 code to IR. */
     37 
     38 /* TODO:
     39 
     40    All Puts to CC_OP/CC_DEP1/CC_DEP2/CC_NDEP should really be checked
     41    to ensure a 32-bit value is being written.
     42 
     43    FUCOMI(P): what happens to A and S flags?  Currently are forced
     44       to zero.
     45 
     46    x87 FP Limitations:
     47 
     48    * all arithmetic done at 64 bits
     49 
     50    * no FP exceptions, except for handling stack over/underflow
     51 
     52    * FP rounding mode observed only for float->int conversions
     53      and int->float conversions which could lose accuracy, and
     54      for float-to-float rounding.  For all other operations,
     55      round-to-nearest is used, regardless.
     56 
     57    * FP sin/cos/tan/sincos: C2 flag is always cleared.  IOW the
     58      simulation claims the argument is in-range (-2^63 <= arg <= 2^63)
     59      even when it isn't.
     60 
     61    * some of the FCOM cases could do with testing -- not convinced
     62      that the args are the right way round.
     63 
     64    * FSAVE does not re-initialise the FPU; it should do
     65 
     66    * FINIT not only initialises the FPU environment, it also
     67      zeroes all the FP registers.  It should leave the registers
     68      unchanged.
     69 
     70    SAHF should cause eflags[1] == 1, and in fact it produces 0.  As
     71    per Intel docs this bit has no meaning anyway.  Since PUSHF is the
     72    only way to observe eflags[1], a proper fix would be to make that
     73    bit be set by PUSHF.
     74 
     75    The state of %eflags.AC (alignment check, bit 18) is recorded by
     76    the simulation (viz, if you set it with popf then a pushf produces
     77    the value you set it to), but it is otherwise ignored.  In
     78    particular, setting it to 1 does NOT cause alignment checking to
     79    happen.  Programs that set it to 1 and then rely on the resulting
     80    SIGBUSs to inform them of misaligned accesses will not work.
     81 
     82    Implementation of sysenter is necessarily partial.  sysenter is a
     83    kind of system call entry.  When doing a sysenter, the return
     84    address is not known -- that is something that is beyond Vex's
     85    knowledge.  So the generated IR forces a return to the scheduler,
     86    which can do what it likes to simulate the systenter, but it MUST
     87    set this thread's guest_EIP field with the continuation address
     88    before resuming execution.  If that doesn't happen, the thread will
     89    jump to address zero, which is probably fatal.
     90 
     91    This module uses global variables and so is not MT-safe (if that
     92    should ever become relevant).
     93 
     94    The delta values are 32-bit ints, not 64-bit ints.  That means
     95    this module may not work right if run on a 64-bit host.  That should
     96    be fixed properly, really -- if anyone ever wants to use Vex to
     97    translate x86 code for execution on a 64-bit host.
     98 
     99    casLE (implementation of lock-prefixed insns) and rep-prefixed
    100    insns: the side-exit back to the start of the insn is done with
    101    Ijk_Boring.  This is quite wrong, it should be done with
    102    Ijk_NoRedir, since otherwise the side exit, which is intended to
    103    restart the instruction for whatever reason, could go somewhere
    104    entirely else.  Doing it right (with Ijk_NoRedir jumps) would make
    105    no-redir jumps performance critical, at least for rep-prefixed
    106    instructions, since all iterations thereof would involve such a
    107    jump.  It's not such a big deal with casLE since the side exit is
    108    only taken if the CAS fails, that is, the location is contended,
    109    which is relatively unlikely.
    110 
    111    XXXX: Nov 2009: handling of SWP on ARM suffers from the same
    112    problem.
    113 
    114    Note also, the test for CAS success vs failure is done using
    115    Iop_CasCmp{EQ,NE}{8,16,32,64} rather than the ordinary
    116    Iop_Cmp{EQ,NE} equivalents.  This is so as to tell Memcheck that it
    117    shouldn't definedness-check these comparisons.  See
    118    COMMENT_ON_CasCmpEQ in memcheck/mc_translate.c for
    119    background/rationale.
    120 */
    121 
    122 /* Performance holes:
    123 
    124    - fcom ; fstsw %ax ; sahf
    125      sahf does not update the O flag (sigh) and so O needs to
    126      be computed.  This is done expensively; it would be better
    127      to have a calculate_eflags_o helper.
    128 
    129    - emwarns; some FP codes can generate huge numbers of these
    130      if the fpucw is changed in an inner loop.  It would be
    131      better for the guest state to have an emwarn-enable reg
    132      which can be set zero or nonzero.  If it is zero, emwarns
    133      are not flagged, and instead control just flows all the
    134      way through bbs as usual.
    135 */
    136 
    137 /* "Special" instructions.
    138 
    139    This instruction decoder can decode three special instructions
    140    which mean nothing natively (are no-ops as far as regs/mem are
    141    concerned) but have meaning for supporting Valgrind.  A special
    142    instruction is flagged by the 12-byte preamble C1C703 C1C70D C1C71D
    143    C1C713 (in the standard interpretation, that means: roll $3, %edi;
    144    roll $13, %edi; roll $29, %edi; roll $19, %edi).  Following that,
    145    one of the following 3 are allowed (standard interpretation in
    146    parentheses):
    147 
    148       87DB (xchgl %ebx,%ebx)   %EDX = client_request ( %EAX )
    149       87C9 (xchgl %ecx,%ecx)   %EAX = guest_NRADDR
    150       87D2 (xchgl %edx,%edx)   call-noredir *%EAX
    151 
    152    Any other bytes following the 12-byte preamble are illegal and
    153    constitute a failure in instruction decoding.  This all assumes
    154    that the preamble will never occur except in specific code
    155    fragments designed for Valgrind to catch.
    156 
    157    No prefixes may precede a "Special" instruction.
    158 */
    159 
    160 /* LOCK prefixed instructions.  These are translated using IR-level
    161    CAS statements (IRCAS) and are believed to preserve atomicity, even
    162    from the point of view of some other process racing against a
    163    simulated one (presumably they communicate via a shared memory
    164    segment).
    165 
    166    Handlers which are aware of LOCK prefixes are:
    167       dis_op2_G_E      (add, or, adc, sbb, and, sub, xor)
    168       dis_cmpxchg_G_E  (cmpxchg)
    169       dis_Grp1         (add, or, adc, sbb, and, sub, xor)
    170       dis_Grp3         (not, neg)
    171       dis_Grp4         (inc, dec)
    172       dis_Grp5         (inc, dec)
    173       dis_Grp8_Imm     (bts, btc, btr)
    174       dis_bt_G_E       (bts, btc, btr)
    175       dis_xadd_G_E     (xadd)
    176 */
    177 
    178 
    179 #include "libvex_basictypes.h"
    180 #include "libvex_ir.h"
    181 #include "libvex.h"
    182 #include "libvex_guest_x86.h"
    183 
    184 #include "main_util.h"
    185 #include "main_globals.h"
    186 #include "guest_generic_bb_to_IR.h"
    187 #include "guest_generic_x87.h"
    188 #include "guest_x86_defs.h"
    189 
    190 
    191 /*------------------------------------------------------------*/
    192 /*--- Globals                                              ---*/
    193 /*------------------------------------------------------------*/
    194 
    195 /* These are set at the start of the translation of an insn, right
    196    down in disInstr_X86, so that we don't have to pass them around
    197    endlessly.  They are all constant during the translation of any
    198    given insn. */
    199 
    200 /* We need to know this to do sub-register accesses correctly. */
    201 static Bool host_is_bigendian;
    202 
    203 /* Pointer to the guest code area (points to start of BB, not to the
    204    insn being processed). */
    205 static UChar* guest_code;
    206 
    207 /* The guest address corresponding to guest_code[0]. */
    208 static Addr32 guest_EIP_bbstart;
    209 
    210 /* The guest address for the instruction currently being
    211    translated. */
    212 static Addr32 guest_EIP_curr_instr;
    213 
    214 /* The IRSB* into which we're generating code. */
    215 static IRSB* irsb;
    216 
    217 
    218 /*------------------------------------------------------------*/
    219 /*--- Debugging output                                     ---*/
    220 /*------------------------------------------------------------*/
    221 
    222 #define DIP(format, args...)           \
    223    if (vex_traceflags & VEX_TRACE_FE)  \
    224       vex_printf(format, ## args)
    225 
    226 #define DIS(buf, format, args...)      \
    227    if (vex_traceflags & VEX_TRACE_FE)  \
    228       vex_sprintf(buf, format, ## args)
    229 
    230 
    231 /*------------------------------------------------------------*/
    232 /*--- Offsets of various parts of the x86 guest state.     ---*/
    233 /*------------------------------------------------------------*/
    234 
    235 #define OFFB_EAX       offsetof(VexGuestX86State,guest_EAX)
    236 #define OFFB_EBX       offsetof(VexGuestX86State,guest_EBX)
    237 #define OFFB_ECX       offsetof(VexGuestX86State,guest_ECX)
    238 #define OFFB_EDX       offsetof(VexGuestX86State,guest_EDX)
    239 #define OFFB_ESP       offsetof(VexGuestX86State,guest_ESP)
    240 #define OFFB_EBP       offsetof(VexGuestX86State,guest_EBP)
    241 #define OFFB_ESI       offsetof(VexGuestX86State,guest_ESI)
    242 #define OFFB_EDI       offsetof(VexGuestX86State,guest_EDI)
    243 
    244 #define OFFB_EIP       offsetof(VexGuestX86State,guest_EIP)
    245 
    246 #define OFFB_CC_OP     offsetof(VexGuestX86State,guest_CC_OP)
    247 #define OFFB_CC_DEP1   offsetof(VexGuestX86State,guest_CC_DEP1)
    248 #define OFFB_CC_DEP2   offsetof(VexGuestX86State,guest_CC_DEP2)
    249 #define OFFB_CC_NDEP   offsetof(VexGuestX86State,guest_CC_NDEP)
    250 
    251 #define OFFB_FPREGS    offsetof(VexGuestX86State,guest_FPREG[0])
    252 #define OFFB_FPTAGS    offsetof(VexGuestX86State,guest_FPTAG[0])
    253 #define OFFB_DFLAG     offsetof(VexGuestX86State,guest_DFLAG)
    254 #define OFFB_IDFLAG    offsetof(VexGuestX86State,guest_IDFLAG)
    255 #define OFFB_ACFLAG    offsetof(VexGuestX86State,guest_ACFLAG)
    256 #define OFFB_FTOP      offsetof(VexGuestX86State,guest_FTOP)
    257 #define OFFB_FC3210    offsetof(VexGuestX86State,guest_FC3210)
    258 #define OFFB_FPROUND   offsetof(VexGuestX86State,guest_FPROUND)
    259 
    260 #define OFFB_CS        offsetof(VexGuestX86State,guest_CS)
    261 #define OFFB_DS        offsetof(VexGuestX86State,guest_DS)
    262 #define OFFB_ES        offsetof(VexGuestX86State,guest_ES)
    263 #define OFFB_FS        offsetof(VexGuestX86State,guest_FS)
    264 #define OFFB_GS        offsetof(VexGuestX86State,guest_GS)
    265 #define OFFB_SS        offsetof(VexGuestX86State,guest_SS)
    266 #define OFFB_LDT       offsetof(VexGuestX86State,guest_LDT)
    267 #define OFFB_GDT       offsetof(VexGuestX86State,guest_GDT)
    268 
    269 #define OFFB_SSEROUND  offsetof(VexGuestX86State,guest_SSEROUND)
    270 #define OFFB_XMM0      offsetof(VexGuestX86State,guest_XMM0)
    271 #define OFFB_XMM1      offsetof(VexGuestX86State,guest_XMM1)
    272 #define OFFB_XMM2      offsetof(VexGuestX86State,guest_XMM2)
    273 #define OFFB_XMM3      offsetof(VexGuestX86State,guest_XMM3)
    274 #define OFFB_XMM4      offsetof(VexGuestX86State,guest_XMM4)
    275 #define OFFB_XMM5      offsetof(VexGuestX86State,guest_XMM5)
    276 #define OFFB_XMM6      offsetof(VexGuestX86State,guest_XMM6)
    277 #define OFFB_XMM7      offsetof(VexGuestX86State,guest_XMM7)
    278 
    279 #define OFFB_EMWARN    offsetof(VexGuestX86State,guest_EMWARN)
    280 
    281 #define OFFB_TISTART   offsetof(VexGuestX86State,guest_TISTART)
    282 #define OFFB_TILEN     offsetof(VexGuestX86State,guest_TILEN)
    283 #define OFFB_NRADDR    offsetof(VexGuestX86State,guest_NRADDR)
    284 
    285 #define OFFB_IP_AT_SYSCALL offsetof(VexGuestX86State,guest_IP_AT_SYSCALL)
    286 
    287 
    288 /*------------------------------------------------------------*/
    289 /*--- Helper bits and pieces for deconstructing the        ---*/
    290 /*--- x86 insn stream.                                     ---*/
    291 /*------------------------------------------------------------*/
    292 
    293 /* This is the Intel register encoding -- integer regs. */
    294 #define R_EAX 0
    295 #define R_ECX 1
    296 #define R_EDX 2
    297 #define R_EBX 3
    298 #define R_ESP 4
    299 #define R_EBP 5
    300 #define R_ESI 6
    301 #define R_EDI 7
    302 
    303 #define R_AL (0+R_EAX)
    304 #define R_AH (4+R_EAX)
    305 
    306 /* This is the Intel register encoding -- segment regs. */
    307 #define R_ES 0
    308 #define R_CS 1
    309 #define R_SS 2
    310 #define R_DS 3
    311 #define R_FS 4
    312 #define R_GS 5
    313 
    314 
    315 /* Add a statement to the list held by "irbb". */
    316 static void stmt ( IRStmt* st )
    317 {
    318    addStmtToIRSB( irsb, st );
    319 }
    320 
    321 /* Generate a new temporary of the given type. */
    322 static IRTemp newTemp ( IRType ty )
    323 {
    324    vassert(isPlausibleIRType(ty));
    325    return newIRTemp( irsb->tyenv, ty );
    326 }
    327 
    328 /* Various simple conversions */
    329 
    330 static UInt extend_s_8to32( UInt x )
    331 {
    332    return (UInt)((((Int)x) << 24) >> 24);
    333 }
    334 
    335 static UInt extend_s_16to32 ( UInt x )
    336 {
    337    return (UInt)((((Int)x) << 16) >> 16);
    338 }
    339 
    340 /* Fetch a byte from the guest insn stream. */
    341 static UChar getIByte ( Int delta )
    342 {
    343    return guest_code[delta];
    344 }
    345 
    346 /* Extract the reg field from a modRM byte. */
    347 static Int gregOfRM ( UChar mod_reg_rm )
    348 {
    349    return (Int)( (mod_reg_rm >> 3) & 7 );
    350 }
    351 
    352 /* Figure out whether the mod and rm parts of a modRM byte refer to a
    353    register or memory.  If so, the byte will have the form 11XXXYYY,
    354    where YYY is the register number. */
    355 static Bool epartIsReg ( UChar mod_reg_rm )
    356 {
    357    return toBool(0xC0 == (mod_reg_rm & 0xC0));
    358 }
    359 
    360 /* ... and extract the register number ... */
    361 static Int eregOfRM ( UChar mod_reg_rm )
    362 {
    363    return (Int)(mod_reg_rm & 0x7);
    364 }
    365 
    366 /* Get a 8/16/32-bit unsigned value out of the insn stream. */
    367 
    368 static UChar getUChar ( Int delta )
    369 {
    370    UChar v = guest_code[delta+0];
    371    return toUChar(v);
    372 }
    373 
    374 static UInt getUDisp16 ( Int delta )
    375 {
    376    UInt v = guest_code[delta+1]; v <<= 8;
    377    v |= guest_code[delta+0];
    378    return v & 0xFFFF;
    379 }
    380 
    381 static UInt getUDisp32 ( Int delta )
    382 {
    383    UInt v = guest_code[delta+3]; v <<= 8;
    384    v |= guest_code[delta+2]; v <<= 8;
    385    v |= guest_code[delta+1]; v <<= 8;
    386    v |= guest_code[delta+0];
    387    return v;
    388 }
    389 
    390 static UInt getUDisp ( Int size, Int delta )
    391 {
    392    switch (size) {
    393       case 4: return getUDisp32(delta);
    394       case 2: return getUDisp16(delta);
    395       case 1: return (UInt)getUChar(delta);
    396       default: vpanic("getUDisp(x86)");
    397    }
    398    return 0; /*notreached*/
    399 }
    400 
    401 
    402 /* Get a byte value out of the insn stream and sign-extend to 32
    403    bits. */
    404 static UInt getSDisp8 ( Int delta )
    405 {
    406    return extend_s_8to32( (UInt) (guest_code[delta]) );
    407 }
    408 
    409 static UInt getSDisp16 ( Int delta0 )
    410 {
    411    UChar* eip = (UChar*)(&guest_code[delta0]);
    412    UInt d = *eip++;
    413    d |= ((*eip++) << 8);
    414    return extend_s_16to32(d);
    415 }
    416 
    417 static UInt getSDisp ( Int size, Int delta )
    418 {
    419    switch (size) {
    420       case 4: return getUDisp32(delta);
    421       case 2: return getSDisp16(delta);
    422       case 1: return getSDisp8(delta);
    423       default: vpanic("getSDisp(x86)");
    424   }
    425   return 0; /*notreached*/
    426 }
    427 
    428 
    429 /*------------------------------------------------------------*/
    430 /*--- Helpers for constructing IR.                         ---*/
    431 /*------------------------------------------------------------*/
    432 
    433 /* Create a 1/2/4 byte read of an x86 integer registers.  For 16/8 bit
    434    register references, we need to take the host endianness into
    435    account.  Supplied value is 0 .. 7 and in the Intel instruction
    436    encoding. */
    437 
    438 static IRType szToITy ( Int n )
    439 {
    440    switch (n) {
    441       case 1: return Ity_I8;
    442       case 2: return Ity_I16;
    443       case 4: return Ity_I32;
    444       default: vpanic("szToITy(x86)");
    445    }
    446 }
    447 
    448 /* On a little-endian host, less significant bits of the guest
    449    registers are at lower addresses.  Therefore, if a reference to a
    450    register low half has the safe guest state offset as a reference to
    451    the full register.
    452 */
    453 static Int integerGuestRegOffset ( Int sz, UInt archreg )
    454 {
    455    vassert(archreg < 8);
    456 
    457    /* Correct for little-endian host only. */
    458    vassert(!host_is_bigendian);
    459 
    460    if (sz == 4 || sz == 2 || (sz == 1 && archreg < 4)) {
    461       switch (archreg) {
    462          case R_EAX: return OFFB_EAX;
    463          case R_EBX: return OFFB_EBX;
    464          case R_ECX: return OFFB_ECX;
    465          case R_EDX: return OFFB_EDX;
    466          case R_ESI: return OFFB_ESI;
    467          case R_EDI: return OFFB_EDI;
    468          case R_ESP: return OFFB_ESP;
    469          case R_EBP: return OFFB_EBP;
    470          default: vpanic("integerGuestRegOffset(x86,le)(4,2)");
    471       }
    472    }
    473 
    474    vassert(archreg >= 4 && archreg < 8 && sz == 1);
    475    switch (archreg-4) {
    476       case R_EAX: return 1+ OFFB_EAX;
    477       case R_EBX: return 1+ OFFB_EBX;
    478       case R_ECX: return 1+ OFFB_ECX;
    479       case R_EDX: return 1+ OFFB_EDX;
    480       default: vpanic("integerGuestRegOffset(x86,le)(1h)");
    481    }
    482 
    483    /* NOTREACHED */
    484    vpanic("integerGuestRegOffset(x86,le)");
    485 }
    486 
    487 static Int segmentGuestRegOffset ( UInt sreg )
    488 {
    489    switch (sreg) {
    490       case R_ES: return OFFB_ES;
    491       case R_CS: return OFFB_CS;
    492       case R_SS: return OFFB_SS;
    493       case R_DS: return OFFB_DS;
    494       case R_FS: return OFFB_FS;
    495       case R_GS: return OFFB_GS;
    496       default: vpanic("segmentGuestRegOffset(x86)");
    497    }
    498 }
    499 
    500 static Int xmmGuestRegOffset ( UInt xmmreg )
    501 {
    502    switch (xmmreg) {
    503       case 0: return OFFB_XMM0;
    504       case 1: return OFFB_XMM1;
    505       case 2: return OFFB_XMM2;
    506       case 3: return OFFB_XMM3;
    507       case 4: return OFFB_XMM4;
    508       case 5: return OFFB_XMM5;
    509       case 6: return OFFB_XMM6;
    510       case 7: return OFFB_XMM7;
    511       default: vpanic("xmmGuestRegOffset");
    512    }
    513 }
    514 
    515 /* Lanes of vector registers are always numbered from zero being the
    516    least significant lane (rightmost in the register).  */
    517 
    518 static Int xmmGuestRegLane16offset ( UInt xmmreg, Int laneno )
    519 {
    520    /* Correct for little-endian host only. */
    521    vassert(!host_is_bigendian);
    522    vassert(laneno >= 0 && laneno < 8);
    523    return xmmGuestRegOffset( xmmreg ) + 2 * laneno;
    524 }
    525 
    526 static Int xmmGuestRegLane32offset ( UInt xmmreg, Int laneno )
    527 {
    528    /* Correct for little-endian host only. */
    529    vassert(!host_is_bigendian);
    530    vassert(laneno >= 0 && laneno < 4);
    531    return xmmGuestRegOffset( xmmreg ) + 4 * laneno;
    532 }
    533 
    534 static Int xmmGuestRegLane64offset ( UInt xmmreg, Int laneno )
    535 {
    536    /* Correct for little-endian host only. */
    537    vassert(!host_is_bigendian);
    538    vassert(laneno >= 0 && laneno < 2);
    539    return xmmGuestRegOffset( xmmreg ) + 8 * laneno;
    540 }
    541 
    542 static IRExpr* getIReg ( Int sz, UInt archreg )
    543 {
    544    vassert(sz == 1 || sz == 2 || sz == 4);
    545    vassert(archreg < 8);
    546    return IRExpr_Get( integerGuestRegOffset(sz,archreg),
    547                       szToITy(sz) );
    548 }
    549 
    550 /* Ditto, but write to a reg instead. */
    551 static void putIReg ( Int sz, UInt archreg, IRExpr* e )
    552 {
    553    IRType ty = typeOfIRExpr(irsb->tyenv, e);
    554    switch (sz) {
    555       case 1: vassert(ty == Ity_I8); break;
    556       case 2: vassert(ty == Ity_I16); break;
    557       case 4: vassert(ty == Ity_I32); break;
    558       default: vpanic("putIReg(x86)");
    559    }
    560    vassert(archreg < 8);
    561    stmt( IRStmt_Put(integerGuestRegOffset(sz,archreg), e) );
    562 }
    563 
    564 static IRExpr* getSReg ( UInt sreg )
    565 {
    566    return IRExpr_Get( segmentGuestRegOffset(sreg), Ity_I16 );
    567 }
    568 
    569 static void putSReg ( UInt sreg, IRExpr* e )
    570 {
    571    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
    572    stmt( IRStmt_Put( segmentGuestRegOffset(sreg), e ) );
    573 }
    574 
    575 static IRExpr* getXMMReg ( UInt xmmreg )
    576 {
    577    return IRExpr_Get( xmmGuestRegOffset(xmmreg), Ity_V128 );
    578 }
    579 
    580 static IRExpr* getXMMRegLane64 ( UInt xmmreg, Int laneno )
    581 {
    582    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_I64 );
    583 }
    584 
    585 static IRExpr* getXMMRegLane64F ( UInt xmmreg, Int laneno )
    586 {
    587    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_F64 );
    588 }
    589 
    590 static IRExpr* getXMMRegLane32 ( UInt xmmreg, Int laneno )
    591 {
    592    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_I32 );
    593 }
    594 
    595 static IRExpr* getXMMRegLane32F ( UInt xmmreg, Int laneno )
    596 {
    597    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_F32 );
    598 }
    599 
    600 static void putXMMReg ( UInt xmmreg, IRExpr* e )
    601 {
    602    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
    603    stmt( IRStmt_Put( xmmGuestRegOffset(xmmreg), e ) );
    604 }
    605 
    606 static void putXMMRegLane64 ( UInt xmmreg, Int laneno, IRExpr* e )
    607 {
    608    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
    609    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
    610 }
    611 
    612 static void putXMMRegLane64F ( UInt xmmreg, Int laneno, IRExpr* e )
    613 {
    614    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
    615    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
    616 }
    617 
    618 static void putXMMRegLane32F ( UInt xmmreg, Int laneno, IRExpr* e )
    619 {
    620    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
    621    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
    622 }
    623 
    624 static void putXMMRegLane32 ( UInt xmmreg, Int laneno, IRExpr* e )
    625 {
    626    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
    627    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
    628 }
    629 
    630 static void putXMMRegLane16 ( UInt xmmreg, Int laneno, IRExpr* e )
    631 {
    632    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
    633    stmt( IRStmt_Put( xmmGuestRegLane16offset(xmmreg,laneno), e ) );
    634 }
    635 
    636 static void assign ( IRTemp dst, IRExpr* e )
    637 {
    638    stmt( IRStmt_WrTmp(dst, e) );
    639 }
    640 
    641 static void storeLE ( IRExpr* addr, IRExpr* data )
    642 {
    643    stmt( IRStmt_Store(Iend_LE, addr, data) );
    644 }
    645 
    646 static IRExpr* unop ( IROp op, IRExpr* a )
    647 {
    648    return IRExpr_Unop(op, a);
    649 }
    650 
    651 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
    652 {
    653    return IRExpr_Binop(op, a1, a2);
    654 }
    655 
    656 static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
    657 {
    658    return IRExpr_Triop(op, a1, a2, a3);
    659 }
    660 
    661 static IRExpr* mkexpr ( IRTemp tmp )
    662 {
    663    return IRExpr_RdTmp(tmp);
    664 }
    665 
    666 static IRExpr* mkU8 ( UInt i )
    667 {
    668    vassert(i < 256);
    669    return IRExpr_Const(IRConst_U8( (UChar)i ));
    670 }
    671 
    672 static IRExpr* mkU16 ( UInt i )
    673 {
    674    vassert(i < 65536);
    675    return IRExpr_Const(IRConst_U16( (UShort)i ));
    676 }
    677 
    678 static IRExpr* mkU32 ( UInt i )
    679 {
    680    return IRExpr_Const(IRConst_U32(i));
    681 }
    682 
    683 static IRExpr* mkU64 ( ULong i )
    684 {
    685    return IRExpr_Const(IRConst_U64(i));
    686 }
    687 
    688 static IRExpr* mkU ( IRType ty, UInt i )
    689 {
    690    if (ty == Ity_I8)  return mkU8(i);
    691    if (ty == Ity_I16) return mkU16(i);
    692    if (ty == Ity_I32) return mkU32(i);
    693    /* If this panics, it usually means you passed a size (1,2,4)
    694       value as the IRType, rather than a real IRType. */
    695    vpanic("mkU(x86)");
    696 }
    697 
    698 static IRExpr* mkV128 ( UShort mask )
    699 {
    700    return IRExpr_Const(IRConst_V128(mask));
    701 }
    702 
    703 static IRExpr* loadLE ( IRType ty, IRExpr* addr )
    704 {
    705    return IRExpr_Load(Iend_LE, ty, addr);
    706 }
    707 
    708 static IROp mkSizedOp ( IRType ty, IROp op8 )
    709 {
    710    Int adj;
    711    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
    712    vassert(op8 == Iop_Add8 || op8 == Iop_Sub8
    713            || op8 == Iop_Mul8
    714            || op8 == Iop_Or8 || op8 == Iop_And8 || op8 == Iop_Xor8
    715            || op8 == Iop_Shl8 || op8 == Iop_Shr8 || op8 == Iop_Sar8
    716            || op8 == Iop_CmpEQ8 || op8 == Iop_CmpNE8
    717            || op8 == Iop_CasCmpNE8
    718            || op8 == Iop_Not8);
    719    adj = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
    720    return adj + op8;
    721 }
    722 
    723 static IROp mkWidenOp ( Int szSmall, Int szBig, Bool signd )
    724 {
    725    if (szSmall == 1 && szBig == 4) {
    726       return signd ? Iop_8Sto32 : Iop_8Uto32;
    727    }
    728    if (szSmall == 1 && szBig == 2) {
    729       return signd ? Iop_8Sto16 : Iop_8Uto16;
    730    }
    731    if (szSmall == 2 && szBig == 4) {
    732       return signd ? Iop_16Sto32 : Iop_16Uto32;
    733    }
    734    vpanic("mkWidenOp(x86,guest)");
    735 }
    736 
    737 static IRExpr* mkAnd1 ( IRExpr* x, IRExpr* y )
    738 {
    739    vassert(typeOfIRExpr(irsb->tyenv,x) == Ity_I1);
    740    vassert(typeOfIRExpr(irsb->tyenv,y) == Ity_I1);
    741    return unop(Iop_32to1,
    742                binop(Iop_And32,
    743                      unop(Iop_1Uto32,x),
    744                      unop(Iop_1Uto32,y)));
    745 }
    746 
    747 /* Generate a compare-and-swap operation, operating on memory at
    748    'addr'.  The expected value is 'expVal' and the new value is
    749    'newVal'.  If the operation fails, then transfer control (with a
    750    no-redir jump (XXX no -- see comment at top of this file)) to
    751    'restart_point', which is presumably the address of the guest
    752    instruction again -- retrying, essentially. */
    753 static void casLE ( IRExpr* addr, IRExpr* expVal, IRExpr* newVal,
    754                     Addr32 restart_point )
    755 {
    756    IRCAS* cas;
    757    IRType tyE    = typeOfIRExpr(irsb->tyenv, expVal);
    758    IRType tyN    = typeOfIRExpr(irsb->tyenv, newVal);
    759    IRTemp oldTmp = newTemp(tyE);
    760    IRTemp expTmp = newTemp(tyE);
    761    vassert(tyE == tyN);
    762    vassert(tyE == Ity_I32 || tyE == Ity_I16 || tyE == Ity_I8);
    763    assign(expTmp, expVal);
    764    cas = mkIRCAS( IRTemp_INVALID, oldTmp, Iend_LE, addr,
    765                   NULL, mkexpr(expTmp), NULL, newVal );
    766    stmt( IRStmt_CAS(cas) );
    767    stmt( IRStmt_Exit(
    768             binop( mkSizedOp(tyE,Iop_CasCmpNE8),
    769                    mkexpr(oldTmp), mkexpr(expTmp) ),
    770             Ijk_Boring, /*Ijk_NoRedir*/
    771             IRConst_U32( restart_point ),
    772             OFFB_EIP
    773          ));
    774 }
    775 
    776 
    777 /*------------------------------------------------------------*/
    778 /*--- Helpers for %eflags.                                 ---*/
    779 /*------------------------------------------------------------*/
    780 
    781 /* -------------- Evaluating the flags-thunk. -------------- */
    782 
    783 /* Build IR to calculate all the eflags from stored
    784    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
    785    Ity_I32. */
    786 static IRExpr* mk_x86g_calculate_eflags_all ( void )
    787 {
    788    IRExpr** args
    789       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
    790                        IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
    791                        IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
    792                        IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
    793    IRExpr* call
    794       = mkIRExprCCall(
    795            Ity_I32,
    796            0/*regparm*/,
    797            "x86g_calculate_eflags_all", &x86g_calculate_eflags_all,
    798            args
    799         );
    800    /* Exclude OP and NDEP from definedness checking.  We're only
    801       interested in DEP1 and DEP2. */
    802    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
    803    return call;
    804 }
    805 
    806 /* Build IR to calculate some particular condition from stored
    807    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
    808    Ity_Bit. */
    809 static IRExpr* mk_x86g_calculate_condition ( X86Condcode cond )
    810 {
    811    IRExpr** args
    812       = mkIRExprVec_5( mkU32(cond),
    813                        IRExpr_Get(OFFB_CC_OP,  Ity_I32),
    814                        IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
    815                        IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
    816                        IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
    817    IRExpr* call
    818       = mkIRExprCCall(
    819            Ity_I32,
    820            0/*regparm*/,
    821            "x86g_calculate_condition", &x86g_calculate_condition,
    822            args
    823         );
    824    /* Exclude the requested condition, OP and NDEP from definedness
    825       checking.  We're only interested in DEP1 and DEP2. */
    826    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<1) | (1<<4);
    827    return unop(Iop_32to1, call);
    828 }
    829 
    830 /* Build IR to calculate just the carry flag from stored
    831    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression :: Ity_I32. */
    832 static IRExpr* mk_x86g_calculate_eflags_c ( void )
    833 {
    834    IRExpr** args
    835       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
    836                        IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
    837                        IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
    838                        IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
    839    IRExpr* call
    840       = mkIRExprCCall(
    841            Ity_I32,
    842            3/*regparm*/,
    843            "x86g_calculate_eflags_c", &x86g_calculate_eflags_c,
    844            args
    845         );
    846    /* Exclude OP and NDEP from definedness checking.  We're only
    847       interested in DEP1 and DEP2. */
    848    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
    849    return call;
    850 }
    851 
    852 
    853 /* -------------- Building the flags-thunk. -------------- */
    854 
    855 /* The machinery in this section builds the flag-thunk following a
    856    flag-setting operation.  Hence the various setFlags_* functions.
    857 */
    858 
    859 static Bool isAddSub ( IROp op8 )
    860 {
    861    return toBool(op8 == Iop_Add8 || op8 == Iop_Sub8);
    862 }
    863 
    864 static Bool isLogic ( IROp op8 )
    865 {
    866    return toBool(op8 == Iop_And8 || op8 == Iop_Or8 || op8 == Iop_Xor8);
    867 }
    868 
    869 /* U-widen 8/16/32 bit int expr to 32. */
    870 static IRExpr* widenUto32 ( IRExpr* e )
    871 {
    872    switch (typeOfIRExpr(irsb->tyenv,e)) {
    873       case Ity_I32: return e;
    874       case Ity_I16: return unop(Iop_16Uto32,e);
    875       case Ity_I8:  return unop(Iop_8Uto32,e);
    876       default: vpanic("widenUto32");
    877    }
    878 }
    879 
    880 /* S-widen 8/16/32 bit int expr to 32. */
    881 static IRExpr* widenSto32 ( IRExpr* e )
    882 {
    883    switch (typeOfIRExpr(irsb->tyenv,e)) {
    884       case Ity_I32: return e;
    885       case Ity_I16: return unop(Iop_16Sto32,e);
    886       case Ity_I8:  return unop(Iop_8Sto32,e);
    887       default: vpanic("widenSto32");
    888    }
    889 }
    890 
    891 /* Narrow 8/16/32 bit int expr to 8/16/32.  Clearly only some
    892    of these combinations make sense. */
    893 static IRExpr* narrowTo ( IRType dst_ty, IRExpr* e )
    894 {
    895    IRType src_ty = typeOfIRExpr(irsb->tyenv,e);
    896    if (src_ty == dst_ty)
    897       return e;
    898    if (src_ty == Ity_I32 && dst_ty == Ity_I16)
    899       return unop(Iop_32to16, e);
    900    if (src_ty == Ity_I32 && dst_ty == Ity_I8)
    901       return unop(Iop_32to8, e);
    902 
    903    vex_printf("\nsrc, dst tys are: ");
    904    ppIRType(src_ty);
    905    vex_printf(", ");
    906    ppIRType(dst_ty);
    907    vex_printf("\n");
    908    vpanic("narrowTo(x86)");
    909 }
    910 
    911 
    912 /* Set the flags thunk OP, DEP1 and DEP2 fields.  The supplied op is
    913    auto-sized up to the real op. */
    914 
    915 static
    916 void setFlags_DEP1_DEP2 ( IROp op8, IRTemp dep1, IRTemp dep2, IRType ty )
    917 {
    918    Int ccOp = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
    919 
    920    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
    921 
    922    switch (op8) {
    923       case Iop_Add8: ccOp += X86G_CC_OP_ADDB;   break;
    924       case Iop_Sub8: ccOp += X86G_CC_OP_SUBB;   break;
    925       default:       ppIROp(op8);
    926                      vpanic("setFlags_DEP1_DEP2(x86)");
    927    }
    928    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
    929    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(dep1))) );
    930    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(mkexpr(dep2))) );
    931    /* Set NDEP even though it isn't used.  This makes redundant-PUT
    932       elimination of previous stores to this field work better. */
    933    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
    934 }
    935 
    936 
    937 /* Set the OP and DEP1 fields only, and write zero to DEP2. */
    938 
    939 static
    940 void setFlags_DEP1 ( IROp op8, IRTemp dep1, IRType ty )
    941 {
    942    Int ccOp = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
    943 
    944    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
    945 
    946    switch (op8) {
    947       case Iop_Or8:
    948       case Iop_And8:
    949       case Iop_Xor8: ccOp += X86G_CC_OP_LOGICB; break;
    950       default:       ppIROp(op8);
    951                      vpanic("setFlags_DEP1(x86)");
    952    }
    953    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
    954    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(dep1))) );
    955    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0)) );
    956    /* Set NDEP even though it isn't used.  This makes redundant-PUT
    957       elimination of previous stores to this field work better. */
    958    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
    959 }
    960 
    961 
    962 /* For shift operations, we put in the result and the undershifted
    963    result.  Except if the shift amount is zero, the thunk is left
    964    unchanged. */
    965 
    966 static void setFlags_DEP1_DEP2_shift ( IROp    op32,
    967                                        IRTemp  res,
    968                                        IRTemp  resUS,
    969                                        IRType  ty,
    970                                        IRTemp  guard )
    971 {
    972    Int ccOp = ty==Ity_I8 ? 2 : (ty==Ity_I16 ? 1 : 0);
    973 
    974    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
    975    vassert(guard);
    976 
    977    /* Both kinds of right shifts are handled by the same thunk
    978       operation. */
    979    switch (op32) {
    980       case Iop_Shr32:
    981       case Iop_Sar32: ccOp = X86G_CC_OP_SHRL - ccOp; break;
    982       case Iop_Shl32: ccOp = X86G_CC_OP_SHLL - ccOp; break;
    983       default:        ppIROp(op32);
    984                       vpanic("setFlags_DEP1_DEP2_shift(x86)");
    985    }
    986 
    987    /* DEP1 contains the result, DEP2 contains the undershifted value. */
    988    stmt( IRStmt_Put( OFFB_CC_OP,
    989                      IRExpr_Mux0X( mkexpr(guard),
    990                                    IRExpr_Get(OFFB_CC_OP,Ity_I32),
    991                                    mkU32(ccOp))) );
    992    stmt( IRStmt_Put( OFFB_CC_DEP1,
    993                      IRExpr_Mux0X( mkexpr(guard),
    994                                    IRExpr_Get(OFFB_CC_DEP1,Ity_I32),
    995                                    widenUto32(mkexpr(res)))) );
    996    stmt( IRStmt_Put( OFFB_CC_DEP2,
    997                      IRExpr_Mux0X( mkexpr(guard),
    998                                    IRExpr_Get(OFFB_CC_DEP2,Ity_I32),
    999                                    widenUto32(mkexpr(resUS)))) );
   1000    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   1001       elimination of previous stores to this field work better. */
   1002    stmt( IRStmt_Put( OFFB_CC_NDEP,
   1003                      IRExpr_Mux0X( mkexpr(guard),
   1004                                    IRExpr_Get(OFFB_CC_NDEP,Ity_I32),
   1005 				   mkU32(0) )));
   1006 }
   1007 
   1008 
   1009 /* For the inc/dec case, we store in DEP1 the result value and in NDEP
   1010    the former value of the carry flag, which unfortunately we have to
   1011    compute. */
   1012 
   1013 static void setFlags_INC_DEC ( Bool inc, IRTemp res, IRType ty )
   1014 {
   1015    Int ccOp = inc ? X86G_CC_OP_INCB : X86G_CC_OP_DECB;
   1016 
   1017    ccOp += ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
   1018    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
   1019 
   1020    /* This has to come first, because calculating the C flag
   1021       may require reading all four thunk fields. */
   1022    stmt( IRStmt_Put( OFFB_CC_NDEP, mk_x86g_calculate_eflags_c()) );
   1023    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
   1024    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(res))) );
   1025    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0)) );
   1026 }
   1027 
   1028 
   1029 /* Multiplies are pretty much like add and sub: DEP1 and DEP2 hold the
   1030    two arguments. */
   1031 
   1032 static
   1033 void setFlags_MUL ( IRType ty, IRTemp arg1, IRTemp arg2, UInt base_op )
   1034 {
   1035    switch (ty) {
   1036       case Ity_I8:
   1037          stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+0) ) );
   1038          break;
   1039       case Ity_I16:
   1040          stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+1) ) );
   1041          break;
   1042       case Ity_I32:
   1043          stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+2) ) );
   1044          break;
   1045       default:
   1046          vpanic("setFlags_MUL(x86)");
   1047    }
   1048    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(arg1)) ));
   1049    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(mkexpr(arg2)) ));
   1050    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   1051       elimination of previous stores to this field work better. */
   1052    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   1053 }
   1054 
   1055 
   1056 /* -------------- Condition codes. -------------- */
   1057 
   1058 /* Condition codes, using the Intel encoding.  */
   1059 
   1060 static HChar* name_X86Condcode ( X86Condcode cond )
   1061 {
   1062    switch (cond) {
   1063       case X86CondO:      return "o";
   1064       case X86CondNO:     return "no";
   1065       case X86CondB:      return "b";
   1066       case X86CondNB:     return "nb";
   1067       case X86CondZ:      return "z";
   1068       case X86CondNZ:     return "nz";
   1069       case X86CondBE:     return "be";
   1070       case X86CondNBE:    return "nbe";
   1071       case X86CondS:      return "s";
   1072       case X86CondNS:     return "ns";
   1073       case X86CondP:      return "p";
   1074       case X86CondNP:     return "np";
   1075       case X86CondL:      return "l";
   1076       case X86CondNL:     return "nl";
   1077       case X86CondLE:     return "le";
   1078       case X86CondNLE:    return "nle";
   1079       case X86CondAlways: return "ALWAYS";
   1080       default: vpanic("name_X86Condcode");
   1081    }
   1082 }
   1083 
   1084 static
   1085 X86Condcode positiveIse_X86Condcode ( X86Condcode  cond,
   1086                                       Bool*        needInvert )
   1087 {
   1088    vassert(cond >= X86CondO && cond <= X86CondNLE);
   1089    if (cond & 1) {
   1090       *needInvert = True;
   1091       return cond-1;
   1092    } else {
   1093       *needInvert = False;
   1094       return cond;
   1095    }
   1096 }
   1097 
   1098 
   1099 /* -------------- Helpers for ADD/SUB with carry. -------------- */
   1100 
   1101 /* Given ta1, ta2 and tres, compute tres = ADC(ta1,ta2) and set flags
   1102    appropriately.
   1103 
   1104    Optionally, generate a store for the 'tres' value.  This can either
   1105    be a normal store, or it can be a cas-with-possible-failure style
   1106    store:
   1107 
   1108    if taddr is IRTemp_INVALID, then no store is generated.
   1109 
   1110    if taddr is not IRTemp_INVALID, then a store (using taddr as
   1111    the address) is generated:
   1112 
   1113      if texpVal is IRTemp_INVALID then a normal store is
   1114      generated, and restart_point must be zero (it is irrelevant).
   1115 
   1116      if texpVal is not IRTemp_INVALID then a cas-style store is
   1117      generated.  texpVal is the expected value, restart_point
   1118      is the restart point if the store fails, and texpVal must
   1119      have the same type as tres.
   1120 */
   1121 static void helper_ADC ( Int sz,
   1122                          IRTemp tres, IRTemp ta1, IRTemp ta2,
   1123                          /* info about optional store: */
   1124                          IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
   1125 {
   1126    UInt    thunkOp;
   1127    IRType  ty    = szToITy(sz);
   1128    IRTemp  oldc  = newTemp(Ity_I32);
   1129    IRTemp  oldcn = newTemp(ty);
   1130    IROp    plus  = mkSizedOp(ty, Iop_Add8);
   1131    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
   1132 
   1133    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
   1134    vassert(sz == 1 || sz == 2 || sz == 4);
   1135    thunkOp = sz==4 ? X86G_CC_OP_ADCL
   1136                    : (sz==2 ? X86G_CC_OP_ADCW : X86G_CC_OP_ADCB);
   1137 
   1138    /* oldc = old carry flag, 0 or 1 */
   1139    assign( oldc,  binop(Iop_And32,
   1140                         mk_x86g_calculate_eflags_c(),
   1141                         mkU32(1)) );
   1142 
   1143    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
   1144 
   1145    assign( tres, binop(plus,
   1146                        binop(plus,mkexpr(ta1),mkexpr(ta2)),
   1147                        mkexpr(oldcn)) );
   1148 
   1149    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
   1150       start of this function. */
   1151    if (taddr != IRTemp_INVALID) {
   1152       if (texpVal == IRTemp_INVALID) {
   1153          vassert(restart_point == 0);
   1154          storeLE( mkexpr(taddr), mkexpr(tres) );
   1155       } else {
   1156          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
   1157          /* .. and hence 'texpVal' has the same type as 'tres'. */
   1158          casLE( mkexpr(taddr),
   1159                 mkexpr(texpVal), mkexpr(tres), restart_point );
   1160       }
   1161    }
   1162 
   1163    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(thunkOp) ) );
   1164    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(ta1)) ));
   1165    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(binop(xor, mkexpr(ta2),
   1166                                                          mkexpr(oldcn)) )) );
   1167    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
   1168 }
   1169 
   1170 
   1171 /* Given ta1, ta2 and tres, compute tres = SBB(ta1,ta2) and set flags
   1172    appropriately.  As with helper_ADC, possibly generate a store of
   1173    the result -- see comments on helper_ADC for details.
   1174 */
   1175 static void helper_SBB ( Int sz,
   1176                          IRTemp tres, IRTemp ta1, IRTemp ta2,
   1177                          /* info about optional store: */
   1178                          IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
   1179 {
   1180    UInt    thunkOp;
   1181    IRType  ty    = szToITy(sz);
   1182    IRTemp  oldc  = newTemp(Ity_I32);
   1183    IRTemp  oldcn = newTemp(ty);
   1184    IROp    minus = mkSizedOp(ty, Iop_Sub8);
   1185    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
   1186 
   1187    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
   1188    vassert(sz == 1 || sz == 2 || sz == 4);
   1189    thunkOp = sz==4 ? X86G_CC_OP_SBBL
   1190                    : (sz==2 ? X86G_CC_OP_SBBW : X86G_CC_OP_SBBB);
   1191 
   1192    /* oldc = old carry flag, 0 or 1 */
   1193    assign( oldc, binop(Iop_And32,
   1194                        mk_x86g_calculate_eflags_c(),
   1195                        mkU32(1)) );
   1196 
   1197    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
   1198 
   1199    assign( tres, binop(minus,
   1200                        binop(minus,mkexpr(ta1),mkexpr(ta2)),
   1201                        mkexpr(oldcn)) );
   1202 
   1203    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
   1204       start of this function. */
   1205    if (taddr != IRTemp_INVALID) {
   1206       if (texpVal == IRTemp_INVALID) {
   1207          vassert(restart_point == 0);
   1208          storeLE( mkexpr(taddr), mkexpr(tres) );
   1209       } else {
   1210          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
   1211          /* .. and hence 'texpVal' has the same type as 'tres'. */
   1212          casLE( mkexpr(taddr),
   1213                 mkexpr(texpVal), mkexpr(tres), restart_point );
   1214       }
   1215    }
   1216 
   1217    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(thunkOp) ) );
   1218    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(ta1) )) );
   1219    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(binop(xor, mkexpr(ta2),
   1220                                                          mkexpr(oldcn)) )) );
   1221    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
   1222 }
   1223 
   1224 
   1225 /* -------------- Helpers for disassembly printing. -------------- */
   1226 
   1227 static HChar* nameGrp1 ( Int opc_aux )
   1228 {
   1229    static HChar* grp1_names[8]
   1230      = { "add", "or", "adc", "sbb", "and", "sub", "xor", "cmp" };
   1231    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp1(x86)");
   1232    return grp1_names[opc_aux];
   1233 }
   1234 
   1235 static HChar* nameGrp2 ( Int opc_aux )
   1236 {
   1237    static HChar* grp2_names[8]
   1238      = { "rol", "ror", "rcl", "rcr", "shl", "shr", "shl", "sar" };
   1239    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp2(x86)");
   1240    return grp2_names[opc_aux];
   1241 }
   1242 
   1243 static HChar* nameGrp4 ( Int opc_aux )
   1244 {
   1245    static HChar* grp4_names[8]
   1246      = { "inc", "dec", "???", "???", "???", "???", "???", "???" };
   1247    if (opc_aux < 0 || opc_aux > 1) vpanic("nameGrp4(x86)");
   1248    return grp4_names[opc_aux];
   1249 }
   1250 
   1251 static HChar* nameGrp5 ( Int opc_aux )
   1252 {
   1253    static HChar* grp5_names[8]
   1254      = { "inc", "dec", "call*", "call*", "jmp*", "jmp*", "push", "???" };
   1255    if (opc_aux < 0 || opc_aux > 6) vpanic("nameGrp5(x86)");
   1256    return grp5_names[opc_aux];
   1257 }
   1258 
   1259 static HChar* nameGrp8 ( Int opc_aux )
   1260 {
   1261    static HChar* grp8_names[8]
   1262      = { "???", "???", "???", "???", "bt", "bts", "btr", "btc" };
   1263    if (opc_aux < 4 || opc_aux > 7) vpanic("nameGrp8(x86)");
   1264    return grp8_names[opc_aux];
   1265 }
   1266 
   1267 static HChar* nameIReg ( Int size, Int reg )
   1268 {
   1269    static HChar* ireg32_names[8]
   1270      = { "%eax", "%ecx", "%edx", "%ebx",
   1271          "%esp", "%ebp", "%esi", "%edi" };
   1272    static HChar* ireg16_names[8]
   1273      = { "%ax", "%cx", "%dx", "%bx", "%sp", "%bp", "%si", "%di" };
   1274    static HChar* ireg8_names[8]
   1275      = { "%al", "%cl", "%dl", "%bl",
   1276          "%ah{sp}", "%ch{bp}", "%dh{si}", "%bh{di}" };
   1277    if (reg < 0 || reg > 7) goto bad;
   1278    switch (size) {
   1279       case 4: return ireg32_names[reg];
   1280       case 2: return ireg16_names[reg];
   1281       case 1: return ireg8_names[reg];
   1282    }
   1283   bad:
   1284    vpanic("nameIReg(X86)");
   1285    return NULL; /*notreached*/
   1286 }
   1287 
   1288 static HChar* nameSReg ( UInt sreg )
   1289 {
   1290    switch (sreg) {
   1291       case R_ES: return "%es";
   1292       case R_CS: return "%cs";
   1293       case R_SS: return "%ss";
   1294       case R_DS: return "%ds";
   1295       case R_FS: return "%fs";
   1296       case R_GS: return "%gs";
   1297       default: vpanic("nameSReg(x86)");
   1298    }
   1299 }
   1300 
   1301 static HChar* nameMMXReg ( Int mmxreg )
   1302 {
   1303    static HChar* mmx_names[8]
   1304      = { "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" };
   1305    if (mmxreg < 0 || mmxreg > 7) vpanic("nameMMXReg(x86,guest)");
   1306    return mmx_names[mmxreg];
   1307 }
   1308 
   1309 static HChar* nameXMMReg ( Int xmmreg )
   1310 {
   1311    static HChar* xmm_names[8]
   1312      = { "%xmm0", "%xmm1", "%xmm2", "%xmm3",
   1313          "%xmm4", "%xmm5", "%xmm6", "%xmm7" };
   1314    if (xmmreg < 0 || xmmreg > 7) vpanic("name_of_xmm_reg");
   1315    return xmm_names[xmmreg];
   1316 }
   1317 
   1318 static HChar* nameMMXGran ( Int gran )
   1319 {
   1320    switch (gran) {
   1321       case 0: return "b";
   1322       case 1: return "w";
   1323       case 2: return "d";
   1324       case 3: return "q";
   1325       default: vpanic("nameMMXGran(x86,guest)");
   1326    }
   1327 }
   1328 
   1329 static HChar nameISize ( Int size )
   1330 {
   1331    switch (size) {
   1332       case 4: return 'l';
   1333       case 2: return 'w';
   1334       case 1: return 'b';
   1335       default: vpanic("nameISize(x86)");
   1336    }
   1337 }
   1338 
   1339 
   1340 /*------------------------------------------------------------*/
   1341 /*--- JMP helpers                                          ---*/
   1342 /*------------------------------------------------------------*/
   1343 
   1344 static void jmp_lit( /*MOD*/DisResult* dres,
   1345                      IRJumpKind kind, Addr32 d32 )
   1346 {
   1347    vassert(dres->whatNext    == Dis_Continue);
   1348    vassert(dres->len         == 0);
   1349    vassert(dres->continueAt  == 0);
   1350    vassert(dres->jk_StopHere == Ijk_INVALID);
   1351    dres->whatNext    = Dis_StopHere;
   1352    dres->jk_StopHere = kind;
   1353    stmt( IRStmt_Put( OFFB_EIP, mkU32(d32) ) );
   1354 }
   1355 
   1356 static void jmp_treg( /*MOD*/DisResult* dres,
   1357                       IRJumpKind kind, IRTemp t )
   1358 {
   1359    vassert(dres->whatNext    == Dis_Continue);
   1360    vassert(dres->len         == 0);
   1361    vassert(dres->continueAt  == 0);
   1362    vassert(dres->jk_StopHere == Ijk_INVALID);
   1363    dres->whatNext    = Dis_StopHere;
   1364    dres->jk_StopHere = kind;
   1365    stmt( IRStmt_Put( OFFB_EIP, mkexpr(t) ) );
   1366 }
   1367 
   1368 static
   1369 void jcc_01( /*MOD*/DisResult* dres,
   1370              X86Condcode cond, Addr32 d32_false, Addr32 d32_true )
   1371 {
   1372    Bool        invert;
   1373    X86Condcode condPos;
   1374    vassert(dres->whatNext    == Dis_Continue);
   1375    vassert(dres->len         == 0);
   1376    vassert(dres->continueAt  == 0);
   1377    vassert(dres->jk_StopHere == Ijk_INVALID);
   1378    dres->whatNext    = Dis_StopHere;
   1379    dres->jk_StopHere = Ijk_Boring;
   1380    condPos = positiveIse_X86Condcode ( cond, &invert );
   1381    if (invert) {
   1382       stmt( IRStmt_Exit( mk_x86g_calculate_condition(condPos),
   1383                          Ijk_Boring,
   1384                          IRConst_U32(d32_false),
   1385                          OFFB_EIP ) );
   1386       stmt( IRStmt_Put( OFFB_EIP, mkU32(d32_true) ) );
   1387    } else {
   1388       stmt( IRStmt_Exit( mk_x86g_calculate_condition(condPos),
   1389                          Ijk_Boring,
   1390                          IRConst_U32(d32_true),
   1391                          OFFB_EIP ) );
   1392       stmt( IRStmt_Put( OFFB_EIP, mkU32(d32_false) ) );
   1393    }
   1394 }
   1395 
   1396 
   1397 /*------------------------------------------------------------*/
   1398 /*--- Disassembling addressing modes                       ---*/
   1399 /*------------------------------------------------------------*/
   1400 
   1401 static
   1402 HChar* sorbTxt ( UChar sorb )
   1403 {
   1404    switch (sorb) {
   1405       case 0:    return ""; /* no override */
   1406       case 0x3E: return "%ds";
   1407       case 0x26: return "%es:";
   1408       case 0x64: return "%fs:";
   1409       case 0x65: return "%gs:";
   1410       default: vpanic("sorbTxt(x86,guest)");
   1411    }
   1412 }
   1413 
   1414 
   1415 /* 'virtual' is an IRExpr* holding a virtual address.  Convert it to a
   1416    linear address by adding any required segment override as indicated
   1417    by sorb. */
   1418 static
   1419 IRExpr* handleSegOverride ( UChar sorb, IRExpr* virtual )
   1420 {
   1421    Int    sreg;
   1422    IRType hWordTy;
   1423    IRTemp ldt_ptr, gdt_ptr, seg_selector, r64;
   1424 
   1425    if (sorb == 0)
   1426       /* the common case - no override */
   1427       return virtual;
   1428 
   1429    switch (sorb) {
   1430       case 0x3E: sreg = R_DS; break;
   1431       case 0x26: sreg = R_ES; break;
   1432       case 0x64: sreg = R_FS; break;
   1433       case 0x65: sreg = R_GS; break;
   1434       default: vpanic("handleSegOverride(x86,guest)");
   1435    }
   1436 
   1437    hWordTy = sizeof(HWord)==4 ? Ity_I32 : Ity_I64;
   1438 
   1439    seg_selector = newTemp(Ity_I32);
   1440    ldt_ptr      = newTemp(hWordTy);
   1441    gdt_ptr      = newTemp(hWordTy);
   1442    r64          = newTemp(Ity_I64);
   1443 
   1444    assign( seg_selector, unop(Iop_16Uto32, getSReg(sreg)) );
   1445    assign( ldt_ptr, IRExpr_Get( OFFB_LDT, hWordTy ));
   1446    assign( gdt_ptr, IRExpr_Get( OFFB_GDT, hWordTy ));
   1447 
   1448    /*
   1449    Call this to do the translation and limit checks:
   1450    ULong x86g_use_seg_selector ( HWord ldt, HWord gdt,
   1451                                  UInt seg_selector, UInt virtual_addr )
   1452    */
   1453    assign(
   1454       r64,
   1455       mkIRExprCCall(
   1456          Ity_I64,
   1457          0/*regparms*/,
   1458          "x86g_use_seg_selector",
   1459          &x86g_use_seg_selector,
   1460          mkIRExprVec_4( mkexpr(ldt_ptr), mkexpr(gdt_ptr),
   1461                         mkexpr(seg_selector), virtual)
   1462       )
   1463    );
   1464 
   1465    /* If the high 32 of the result are non-zero, there was a
   1466       failure in address translation.  In which case, make a
   1467       quick exit.
   1468    */
   1469    stmt(
   1470       IRStmt_Exit(
   1471          binop(Iop_CmpNE32, unop(Iop_64HIto32, mkexpr(r64)), mkU32(0)),
   1472          Ijk_MapFail,
   1473          IRConst_U32( guest_EIP_curr_instr ),
   1474          OFFB_EIP
   1475       )
   1476    );
   1477 
   1478    /* otherwise, here's the translated result. */
   1479    return unop(Iop_64to32, mkexpr(r64));
   1480 }
   1481 
   1482 
   1483 /* Generate IR to calculate an address indicated by a ModRM and
   1484    following SIB bytes.  The expression, and the number of bytes in
   1485    the address mode, are returned.  Note that this fn should not be
   1486    called if the R/M part of the address denotes a register instead of
   1487    memory.  If print_codegen is true, text of the addressing mode is
   1488    placed in buf.
   1489 
   1490    The computed address is stored in a new tempreg, and the
   1491    identity of the tempreg is returned.  */
   1492 
   1493 static IRTemp disAMode_copy2tmp ( IRExpr* addr32 )
   1494 {
   1495    IRTemp tmp = newTemp(Ity_I32);
   1496    assign( tmp, addr32 );
   1497    return tmp;
   1498 }
   1499 
   1500 static
   1501 IRTemp disAMode ( Int* len, UChar sorb, Int delta, HChar* buf )
   1502 {
   1503    UChar mod_reg_rm = getIByte(delta);
   1504    delta++;
   1505 
   1506    buf[0] = (UChar)0;
   1507 
   1508    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
   1509       jump table seems a bit excessive.
   1510    */
   1511    mod_reg_rm &= 0xC7;                      /* is now XX000YYY */
   1512    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
   1513                                             /* is now XX0XXYYY */
   1514    mod_reg_rm &= 0x1F;                      /* is now 000XXYYY */
   1515    switch (mod_reg_rm) {
   1516 
   1517       /* (%eax) .. (%edi), not including (%esp) or (%ebp).
   1518          --> GET %reg, t
   1519       */
   1520       case 0x00: case 0x01: case 0x02: case 0x03:
   1521       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
   1522          { UChar rm = mod_reg_rm;
   1523            DIS(buf, "%s(%s)", sorbTxt(sorb), nameIReg(4,rm));
   1524            *len = 1;
   1525            return disAMode_copy2tmp(
   1526                   handleSegOverride(sorb, getIReg(4,rm)));
   1527          }
   1528 
   1529       /* d8(%eax) ... d8(%edi), not including d8(%esp)
   1530          --> GET %reg, t ; ADDL d8, t
   1531       */
   1532       case 0x08: case 0x09: case 0x0A: case 0x0B:
   1533       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
   1534          { UChar rm = toUChar(mod_reg_rm & 7);
   1535            UInt  d  = getSDisp8(delta);
   1536            DIS(buf, "%s%d(%s)", sorbTxt(sorb), (Int)d, nameIReg(4,rm));
   1537            *len = 2;
   1538            return disAMode_copy2tmp(
   1539                   handleSegOverride(sorb,
   1540                      binop(Iop_Add32,getIReg(4,rm),mkU32(d))));
   1541          }
   1542 
   1543       /* d32(%eax) ... d32(%edi), not including d32(%esp)
   1544          --> GET %reg, t ; ADDL d8, t
   1545       */
   1546       case 0x10: case 0x11: case 0x12: case 0x13:
   1547       /* ! 14 */ case 0x15: case 0x16: case 0x17:
   1548          { UChar rm = toUChar(mod_reg_rm & 7);
   1549            UInt  d  = getUDisp32(delta);
   1550            DIS(buf, "%s0x%x(%s)", sorbTxt(sorb), (Int)d, nameIReg(4,rm));
   1551            *len = 5;
   1552            return disAMode_copy2tmp(
   1553                   handleSegOverride(sorb,
   1554                      binop(Iop_Add32,getIReg(4,rm),mkU32(d))));
   1555          }
   1556 
   1557       /* a register, %eax .. %edi.  This shouldn't happen. */
   1558       case 0x18: case 0x19: case 0x1A: case 0x1B:
   1559       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
   1560          vpanic("disAMode(x86): not an addr!");
   1561 
   1562       /* a 32-bit literal address
   1563          --> MOV d32, tmp
   1564       */
   1565       case 0x05:
   1566          { UInt d = getUDisp32(delta);
   1567            *len = 5;
   1568            DIS(buf, "%s(0x%x)", sorbTxt(sorb), d);
   1569            return disAMode_copy2tmp(
   1570                      handleSegOverride(sorb, mkU32(d)));
   1571          }
   1572 
   1573       case 0x04: {
   1574          /* SIB, with no displacement.  Special cases:
   1575             -- %esp cannot act as an index value.
   1576                If index_r indicates %esp, zero is used for the index.
   1577             -- when mod is zero and base indicates EBP, base is instead
   1578                a 32-bit literal.
   1579             It's all madness, I tell you.  Extract %index, %base and
   1580             scale from the SIB byte.  The value denoted is then:
   1581                | %index == %ESP && %base == %EBP
   1582                = d32 following SIB byte
   1583                | %index == %ESP && %base != %EBP
   1584                = %base
   1585                | %index != %ESP && %base == %EBP
   1586                = d32 following SIB byte + (%index << scale)
   1587                | %index != %ESP && %base != %ESP
   1588                = %base + (%index << scale)
   1589 
   1590             What happens to the souls of CPU architects who dream up such
   1591             horrendous schemes, do you suppose?
   1592          */
   1593          UChar sib     = getIByte(delta);
   1594          UChar scale   = toUChar((sib >> 6) & 3);
   1595          UChar index_r = toUChar((sib >> 3) & 7);
   1596          UChar base_r  = toUChar(sib & 7);
   1597          delta++;
   1598 
   1599          if (index_r != R_ESP && base_r != R_EBP) {
   1600             DIS(buf, "%s(%s,%s,%d)", sorbTxt(sorb),
   1601                       nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
   1602             *len = 2;
   1603             return
   1604                disAMode_copy2tmp(
   1605                handleSegOverride(sorb,
   1606                   binop(Iop_Add32,
   1607                         getIReg(4,base_r),
   1608                         binop(Iop_Shl32, getIReg(4,index_r),
   1609                               mkU8(scale)))));
   1610          }
   1611 
   1612          if (index_r != R_ESP && base_r == R_EBP) {
   1613             UInt d = getUDisp32(delta);
   1614             DIS(buf, "%s0x%x(,%s,%d)", sorbTxt(sorb), d,
   1615                       nameIReg(4,index_r), 1<<scale);
   1616             *len = 6;
   1617             return
   1618                disAMode_copy2tmp(
   1619                handleSegOverride(sorb,
   1620                   binop(Iop_Add32,
   1621                         binop(Iop_Shl32, getIReg(4,index_r), mkU8(scale)),
   1622                         mkU32(d))));
   1623          }
   1624 
   1625          if (index_r == R_ESP && base_r != R_EBP) {
   1626             DIS(buf, "%s(%s,,)", sorbTxt(sorb), nameIReg(4,base_r));
   1627             *len = 2;
   1628             return disAMode_copy2tmp(
   1629                    handleSegOverride(sorb, getIReg(4,base_r)));
   1630          }
   1631 
   1632          if (index_r == R_ESP && base_r == R_EBP) {
   1633             UInt d = getUDisp32(delta);
   1634             DIS(buf, "%s0x%x(,,)", sorbTxt(sorb), d);
   1635             *len = 6;
   1636             return disAMode_copy2tmp(
   1637                    handleSegOverride(sorb, mkU32(d)));
   1638          }
   1639          /*NOTREACHED*/
   1640          vassert(0);
   1641       }
   1642 
   1643       /* SIB, with 8-bit displacement.  Special cases:
   1644          -- %esp cannot act as an index value.
   1645             If index_r indicates %esp, zero is used for the index.
   1646          Denoted value is:
   1647             | %index == %ESP
   1648             = d8 + %base
   1649             | %index != %ESP
   1650             = d8 + %base + (%index << scale)
   1651       */
   1652       case 0x0C: {
   1653          UChar sib     = getIByte(delta);
   1654          UChar scale   = toUChar((sib >> 6) & 3);
   1655          UChar index_r = toUChar((sib >> 3) & 7);
   1656          UChar base_r  = toUChar(sib & 7);
   1657          UInt  d       = getSDisp8(delta+1);
   1658 
   1659          if (index_r == R_ESP) {
   1660             DIS(buf, "%s%d(%s,,)", sorbTxt(sorb),
   1661                                    (Int)d, nameIReg(4,base_r));
   1662             *len = 3;
   1663             return disAMode_copy2tmp(
   1664                    handleSegOverride(sorb,
   1665                       binop(Iop_Add32, getIReg(4,base_r), mkU32(d)) ));
   1666          } else {
   1667             DIS(buf, "%s%d(%s,%s,%d)", sorbTxt(sorb), (Int)d,
   1668                      nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
   1669             *len = 3;
   1670             return
   1671                 disAMode_copy2tmp(
   1672                 handleSegOverride(sorb,
   1673                   binop(Iop_Add32,
   1674                         binop(Iop_Add32,
   1675                               getIReg(4,base_r),
   1676                               binop(Iop_Shl32,
   1677                                     getIReg(4,index_r), mkU8(scale))),
   1678                         mkU32(d))));
   1679          }
   1680 	 /*NOTREACHED*/
   1681          vassert(0);
   1682       }
   1683 
   1684       /* SIB, with 32-bit displacement.  Special cases:
   1685          -- %esp cannot act as an index value.
   1686             If index_r indicates %esp, zero is used for the index.
   1687          Denoted value is:
   1688             | %index == %ESP
   1689             = d32 + %base
   1690             | %index != %ESP
   1691             = d32 + %base + (%index << scale)
   1692       */
   1693       case 0x14: {
   1694          UChar sib     = getIByte(delta);
   1695          UChar scale   = toUChar((sib >> 6) & 3);
   1696          UChar index_r = toUChar((sib >> 3) & 7);
   1697          UChar base_r  = toUChar(sib & 7);
   1698          UInt d        = getUDisp32(delta+1);
   1699 
   1700          if (index_r == R_ESP) {
   1701             DIS(buf, "%s%d(%s,,)", sorbTxt(sorb),
   1702                                    (Int)d, nameIReg(4,base_r));
   1703             *len = 6;
   1704             return disAMode_copy2tmp(
   1705                    handleSegOverride(sorb,
   1706                       binop(Iop_Add32, getIReg(4,base_r), mkU32(d)) ));
   1707          } else {
   1708             DIS(buf, "%s%d(%s,%s,%d)", sorbTxt(sorb), (Int)d,
   1709                      nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
   1710             *len = 6;
   1711             return
   1712                 disAMode_copy2tmp(
   1713                 handleSegOverride(sorb,
   1714                   binop(Iop_Add32,
   1715                         binop(Iop_Add32,
   1716                               getIReg(4,base_r),
   1717                               binop(Iop_Shl32,
   1718                                     getIReg(4,index_r), mkU8(scale))),
   1719                         mkU32(d))));
   1720          }
   1721 	 /*NOTREACHED*/
   1722          vassert(0);
   1723       }
   1724 
   1725       default:
   1726          vpanic("disAMode(x86)");
   1727          return 0; /*notreached*/
   1728    }
   1729 }
   1730 
   1731 
   1732 /* Figure out the number of (insn-stream) bytes constituting the amode
   1733    beginning at delta.  Is useful for getting hold of literals beyond
   1734    the end of the amode before it has been disassembled.  */
   1735 
   1736 static UInt lengthAMode ( Int delta )
   1737 {
   1738    UChar mod_reg_rm = getIByte(delta); delta++;
   1739 
   1740    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
   1741       jump table seems a bit excessive.
   1742    */
   1743    mod_reg_rm &= 0xC7;               /* is now XX000YYY */
   1744    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
   1745                                      /* is now XX0XXYYY */
   1746    mod_reg_rm &= 0x1F;               /* is now 000XXYYY */
   1747    switch (mod_reg_rm) {
   1748 
   1749       /* (%eax) .. (%edi), not including (%esp) or (%ebp). */
   1750       case 0x00: case 0x01: case 0x02: case 0x03:
   1751       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
   1752          return 1;
   1753 
   1754       /* d8(%eax) ... d8(%edi), not including d8(%esp). */
   1755       case 0x08: case 0x09: case 0x0A: case 0x0B:
   1756       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
   1757          return 2;
   1758 
   1759       /* d32(%eax) ... d32(%edi), not including d32(%esp). */
   1760       case 0x10: case 0x11: case 0x12: case 0x13:
   1761       /* ! 14 */ case 0x15: case 0x16: case 0x17:
   1762          return 5;
   1763 
   1764       /* a register, %eax .. %edi.  (Not an addr, but still handled.) */
   1765       case 0x18: case 0x19: case 0x1A: case 0x1B:
   1766       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
   1767          return 1;
   1768 
   1769       /* a 32-bit literal address. */
   1770       case 0x05: return 5;
   1771 
   1772       /* SIB, no displacement.  */
   1773       case 0x04: {
   1774          UChar sib    = getIByte(delta);
   1775          UChar base_r = toUChar(sib & 7);
   1776          if (base_r == R_EBP) return 6; else return 2;
   1777       }
   1778       /* SIB, with 8-bit displacement.  */
   1779       case 0x0C: return 3;
   1780 
   1781       /* SIB, with 32-bit displacement.  */
   1782       case 0x14: return 6;
   1783 
   1784       default:
   1785          vpanic("lengthAMode");
   1786          return 0; /*notreached*/
   1787    }
   1788 }
   1789 
   1790 /*------------------------------------------------------------*/
   1791 /*--- Disassembling common idioms                          ---*/
   1792 /*------------------------------------------------------------*/
   1793 
   1794 /* Handle binary integer instructions of the form
   1795       op E, G  meaning
   1796       op reg-or-mem, reg
   1797    Is passed the a ptr to the modRM byte, the actual operation, and the
   1798    data size.  Returns the address advanced completely over this
   1799    instruction.
   1800 
   1801    E(src) is reg-or-mem
   1802    G(dst) is reg.
   1803 
   1804    If E is reg, -->    GET %G,  tmp
   1805                        OP %E,   tmp
   1806                        PUT tmp, %G
   1807 
   1808    If E is mem and OP is not reversible,
   1809                 -->    (getAddr E) -> tmpa
   1810                        LD (tmpa), tmpa
   1811                        GET %G, tmp2
   1812                        OP tmpa, tmp2
   1813                        PUT tmp2, %G
   1814 
   1815    If E is mem and OP is reversible
   1816                 -->    (getAddr E) -> tmpa
   1817                        LD (tmpa), tmpa
   1818                        OP %G, tmpa
   1819                        PUT tmpa, %G
   1820 */
   1821 static
   1822 UInt dis_op2_E_G ( UChar       sorb,
   1823                    Bool        addSubCarry,
   1824                    IROp        op8,
   1825                    Bool        keep,
   1826                    Int         size,
   1827                    Int         delta0,
   1828                    HChar*      t_x86opc )
   1829 {
   1830    HChar   dis_buf[50];
   1831    Int     len;
   1832    IRType  ty   = szToITy(size);
   1833    IRTemp  dst1 = newTemp(ty);
   1834    IRTemp  src  = newTemp(ty);
   1835    IRTemp  dst0 = newTemp(ty);
   1836    UChar   rm   = getUChar(delta0);
   1837    IRTemp  addr = IRTemp_INVALID;
   1838 
   1839    /* addSubCarry == True indicates the intended operation is
   1840       add-with-carry or subtract-with-borrow. */
   1841    if (addSubCarry) {
   1842       vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
   1843       vassert(keep);
   1844    }
   1845 
   1846    if (epartIsReg(rm)) {
   1847       /* Specially handle XOR reg,reg, because that doesn't really
   1848          depend on reg, and doing the obvious thing potentially
   1849          generates a spurious value check failure due to the bogus
   1850          dependency.  Ditto SBB reg,reg. */
   1851       if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
   1852           && gregOfRM(rm) == eregOfRM(rm)) {
   1853          putIReg(size, gregOfRM(rm), mkU(ty,0));
   1854       }
   1855       assign( dst0, getIReg(size,gregOfRM(rm)) );
   1856       assign( src,  getIReg(size,eregOfRM(rm)) );
   1857 
   1858       if (addSubCarry && op8 == Iop_Add8) {
   1859          helper_ADC( size, dst1, dst0, src,
   1860                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1861          putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1862       } else
   1863       if (addSubCarry && op8 == Iop_Sub8) {
   1864          helper_SBB( size, dst1, dst0, src,
   1865                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1866          putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1867       } else {
   1868          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   1869          if (isAddSub(op8))
   1870             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   1871          else
   1872             setFlags_DEP1(op8, dst1, ty);
   1873          if (keep)
   1874             putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1875       }
   1876 
   1877       DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
   1878                           nameIReg(size,eregOfRM(rm)),
   1879                           nameIReg(size,gregOfRM(rm)));
   1880       return 1+delta0;
   1881    } else {
   1882       /* E refers to memory */
   1883       addr = disAMode ( &len, sorb, delta0, dis_buf);
   1884       assign( dst0, getIReg(size,gregOfRM(rm)) );
   1885       assign( src,  loadLE(szToITy(size), mkexpr(addr)) );
   1886 
   1887       if (addSubCarry && op8 == Iop_Add8) {
   1888          helper_ADC( size, dst1, dst0, src,
   1889                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1890          putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1891       } else
   1892       if (addSubCarry && op8 == Iop_Sub8) {
   1893          helper_SBB( size, dst1, dst0, src,
   1894                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1895          putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1896       } else {
   1897          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   1898          if (isAddSub(op8))
   1899             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   1900          else
   1901             setFlags_DEP1(op8, dst1, ty);
   1902          if (keep)
   1903             putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1904       }
   1905 
   1906       DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
   1907                           dis_buf,nameIReg(size,gregOfRM(rm)));
   1908       return len+delta0;
   1909    }
   1910 }
   1911 
   1912 
   1913 
   1914 /* Handle binary integer instructions of the form
   1915       op G, E  meaning
   1916       op reg, reg-or-mem
   1917    Is passed the a ptr to the modRM byte, the actual operation, and the
   1918    data size.  Returns the address advanced completely over this
   1919    instruction.
   1920 
   1921    G(src) is reg.
   1922    E(dst) is reg-or-mem
   1923 
   1924    If E is reg, -->    GET %E,  tmp
   1925                        OP %G,   tmp
   1926                        PUT tmp, %E
   1927 
   1928    If E is mem, -->    (getAddr E) -> tmpa
   1929                        LD (tmpa), tmpv
   1930                        OP %G, tmpv
   1931                        ST tmpv, (tmpa)
   1932 */
   1933 static
   1934 UInt dis_op2_G_E ( UChar       sorb,
   1935                    Bool        locked,
   1936                    Bool        addSubCarry,
   1937                    IROp        op8,
   1938                    Bool        keep,
   1939                    Int         size,
   1940                    Int         delta0,
   1941                    HChar*      t_x86opc )
   1942 {
   1943    HChar   dis_buf[50];
   1944    Int     len;
   1945    IRType  ty   = szToITy(size);
   1946    IRTemp  dst1 = newTemp(ty);
   1947    IRTemp  src  = newTemp(ty);
   1948    IRTemp  dst0 = newTemp(ty);
   1949    UChar   rm   = getIByte(delta0);
   1950    IRTemp  addr = IRTemp_INVALID;
   1951 
   1952    /* addSubCarry == True indicates the intended operation is
   1953       add-with-carry or subtract-with-borrow. */
   1954    if (addSubCarry) {
   1955       vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
   1956       vassert(keep);
   1957    }
   1958 
   1959    if (epartIsReg(rm)) {
   1960       /* Specially handle XOR reg,reg, because that doesn't really
   1961          depend on reg, and doing the obvious thing potentially
   1962          generates a spurious value check failure due to the bogus
   1963          dependency.  Ditto SBB reg,reg.*/
   1964       if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
   1965           && gregOfRM(rm) == eregOfRM(rm)) {
   1966          putIReg(size, eregOfRM(rm), mkU(ty,0));
   1967       }
   1968       assign(dst0, getIReg(size,eregOfRM(rm)));
   1969       assign(src,  getIReg(size,gregOfRM(rm)));
   1970 
   1971       if (addSubCarry && op8 == Iop_Add8) {
   1972          helper_ADC( size, dst1, dst0, src,
   1973                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1974          putIReg(size, eregOfRM(rm), mkexpr(dst1));
   1975       } else
   1976       if (addSubCarry && op8 == Iop_Sub8) {
   1977          helper_SBB( size, dst1, dst0, src,
   1978                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1979          putIReg(size, eregOfRM(rm), mkexpr(dst1));
   1980       } else {
   1981          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   1982          if (isAddSub(op8))
   1983             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   1984          else
   1985             setFlags_DEP1(op8, dst1, ty);
   1986          if (keep)
   1987             putIReg(size, eregOfRM(rm), mkexpr(dst1));
   1988       }
   1989 
   1990       DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
   1991                           nameIReg(size,gregOfRM(rm)),
   1992                           nameIReg(size,eregOfRM(rm)));
   1993       return 1+delta0;
   1994    }
   1995 
   1996    /* E refers to memory */
   1997    {
   1998       addr = disAMode ( &len, sorb, delta0, dis_buf);
   1999       assign(dst0, loadLE(ty,mkexpr(addr)));
   2000       assign(src,  getIReg(size,gregOfRM(rm)));
   2001 
   2002       if (addSubCarry && op8 == Iop_Add8) {
   2003          if (locked) {
   2004             /* cas-style store */
   2005             helper_ADC( size, dst1, dst0, src,
   2006                         /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
   2007          } else {
   2008             /* normal store */
   2009             helper_ADC( size, dst1, dst0, src,
   2010                         /*store*/addr, IRTemp_INVALID, 0 );
   2011          }
   2012       } else
   2013       if (addSubCarry && op8 == Iop_Sub8) {
   2014          if (locked) {
   2015             /* cas-style store */
   2016             helper_SBB( size, dst1, dst0, src,
   2017                         /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
   2018          } else {
   2019             /* normal store */
   2020             helper_SBB( size, dst1, dst0, src,
   2021                         /*store*/addr, IRTemp_INVALID, 0 );
   2022          }
   2023       } else {
   2024          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   2025          if (keep) {
   2026             if (locked) {
   2027                if (0) vex_printf("locked case\n" );
   2028                casLE( mkexpr(addr),
   2029                       mkexpr(dst0)/*expval*/,
   2030                       mkexpr(dst1)/*newval*/, guest_EIP_curr_instr );
   2031             } else {
   2032                if (0) vex_printf("nonlocked case\n");
   2033                storeLE(mkexpr(addr), mkexpr(dst1));
   2034             }
   2035          }
   2036          if (isAddSub(op8))
   2037             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2038          else
   2039             setFlags_DEP1(op8, dst1, ty);
   2040       }
   2041 
   2042       DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
   2043                           nameIReg(size,gregOfRM(rm)), dis_buf);
   2044       return len+delta0;
   2045    }
   2046 }
   2047 
   2048 
   2049 /* Handle move instructions of the form
   2050       mov E, G  meaning
   2051       mov reg-or-mem, reg
   2052    Is passed the a ptr to the modRM byte, and the data size.  Returns
   2053    the address advanced completely over this instruction.
   2054 
   2055    E(src) is reg-or-mem
   2056    G(dst) is reg.
   2057 
   2058    If E is reg, -->    GET %E,  tmpv
   2059                        PUT tmpv, %G
   2060 
   2061    If E is mem  -->    (getAddr E) -> tmpa
   2062                        LD (tmpa), tmpb
   2063                        PUT tmpb, %G
   2064 */
   2065 static
   2066 UInt dis_mov_E_G ( UChar       sorb,
   2067                    Int         size,
   2068                    Int         delta0 )
   2069 {
   2070    Int len;
   2071    UChar rm = getIByte(delta0);
   2072    HChar dis_buf[50];
   2073 
   2074    if (epartIsReg(rm)) {
   2075       putIReg(size, gregOfRM(rm), getIReg(size, eregOfRM(rm)));
   2076       DIP("mov%c %s,%s\n", nameISize(size),
   2077                            nameIReg(size,eregOfRM(rm)),
   2078                            nameIReg(size,gregOfRM(rm)));
   2079       return 1+delta0;
   2080    }
   2081 
   2082    /* E refers to memory */
   2083    {
   2084       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
   2085       putIReg(size, gregOfRM(rm), loadLE(szToITy(size), mkexpr(addr)));
   2086       DIP("mov%c %s,%s\n", nameISize(size),
   2087                            dis_buf,nameIReg(size,gregOfRM(rm)));
   2088       return delta0+len;
   2089    }
   2090 }
   2091 
   2092 
   2093 /* Handle move instructions of the form
   2094       mov G, E  meaning
   2095       mov reg, reg-or-mem
   2096    Is passed the a ptr to the modRM byte, and the data size.  Returns
   2097    the address advanced completely over this instruction.
   2098 
   2099    G(src) is reg.
   2100    E(dst) is reg-or-mem
   2101 
   2102    If E is reg, -->    GET %G,  tmp
   2103                        PUT tmp, %E
   2104 
   2105    If E is mem, -->    (getAddr E) -> tmpa
   2106                        GET %G, tmpv
   2107                        ST tmpv, (tmpa)
   2108 */
   2109 static
   2110 UInt dis_mov_G_E ( UChar       sorb,
   2111                    Int         size,
   2112                    Int         delta0 )
   2113 {
   2114    Int len;
   2115    UChar rm = getIByte(delta0);
   2116    HChar dis_buf[50];
   2117 
   2118    if (epartIsReg(rm)) {
   2119       putIReg(size, eregOfRM(rm), getIReg(size, gregOfRM(rm)));
   2120       DIP("mov%c %s,%s\n", nameISize(size),
   2121                            nameIReg(size,gregOfRM(rm)),
   2122                            nameIReg(size,eregOfRM(rm)));
   2123       return 1+delta0;
   2124    }
   2125 
   2126    /* E refers to memory */
   2127    {
   2128       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf);
   2129       storeLE( mkexpr(addr), getIReg(size, gregOfRM(rm)) );
   2130       DIP("mov%c %s,%s\n", nameISize(size),
   2131                            nameIReg(size,gregOfRM(rm)), dis_buf);
   2132       return len+delta0;
   2133    }
   2134 }
   2135 
   2136 
   2137 /* op $immediate, AL/AX/EAX. */
   2138 static
   2139 UInt dis_op_imm_A ( Int    size,
   2140                     Bool   carrying,
   2141                     IROp   op8,
   2142                     Bool   keep,
   2143                     Int    delta,
   2144                     HChar* t_x86opc )
   2145 {
   2146    IRType ty   = szToITy(size);
   2147    IRTemp dst0 = newTemp(ty);
   2148    IRTemp src  = newTemp(ty);
   2149    IRTemp dst1 = newTemp(ty);
   2150    UInt lit    = getUDisp(size,delta);
   2151    assign(dst0, getIReg(size,R_EAX));
   2152    assign(src,  mkU(ty,lit));
   2153 
   2154    if (isAddSub(op8) && !carrying) {
   2155       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   2156       setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2157    }
   2158    else
   2159    if (isLogic(op8)) {
   2160       vassert(!carrying);
   2161       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   2162       setFlags_DEP1(op8, dst1, ty);
   2163    }
   2164    else
   2165    if (op8 == Iop_Add8 && carrying) {
   2166       helper_ADC( size, dst1, dst0, src,
   2167                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2168    }
   2169    else
   2170    if (op8 == Iop_Sub8 && carrying) {
   2171       helper_SBB( size, dst1, dst0, src,
   2172                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2173    }
   2174    else
   2175       vpanic("dis_op_imm_A(x86,guest)");
   2176 
   2177    if (keep)
   2178       putIReg(size, R_EAX, mkexpr(dst1));
   2179 
   2180    DIP("%s%c $0x%x, %s\n", t_x86opc, nameISize(size),
   2181                            lit, nameIReg(size,R_EAX));
   2182    return delta+size;
   2183 }
   2184 
   2185 
   2186 /* Sign- and Zero-extending moves. */
   2187 static
   2188 UInt dis_movx_E_G ( UChar      sorb,
   2189                     Int delta, Int szs, Int szd, Bool sign_extend )
   2190 {
   2191    UChar rm = getIByte(delta);
   2192    if (epartIsReg(rm)) {
   2193       if (szd == szs) {
   2194          // mutant case.  See #250799
   2195          putIReg(szd, gregOfRM(rm),
   2196                            getIReg(szs,eregOfRM(rm)));
   2197       } else {
   2198          // normal case
   2199          putIReg(szd, gregOfRM(rm),
   2200                       unop(mkWidenOp(szs,szd,sign_extend),
   2201                            getIReg(szs,eregOfRM(rm))));
   2202       }
   2203       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
   2204                                nameISize(szs), nameISize(szd),
   2205                                nameIReg(szs,eregOfRM(rm)),
   2206                                nameIReg(szd,gregOfRM(rm)));
   2207       return 1+delta;
   2208    }
   2209 
   2210    /* E refers to memory */
   2211    {
   2212       Int    len;
   2213       HChar  dis_buf[50];
   2214       IRTemp addr = disAMode ( &len, sorb, delta, dis_buf );
   2215       if (szd == szs) {
   2216          // mutant case.  See #250799
   2217          putIReg(szd, gregOfRM(rm),
   2218                            loadLE(szToITy(szs),mkexpr(addr)));
   2219       } else {
   2220          // normal case
   2221          putIReg(szd, gregOfRM(rm),
   2222                       unop(mkWidenOp(szs,szd,sign_extend),
   2223                            loadLE(szToITy(szs),mkexpr(addr))));
   2224       }
   2225       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
   2226                                nameISize(szs), nameISize(szd),
   2227                                dis_buf, nameIReg(szd,gregOfRM(rm)));
   2228       return len+delta;
   2229    }
   2230 }
   2231 
   2232 
   2233 /* Generate code to divide ArchRegs EDX:EAX / DX:AX / AX by the 32 /
   2234    16 / 8 bit quantity in the given IRTemp.  */
   2235 static
   2236 void codegen_div ( Int sz, IRTemp t, Bool signed_divide )
   2237 {
   2238    IROp   op    = signed_divide ? Iop_DivModS64to32 : Iop_DivModU64to32;
   2239    IRTemp src64 = newTemp(Ity_I64);
   2240    IRTemp dst64 = newTemp(Ity_I64);
   2241    switch (sz) {
   2242       case 4:
   2243          assign( src64, binop(Iop_32HLto64,
   2244                               getIReg(4,R_EDX), getIReg(4,R_EAX)) );
   2245          assign( dst64, binop(op, mkexpr(src64), mkexpr(t)) );
   2246          putIReg( 4, R_EAX, unop(Iop_64to32,mkexpr(dst64)) );
   2247          putIReg( 4, R_EDX, unop(Iop_64HIto32,mkexpr(dst64)) );
   2248          break;
   2249       case 2: {
   2250          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
   2251          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
   2252          assign( src64, unop(widen3264,
   2253                              binop(Iop_16HLto32,
   2254                                    getIReg(2,R_EDX), getIReg(2,R_EAX))) );
   2255          assign( dst64, binop(op, mkexpr(src64), unop(widen1632,mkexpr(t))) );
   2256          putIReg( 2, R_EAX, unop(Iop_32to16,unop(Iop_64to32,mkexpr(dst64))) );
   2257          putIReg( 2, R_EDX, unop(Iop_32to16,unop(Iop_64HIto32,mkexpr(dst64))) );
   2258          break;
   2259       }
   2260       case 1: {
   2261          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
   2262          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
   2263          IROp widen816  = signed_divide ? Iop_8Sto16  : Iop_8Uto16;
   2264          assign( src64, unop(widen3264, unop(widen1632, getIReg(2,R_EAX))) );
   2265          assign( dst64,
   2266                  binop(op, mkexpr(src64),
   2267                            unop(widen1632, unop(widen816, mkexpr(t)))) );
   2268          putIReg( 1, R_AL, unop(Iop_16to8, unop(Iop_32to16,
   2269                            unop(Iop_64to32,mkexpr(dst64)))) );
   2270          putIReg( 1, R_AH, unop(Iop_16to8, unop(Iop_32to16,
   2271                            unop(Iop_64HIto32,mkexpr(dst64)))) );
   2272          break;
   2273       }
   2274       default: vpanic("codegen_div(x86)");
   2275    }
   2276 }
   2277 
   2278 
   2279 static
   2280 UInt dis_Grp1 ( UChar sorb, Bool locked,
   2281                 Int delta, UChar modrm,
   2282                 Int am_sz, Int d_sz, Int sz, UInt d32 )
   2283 {
   2284    Int     len;
   2285    HChar   dis_buf[50];
   2286    IRType  ty   = szToITy(sz);
   2287    IRTemp  dst1 = newTemp(ty);
   2288    IRTemp  src  = newTemp(ty);
   2289    IRTemp  dst0 = newTemp(ty);
   2290    IRTemp  addr = IRTemp_INVALID;
   2291    IROp    op8  = Iop_INVALID;
   2292    UInt    mask = sz==1 ? 0xFF : (sz==2 ? 0xFFFF : 0xFFFFFFFF);
   2293 
   2294    switch (gregOfRM(modrm)) {
   2295       case 0: op8 = Iop_Add8; break;  case 1: op8 = Iop_Or8;  break;
   2296       case 2: break;  // ADC
   2297       case 3: break;  // SBB
   2298       case 4: op8 = Iop_And8; break;  case 5: op8 = Iop_Sub8; break;
   2299       case 6: op8 = Iop_Xor8; break;  case 7: op8 = Iop_Sub8; break;
   2300       /*NOTREACHED*/
   2301       default: vpanic("dis_Grp1: unhandled case");
   2302    }
   2303 
   2304    if (epartIsReg(modrm)) {
   2305       vassert(am_sz == 1);
   2306 
   2307       assign(dst0, getIReg(sz,eregOfRM(modrm)));
   2308       assign(src,  mkU(ty,d32 & mask));
   2309 
   2310       if (gregOfRM(modrm) == 2 /* ADC */) {
   2311          helper_ADC( sz, dst1, dst0, src,
   2312                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2313       } else
   2314       if (gregOfRM(modrm) == 3 /* SBB */) {
   2315          helper_SBB( sz, dst1, dst0, src,
   2316                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2317       } else {
   2318          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   2319          if (isAddSub(op8))
   2320             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2321          else
   2322             setFlags_DEP1(op8, dst1, ty);
   2323       }
   2324 
   2325       if (gregOfRM(modrm) < 7)
   2326          putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
   2327 
   2328       delta += (am_sz + d_sz);
   2329       DIP("%s%c $0x%x, %s\n", nameGrp1(gregOfRM(modrm)), nameISize(sz), d32,
   2330                               nameIReg(sz,eregOfRM(modrm)));
   2331    } else {
   2332       addr = disAMode ( &len, sorb, delta, dis_buf);
   2333 
   2334       assign(dst0, loadLE(ty,mkexpr(addr)));
   2335       assign(src, mkU(ty,d32 & mask));
   2336 
   2337       if (gregOfRM(modrm) == 2 /* ADC */) {
   2338          if (locked) {
   2339             /* cas-style store */
   2340             helper_ADC( sz, dst1, dst0, src,
   2341                        /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
   2342          } else {
   2343             /* normal store */
   2344             helper_ADC( sz, dst1, dst0, src,
   2345                         /*store*/addr, IRTemp_INVALID, 0 );
   2346          }
   2347       } else
   2348       if (gregOfRM(modrm) == 3 /* SBB */) {
   2349          if (locked) {
   2350             /* cas-style store */
   2351             helper_SBB( sz, dst1, dst0, src,
   2352                        /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
   2353          } else {
   2354             /* normal store */
   2355             helper_SBB( sz, dst1, dst0, src,
   2356                         /*store*/addr, IRTemp_INVALID, 0 );
   2357          }
   2358       } else {
   2359          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   2360          if (gregOfRM(modrm) < 7) {
   2361             if (locked) {
   2362                casLE( mkexpr(addr), mkexpr(dst0)/*expVal*/,
   2363                                     mkexpr(dst1)/*newVal*/,
   2364                                     guest_EIP_curr_instr );
   2365             } else {
   2366                storeLE(mkexpr(addr), mkexpr(dst1));
   2367             }
   2368          }
   2369          if (isAddSub(op8))
   2370             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2371          else
   2372             setFlags_DEP1(op8, dst1, ty);
   2373       }
   2374 
   2375       delta += (len+d_sz);
   2376       DIP("%s%c $0x%x, %s\n", nameGrp1(gregOfRM(modrm)), nameISize(sz),
   2377                               d32, dis_buf);
   2378    }
   2379    return delta;
   2380 }
   2381 
   2382 
   2383 /* Group 2 extended opcodes.  shift_expr must be an 8-bit typed
   2384    expression. */
   2385 
   2386 static
   2387 UInt dis_Grp2 ( UChar sorb,
   2388                 Int delta, UChar modrm,
   2389                 Int am_sz, Int d_sz, Int sz, IRExpr* shift_expr,
   2390                 HChar* shift_expr_txt, Bool* decode_OK )
   2391 {
   2392    /* delta on entry points at the modrm byte. */
   2393    HChar  dis_buf[50];
   2394    Int    len;
   2395    Bool   isShift, isRotate, isRotateC;
   2396    IRType ty    = szToITy(sz);
   2397    IRTemp dst0  = newTemp(ty);
   2398    IRTemp dst1  = newTemp(ty);
   2399    IRTemp addr  = IRTemp_INVALID;
   2400 
   2401    *decode_OK = True;
   2402 
   2403    vassert(sz == 1 || sz == 2 || sz == 4);
   2404 
   2405    /* Put value to shift/rotate in dst0. */
   2406    if (epartIsReg(modrm)) {
   2407       assign(dst0, getIReg(sz, eregOfRM(modrm)));
   2408       delta += (am_sz + d_sz);
   2409    } else {
   2410       addr = disAMode ( &len, sorb, delta, dis_buf);
   2411       assign(dst0, loadLE(ty,mkexpr(addr)));
   2412       delta += len + d_sz;
   2413    }
   2414 
   2415    isShift = False;
   2416    switch (gregOfRM(modrm)) { case 4: case 5: case 6: case 7: isShift = True; }
   2417 
   2418    isRotate = False;
   2419    switch (gregOfRM(modrm)) { case 0: case 1: isRotate = True; }
   2420 
   2421    isRotateC = False;
   2422    switch (gregOfRM(modrm)) { case 2: case 3: isRotateC = True; }
   2423 
   2424    if (!isShift && !isRotate && !isRotateC) {
   2425       /*NOTREACHED*/
   2426       vpanic("dis_Grp2(Reg): unhandled case(x86)");
   2427    }
   2428 
   2429    if (isRotateC) {
   2430       /* call a helper; these insns are so ridiculous they do not
   2431          deserve better */
   2432       Bool     left = toBool(gregOfRM(modrm) == 2);
   2433       IRTemp   r64  = newTemp(Ity_I64);
   2434       IRExpr** args
   2435          = mkIRExprVec_4( widenUto32(mkexpr(dst0)), /* thing to rotate */
   2436                           widenUto32(shift_expr),   /* rotate amount */
   2437                           widenUto32(mk_x86g_calculate_eflags_all()),
   2438                           mkU32(sz) );
   2439       assign( r64, mkIRExprCCall(
   2440                       Ity_I64,
   2441                       0/*regparm*/,
   2442                       left ? "x86g_calculate_RCL" : "x86g_calculate_RCR",
   2443                       left ? &x86g_calculate_RCL  : &x86g_calculate_RCR,
   2444                       args
   2445                    )
   2446             );
   2447       /* new eflags in hi half r64; new value in lo half r64 */
   2448       assign( dst1, narrowTo(ty, unop(Iop_64to32, mkexpr(r64))) );
   2449       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   2450       stmt( IRStmt_Put( OFFB_CC_DEP1, unop(Iop_64HIto32, mkexpr(r64)) ));
   2451       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   2452       /* Set NDEP even though it isn't used.  This makes redundant-PUT
   2453          elimination of previous stores to this field work better. */
   2454       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   2455    }
   2456 
   2457    if (isShift) {
   2458 
   2459       IRTemp pre32     = newTemp(Ity_I32);
   2460       IRTemp res32     = newTemp(Ity_I32);
   2461       IRTemp res32ss   = newTemp(Ity_I32);
   2462       IRTemp shift_amt = newTemp(Ity_I8);
   2463       IROp   op32;
   2464 
   2465       switch (gregOfRM(modrm)) {
   2466          case 4: op32 = Iop_Shl32; break;
   2467          case 5: op32 = Iop_Shr32; break;
   2468          case 6: op32 = Iop_Shl32; break;
   2469          case 7: op32 = Iop_Sar32; break;
   2470          /*NOTREACHED*/
   2471          default: vpanic("dis_Grp2:shift"); break;
   2472       }
   2473 
   2474       /* Widen the value to be shifted to 32 bits, do the shift, and
   2475          narrow back down.  This seems surprisingly long-winded, but
   2476          unfortunately the Intel semantics requires that 8/16-bit
   2477          shifts give defined results for shift values all the way up
   2478          to 31, and this seems the simplest way to do it.  It has the
   2479          advantage that the only IR level shifts generated are of 32
   2480          bit values, and the shift amount is guaranteed to be in the
   2481          range 0 .. 31, thereby observing the IR semantics requiring
   2482          all shift values to be in the range 0 .. 2^word_size-1. */
   2483 
   2484       /* shift_amt = shift_expr & 31, regardless of operation size */
   2485       assign( shift_amt, binop(Iop_And8, shift_expr, mkU8(31)) );
   2486 
   2487       /* suitably widen the value to be shifted to 32 bits. */
   2488       assign( pre32, op32==Iop_Sar32 ? widenSto32(mkexpr(dst0))
   2489                                      : widenUto32(mkexpr(dst0)) );
   2490 
   2491       /* res32 = pre32 `shift` shift_amt */
   2492       assign( res32, binop(op32, mkexpr(pre32), mkexpr(shift_amt)) );
   2493 
   2494       /* res32ss = pre32 `shift` ((shift_amt - 1) & 31) */
   2495       assign( res32ss,
   2496               binop(op32,
   2497                     mkexpr(pre32),
   2498                     binop(Iop_And8,
   2499                           binop(Iop_Sub8,
   2500                                 mkexpr(shift_amt), mkU8(1)),
   2501                           mkU8(31))) );
   2502 
   2503       /* Build the flags thunk. */
   2504       setFlags_DEP1_DEP2_shift(op32, res32, res32ss, ty, shift_amt);
   2505 
   2506       /* Narrow the result back down. */
   2507       assign( dst1, narrowTo(ty, mkexpr(res32)) );
   2508 
   2509    } /* if (isShift) */
   2510 
   2511    else
   2512    if (isRotate) {
   2513       Int    ccOp      = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
   2514       Bool   left      = toBool(gregOfRM(modrm) == 0);
   2515       IRTemp rot_amt   = newTemp(Ity_I8);
   2516       IRTemp rot_amt32 = newTemp(Ity_I8);
   2517       IRTemp oldFlags  = newTemp(Ity_I32);
   2518 
   2519       /* rot_amt = shift_expr & mask */
   2520       /* By masking the rotate amount thusly, the IR-level Shl/Shr
   2521          expressions never shift beyond the word size and thus remain
   2522          well defined. */
   2523       assign(rot_amt32, binop(Iop_And8, shift_expr, mkU8(31)));
   2524 
   2525       if (ty == Ity_I32)
   2526          assign(rot_amt, mkexpr(rot_amt32));
   2527       else
   2528          assign(rot_amt, binop(Iop_And8, mkexpr(rot_amt32), mkU8(8*sz-1)));
   2529 
   2530       if (left) {
   2531 
   2532          /* dst1 = (dst0 << rot_amt) | (dst0 >>u (wordsize-rot_amt)) */
   2533          assign(dst1,
   2534             binop( mkSizedOp(ty,Iop_Or8),
   2535                    binop( mkSizedOp(ty,Iop_Shl8),
   2536                           mkexpr(dst0),
   2537                           mkexpr(rot_amt)
   2538                    ),
   2539                    binop( mkSizedOp(ty,Iop_Shr8),
   2540                           mkexpr(dst0),
   2541                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
   2542                    )
   2543             )
   2544          );
   2545          ccOp += X86G_CC_OP_ROLB;
   2546 
   2547       } else { /* right */
   2548 
   2549          /* dst1 = (dst0 >>u rot_amt) | (dst0 << (wordsize-rot_amt)) */
   2550          assign(dst1,
   2551             binop( mkSizedOp(ty,Iop_Or8),
   2552                    binop( mkSizedOp(ty,Iop_Shr8),
   2553                           mkexpr(dst0),
   2554                           mkexpr(rot_amt)
   2555                    ),
   2556                    binop( mkSizedOp(ty,Iop_Shl8),
   2557                           mkexpr(dst0),
   2558                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
   2559                    )
   2560             )
   2561          );
   2562          ccOp += X86G_CC_OP_RORB;
   2563 
   2564       }
   2565 
   2566       /* dst1 now holds the rotated value.  Build flag thunk.  We
   2567          need the resulting value for this, and the previous flags.
   2568          Except don't set it if the rotate count is zero. */
   2569 
   2570       assign(oldFlags, mk_x86g_calculate_eflags_all());
   2571 
   2572       /* CC_DEP1 is the rotated value.  CC_NDEP is flags before. */
   2573       stmt( IRStmt_Put( OFFB_CC_OP,
   2574                         IRExpr_Mux0X( mkexpr(rot_amt32),
   2575                                       IRExpr_Get(OFFB_CC_OP,Ity_I32),
   2576                                       mkU32(ccOp))) );
   2577       stmt( IRStmt_Put( OFFB_CC_DEP1,
   2578                         IRExpr_Mux0X( mkexpr(rot_amt32),
   2579                                       IRExpr_Get(OFFB_CC_DEP1,Ity_I32),
   2580                                       widenUto32(mkexpr(dst1)))) );
   2581       stmt( IRStmt_Put( OFFB_CC_DEP2,
   2582                         IRExpr_Mux0X( mkexpr(rot_amt32),
   2583                                       IRExpr_Get(OFFB_CC_DEP2,Ity_I32),
   2584                                       mkU32(0))) );
   2585       stmt( IRStmt_Put( OFFB_CC_NDEP,
   2586                         IRExpr_Mux0X( mkexpr(rot_amt32),
   2587                                       IRExpr_Get(OFFB_CC_NDEP,Ity_I32),
   2588                                       mkexpr(oldFlags))) );
   2589    } /* if (isRotate) */
   2590 
   2591    /* Save result, and finish up. */
   2592    if (epartIsReg(modrm)) {
   2593       putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
   2594       if (vex_traceflags & VEX_TRACE_FE) {
   2595          vex_printf("%s%c ",
   2596                     nameGrp2(gregOfRM(modrm)), nameISize(sz) );
   2597          if (shift_expr_txt)
   2598             vex_printf("%s", shift_expr_txt);
   2599          else
   2600             ppIRExpr(shift_expr);
   2601          vex_printf(", %s\n", nameIReg(sz,eregOfRM(modrm)));
   2602       }
   2603    } else {
   2604       storeLE(mkexpr(addr), mkexpr(dst1));
   2605       if (vex_traceflags & VEX_TRACE_FE) {
   2606          vex_printf("%s%c ",
   2607                     nameGrp2(gregOfRM(modrm)), nameISize(sz) );
   2608          if (shift_expr_txt)
   2609             vex_printf("%s", shift_expr_txt);
   2610          else
   2611             ppIRExpr(shift_expr);
   2612          vex_printf(", %s\n", dis_buf);
   2613       }
   2614    }
   2615    return delta;
   2616 }
   2617 
   2618 
   2619 /* Group 8 extended opcodes (but BT/BTS/BTC/BTR only). */
   2620 static
   2621 UInt dis_Grp8_Imm ( UChar sorb,
   2622                     Bool locked,
   2623                     Int delta, UChar modrm,
   2624                     Int am_sz, Int sz, UInt src_val,
   2625                     Bool* decode_OK )
   2626 {
   2627    /* src_val denotes a d8.
   2628       And delta on entry points at the modrm byte. */
   2629 
   2630    IRType ty     = szToITy(sz);
   2631    IRTemp t2     = newTemp(Ity_I32);
   2632    IRTemp t2m    = newTemp(Ity_I32);
   2633    IRTemp t_addr = IRTemp_INVALID;
   2634    HChar  dis_buf[50];
   2635    UInt   mask;
   2636 
   2637    /* we're optimists :-) */
   2638    *decode_OK = True;
   2639 
   2640    /* Limit src_val -- the bit offset -- to something within a word.
   2641       The Intel docs say that literal offsets larger than a word are
   2642       masked in this way. */
   2643    switch (sz) {
   2644       case 2:  src_val &= 15; break;
   2645       case 4:  src_val &= 31; break;
   2646       default: *decode_OK = False; return delta;
   2647    }
   2648 
   2649    /* Invent a mask suitable for the operation. */
   2650    switch (gregOfRM(modrm)) {
   2651       case 4: /* BT */  mask = 0;               break;
   2652       case 5: /* BTS */ mask = 1 << src_val;    break;
   2653       case 6: /* BTR */ mask = ~(1 << src_val); break;
   2654       case 7: /* BTC */ mask = 1 << src_val;    break;
   2655          /* If this needs to be extended, probably simplest to make a
   2656             new function to handle the other cases (0 .. 3).  The
   2657             Intel docs do however not indicate any use for 0 .. 3, so
   2658             we don't expect this to happen. */
   2659       default: *decode_OK = False; return delta;
   2660    }
   2661 
   2662    /* Fetch the value to be tested and modified into t2, which is
   2663       32-bits wide regardless of sz. */
   2664    if (epartIsReg(modrm)) {
   2665       vassert(am_sz == 1);
   2666       assign( t2, widenUto32(getIReg(sz, eregOfRM(modrm))) );
   2667       delta += (am_sz + 1);
   2668       DIP("%s%c $0x%x, %s\n", nameGrp8(gregOfRM(modrm)), nameISize(sz),
   2669                               src_val, nameIReg(sz,eregOfRM(modrm)));
   2670    } else {
   2671       Int len;
   2672       t_addr = disAMode ( &len, sorb, delta, dis_buf);
   2673       delta  += (len+1);
   2674       assign( t2, widenUto32(loadLE(ty, mkexpr(t_addr))) );
   2675       DIP("%s%c $0x%x, %s\n", nameGrp8(gregOfRM(modrm)), nameISize(sz),
   2676                               src_val, dis_buf);
   2677    }
   2678 
   2679    /* Compute the new value into t2m, if non-BT. */
   2680    switch (gregOfRM(modrm)) {
   2681       case 4: /* BT */
   2682          break;
   2683       case 5: /* BTS */
   2684          assign( t2m, binop(Iop_Or32, mkU32(mask), mkexpr(t2)) );
   2685          break;
   2686       case 6: /* BTR */
   2687          assign( t2m, binop(Iop_And32, mkU32(mask), mkexpr(t2)) );
   2688          break;
   2689       case 7: /* BTC */
   2690          assign( t2m, binop(Iop_Xor32, mkU32(mask), mkexpr(t2)) );
   2691          break;
   2692       default:
   2693          /*NOTREACHED*/ /*the previous switch guards this*/
   2694          vassert(0);
   2695    }
   2696 
   2697    /* Write the result back, if non-BT.  If the CAS fails then we
   2698       side-exit from the trace at this point, and so the flag state is
   2699       not affected.  This is of course as required. */
   2700    if (gregOfRM(modrm) != 4 /* BT */) {
   2701       if (epartIsReg(modrm)) {
   2702          putIReg(sz, eregOfRM(modrm), narrowTo(ty, mkexpr(t2m)));
   2703       } else {
   2704          if (locked) {
   2705             casLE( mkexpr(t_addr),
   2706                    narrowTo(ty, mkexpr(t2))/*expd*/,
   2707                    narrowTo(ty, mkexpr(t2m))/*new*/,
   2708                    guest_EIP_curr_instr );
   2709          } else {
   2710             storeLE(mkexpr(t_addr), narrowTo(ty, mkexpr(t2m)));
   2711          }
   2712       }
   2713    }
   2714 
   2715    /* Copy relevant bit from t2 into the carry flag. */
   2716    /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
   2717    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   2718    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   2719    stmt( IRStmt_Put(
   2720             OFFB_CC_DEP1,
   2721             binop(Iop_And32,
   2722                   binop(Iop_Shr32, mkexpr(t2), mkU8(src_val)),
   2723                   mkU32(1))
   2724        ));
   2725    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   2726       elimination of previous stores to this field work better. */
   2727    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   2728 
   2729    return delta;
   2730 }
   2731 
   2732 
   2733 /* Signed/unsigned widening multiply.  Generate IR to multiply the
   2734    value in EAX/AX/AL by the given IRTemp, and park the result in
   2735    EDX:EAX/DX:AX/AX.
   2736 */
   2737 static void codegen_mulL_A_D ( Int sz, Bool syned,
   2738                                IRTemp tmp, HChar* tmp_txt )
   2739 {
   2740    IRType ty = szToITy(sz);
   2741    IRTemp t1 = newTemp(ty);
   2742 
   2743    assign( t1, getIReg(sz, R_EAX) );
   2744 
   2745    switch (ty) {
   2746       case Ity_I32: {
   2747          IRTemp res64   = newTemp(Ity_I64);
   2748          IRTemp resHi   = newTemp(Ity_I32);
   2749          IRTemp resLo   = newTemp(Ity_I32);
   2750          IROp   mulOp   = syned ? Iop_MullS32 : Iop_MullU32;
   2751          UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
   2752          setFlags_MUL ( Ity_I32, t1, tmp, tBaseOp );
   2753          assign( res64, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   2754          assign( resHi, unop(Iop_64HIto32,mkexpr(res64)));
   2755          assign( resLo, unop(Iop_64to32,mkexpr(res64)));
   2756          putIReg(4, R_EDX, mkexpr(resHi));
   2757          putIReg(4, R_EAX, mkexpr(resLo));
   2758          break;
   2759       }
   2760       case Ity_I16: {
   2761          IRTemp res32   = newTemp(Ity_I32);
   2762          IRTemp resHi   = newTemp(Ity_I16);
   2763          IRTemp resLo   = newTemp(Ity_I16);
   2764          IROp   mulOp   = syned ? Iop_MullS16 : Iop_MullU16;
   2765          UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
   2766          setFlags_MUL ( Ity_I16, t1, tmp, tBaseOp );
   2767          assign( res32, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   2768          assign( resHi, unop(Iop_32HIto16,mkexpr(res32)));
   2769          assign( resLo, unop(Iop_32to16,mkexpr(res32)));
   2770          putIReg(2, R_EDX, mkexpr(resHi));
   2771          putIReg(2, R_EAX, mkexpr(resLo));
   2772          break;
   2773       }
   2774       case Ity_I8: {
   2775          IRTemp res16   = newTemp(Ity_I16);
   2776          IRTemp resHi   = newTemp(Ity_I8);
   2777          IRTemp resLo   = newTemp(Ity_I8);
   2778          IROp   mulOp   = syned ? Iop_MullS8 : Iop_MullU8;
   2779          UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
   2780          setFlags_MUL ( Ity_I8, t1, tmp, tBaseOp );
   2781          assign( res16, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   2782          assign( resHi, unop(Iop_16HIto8,mkexpr(res16)));
   2783          assign( resLo, unop(Iop_16to8,mkexpr(res16)));
   2784          putIReg(2, R_EAX, mkexpr(res16));
   2785          break;
   2786       }
   2787       default:
   2788          vpanic("codegen_mulL_A_D(x86)");
   2789    }
   2790    DIP("%s%c %s\n", syned ? "imul" : "mul", nameISize(sz), tmp_txt);
   2791 }
   2792 
   2793 
   2794 /* Group 3 extended opcodes. */
   2795 static
   2796 UInt dis_Grp3 ( UChar sorb, Bool locked, Int sz, Int delta, Bool* decode_OK )
   2797 {
   2798    UInt    d32;
   2799    UChar   modrm;
   2800    HChar   dis_buf[50];
   2801    Int     len;
   2802    IRTemp  addr;
   2803    IRType  ty = szToITy(sz);
   2804    IRTemp  t1 = newTemp(ty);
   2805    IRTemp dst1, src, dst0;
   2806 
   2807    *decode_OK = True; /* may change this later */
   2808 
   2809    modrm = getIByte(delta);
   2810 
   2811    if (locked && (gregOfRM(modrm) != 2 && gregOfRM(modrm) != 3)) {
   2812       /* LOCK prefix only allowed with not and neg subopcodes */
   2813       *decode_OK = False;
   2814       return delta;
   2815    }
   2816 
   2817    if (epartIsReg(modrm)) {
   2818       switch (gregOfRM(modrm)) {
   2819          case 0: { /* TEST */
   2820             delta++; d32 = getUDisp(sz, delta); delta += sz;
   2821             dst1 = newTemp(ty);
   2822             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
   2823                                getIReg(sz,eregOfRM(modrm)),
   2824                                mkU(ty,d32)));
   2825             setFlags_DEP1( Iop_And8, dst1, ty );
   2826             DIP("test%c $0x%x, %s\n", nameISize(sz), d32,
   2827                                       nameIReg(sz, eregOfRM(modrm)));
   2828             break;
   2829          }
   2830          case 1: /* UNDEFINED */
   2831            /* The Intel docs imply this insn is undefined and binutils
   2832               agrees.  Unfortunately Core 2 will run it (with who
   2833               knows what result?)  sandpile.org reckons it's an alias
   2834               for case 0.  We play safe. */
   2835            *decode_OK = False;
   2836            break;
   2837          case 2: /* NOT */
   2838             delta++;
   2839             putIReg(sz, eregOfRM(modrm),
   2840                         unop(mkSizedOp(ty,Iop_Not8),
   2841                              getIReg(sz, eregOfRM(modrm))));
   2842             DIP("not%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
   2843             break;
   2844          case 3: /* NEG */
   2845             delta++;
   2846             dst0 = newTemp(ty);
   2847             src  = newTemp(ty);
   2848             dst1 = newTemp(ty);
   2849             assign(dst0, mkU(ty,0));
   2850             assign(src,  getIReg(sz,eregOfRM(modrm)));
   2851             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0), mkexpr(src)));
   2852             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
   2853             putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
   2854             DIP("neg%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
   2855             break;
   2856          case 4: /* MUL (unsigned widening) */
   2857             delta++;
   2858             src = newTemp(ty);
   2859             assign(src, getIReg(sz,eregOfRM(modrm)));
   2860             codegen_mulL_A_D ( sz, False, src, nameIReg(sz,eregOfRM(modrm)) );
   2861             break;
   2862          case 5: /* IMUL (signed widening) */
   2863             delta++;
   2864             src = newTemp(ty);
   2865             assign(src, getIReg(sz,eregOfRM(modrm)));
   2866             codegen_mulL_A_D ( sz, True, src, nameIReg(sz,eregOfRM(modrm)) );
   2867             break;
   2868          case 6: /* DIV */
   2869             delta++;
   2870             assign( t1, getIReg(sz, eregOfRM(modrm)) );
   2871             codegen_div ( sz, t1, False );
   2872             DIP("div%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
   2873             break;
   2874          case 7: /* IDIV */
   2875             delta++;
   2876             assign( t1, getIReg(sz, eregOfRM(modrm)) );
   2877             codegen_div ( sz, t1, True );
   2878             DIP("idiv%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
   2879             break;
   2880          default:
   2881             /* This can't happen - gregOfRM should return 0 .. 7 only */
   2882             vpanic("Grp3(x86)");
   2883       }
   2884    } else {
   2885       addr = disAMode ( &len, sorb, delta, dis_buf );
   2886       t1   = newTemp(ty);
   2887       delta += len;
   2888       assign(t1, loadLE(ty,mkexpr(addr)));
   2889       switch (gregOfRM(modrm)) {
   2890          case 0: { /* TEST */
   2891             d32 = getUDisp(sz, delta); delta += sz;
   2892             dst1 = newTemp(ty);
   2893             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
   2894                                mkexpr(t1), mkU(ty,d32)));
   2895             setFlags_DEP1( Iop_And8, dst1, ty );
   2896             DIP("test%c $0x%x, %s\n", nameISize(sz), d32, dis_buf);
   2897             break;
   2898          }
   2899          case 1: /* UNDEFINED */
   2900            /* See comment above on R case */
   2901            *decode_OK = False;
   2902            break;
   2903          case 2: /* NOT */
   2904             dst1 = newTemp(ty);
   2905             assign(dst1, unop(mkSizedOp(ty,Iop_Not8), mkexpr(t1)));
   2906             if (locked) {
   2907                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
   2908                                     guest_EIP_curr_instr );
   2909             } else {
   2910                storeLE( mkexpr(addr), mkexpr(dst1) );
   2911             }
   2912             DIP("not%c %s\n", nameISize(sz), dis_buf);
   2913             break;
   2914          case 3: /* NEG */
   2915             dst0 = newTemp(ty);
   2916             src  = newTemp(ty);
   2917             dst1 = newTemp(ty);
   2918             assign(dst0, mkU(ty,0));
   2919             assign(src,  mkexpr(t1));
   2920             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8),
   2921                                mkexpr(dst0), mkexpr(src)));
   2922             if (locked) {
   2923                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
   2924                                     guest_EIP_curr_instr );
   2925             } else {
   2926                storeLE( mkexpr(addr), mkexpr(dst1) );
   2927             }
   2928             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
   2929             DIP("neg%c %s\n", nameISize(sz), dis_buf);
   2930             break;
   2931          case 4: /* MUL */
   2932             codegen_mulL_A_D ( sz, False, t1, dis_buf );
   2933             break;
   2934          case 5: /* IMUL */
   2935             codegen_mulL_A_D ( sz, True, t1, dis_buf );
   2936             break;
   2937          case 6: /* DIV */
   2938             codegen_div ( sz, t1, False );
   2939             DIP("div%c %s\n", nameISize(sz), dis_buf);
   2940             break;
   2941          case 7: /* IDIV */
   2942             codegen_div ( sz, t1, True );
   2943             DIP("idiv%c %s\n", nameISize(sz), dis_buf);
   2944             break;
   2945          default:
   2946             /* This can't happen - gregOfRM should return 0 .. 7 only */
   2947             vpanic("Grp3(x86)");
   2948       }
   2949    }
   2950    return delta;
   2951 }
   2952 
   2953 
   2954 /* Group 4 extended opcodes. */
   2955 static
   2956 UInt dis_Grp4 ( UChar sorb, Bool locked, Int delta, Bool* decode_OK )
   2957 {
   2958    Int   alen;
   2959    UChar modrm;
   2960    HChar dis_buf[50];
   2961    IRType ty = Ity_I8;
   2962    IRTemp t1 = newTemp(ty);
   2963    IRTemp t2 = newTemp(ty);
   2964 
   2965    *decode_OK = True;
   2966 
   2967    modrm = getIByte(delta);
   2968 
   2969    if (locked && (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)) {
   2970       /* LOCK prefix only allowed with inc and dec subopcodes */
   2971       *decode_OK = False;
   2972       return delta;
   2973    }
   2974 
   2975    if (epartIsReg(modrm)) {
   2976       assign(t1, getIReg(1, eregOfRM(modrm)));
   2977       switch (gregOfRM(modrm)) {
   2978          case 0: /* INC */
   2979             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
   2980             putIReg(1, eregOfRM(modrm), mkexpr(t2));
   2981             setFlags_INC_DEC( True, t2, ty );
   2982             break;
   2983          case 1: /* DEC */
   2984             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
   2985             putIReg(1, eregOfRM(modrm), mkexpr(t2));
   2986             setFlags_INC_DEC( False, t2, ty );
   2987             break;
   2988          default:
   2989             *decode_OK = False;
   2990             return delta;
   2991       }
   2992       delta++;
   2993       DIP("%sb %s\n", nameGrp4(gregOfRM(modrm)),
   2994                       nameIReg(1, eregOfRM(modrm)));
   2995    } else {
   2996       IRTemp addr = disAMode ( &alen, sorb, delta, dis_buf );
   2997       assign( t1, loadLE(ty, mkexpr(addr)) );
   2998       switch (gregOfRM(modrm)) {
   2999          case 0: /* INC */
   3000             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
   3001             if (locked) {
   3002                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
   3003                       guest_EIP_curr_instr );
   3004             } else {
   3005                storeLE( mkexpr(addr), mkexpr(t2) );
   3006             }
   3007             setFlags_INC_DEC( True, t2, ty );
   3008             break;
   3009          case 1: /* DEC */
   3010             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
   3011             if (locked) {
   3012                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
   3013                       guest_EIP_curr_instr );
   3014             } else {
   3015                storeLE( mkexpr(addr), mkexpr(t2) );
   3016             }
   3017             setFlags_INC_DEC( False, t2, ty );
   3018             break;
   3019          default:
   3020             *decode_OK = False;
   3021             return delta;
   3022       }
   3023       delta += alen;
   3024       DIP("%sb %s\n", nameGrp4(gregOfRM(modrm)), dis_buf);
   3025    }
   3026    return delta;
   3027 }
   3028 
   3029 
   3030 /* Group 5 extended opcodes. */
   3031 static
   3032 UInt dis_Grp5 ( UChar sorb, Bool locked, Int sz, Int delta,
   3033                 /*MOD*/DisResult* dres, /*OUT*/Bool* decode_OK )
   3034 {
   3035    Int     len;
   3036    UChar   modrm;
   3037    HChar   dis_buf[50];
   3038    IRTemp  addr = IRTemp_INVALID;
   3039    IRType  ty = szToITy(sz);
   3040    IRTemp  t1 = newTemp(ty);
   3041    IRTemp  t2 = IRTemp_INVALID;
   3042 
   3043    *decode_OK = True;
   3044 
   3045    modrm = getIByte(delta);
   3046 
   3047    if (locked && (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)) {
   3048       /* LOCK prefix only allowed with inc and dec subopcodes */
   3049       *decode_OK = False;
   3050       return delta;
   3051    }
   3052 
   3053    if (epartIsReg(modrm)) {
   3054       assign(t1, getIReg(sz,eregOfRM(modrm)));
   3055       switch (gregOfRM(modrm)) {
   3056          case 0: /* INC */
   3057             vassert(sz == 2 || sz == 4);
   3058             t2 = newTemp(ty);
   3059             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
   3060                              mkexpr(t1), mkU(ty,1)));
   3061             setFlags_INC_DEC( True, t2, ty );
   3062             putIReg(sz,eregOfRM(modrm),mkexpr(t2));
   3063             break;
   3064          case 1: /* DEC */
   3065             vassert(sz == 2 || sz == 4);
   3066             t2 = newTemp(ty);
   3067             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
   3068                              mkexpr(t1), mkU(ty,1)));
   3069             setFlags_INC_DEC( False, t2, ty );
   3070             putIReg(sz,eregOfRM(modrm),mkexpr(t2));
   3071             break;
   3072          case 2: /* call Ev */
   3073             vassert(sz == 4);
   3074             t2 = newTemp(Ity_I32);
   3075             assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
   3076             putIReg(4, R_ESP, mkexpr(t2));
   3077             storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta+1));
   3078             jmp_treg(dres, Ijk_Call, t1);
   3079             vassert(dres->whatNext == Dis_StopHere);
   3080             break;
   3081          case 4: /* jmp Ev */
   3082             vassert(sz == 4);
   3083             jmp_treg(dres, Ijk_Boring, t1);
   3084             vassert(dres->whatNext == Dis_StopHere);
   3085             break;
   3086          case 6: /* PUSH Ev */
   3087             vassert(sz == 4 || sz == 2);
   3088             t2 = newTemp(Ity_I32);
   3089             assign( t2, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
   3090             putIReg(4, R_ESP, mkexpr(t2) );
   3091             storeLE( mkexpr(t2), mkexpr(t1) );
   3092             break;
   3093          default:
   3094             *decode_OK = False;
   3095             return delta;
   3096       }
   3097       delta++;
   3098       DIP("%s%c %s\n", nameGrp5(gregOfRM(modrm)),
   3099                        nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
   3100    } else {
   3101       addr = disAMode ( &len, sorb, delta, dis_buf );
   3102       assign(t1, loadLE(ty,mkexpr(addr)));
   3103       switch (gregOfRM(modrm)) {
   3104          case 0: /* INC */
   3105             t2 = newTemp(ty);
   3106             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
   3107                              mkexpr(t1), mkU(ty,1)));
   3108             if (locked) {
   3109                casLE( mkexpr(addr),
   3110                       mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
   3111             } else {
   3112                storeLE(mkexpr(addr),mkexpr(t2));
   3113             }
   3114             setFlags_INC_DEC( True, t2, ty );
   3115             break;
   3116          case 1: /* DEC */
   3117             t2 = newTemp(ty);
   3118             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
   3119                              mkexpr(t1), mkU(ty,1)));
   3120             if (locked) {
   3121                casLE( mkexpr(addr),
   3122                       mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
   3123             } else {
   3124                storeLE(mkexpr(addr),mkexpr(t2));
   3125             }
   3126             setFlags_INC_DEC( False, t2, ty );
   3127             break;
   3128          case 2: /* call Ev */
   3129             vassert(sz == 4);
   3130             t2 = newTemp(Ity_I32);
   3131             assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
   3132             putIReg(4, R_ESP, mkexpr(t2));
   3133             storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta+len));
   3134             jmp_treg(dres, Ijk_Call, t1);
   3135             vassert(dres->whatNext == Dis_StopHere);
   3136             break;
   3137          case 4: /* JMP Ev */
   3138             vassert(sz == 4);
   3139             jmp_treg(dres, Ijk_Boring, t1);
   3140             vassert(dres->whatNext == Dis_StopHere);
   3141             break;
   3142          case 6: /* PUSH Ev */
   3143             vassert(sz == 4 || sz == 2);
   3144             t2 = newTemp(Ity_I32);
   3145             assign( t2, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
   3146             putIReg(4, R_ESP, mkexpr(t2) );
   3147             storeLE( mkexpr(t2), mkexpr(t1) );
   3148             break;
   3149          default:
   3150             *decode_OK = False;
   3151             return delta;
   3152       }
   3153       delta += len;
   3154       DIP("%s%c %s\n", nameGrp5(gregOfRM(modrm)),
   3155                        nameISize(sz), dis_buf);
   3156    }
   3157    return delta;
   3158 }
   3159 
   3160 
   3161 /*------------------------------------------------------------*/
   3162 /*--- Disassembling string ops (including REP prefixes)    ---*/
   3163 /*------------------------------------------------------------*/
   3164 
   3165 /* Code shared by all the string ops */
   3166 static
   3167 void dis_string_op_increment(Int sz, Int t_inc)
   3168 {
   3169    if (sz == 4 || sz == 2) {
   3170       assign( t_inc,
   3171               binop(Iop_Shl32, IRExpr_Get( OFFB_DFLAG, Ity_I32 ),
   3172                                mkU8(sz/2) ) );
   3173    } else {
   3174       assign( t_inc,
   3175               IRExpr_Get( OFFB_DFLAG, Ity_I32 ) );
   3176    }
   3177 }
   3178 
   3179 static
   3180 void dis_string_op( void (*dis_OP)( Int, IRTemp ),
   3181                     Int sz, HChar* name, UChar sorb )
   3182 {
   3183    IRTemp t_inc = newTemp(Ity_I32);
   3184    vassert(sorb == 0); /* hmm.  so what was the point of passing it in? */
   3185    dis_string_op_increment(sz, t_inc);
   3186    dis_OP( sz, t_inc );
   3187    DIP("%s%c\n", name, nameISize(sz));
   3188 }
   3189 
   3190 static
   3191 void dis_MOVS ( Int sz, IRTemp t_inc )
   3192 {
   3193    IRType ty = szToITy(sz);
   3194    IRTemp td = newTemp(Ity_I32);   /* EDI */
   3195    IRTemp ts = newTemp(Ity_I32);   /* ESI */
   3196 
   3197    assign( td, getIReg(4, R_EDI) );
   3198    assign( ts, getIReg(4, R_ESI) );
   3199 
   3200    storeLE( mkexpr(td), loadLE(ty,mkexpr(ts)) );
   3201 
   3202    putIReg( 4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
   3203    putIReg( 4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
   3204 }
   3205 
   3206 static
   3207 void dis_LODS ( Int sz, IRTemp t_inc )
   3208 {
   3209    IRType ty = szToITy(sz);
   3210    IRTemp ts = newTemp(Ity_I32);   /* ESI */
   3211 
   3212    assign( ts, getIReg(4, R_ESI) );
   3213 
   3214    putIReg( sz, R_EAX, loadLE(ty, mkexpr(ts)) );
   3215 
   3216    putIReg( 4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
   3217 }
   3218 
   3219 static
   3220 void dis_STOS ( Int sz, IRTemp t_inc )
   3221 {
   3222    IRType ty = szToITy(sz);
   3223    IRTemp ta = newTemp(ty);        /* EAX */
   3224    IRTemp td = newTemp(Ity_I32);   /* EDI */
   3225 
   3226    assign( ta, getIReg(sz, R_EAX) );
   3227    assign( td, getIReg(4, R_EDI) );
   3228 
   3229    storeLE( mkexpr(td), mkexpr(ta) );
   3230 
   3231    putIReg( 4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
   3232 }
   3233 
   3234 static
   3235 void dis_CMPS ( Int sz, IRTemp t_inc )
   3236 {
   3237    IRType ty  = szToITy(sz);
   3238    IRTemp tdv = newTemp(ty);      /* (EDI) */
   3239    IRTemp tsv = newTemp(ty);      /* (ESI) */
   3240    IRTemp td  = newTemp(Ity_I32); /*  EDI  */
   3241    IRTemp ts  = newTemp(Ity_I32); /*  ESI  */
   3242 
   3243    assign( td, getIReg(4, R_EDI) );
   3244    assign( ts, getIReg(4, R_ESI) );
   3245 
   3246    assign( tdv, loadLE(ty,mkexpr(td)) );
   3247    assign( tsv, loadLE(ty,mkexpr(ts)) );
   3248 
   3249    setFlags_DEP1_DEP2 ( Iop_Sub8, tsv, tdv, ty );
   3250 
   3251    putIReg(4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
   3252    putIReg(4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
   3253 }
   3254 
   3255 static
   3256 void dis_SCAS ( Int sz, IRTemp t_inc )
   3257 {
   3258    IRType ty  = szToITy(sz);
   3259    IRTemp ta  = newTemp(ty);       /*  EAX  */
   3260    IRTemp td  = newTemp(Ity_I32);  /*  EDI  */
   3261    IRTemp tdv = newTemp(ty);       /* (EDI) */
   3262 
   3263    assign( ta, getIReg(sz, R_EAX) );
   3264    assign( td, getIReg(4, R_EDI) );
   3265 
   3266    assign( tdv, loadLE(ty,mkexpr(td)) );
   3267    setFlags_DEP1_DEP2 ( Iop_Sub8, ta, tdv, ty );
   3268 
   3269    putIReg(4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
   3270 }
   3271 
   3272 
   3273 /* Wrap the appropriate string op inside a REP/REPE/REPNE.
   3274    We assume the insn is the last one in the basic block, and so emit a jump
   3275    to the next insn, rather than just falling through. */
   3276 static
   3277 void dis_REP_op ( /*MOD*/DisResult* dres,
   3278                   X86Condcode cond,
   3279                   void (*dis_OP)(Int, IRTemp),
   3280                   Int sz, Addr32 eip, Addr32 eip_next, HChar* name )
   3281 {
   3282    IRTemp t_inc = newTemp(Ity_I32);
   3283    IRTemp tc    = newTemp(Ity_I32);  /*  ECX  */
   3284 
   3285    assign( tc, getIReg(4,R_ECX) );
   3286 
   3287    stmt( IRStmt_Exit( binop(Iop_CmpEQ32,mkexpr(tc),mkU32(0)),
   3288                       Ijk_Boring,
   3289                       IRConst_U32(eip_next), OFFB_EIP ) );
   3290 
   3291    putIReg(4, R_ECX, binop(Iop_Sub32, mkexpr(tc), mkU32(1)) );
   3292 
   3293    dis_string_op_increment(sz, t_inc);
   3294    dis_OP (sz, t_inc);
   3295 
   3296    if (cond == X86CondAlways) {
   3297       jmp_lit(dres, Ijk_Boring, eip);
   3298       vassert(dres->whatNext == Dis_StopHere);
   3299    } else {
   3300       stmt( IRStmt_Exit( mk_x86g_calculate_condition(cond),
   3301                          Ijk_Boring,
   3302                          IRConst_U32(eip), OFFB_EIP ) );
   3303       jmp_lit(dres, Ijk_Boring, eip_next);
   3304       vassert(dres->whatNext == Dis_StopHere);
   3305    }
   3306    DIP("%s%c\n", name, nameISize(sz));
   3307 }
   3308 
   3309 
   3310 /*------------------------------------------------------------*/
   3311 /*--- Arithmetic, etc.                                     ---*/
   3312 /*------------------------------------------------------------*/
   3313 
   3314 /* IMUL E, G.  Supplied eip points to the modR/M byte. */
   3315 static
   3316 UInt dis_mul_E_G ( UChar       sorb,
   3317                    Int         size,
   3318                    Int         delta0 )
   3319 {
   3320    Int    alen;
   3321    HChar  dis_buf[50];
   3322    UChar  rm = getIByte(delta0);
   3323    IRType ty = szToITy(size);
   3324    IRTemp te = newTemp(ty);
   3325    IRTemp tg = newTemp(ty);
   3326    IRTemp resLo = newTemp(ty);
   3327 
   3328    assign( tg, getIReg(size, gregOfRM(rm)) );
   3329    if (epartIsReg(rm)) {
   3330       assign( te, getIReg(size, eregOfRM(rm)) );
   3331    } else {
   3332       IRTemp addr = disAMode( &alen, sorb, delta0, dis_buf );
   3333       assign( te, loadLE(ty,mkexpr(addr)) );
   3334    }
   3335 
   3336    setFlags_MUL ( ty, te, tg, X86G_CC_OP_SMULB );
   3337 
   3338    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tg) ) );
   3339 
   3340    putIReg(size, gregOfRM(rm), mkexpr(resLo) );
   3341 
   3342    if (epartIsReg(rm)) {
   3343       DIP("imul%c %s, %s\n", nameISize(size),
   3344                              nameIReg(size,eregOfRM(rm)),
   3345                              nameIReg(size,gregOfRM(rm)));
   3346       return 1+delta0;
   3347    } else {
   3348       DIP("imul%c %s, %s\n", nameISize(size),
   3349                              dis_buf, nameIReg(size,gregOfRM(rm)));
   3350       return alen+delta0;
   3351    }
   3352 }
   3353 
   3354 
   3355 /* IMUL I * E -> G.  Supplied eip points to the modR/M byte. */
   3356 static
   3357 UInt dis_imul_I_E_G ( UChar       sorb,
   3358                       Int         size,
   3359                       Int         delta,
   3360                       Int         litsize )
   3361 {
   3362    Int    d32, alen;
   3363    HChar  dis_buf[50];
   3364    UChar  rm = getIByte(delta);
   3365    IRType ty = szToITy(size);
   3366    IRTemp te = newTemp(ty);
   3367    IRTemp tl = newTemp(ty);
   3368    IRTemp resLo = newTemp(ty);
   3369 
   3370    vassert(size == 1 || size == 2 || size == 4);
   3371 
   3372    if (epartIsReg(rm)) {
   3373       assign(te, getIReg(size, eregOfRM(rm)));
   3374       delta++;
   3375    } else {
   3376       IRTemp addr = disAMode( &alen, sorb, delta, dis_buf );
   3377       assign(te, loadLE(ty, mkexpr(addr)));
   3378       delta += alen;
   3379    }
   3380    d32 = getSDisp(litsize,delta);
   3381    delta += litsize;
   3382 
   3383    if (size == 1) d32 &= 0xFF;
   3384    if (size == 2) d32 &= 0xFFFF;
   3385 
   3386    assign(tl, mkU(ty,d32));
   3387 
   3388    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tl) ));
   3389 
   3390    setFlags_MUL ( ty, te, tl, X86G_CC_OP_SMULB );
   3391 
   3392    putIReg(size, gregOfRM(rm), mkexpr(resLo));
   3393 
   3394    DIP("imul %d, %s, %s\n", d32,
   3395        ( epartIsReg(rm) ? nameIReg(size,eregOfRM(rm)) : dis_buf ),
   3396        nameIReg(size,gregOfRM(rm)) );
   3397    return delta;
   3398 }
   3399 
   3400 
   3401 /* Generate an IR sequence to do a count-leading-zeroes operation on
   3402    the supplied IRTemp, and return a new IRTemp holding the result.
   3403    'ty' may be Ity_I16 or Ity_I32 only.  In the case where the
   3404    argument is zero, return the number of bits in the word (the
   3405    natural semantics). */
   3406 static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
   3407 {
   3408    vassert(ty == Ity_I32 || ty == Ity_I16);
   3409 
   3410    IRTemp src32 = newTemp(Ity_I32);
   3411    assign(src32, widenUto32( mkexpr(src) ));
   3412 
   3413    IRTemp src32x = newTemp(Ity_I32);
   3414    assign(src32x,
   3415           binop(Iop_Shl32, mkexpr(src32),
   3416                            mkU8(32 - 8 * sizeofIRType(ty))));
   3417 
   3418    // Clz32 has undefined semantics when its input is zero, so
   3419    // special-case around that.
   3420    IRTemp res32 = newTemp(Ity_I32);
   3421    assign(res32,
   3422           IRExpr_Mux0X(
   3423              unop(Iop_1Uto8,
   3424                   binop(Iop_CmpEQ32, mkexpr(src32x), mkU32(0))),
   3425              unop(Iop_Clz32, mkexpr(src32x)),
   3426              mkU32(8 * sizeofIRType(ty))
   3427    ));
   3428 
   3429    IRTemp res = newTemp(ty);
   3430    assign(res, narrowTo(ty, mkexpr(res32)));
   3431    return res;
   3432 }
   3433 
   3434 
   3435 /*------------------------------------------------------------*/
   3436 /*---                                                      ---*/
   3437 /*--- x87 FLOATING POINT INSTRUCTIONS                      ---*/
   3438 /*---                                                      ---*/
   3439 /*------------------------------------------------------------*/
   3440 
   3441 /* --- Helper functions for dealing with the register stack. --- */
   3442 
   3443 /* --- Set the emulation-warning pseudo-register. --- */
   3444 
   3445 static void put_emwarn ( IRExpr* e /* :: Ity_I32 */ )
   3446 {
   3447    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   3448    stmt( IRStmt_Put( OFFB_EMWARN, e ) );
   3449 }
   3450 
   3451 /* --- Produce an IRExpr* denoting a 64-bit QNaN. --- */
   3452 
   3453 static IRExpr* mkQNaN64 ( void )
   3454 {
   3455   /* QNaN is 0 2047 1 0(51times)
   3456      == 0b 11111111111b 1 0(51times)
   3457      == 0x7FF8 0000 0000 0000
   3458    */
   3459    return IRExpr_Const(IRConst_F64i(0x7FF8000000000000ULL));
   3460 }
   3461 
   3462 /* --------- Get/put the top-of-stack pointer. --------- */
   3463 
   3464 static IRExpr* get_ftop ( void )
   3465 {
   3466    return IRExpr_Get( OFFB_FTOP, Ity_I32 );
   3467 }
   3468 
   3469 static void put_ftop ( IRExpr* e )
   3470 {
   3471    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   3472    stmt( IRStmt_Put( OFFB_FTOP, e ) );
   3473 }
   3474 
   3475 /* --------- Get/put the C3210 bits. --------- */
   3476 
   3477 static IRExpr* get_C3210 ( void )
   3478 {
   3479    return IRExpr_Get( OFFB_FC3210, Ity_I32 );
   3480 }
   3481 
   3482 static void put_C3210 ( IRExpr* e )
   3483 {
   3484    stmt( IRStmt_Put( OFFB_FC3210, e ) );
   3485 }
   3486 
   3487 /* --------- Get/put the FPU rounding mode. --------- */
   3488 static IRExpr* /* :: Ity_I32 */ get_fpround ( void )
   3489 {
   3490    return IRExpr_Get( OFFB_FPROUND, Ity_I32 );
   3491 }
   3492 
   3493 static void put_fpround ( IRExpr* /* :: Ity_I32 */ e )
   3494 {
   3495    stmt( IRStmt_Put( OFFB_FPROUND, e ) );
   3496 }
   3497 
   3498 
   3499 /* --------- Synthesise a 2-bit FPU rounding mode. --------- */
   3500 /* Produces a value in 0 .. 3, which is encoded as per the type
   3501    IRRoundingMode.  Since the guest_FPROUND value is also encoded as
   3502    per IRRoundingMode, we merely need to get it and mask it for
   3503    safety.
   3504 */
   3505 static IRExpr* /* :: Ity_I32 */ get_roundingmode ( void )
   3506 {
   3507    return binop( Iop_And32, get_fpround(), mkU32(3) );
   3508 }
   3509 
   3510 static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
   3511 {
   3512    return mkU32(Irrm_NEAREST);
   3513 }
   3514 
   3515 
   3516 /* --------- Get/set FP register tag bytes. --------- */
   3517 
   3518 /* Given i, and some expression e, generate 'ST_TAG(i) = e'. */
   3519 
   3520 static void put_ST_TAG ( Int i, IRExpr* value )
   3521 {
   3522    IRRegArray* descr;
   3523    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_I8);
   3524    descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   3525    stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
   3526 }
   3527 
   3528 /* Given i, generate an expression yielding 'ST_TAG(i)'.  This will be
   3529    zero to indicate "Empty" and nonzero to indicate "NonEmpty".  */
   3530 
   3531 static IRExpr* get_ST_TAG ( Int i )
   3532 {
   3533    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   3534    return IRExpr_GetI( descr, get_ftop(), i );
   3535 }
   3536 
   3537 
   3538 /* --------- Get/set FP registers. --------- */
   3539 
   3540 /* Given i, and some expression e, emit 'ST(i) = e' and set the
   3541    register's tag to indicate the register is full.  The previous
   3542    state of the register is not checked. */
   3543 
   3544 static void put_ST_UNCHECKED ( Int i, IRExpr* value )
   3545 {
   3546    IRRegArray* descr;
   3547    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_F64);
   3548    descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
   3549    stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
   3550    /* Mark the register as in-use. */
   3551    put_ST_TAG(i, mkU8(1));
   3552 }
   3553 
   3554 /* Given i, and some expression e, emit
   3555       ST(i) = is_full(i) ? NaN : e
   3556    and set the tag accordingly.
   3557 */
   3558 
   3559 static void put_ST ( Int i, IRExpr* value )
   3560 {
   3561    put_ST_UNCHECKED( i,
   3562                      IRExpr_Mux0X( get_ST_TAG(i),
   3563                                    /* 0 means empty */
   3564                                    value,
   3565                                    /* non-0 means full */
   3566                                    mkQNaN64()
   3567                    )
   3568    );
   3569 }
   3570 
   3571 
   3572 /* Given i, generate an expression yielding 'ST(i)'. */
   3573 
   3574 static IRExpr* get_ST_UNCHECKED ( Int i )
   3575 {
   3576    IRRegArray* descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
   3577    return IRExpr_GetI( descr, get_ftop(), i );
   3578 }
   3579 
   3580 
   3581 /* Given i, generate an expression yielding
   3582   is_full(i) ? ST(i) : NaN
   3583 */
   3584 
   3585 static IRExpr* get_ST ( Int i )
   3586 {
   3587    return
   3588       IRExpr_Mux0X( get_ST_TAG(i),
   3589                     /* 0 means empty */
   3590                     mkQNaN64(),
   3591                     /* non-0 means full */
   3592                     get_ST_UNCHECKED(i));
   3593 }
   3594 
   3595 
   3596 /* Adjust FTOP downwards by one register. */
   3597 
   3598 static void fp_push ( void )
   3599 {
   3600    put_ftop( binop(Iop_Sub32, get_ftop(), mkU32(1)) );
   3601 }
   3602 
   3603 /* Adjust FTOP upwards by one register, and mark the vacated register
   3604    as empty.  */
   3605 
   3606 static void fp_pop ( void )
   3607 {
   3608    put_ST_TAG(0, mkU8(0));
   3609    put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
   3610 }
   3611 
   3612 /* Clear the C2 bit of the FPU status register, for
   3613    sin/cos/tan/sincos. */
   3614 
   3615 static void clear_C2 ( void )
   3616 {
   3617    put_C3210( binop(Iop_And32, get_C3210(), mkU32(~X86G_FC_MASK_C2)) );
   3618 }
   3619 
   3620 /* Invent a plausible-looking FPU status word value:
   3621       ((ftop & 7) << 11) | (c3210 & 0x4700)
   3622  */
   3623 static IRExpr* get_FPU_sw ( void )
   3624 {
   3625    return
   3626       unop(Iop_32to16,
   3627            binop(Iop_Or32,
   3628                  binop(Iop_Shl32,
   3629                        binop(Iop_And32, get_ftop(), mkU32(7)),
   3630                              mkU8(11)),
   3631                        binop(Iop_And32, get_C3210(), mkU32(0x4700))
   3632       ));
   3633 }
   3634 
   3635 
   3636 /* ------------------------------------------------------- */
   3637 /* Given all that stack-mangling junk, we can now go ahead
   3638    and describe FP instructions.
   3639 */
   3640 
   3641 /* ST(0) = ST(0) `op` mem64/32(addr)
   3642    Need to check ST(0)'s tag on read, but not on write.
   3643 */
   3644 static
   3645 void fp_do_op_mem_ST_0 ( IRTemp addr, HChar* op_txt, HChar* dis_buf,
   3646                          IROp op, Bool dbl )
   3647 {
   3648    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
   3649    if (dbl) {
   3650       put_ST_UNCHECKED(0,
   3651          triop( op,
   3652                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3653                 get_ST(0),
   3654                 loadLE(Ity_F64,mkexpr(addr))
   3655          ));
   3656    } else {
   3657       put_ST_UNCHECKED(0,
   3658          triop( op,
   3659                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3660                 get_ST(0),
   3661                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr)))
   3662          ));
   3663    }
   3664 }
   3665 
   3666 
   3667 /* ST(0) = mem64/32(addr) `op` ST(0)
   3668    Need to check ST(0)'s tag on read, but not on write.
   3669 */
   3670 static
   3671 void fp_do_oprev_mem_ST_0 ( IRTemp addr, HChar* op_txt, HChar* dis_buf,
   3672                             IROp op, Bool dbl )
   3673 {
   3674    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
   3675    if (dbl) {
   3676       put_ST_UNCHECKED(0,
   3677          triop( op,
   3678                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3679                 loadLE(Ity_F64,mkexpr(addr)),
   3680                 get_ST(0)
   3681          ));
   3682    } else {
   3683       put_ST_UNCHECKED(0,
   3684          triop( op,
   3685                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3686                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr))),
   3687                 get_ST(0)
   3688          ));
   3689    }
   3690 }
   3691 
   3692 
   3693 /* ST(dst) = ST(dst) `op` ST(src).
   3694    Check dst and src tags when reading but not on write.
   3695 */
   3696 static
   3697 void fp_do_op_ST_ST ( HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
   3698                       Bool pop_after )
   3699 {
   3700    DIP("f%s%s st(%d), st(%d)\n", op_txt, pop_after?"p":"",
   3701                                  (Int)st_src, (Int)st_dst );
   3702    put_ST_UNCHECKED(
   3703       st_dst,
   3704       triop( op,
   3705              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3706              get_ST(st_dst),
   3707              get_ST(st_src) )
   3708    );
   3709    if (pop_after)
   3710       fp_pop();
   3711 }
   3712 
   3713 /* ST(dst) = ST(src) `op` ST(dst).
   3714    Check dst and src tags when reading but not on write.
   3715 */
   3716 static
   3717 void fp_do_oprev_ST_ST ( HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
   3718                          Bool pop_after )
   3719 {
   3720    DIP("f%s%s st(%d), st(%d)\n", op_txt, pop_after?"p":"",
   3721                                  (Int)st_src, (Int)st_dst );
   3722    put_ST_UNCHECKED(
   3723       st_dst,
   3724       triop( op,
   3725              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3726              get_ST(st_src),
   3727              get_ST(st_dst) )
   3728    );
   3729    if (pop_after)
   3730       fp_pop();
   3731 }
   3732 
   3733 /* %eflags(Z,P,C) = UCOMI( st(0), st(i) ) */
   3734 static void fp_do_ucomi_ST0_STi ( UInt i, Bool pop_after )
   3735 {
   3736    DIP("fucomi%s %%st(0),%%st(%d)\n", pop_after ? "p" : "", (Int)i );
   3737    /* This is a bit of a hack (and isn't really right).  It sets
   3738       Z,P,C,O correctly, but forces A and S to zero, whereas the Intel
   3739       documentation implies A and S are unchanged.
   3740    */
   3741    /* It's also fishy in that it is used both for COMIP and
   3742       UCOMIP, and they aren't the same (although similar). */
   3743    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   3744    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   3745    stmt( IRStmt_Put( OFFB_CC_DEP1,
   3746                      binop( Iop_And32,
   3747                             binop(Iop_CmpF64, get_ST(0), get_ST(i)),
   3748                             mkU32(0x45)
   3749        )));
   3750    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   3751       elimination of previous stores to this field work better. */
   3752    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   3753    if (pop_after)
   3754       fp_pop();
   3755 }
   3756 
   3757 
   3758 static
   3759 UInt dis_FPU ( Bool* decode_ok, UChar sorb, Int delta )
   3760 {
   3761    Int    len;
   3762    UInt   r_src, r_dst;
   3763    HChar  dis_buf[50];
   3764    IRTemp t1, t2;
   3765 
   3766    /* On entry, delta points at the second byte of the insn (the modrm
   3767       byte).*/
   3768    UChar first_opcode = getIByte(delta-1);
   3769    UChar modrm        = getIByte(delta+0);
   3770 
   3771    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD8 opcodes +-+-+-+-+-+-+-+ */
   3772 
   3773    if (first_opcode == 0xD8) {
   3774       if (modrm < 0xC0) {
   3775 
   3776          /* bits 5,4,3 are an opcode extension, and the modRM also
   3777            specifies an address. */
   3778          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   3779          delta += len;
   3780 
   3781          switch (gregOfRM(modrm)) {
   3782 
   3783             case 0: /* FADD single-real */
   3784                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, False );
   3785                break;
   3786 
   3787             case 1: /* FMUL single-real */
   3788                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, False );
   3789                break;
   3790 
   3791             case 2: /* FCOM single-real */
   3792                DIP("fcoms %s\n", dis_buf);
   3793                /* This forces C1 to zero, which isn't right. */
   3794                put_C3210(
   3795                    binop( Iop_And32,
   3796                           binop(Iop_Shl32,
   3797                                 binop(Iop_CmpF64,
   3798                                       get_ST(0),
   3799                                       unop(Iop_F32toF64,
   3800                                            loadLE(Ity_F32,mkexpr(addr)))),
   3801                                 mkU8(8)),
   3802                           mkU32(0x4500)
   3803                    ));
   3804                break;
   3805 
   3806             case 3: /* FCOMP single-real */
   3807                DIP("fcomps %s\n", dis_buf);
   3808                /* This forces C1 to zero, which isn't right. */
   3809                put_C3210(
   3810                    binop( Iop_And32,
   3811                           binop(Iop_Shl32,
   3812                                 binop(Iop_CmpF64,
   3813                                       get_ST(0),
   3814                                       unop(Iop_F32toF64,
   3815                                            loadLE(Ity_F32,mkexpr(addr)))),
   3816                                 mkU8(8)),
   3817                           mkU32(0x4500)
   3818                    ));
   3819                fp_pop();
   3820                break;
   3821 
   3822             case 4: /* FSUB single-real */
   3823                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, False );
   3824                break;
   3825 
   3826             case 5: /* FSUBR single-real */
   3827                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, False );
   3828                break;
   3829 
   3830             case 6: /* FDIV single-real */
   3831                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, False );
   3832                break;
   3833 
   3834             case 7: /* FDIVR single-real */
   3835                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, False );
   3836                break;
   3837 
   3838             default:
   3839                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   3840                vex_printf("first_opcode == 0xD8\n");
   3841                goto decode_fail;
   3842          }
   3843       } else {
   3844          delta++;
   3845          switch (modrm) {
   3846 
   3847             case 0xC0 ... 0xC7: /* FADD %st(?),%st(0) */
   3848                fp_do_op_ST_ST ( "add", Iop_AddF64, modrm - 0xC0, 0, False );
   3849                break;
   3850 
   3851             case 0xC8 ... 0xCF: /* FMUL %st(?),%st(0) */
   3852                fp_do_op_ST_ST ( "mul", Iop_MulF64, modrm - 0xC8, 0, False );
   3853                break;
   3854 
   3855             /* Dunno if this is right */
   3856             case 0xD0 ... 0xD7: /* FCOM %st(?),%st(0) */
   3857                r_dst = (UInt)modrm - 0xD0;
   3858                DIP("fcom %%st(0),%%st(%d)\n", (Int)r_dst);
   3859                /* This forces C1 to zero, which isn't right. */
   3860                put_C3210(
   3861                    binop( Iop_And32,
   3862                           binop(Iop_Shl32,
   3863                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   3864                                 mkU8(8)),
   3865                           mkU32(0x4500)
   3866                    ));
   3867                break;
   3868 
   3869             /* Dunno if this is right */
   3870             case 0xD8 ... 0xDF: /* FCOMP %st(?),%st(0) */
   3871                r_dst = (UInt)modrm - 0xD8;
   3872                DIP("fcomp %%st(0),%%st(%d)\n", (Int)r_dst);
   3873                /* This forces C1 to zero, which isn't right. */
   3874                put_C3210(
   3875                    binop( Iop_And32,
   3876                           binop(Iop_Shl32,
   3877                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   3878                                 mkU8(8)),
   3879                           mkU32(0x4500)
   3880                    ));
   3881                fp_pop();
   3882                break;
   3883 
   3884             case 0xE0 ... 0xE7: /* FSUB %st(?),%st(0) */
   3885                fp_do_op_ST_ST ( "sub", Iop_SubF64, modrm - 0xE0, 0, False );
   3886                break;
   3887 
   3888             case 0xE8 ... 0xEF: /* FSUBR %st(?),%st(0) */
   3889                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, modrm - 0xE8, 0, False );
   3890                break;
   3891 
   3892             case 0xF0 ... 0xF7: /* FDIV %st(?),%st(0) */
   3893                fp_do_op_ST_ST ( "div", Iop_DivF64, modrm - 0xF0, 0, False );
   3894                break;
   3895 
   3896             case 0xF8 ... 0xFF: /* FDIVR %st(?),%st(0) */
   3897                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, modrm - 0xF8, 0, False );
   3898                break;
   3899 
   3900             default:
   3901                goto decode_fail;
   3902          }
   3903       }
   3904    }
   3905 
   3906    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD9 opcodes +-+-+-+-+-+-+-+ */
   3907    else
   3908    if (first_opcode == 0xD9) {
   3909       if (modrm < 0xC0) {
   3910 
   3911          /* bits 5,4,3 are an opcode extension, and the modRM also
   3912             specifies an address. */
   3913          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   3914          delta += len;
   3915 
   3916          switch (gregOfRM(modrm)) {
   3917 
   3918             case 0: /* FLD single-real */
   3919                DIP("flds %s\n", dis_buf);
   3920                fp_push();
   3921                put_ST(0, unop(Iop_F32toF64,
   3922                               loadLE(Ity_F32, mkexpr(addr))));
   3923                break;
   3924 
   3925             case 2: /* FST single-real */
   3926                DIP("fsts %s\n", dis_buf);
   3927                storeLE(mkexpr(addr),
   3928                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
   3929                break;
   3930 
   3931             case 3: /* FSTP single-real */
   3932                DIP("fstps %s\n", dis_buf);
   3933                storeLE(mkexpr(addr),
   3934                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
   3935                fp_pop();
   3936                break;
   3937 
   3938             case 4: { /* FLDENV m28 */
   3939                /* Uses dirty helper:
   3940                      VexEmWarn x86g_do_FLDENV ( VexGuestX86State*, HWord ) */
   3941                IRTemp   ew = newTemp(Ity_I32);
   3942                IRDirty* d  = unsafeIRDirty_0_N (
   3943                                 0/*regparms*/,
   3944                                 "x86g_dirtyhelper_FLDENV",
   3945                                 &x86g_dirtyhelper_FLDENV,
   3946                                 mkIRExprVec_1( mkexpr(addr) )
   3947                              );
   3948                d->needsBBP = True;
   3949                d->tmp      = ew;
   3950                /* declare we're reading memory */
   3951                d->mFx   = Ifx_Read;
   3952                d->mAddr = mkexpr(addr);
   3953                d->mSize = 28;
   3954 
   3955                /* declare we're writing guest state */
   3956                d->nFxState = 4;
   3957                vex_bzero(&d->fxState, sizeof(d->fxState));
   3958 
   3959                d->fxState[0].fx     = Ifx_Write;
   3960                d->fxState[0].offset = OFFB_FTOP;
   3961                d->fxState[0].size   = sizeof(UInt);
   3962 
   3963                d->fxState[1].fx     = Ifx_Write;
   3964                d->fxState[1].offset = OFFB_FPTAGS;
   3965                d->fxState[1].size   = 8 * sizeof(UChar);
   3966 
   3967                d->fxState[2].fx     = Ifx_Write;
   3968                d->fxState[2].offset = OFFB_FPROUND;
   3969                d->fxState[2].size   = sizeof(UInt);
   3970 
   3971                d->fxState[3].fx     = Ifx_Write;
   3972                d->fxState[3].offset = OFFB_FC3210;
   3973                d->fxState[3].size   = sizeof(UInt);
   3974 
   3975                stmt( IRStmt_Dirty(d) );
   3976 
   3977                /* ew contains any emulation warning we may need to
   3978                   issue.  If needed, side-exit to the next insn,
   3979                   reporting the warning, so that Valgrind's dispatcher
   3980                   sees the warning. */
   3981                put_emwarn( mkexpr(ew) );
   3982                stmt(
   3983                   IRStmt_Exit(
   3984                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   3985                      Ijk_EmWarn,
   3986                      IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
   3987                      OFFB_EIP
   3988                   )
   3989                );
   3990 
   3991                DIP("fldenv %s\n", dis_buf);
   3992                break;
   3993             }
   3994 
   3995             case 5: {/* FLDCW */
   3996                /* The only thing we observe in the control word is the
   3997                   rounding mode.  Therefore, pass the 16-bit value
   3998                   (x87 native-format control word) to a clean helper,
   3999                   getting back a 64-bit value, the lower half of which
   4000                   is the FPROUND value to store, and the upper half of
   4001                   which is the emulation-warning token which may be
   4002                   generated.
   4003                */
   4004                /* ULong x86h_check_fldcw ( UInt ); */
   4005                IRTemp t64 = newTemp(Ity_I64);
   4006                IRTemp ew = newTemp(Ity_I32);
   4007                DIP("fldcw %s\n", dis_buf);
   4008                assign( t64, mkIRExprCCall(
   4009                                Ity_I64, 0/*regparms*/,
   4010                                "x86g_check_fldcw",
   4011                                &x86g_check_fldcw,
   4012                                mkIRExprVec_1(
   4013                                   unop( Iop_16Uto32,
   4014                                         loadLE(Ity_I16, mkexpr(addr)))
   4015                                )
   4016                             )
   4017                      );
   4018 
   4019                put_fpround( unop(Iop_64to32, mkexpr(t64)) );
   4020                assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
   4021                put_emwarn( mkexpr(ew) );
   4022                /* Finally, if an emulation warning was reported,
   4023                   side-exit to the next insn, reporting the warning,
   4024                   so that Valgrind's dispatcher sees the warning. */
   4025                stmt(
   4026                   IRStmt_Exit(
   4027                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   4028                      Ijk_EmWarn,
   4029                      IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
   4030                      OFFB_EIP
   4031                   )
   4032                );
   4033                break;
   4034             }
   4035 
   4036             case 6: { /* FNSTENV m28 */
   4037                /* Uses dirty helper:
   4038                      void x86g_do_FSTENV ( VexGuestX86State*, HWord ) */
   4039                IRDirty* d = unsafeIRDirty_0_N (
   4040                                0/*regparms*/,
   4041                                "x86g_dirtyhelper_FSTENV",
   4042                                &x86g_dirtyhelper_FSTENV,
   4043                                mkIRExprVec_1( mkexpr(addr) )
   4044                             );
   4045                d->needsBBP = True;
   4046                /* declare we're writing memory */
   4047                d->mFx   = Ifx_Write;
   4048                d->mAddr = mkexpr(addr);
   4049                d->mSize = 28;
   4050 
   4051                /* declare we're reading guest state */
   4052                d->nFxState = 4;
   4053                vex_bzero(&d->fxState, sizeof(d->fxState));
   4054 
   4055                d->fxState[0].fx     = Ifx_Read;
   4056                d->fxState[0].offset = OFFB_FTOP;
   4057                d->fxState[0].size   = sizeof(UInt);
   4058 
   4059                d->fxState[1].fx     = Ifx_Read;
   4060                d->fxState[1].offset = OFFB_FPTAGS;
   4061                d->fxState[1].size   = 8 * sizeof(UChar);
   4062 
   4063                d->fxState[2].fx     = Ifx_Read;
   4064                d->fxState[2].offset = OFFB_FPROUND;
   4065                d->fxState[2].size   = sizeof(UInt);
   4066 
   4067                d->fxState[3].fx     = Ifx_Read;
   4068                d->fxState[3].offset = OFFB_FC3210;
   4069                d->fxState[3].size   = sizeof(UInt);
   4070 
   4071                stmt( IRStmt_Dirty(d) );
   4072 
   4073                DIP("fnstenv %s\n", dis_buf);
   4074                break;
   4075             }
   4076 
   4077             case 7: /* FNSTCW */
   4078               /* Fake up a native x87 FPU control word.  The only
   4079                  thing it depends on is FPROUND[1:0], so call a clean
   4080                  helper to cook it up. */
   4081                /* UInt x86h_create_fpucw ( UInt fpround ) */
   4082                DIP("fnstcw %s\n", dis_buf);
   4083                storeLE(
   4084                   mkexpr(addr),
   4085                   unop( Iop_32to16,
   4086                         mkIRExprCCall(
   4087                            Ity_I32, 0/*regp*/,
   4088                            "x86g_create_fpucw", &x86g_create_fpucw,
   4089                            mkIRExprVec_1( get_fpround() )
   4090                         )
   4091                   )
   4092                );
   4093                break;
   4094 
   4095             default:
   4096                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   4097                vex_printf("first_opcode == 0xD9\n");
   4098                goto decode_fail;
   4099          }
   4100 
   4101       } else {
   4102          delta++;
   4103          switch (modrm) {
   4104 
   4105             case 0xC0 ... 0xC7: /* FLD %st(?) */
   4106                r_src = (UInt)modrm - 0xC0;
   4107                DIP("fld %%st(%d)\n", (Int)r_src);
   4108                t1 = newTemp(Ity_F64);
   4109                assign(t1, get_ST(r_src));
   4110                fp_push();
   4111                put_ST(0, mkexpr(t1));
   4112                break;
   4113 
   4114             case 0xC8 ... 0xCF: /* FXCH %st(?) */
   4115                r_src = (UInt)modrm - 0xC8;
   4116                DIP("fxch %%st(%d)\n", (Int)r_src);
   4117                t1 = newTemp(Ity_F64);
   4118                t2 = newTemp(Ity_F64);
   4119                assign(t1, get_ST(0));
   4120                assign(t2, get_ST(r_src));
   4121                put_ST_UNCHECKED(0, mkexpr(t2));
   4122                put_ST_UNCHECKED(r_src, mkexpr(t1));
   4123                break;
   4124 
   4125             case 0xE0: /* FCHS */
   4126                DIP("fchs\n");
   4127                put_ST_UNCHECKED(0, unop(Iop_NegF64, get_ST(0)));
   4128                break;
   4129 
   4130             case 0xE1: /* FABS */
   4131                DIP("fabs\n");
   4132                put_ST_UNCHECKED(0, unop(Iop_AbsF64, get_ST(0)));
   4133                break;
   4134 
   4135             case 0xE4: /* FTST */
   4136                DIP("ftst\n");
   4137                /* This forces C1 to zero, which isn't right. */
   4138                /* Well, in fact the Intel docs say (bizarrely): "C1 is
   4139                   set to 0 if stack underflow occurred; otherwise, set
   4140                   to 0" which is pretty nonsensical.  I guess it's a
   4141                    typo. */
   4142                put_C3210(
   4143                    binop( Iop_And32,
   4144                           binop(Iop_Shl32,
   4145                                 binop(Iop_CmpF64,
   4146                                       get_ST(0),
   4147                                       IRExpr_Const(IRConst_F64i(0x0ULL))),
   4148                                 mkU8(8)),
   4149                           mkU32(0x4500)
   4150                    ));
   4151                break;
   4152 
   4153             case 0xE5: { /* FXAM */
   4154                /* This is an interesting one.  It examines %st(0),
   4155                   regardless of whether the tag says it's empty or not.
   4156                   Here, just pass both the tag (in our format) and the
   4157                   value (as a double, actually a ULong) to a helper
   4158                   function. */
   4159                IRExpr** args
   4160                   = mkIRExprVec_2( unop(Iop_8Uto32, get_ST_TAG(0)),
   4161                                    unop(Iop_ReinterpF64asI64,
   4162                                         get_ST_UNCHECKED(0)) );
   4163                put_C3210(mkIRExprCCall(
   4164                             Ity_I32,
   4165                             0/*regparm*/,
   4166                             "x86g_calculate_FXAM", &x86g_calculate_FXAM,
   4167                             args
   4168                         ));
   4169                DIP("fxam\n");
   4170                break;
   4171             }
   4172 
   4173             case 0xE8: /* FLD1 */
   4174                DIP("fld1\n");
   4175                fp_push();
   4176                /* put_ST(0, IRExpr_Const(IRConst_F64(1.0))); */
   4177                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff0000000000000ULL)));
   4178                break;
   4179 
   4180             case 0xE9: /* FLDL2T */
   4181                DIP("fldl2t\n");
   4182                fp_push();
   4183                /* put_ST(0, IRExpr_Const(IRConst_F64(3.32192809488736234781))); */
   4184                put_ST(0, IRExpr_Const(IRConst_F64i(0x400a934f0979a371ULL)));
   4185                break;
   4186 
   4187             case 0xEA: /* FLDL2E */
   4188                DIP("fldl2e\n");
   4189                fp_push();
   4190                /* put_ST(0, IRExpr_Const(IRConst_F64(1.44269504088896340739))); */
   4191                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff71547652b82feULL)));
   4192                break;
   4193 
   4194             case 0xEB: /* FLDPI */
   4195                DIP("fldpi\n");
   4196                fp_push();
   4197                /* put_ST(0, IRExpr_Const(IRConst_F64(3.14159265358979323851))); */
   4198                put_ST(0, IRExpr_Const(IRConst_F64i(0x400921fb54442d18ULL)));
   4199                break;
   4200 
   4201             case 0xEC: /* FLDLG2 */
   4202                DIP("fldlg2\n");
   4203                fp_push();
   4204                /* put_ST(0, IRExpr_Const(IRConst_F64(0.301029995663981143))); */
   4205                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fd34413509f79ffULL)));
   4206                break;
   4207 
   4208             case 0xED: /* FLDLN2 */
   4209                DIP("fldln2\n");
   4210                fp_push();
   4211                /* put_ST(0, IRExpr_Const(IRConst_F64(0.69314718055994530942))); */
   4212                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fe62e42fefa39efULL)));
   4213                break;
   4214 
   4215             case 0xEE: /* FLDZ */
   4216                DIP("fldz\n");
   4217                fp_push();
   4218                /* put_ST(0, IRExpr_Const(IRConst_F64(0.0))); */
   4219                put_ST(0, IRExpr_Const(IRConst_F64i(0x0000000000000000ULL)));
   4220                break;
   4221 
   4222             case 0xF0: /* F2XM1 */
   4223                DIP("f2xm1\n");
   4224                put_ST_UNCHECKED(0,
   4225                   binop(Iop_2xm1F64,
   4226                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4227                         get_ST(0)));
   4228                break;
   4229 
   4230             case 0xF1: /* FYL2X */
   4231                DIP("fyl2x\n");
   4232                put_ST_UNCHECKED(1,
   4233                   triop(Iop_Yl2xF64,
   4234                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4235                         get_ST(1),
   4236                         get_ST(0)));
   4237                fp_pop();
   4238                break;
   4239 
   4240             case 0xF2: /* FPTAN */
   4241                DIP("ftan\n");
   4242                put_ST_UNCHECKED(0,
   4243                   binop(Iop_TanF64,
   4244                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4245                         get_ST(0)));
   4246                fp_push();
   4247                put_ST(0, IRExpr_Const(IRConst_F64(1.0)));
   4248                clear_C2(); /* HACK */
   4249                break;
   4250 
   4251             case 0xF3: /* FPATAN */
   4252                DIP("fpatan\n");
   4253                put_ST_UNCHECKED(1,
   4254                   triop(Iop_AtanF64,
   4255                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4256                         get_ST(1),
   4257                         get_ST(0)));
   4258                fp_pop();
   4259                break;
   4260 
   4261             case 0xF4: { /* FXTRACT */
   4262                IRTemp argF = newTemp(Ity_F64);
   4263                IRTemp sigF = newTemp(Ity_F64);
   4264                IRTemp expF = newTemp(Ity_F64);
   4265                IRTemp argI = newTemp(Ity_I64);
   4266                IRTemp sigI = newTemp(Ity_I64);
   4267                IRTemp expI = newTemp(Ity_I64);
   4268                DIP("fxtract\n");
   4269                assign( argF, get_ST(0) );
   4270                assign( argI, unop(Iop_ReinterpF64asI64, mkexpr(argF)));
   4271                assign( sigI,
   4272                        mkIRExprCCall(
   4273                           Ity_I64, 0/*regparms*/,
   4274                           "x86amd64g_calculate_FXTRACT",
   4275                           &x86amd64g_calculate_FXTRACT,
   4276                           mkIRExprVec_2( mkexpr(argI),
   4277                                          mkIRExpr_HWord(0)/*sig*/ ))
   4278                );
   4279                assign( expI,
   4280                        mkIRExprCCall(
   4281                           Ity_I64, 0/*regparms*/,
   4282                           "x86amd64g_calculate_FXTRACT",
   4283                           &x86amd64g_calculate_FXTRACT,
   4284                           mkIRExprVec_2( mkexpr(argI),
   4285                                          mkIRExpr_HWord(1)/*exp*/ ))
   4286                );
   4287                assign( sigF, unop(Iop_ReinterpI64asF64, mkexpr(sigI)) );
   4288                assign( expF, unop(Iop_ReinterpI64asF64, mkexpr(expI)) );
   4289                /* exponent */
   4290                put_ST_UNCHECKED(0, mkexpr(expF) );
   4291                fp_push();
   4292                /* significand */
   4293                put_ST(0, mkexpr(sigF) );
   4294                break;
   4295             }
   4296 
   4297             case 0xF5: { /* FPREM1 -- IEEE compliant */
   4298                IRTemp a1 = newTemp(Ity_F64);
   4299                IRTemp a2 = newTemp(Ity_F64);
   4300                DIP("fprem1\n");
   4301                /* Do FPREM1 twice, once to get the remainder, and once
   4302                   to get the C3210 flag values. */
   4303                assign( a1, get_ST(0) );
   4304                assign( a2, get_ST(1) );
   4305                put_ST_UNCHECKED(0,
   4306                   triop(Iop_PRem1F64,
   4307                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4308                         mkexpr(a1),
   4309                         mkexpr(a2)));
   4310                put_C3210(
   4311                   triop(Iop_PRem1C3210F64,
   4312                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4313                         mkexpr(a1),
   4314                         mkexpr(a2)) );
   4315                break;
   4316             }
   4317 
   4318             case 0xF7: /* FINCSTP */
   4319                DIP("fprem\n");
   4320                put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
   4321                break;
   4322 
   4323             case 0xF8: { /* FPREM -- not IEEE compliant */
   4324                IRTemp a1 = newTemp(Ity_F64);
   4325                IRTemp a2 = newTemp(Ity_F64);
   4326                DIP("fprem\n");
   4327                /* Do FPREM twice, once to get the remainder, and once
   4328                   to get the C3210 flag values. */
   4329                assign( a1, get_ST(0) );
   4330                assign( a2, get_ST(1) );
   4331                put_ST_UNCHECKED(0,
   4332                   triop(Iop_PRemF64,
   4333                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4334                         mkexpr(a1),
   4335                         mkexpr(a2)));
   4336                put_C3210(
   4337                   triop(Iop_PRemC3210F64,
   4338                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4339                         mkexpr(a1),
   4340                         mkexpr(a2)) );
   4341                break;
   4342             }
   4343 
   4344             case 0xF9: /* FYL2XP1 */
   4345                DIP("fyl2xp1\n");
   4346                put_ST_UNCHECKED(1,
   4347                   triop(Iop_Yl2xp1F64,
   4348                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4349                         get_ST(1),
   4350                         get_ST(0)));
   4351                fp_pop();
   4352                break;
   4353 
   4354             case 0xFA: /* FSQRT */
   4355                DIP("fsqrt\n");
   4356                put_ST_UNCHECKED(0,
   4357                   binop(Iop_SqrtF64,
   4358                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4359                         get_ST(0)));
   4360                break;
   4361 
   4362             case 0xFB: { /* FSINCOS */
   4363                IRTemp a1 = newTemp(Ity_F64);
   4364                assign( a1, get_ST(0) );
   4365                DIP("fsincos\n");
   4366                put_ST_UNCHECKED(0,
   4367                   binop(Iop_SinF64,
   4368                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4369                         mkexpr(a1)));
   4370                fp_push();
   4371                put_ST(0,
   4372                   binop(Iop_CosF64,
   4373                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4374                         mkexpr(a1)));
   4375                clear_C2(); /* HACK */
   4376                break;
   4377             }
   4378 
   4379             case 0xFC: /* FRNDINT */
   4380                DIP("frndint\n");
   4381                put_ST_UNCHECKED(0,
   4382                   binop(Iop_RoundF64toInt, get_roundingmode(), get_ST(0)) );
   4383                break;
   4384 
   4385             case 0xFD: /* FSCALE */
   4386                DIP("fscale\n");
   4387                put_ST_UNCHECKED(0,
   4388                   triop(Iop_ScaleF64,
   4389                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4390                         get_ST(0),
   4391                         get_ST(1)));
   4392                break;
   4393 
   4394             case 0xFE: /* FSIN */
   4395                DIP("fsin\n");
   4396                put_ST_UNCHECKED(0,
   4397                   binop(Iop_SinF64,
   4398                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4399                         get_ST(0)));
   4400                clear_C2(); /* HACK */
   4401                break;
   4402 
   4403             case 0xFF: /* FCOS */
   4404                DIP("fcos\n");
   4405                put_ST_UNCHECKED(0,
   4406                   binop(Iop_CosF64,
   4407                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4408                         get_ST(0)));
   4409                clear_C2(); /* HACK */
   4410                break;
   4411 
   4412             default:
   4413                goto decode_fail;
   4414          }
   4415       }
   4416    }
   4417 
   4418    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDA opcodes +-+-+-+-+-+-+-+ */
   4419    else
   4420    if (first_opcode == 0xDA) {
   4421 
   4422       if (modrm < 0xC0) {
   4423 
   4424          /* bits 5,4,3 are an opcode extension, and the modRM also
   4425             specifies an address. */
   4426          IROp   fop;
   4427          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   4428          delta += len;
   4429          switch (gregOfRM(modrm)) {
   4430 
   4431             case 0: /* FIADD m32int */ /* ST(0) += m32int */
   4432                DIP("fiaddl %s\n", dis_buf);
   4433                fop = Iop_AddF64;
   4434                goto do_fop_m32;
   4435 
   4436             case 1: /* FIMUL m32int */ /* ST(0) *= m32int */
   4437                DIP("fimull %s\n", dis_buf);
   4438                fop = Iop_MulF64;
   4439                goto do_fop_m32;
   4440 
   4441             case 2: /* FICOM m32int */
   4442                DIP("ficoml %s\n", dis_buf);
   4443                /* This forces C1 to zero, which isn't right. */
   4444                put_C3210(
   4445                    binop( Iop_And32,
   4446                           binop(Iop_Shl32,
   4447                                 binop(Iop_CmpF64,
   4448                                       get_ST(0),
   4449                                       unop(Iop_I32StoF64,
   4450                                            loadLE(Ity_I32,mkexpr(addr)))),
   4451                                 mkU8(8)),
   4452                           mkU32(0x4500)
   4453                    ));
   4454                break;
   4455 
   4456             case 3: /* FICOMP m32int */
   4457                DIP("ficompl %s\n", dis_buf);
   4458                /* This forces C1 to zero, which isn't right. */
   4459                put_C3210(
   4460                    binop( Iop_And32,
   4461                           binop(Iop_Shl32,
   4462                                 binop(Iop_CmpF64,
   4463                                       get_ST(0),
   4464                                       unop(Iop_I32StoF64,
   4465                                            loadLE(Ity_I32,mkexpr(addr)))),
   4466                                 mkU8(8)),
   4467                           mkU32(0x4500)
   4468                    ));
   4469                fp_pop();
   4470                break;
   4471 
   4472             case 4: /* FISUB m32int */ /* ST(0) -= m32int */
   4473                DIP("fisubl %s\n", dis_buf);
   4474                fop = Iop_SubF64;
   4475                goto do_fop_m32;
   4476 
   4477             case 5: /* FISUBR m32int */ /* ST(0) = m32int - ST(0) */
   4478                DIP("fisubrl %s\n", dis_buf);
   4479                fop = Iop_SubF64;
   4480                goto do_foprev_m32;
   4481 
   4482             case 6: /* FIDIV m32int */ /* ST(0) /= m32int */
   4483                DIP("fidivl %s\n", dis_buf);
   4484                fop = Iop_DivF64;
   4485                goto do_fop_m32;
   4486 
   4487             case 7: /* FIDIVR m32int */ /* ST(0) = m32int / ST(0) */
   4488                DIP("fidivrl %s\n", dis_buf);
   4489                fop = Iop_DivF64;
   4490                goto do_foprev_m32;
   4491 
   4492             do_fop_m32:
   4493                put_ST_UNCHECKED(0,
   4494                   triop(fop,
   4495                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4496                         get_ST(0),
   4497                         unop(Iop_I32StoF64,
   4498                              loadLE(Ity_I32, mkexpr(addr)))));
   4499                break;
   4500 
   4501             do_foprev_m32:
   4502                put_ST_UNCHECKED(0,
   4503                   triop(fop,
   4504                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4505                         unop(Iop_I32StoF64,
   4506                              loadLE(Ity_I32, mkexpr(addr))),
   4507                         get_ST(0)));
   4508                break;
   4509 
   4510             default:
   4511                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   4512                vex_printf("first_opcode == 0xDA\n");
   4513                goto decode_fail;
   4514          }
   4515 
   4516       } else {
   4517 
   4518          delta++;
   4519          switch (modrm) {
   4520 
   4521             case 0xC0 ... 0xC7: /* FCMOVB ST(i), ST(0) */
   4522                r_src = (UInt)modrm - 0xC0;
   4523                DIP("fcmovb %%st(%d), %%st(0)\n", (Int)r_src);
   4524                put_ST_UNCHECKED(0,
   4525                                 IRExpr_Mux0X(
   4526                                     unop(Iop_1Uto8,
   4527                                          mk_x86g_calculate_condition(X86CondB)),
   4528                                     get_ST(0), get_ST(r_src)) );
   4529                break;
   4530 
   4531             case 0xC8 ... 0xCF: /* FCMOVE(Z) ST(i), ST(0) */
   4532                r_src = (UInt)modrm - 0xC8;
   4533                DIP("fcmovz %%st(%d), %%st(0)\n", (Int)r_src);
   4534                put_ST_UNCHECKED(0,
   4535                                 IRExpr_Mux0X(
   4536                                     unop(Iop_1Uto8,
   4537                                          mk_x86g_calculate_condition(X86CondZ)),
   4538                                     get_ST(0), get_ST(r_src)) );
   4539                break;
   4540 
   4541             case 0xD0 ... 0xD7: /* FCMOVBE ST(i), ST(0) */
   4542                r_src = (UInt)modrm - 0xD0;
   4543                DIP("fcmovbe %%st(%d), %%st(0)\n", (Int)r_src);
   4544                put_ST_UNCHECKED(0,
   4545                                 IRExpr_Mux0X(
   4546                                     unop(Iop_1Uto8,
   4547                                          mk_x86g_calculate_condition(X86CondBE)),
   4548                                     get_ST(0), get_ST(r_src)) );
   4549                break;
   4550 
   4551             case 0xD8 ... 0xDF: /* FCMOVU ST(i), ST(0) */
   4552                r_src = (UInt)modrm - 0xD8;
   4553                DIP("fcmovu %%st(%d), %%st(0)\n", (Int)r_src);
   4554                put_ST_UNCHECKED(0,
   4555                                 IRExpr_Mux0X(
   4556                                     unop(Iop_1Uto8,
   4557                                          mk_x86g_calculate_condition(X86CondP)),
   4558                                     get_ST(0), get_ST(r_src)) );
   4559                break;
   4560 
   4561             case 0xE9: /* FUCOMPP %st(0),%st(1) */
   4562                DIP("fucompp %%st(0),%%st(1)\n");
   4563                /* This forces C1 to zero, which isn't right. */
   4564                put_C3210(
   4565                    binop( Iop_And32,
   4566                           binop(Iop_Shl32,
   4567                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
   4568                                 mkU8(8)),
   4569                           mkU32(0x4500)
   4570                    ));
   4571                fp_pop();
   4572                fp_pop();
   4573                break;
   4574 
   4575             default:
   4576                goto decode_fail;
   4577          }
   4578 
   4579       }
   4580    }
   4581 
   4582    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDB opcodes +-+-+-+-+-+-+-+ */
   4583    else
   4584    if (first_opcode == 0xDB) {
   4585       if (modrm < 0xC0) {
   4586 
   4587          /* bits 5,4,3 are an opcode extension, and the modRM also
   4588             specifies an address. */
   4589          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   4590          delta += len;
   4591 
   4592          switch (gregOfRM(modrm)) {
   4593 
   4594             case 0: /* FILD m32int */
   4595                DIP("fildl %s\n", dis_buf);
   4596                fp_push();
   4597                put_ST(0, unop(Iop_I32StoF64,
   4598                               loadLE(Ity_I32, mkexpr(addr))));
   4599                break;
   4600 
   4601             case 1: /* FISTTPL m32 (SSE3) */
   4602                DIP("fisttpl %s\n", dis_buf);
   4603                storeLE( mkexpr(addr),
   4604                         binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) );
   4605                fp_pop();
   4606                break;
   4607 
   4608             case 2: /* FIST m32 */
   4609                DIP("fistl %s\n", dis_buf);
   4610                storeLE( mkexpr(addr),
   4611                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
   4612                break;
   4613 
   4614             case 3: /* FISTP m32 */
   4615                DIP("fistpl %s\n", dis_buf);
   4616                storeLE( mkexpr(addr),
   4617                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
   4618                fp_pop();
   4619                break;
   4620 
   4621             case 5: { /* FLD extended-real */
   4622                /* Uses dirty helper:
   4623                      ULong x86g_loadF80le ( UInt )
   4624                   addr holds the address.  First, do a dirty call to
   4625                   get hold of the data. */
   4626                IRTemp   val  = newTemp(Ity_I64);
   4627                IRExpr** args = mkIRExprVec_1 ( mkexpr(addr) );
   4628 
   4629                IRDirty* d = unsafeIRDirty_1_N (
   4630                                val,
   4631                                0/*regparms*/,
   4632                                "x86g_dirtyhelper_loadF80le",
   4633                                &x86g_dirtyhelper_loadF80le,
   4634                                args
   4635                             );
   4636                /* declare that we're reading memory */
   4637                d->mFx   = Ifx_Read;
   4638                d->mAddr = mkexpr(addr);
   4639                d->mSize = 10;
   4640 
   4641                /* execute the dirty call, dumping the result in val. */
   4642                stmt( IRStmt_Dirty(d) );
   4643                fp_push();
   4644                put_ST(0, unop(Iop_ReinterpI64asF64, mkexpr(val)));
   4645 
   4646                DIP("fldt %s\n", dis_buf);
   4647                break;
   4648             }
   4649 
   4650             case 7: { /* FSTP extended-real */
   4651                /* Uses dirty helper: void x86g_storeF80le ( UInt, ULong ) */
   4652                IRExpr** args
   4653                   = mkIRExprVec_2( mkexpr(addr),
   4654                                    unop(Iop_ReinterpF64asI64, get_ST(0)) );
   4655 
   4656                IRDirty* d = unsafeIRDirty_0_N (
   4657                                0/*regparms*/,
   4658                                "x86g_dirtyhelper_storeF80le",
   4659                                &x86g_dirtyhelper_storeF80le,
   4660                                args
   4661                             );
   4662                /* declare we're writing memory */
   4663                d->mFx   = Ifx_Write;
   4664                d->mAddr = mkexpr(addr);
   4665                d->mSize = 10;
   4666 
   4667                /* execute the dirty call. */
   4668                stmt( IRStmt_Dirty(d) );
   4669                fp_pop();
   4670 
   4671                DIP("fstpt\n %s", dis_buf);
   4672                break;
   4673             }
   4674 
   4675             default:
   4676                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   4677                vex_printf("first_opcode == 0xDB\n");
   4678                goto decode_fail;
   4679          }
   4680 
   4681       } else {
   4682 
   4683          delta++;
   4684          switch (modrm) {
   4685 
   4686             case 0xC0 ... 0xC7: /* FCMOVNB ST(i), ST(0) */
   4687                r_src = (UInt)modrm - 0xC0;
   4688                DIP("fcmovnb %%st(%d), %%st(0)\n", (Int)r_src);
   4689                put_ST_UNCHECKED(0,
   4690                                 IRExpr_Mux0X(
   4691                                     unop(Iop_1Uto8,
   4692                                          mk_x86g_calculate_condition(X86CondNB)),
   4693                                     get_ST(0), get_ST(r_src)) );
   4694                break;
   4695 
   4696             case 0xC8 ... 0xCF: /* FCMOVNE(NZ) ST(i), ST(0) */
   4697                r_src = (UInt)modrm - 0xC8;
   4698                DIP("fcmovnz %%st(%d), %%st(0)\n", (Int)r_src);
   4699                put_ST_UNCHECKED(0,
   4700                                 IRExpr_Mux0X(
   4701                                     unop(Iop_1Uto8,
   4702                                          mk_x86g_calculate_condition(X86CondNZ)),
   4703                                     get_ST(0), get_ST(r_src)) );
   4704                break;
   4705 
   4706             case 0xD0 ... 0xD7: /* FCMOVNBE ST(i), ST(0) */
   4707                r_src = (UInt)modrm - 0xD0;
   4708                DIP("fcmovnbe %%st(%d), %%st(0)\n", (Int)r_src);
   4709                put_ST_UNCHECKED(0,
   4710                                 IRExpr_Mux0X(
   4711                                     unop(Iop_1Uto8,
   4712                                          mk_x86g_calculate_condition(X86CondNBE)),
   4713                                     get_ST(0), get_ST(r_src)) );
   4714                break;
   4715 
   4716             case 0xD8 ... 0xDF: /* FCMOVNU ST(i), ST(0) */
   4717                r_src = (UInt)modrm - 0xD8;
   4718                DIP("fcmovnu %%st(%d), %%st(0)\n", (Int)r_src);
   4719                put_ST_UNCHECKED(0,
   4720                                 IRExpr_Mux0X(
   4721                                     unop(Iop_1Uto8,
   4722                                          mk_x86g_calculate_condition(X86CondNP)),
   4723                                     get_ST(0), get_ST(r_src)) );
   4724                break;
   4725 
   4726             case 0xE2:
   4727                DIP("fnclex\n");
   4728                break;
   4729 
   4730             case 0xE3: {
   4731                /* Uses dirty helper:
   4732                      void x86g_do_FINIT ( VexGuestX86State* ) */
   4733                IRDirty* d  = unsafeIRDirty_0_N (
   4734                                 0/*regparms*/,
   4735                                 "x86g_dirtyhelper_FINIT",
   4736                                 &x86g_dirtyhelper_FINIT,
   4737                                 mkIRExprVec_0()
   4738                              );
   4739                d->needsBBP = True;
   4740 
   4741                /* declare we're writing guest state */
   4742                d->nFxState = 5;
   4743                vex_bzero(&d->fxState, sizeof(d->fxState));
   4744 
   4745                d->fxState[0].fx     = Ifx_Write;
   4746                d->fxState[0].offset = OFFB_FTOP;
   4747                d->fxState[0].size   = sizeof(UInt);
   4748 
   4749                d->fxState[1].fx     = Ifx_Write;
   4750                d->fxState[1].offset = OFFB_FPREGS;
   4751                d->fxState[1].size   = 8 * sizeof(ULong);
   4752 
   4753                d->fxState[2].fx     = Ifx_Write;
   4754                d->fxState[2].offset = OFFB_FPTAGS;
   4755                d->fxState[2].size   = 8 * sizeof(UChar);
   4756 
   4757                d->fxState[3].fx     = Ifx_Write;
   4758                d->fxState[3].offset = OFFB_FPROUND;
   4759                d->fxState[3].size   = sizeof(UInt);
   4760 
   4761                d->fxState[4].fx     = Ifx_Write;
   4762                d->fxState[4].offset = OFFB_FC3210;
   4763                d->fxState[4].size   = sizeof(UInt);
   4764 
   4765                stmt( IRStmt_Dirty(d) );
   4766 
   4767                DIP("fninit\n");
   4768                break;
   4769             }
   4770 
   4771             case 0xE8 ... 0xEF: /* FUCOMI %st(0),%st(?) */
   4772                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, False );
   4773                break;
   4774 
   4775             case 0xF0 ... 0xF7: /* FCOMI %st(0),%st(?) */
   4776                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, False );
   4777                break;
   4778 
   4779             default:
   4780                goto decode_fail;
   4781          }
   4782       }
   4783    }
   4784 
   4785    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDC opcodes +-+-+-+-+-+-+-+ */
   4786    else
   4787    if (first_opcode == 0xDC) {
   4788       if (modrm < 0xC0) {
   4789 
   4790          /* bits 5,4,3 are an opcode extension, and the modRM also
   4791             specifies an address. */
   4792          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   4793          delta += len;
   4794 
   4795          switch (gregOfRM(modrm)) {
   4796 
   4797             case 0: /* FADD double-real */
   4798                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, True );
   4799                break;
   4800 
   4801             case 1: /* FMUL double-real */
   4802                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, True );
   4803                break;
   4804 
   4805             case 2: /* FCOM double-real */
   4806                DIP("fcoml %s\n", dis_buf);
   4807                /* This forces C1 to zero, which isn't right. */
   4808                put_C3210(
   4809                    binop( Iop_And32,
   4810                           binop(Iop_Shl32,
   4811                                 binop(Iop_CmpF64,
   4812                                       get_ST(0),
   4813                                       loadLE(Ity_F64,mkexpr(addr))),
   4814                                 mkU8(8)),
   4815                           mkU32(0x4500)
   4816                    ));
   4817                break;
   4818 
   4819             case 3: /* FCOMP double-real */
   4820                DIP("fcompl %s\n", dis_buf);
   4821                /* This forces C1 to zero, which isn't right. */
   4822                put_C3210(
   4823                    binop( Iop_And32,
   4824                           binop(Iop_Shl32,
   4825                                 binop(Iop_CmpF64,
   4826                                       get_ST(0),
   4827                                       loadLE(Ity_F64,mkexpr(addr))),
   4828                                 mkU8(8)),
   4829                           mkU32(0x4500)
   4830                    ));
   4831                fp_pop();
   4832                break;
   4833 
   4834             case 4: /* FSUB double-real */
   4835                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, True );
   4836                break;
   4837 
   4838             case 5: /* FSUBR double-real */
   4839                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, True );
   4840                break;
   4841 
   4842             case 6: /* FDIV double-real */
   4843                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, True );
   4844                break;
   4845 
   4846             case 7: /* FDIVR double-real */
   4847                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, True );
   4848                break;
   4849 
   4850             default:
   4851                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   4852                vex_printf("first_opcode == 0xDC\n");
   4853                goto decode_fail;
   4854          }
   4855 
   4856       } else {
   4857 
   4858          delta++;
   4859          switch (modrm) {
   4860 
   4861             case 0xC0 ... 0xC7: /* FADD %st(0),%st(?) */
   4862                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, False );
   4863                break;
   4864 
   4865             case 0xC8 ... 0xCF: /* FMUL %st(0),%st(?) */
   4866                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, False );
   4867                break;
   4868 
   4869             case 0xE0 ... 0xE7: /* FSUBR %st(0),%st(?) */
   4870                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0, modrm - 0xE0, False );
   4871                break;
   4872 
   4873             case 0xE8 ... 0xEF: /* FSUB %st(0),%st(?) */
   4874                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0, modrm - 0xE8, False );
   4875                break;
   4876 
   4877             case 0xF0 ... 0xF7: /* FDIVR %st(0),%st(?) */
   4878                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, False );
   4879                break;
   4880 
   4881             case 0xF8 ... 0xFF: /* FDIV %st(0),%st(?) */
   4882                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, False );
   4883                break;
   4884 
   4885             default:
   4886                goto decode_fail;
   4887          }
   4888 
   4889       }
   4890    }
   4891 
   4892    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDD opcodes +-+-+-+-+-+-+-+ */
   4893    else
   4894    if (first_opcode == 0xDD) {
   4895 
   4896       if (modrm < 0xC0) {
   4897 
   4898          /* bits 5,4,3 are an opcode extension, and the modRM also
   4899             specifies an address. */
   4900          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   4901          delta += len;
   4902 
   4903          switch (gregOfRM(modrm)) {
   4904 
   4905             case 0: /* FLD double-real */
   4906                DIP("fldl %s\n", dis_buf);
   4907                fp_push();
   4908                put_ST(0, loadLE(Ity_F64, mkexpr(addr)));
   4909                break;
   4910 
   4911             case 1: /* FISTTPQ m64 (SSE3) */
   4912                DIP("fistppll %s\n", dis_buf);
   4913                storeLE( mkexpr(addr),
   4914                         binop(Iop_F64toI64S, mkU32(Irrm_ZERO), get_ST(0)) );
   4915                fp_pop();
   4916                break;
   4917 
   4918             case 2: /* FST double-real */
   4919                DIP("fstl %s\n", dis_buf);
   4920                storeLE(mkexpr(addr), get_ST(0));
   4921                break;
   4922 
   4923             case 3: /* FSTP double-real */
   4924                DIP("fstpl %s\n", dis_buf);
   4925                storeLE(mkexpr(addr), get_ST(0));
   4926                fp_pop();
   4927                break;
   4928 
   4929             case 4: { /* FRSTOR m108 */
   4930                /* Uses dirty helper:
   4931                      VexEmWarn x86g_do_FRSTOR ( VexGuestX86State*, Addr32 ) */
   4932                IRTemp   ew = newTemp(Ity_I32);
   4933                IRDirty* d  = unsafeIRDirty_0_N (
   4934                                 0/*regparms*/,
   4935                                 "x86g_dirtyhelper_FRSTOR",
   4936                                 &x86g_dirtyhelper_FRSTOR,
   4937                                 mkIRExprVec_1( mkexpr(addr) )
   4938                              );
   4939                d->needsBBP = True;
   4940                d->tmp      = ew;
   4941                /* declare we're reading memory */
   4942                d->mFx   = Ifx_Read;
   4943                d->mAddr = mkexpr(addr);
   4944                d->mSize = 108;
   4945 
   4946                /* declare we're writing guest state */
   4947                d->nFxState = 5;
   4948                vex_bzero(&d->fxState, sizeof(d->fxState));
   4949 
   4950                d->fxState[0].fx     = Ifx_Write;
   4951                d->fxState[0].offset = OFFB_FTOP;
   4952                d->fxState[0].size   = sizeof(UInt);
   4953 
   4954                d->fxState[1].fx     = Ifx_Write;
   4955                d->fxState[1].offset = OFFB_FPREGS;
   4956                d->fxState[1].size   = 8 * sizeof(ULong);
   4957 
   4958                d->fxState[2].fx     = Ifx_Write;
   4959                d->fxState[2].offset = OFFB_FPTAGS;
   4960                d->fxState[2].size   = 8 * sizeof(UChar);
   4961 
   4962                d->fxState[3].fx     = Ifx_Write;
   4963                d->fxState[3].offset = OFFB_FPROUND;
   4964                d->fxState[3].size   = sizeof(UInt);
   4965 
   4966                d->fxState[4].fx     = Ifx_Write;
   4967                d->fxState[4].offset = OFFB_FC3210;
   4968                d->fxState[4].size   = sizeof(UInt);
   4969 
   4970                stmt( IRStmt_Dirty(d) );
   4971 
   4972                /* ew contains any emulation warning we may need to
   4973                   issue.  If needed, side-exit to the next insn,
   4974                   reporting the warning, so that Valgrind's dispatcher
   4975                   sees the warning. */
   4976                put_emwarn( mkexpr(ew) );
   4977                stmt(
   4978                   IRStmt_Exit(
   4979                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   4980                      Ijk_EmWarn,
   4981                      IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
   4982                      OFFB_EIP
   4983                   )
   4984                );
   4985 
   4986                DIP("frstor %s\n", dis_buf);
   4987                break;
   4988             }
   4989 
   4990             case 6: { /* FNSAVE m108 */
   4991                /* Uses dirty helper:
   4992                      void x86g_do_FSAVE ( VexGuestX86State*, UInt ) */
   4993                IRDirty* d = unsafeIRDirty_0_N (
   4994                                0/*regparms*/,
   4995                                "x86g_dirtyhelper_FSAVE",
   4996                                &x86g_dirtyhelper_FSAVE,
   4997                                mkIRExprVec_1( mkexpr(addr) )
   4998                             );
   4999                d->needsBBP = True;
   5000                /* declare we're writing memory */
   5001                d->mFx   = Ifx_Write;
   5002                d->mAddr = mkexpr(addr);
   5003                d->mSize = 108;
   5004 
   5005                /* declare we're reading guest state */
   5006                d->nFxState = 5;
   5007                vex_bzero(&d->fxState, sizeof(d->fxState));
   5008 
   5009                d->fxState[0].fx     = Ifx_Read;
   5010                d->fxState[0].offset = OFFB_FTOP;
   5011                d->fxState[0].size   = sizeof(UInt);
   5012 
   5013                d->fxState[1].fx     = Ifx_Read;
   5014                d->fxState[1].offset = OFFB_FPREGS;
   5015                d->fxState[1].size   = 8 * sizeof(ULong);
   5016 
   5017                d->fxState[2].fx     = Ifx_Read;
   5018                d->fxState[2].offset = OFFB_FPTAGS;
   5019                d->fxState[2].size   = 8 * sizeof(UChar);
   5020 
   5021                d->fxState[3].fx     = Ifx_Read;
   5022                d->fxState[3].offset = OFFB_FPROUND;
   5023                d->fxState[3].size   = sizeof(UInt);
   5024 
   5025                d->fxState[4].fx     = Ifx_Read;
   5026                d->fxState[4].offset = OFFB_FC3210;
   5027                d->fxState[4].size   = sizeof(UInt);
   5028 
   5029                stmt( IRStmt_Dirty(d) );
   5030 
   5031                DIP("fnsave %s\n", dis_buf);
   5032                break;
   5033             }
   5034 
   5035             case 7: { /* FNSTSW m16 */
   5036                IRExpr* sw = get_FPU_sw();
   5037                vassert(typeOfIRExpr(irsb->tyenv, sw) == Ity_I16);
   5038                storeLE( mkexpr(addr), sw );
   5039                DIP("fnstsw %s\n", dis_buf);
   5040                break;
   5041             }
   5042 
   5043             default:
   5044                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   5045                vex_printf("first_opcode == 0xDD\n");
   5046                goto decode_fail;
   5047          }
   5048       } else {
   5049          delta++;
   5050          switch (modrm) {
   5051 
   5052             case 0xC0 ... 0xC7: /* FFREE %st(?) */
   5053                r_dst = (UInt)modrm - 0xC0;
   5054                DIP("ffree %%st(%d)\n", (Int)r_dst);
   5055                put_ST_TAG ( r_dst, mkU8(0) );
   5056                break;
   5057 
   5058             case 0xD0 ... 0xD7: /* FST %st(0),%st(?) */
   5059                r_dst = (UInt)modrm - 0xD0;
   5060                DIP("fst %%st(0),%%st(%d)\n", (Int)r_dst);
   5061                /* P4 manual says: "If the destination operand is a
   5062                   non-empty register, the invalid-operation exception
   5063                   is not generated.  Hence put_ST_UNCHECKED. */
   5064                put_ST_UNCHECKED(r_dst, get_ST(0));
   5065                break;
   5066 
   5067             case 0xD8 ... 0xDF: /* FSTP %st(0),%st(?) */
   5068                r_dst = (UInt)modrm - 0xD8;
   5069                DIP("fstp %%st(0),%%st(%d)\n", (Int)r_dst);
   5070                /* P4 manual says: "If the destination operand is a
   5071                   non-empty register, the invalid-operation exception
   5072                   is not generated.  Hence put_ST_UNCHECKED. */
   5073                put_ST_UNCHECKED(r_dst, get_ST(0));
   5074                fp_pop();
   5075                break;
   5076 
   5077             case 0xE0 ... 0xE7: /* FUCOM %st(0),%st(?) */
   5078                r_dst = (UInt)modrm - 0xE0;
   5079                DIP("fucom %%st(0),%%st(%d)\n", (Int)r_dst);
   5080                /* This forces C1 to zero, which isn't right. */
   5081                put_C3210(
   5082                    binop( Iop_And32,
   5083                           binop(Iop_Shl32,
   5084                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   5085                                 mkU8(8)),
   5086                           mkU32(0x4500)
   5087                    ));
   5088                break;
   5089 
   5090             case 0xE8 ... 0xEF: /* FUCOMP %st(0),%st(?) */
   5091                r_dst = (UInt)modrm - 0xE8;
   5092                DIP("fucomp %%st(0),%%st(%d)\n", (Int)r_dst);
   5093                /* This forces C1 to zero, which isn't right. */
   5094                put_C3210(
   5095                    binop( Iop_And32,
   5096                           binop(Iop_Shl32,
   5097                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   5098                                 mkU8(8)),
   5099                           mkU32(0x4500)
   5100                    ));
   5101                fp_pop();
   5102                break;
   5103 
   5104             default:
   5105                goto decode_fail;
   5106          }
   5107       }
   5108    }
   5109 
   5110    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDE opcodes +-+-+-+-+-+-+-+ */
   5111    else
   5112    if (first_opcode == 0xDE) {
   5113 
   5114       if (modrm < 0xC0) {
   5115 
   5116          /* bits 5,4,3 are an opcode extension, and the modRM also
   5117             specifies an address. */
   5118          IROp   fop;
   5119          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5120          delta += len;
   5121 
   5122          switch (gregOfRM(modrm)) {
   5123 
   5124             case 0: /* FIADD m16int */ /* ST(0) += m16int */
   5125                DIP("fiaddw %s\n", dis_buf);
   5126                fop = Iop_AddF64;
   5127                goto do_fop_m16;
   5128 
   5129             case 1: /* FIMUL m16int */ /* ST(0) *= m16int */
   5130                DIP("fimulw %s\n", dis_buf);
   5131                fop = Iop_MulF64;
   5132                goto do_fop_m16;
   5133 
   5134             case 2: /* FICOM m16int */
   5135                DIP("ficomw %s\n", dis_buf);
   5136                /* This forces C1 to zero, which isn't right. */
   5137                put_C3210(
   5138                    binop( Iop_And32,
   5139                           binop(Iop_Shl32,
   5140                                 binop(Iop_CmpF64,
   5141                                       get_ST(0),
   5142                                       unop(Iop_I32StoF64,
   5143                                          unop(Iop_16Sto32,
   5144                                            loadLE(Ity_I16,mkexpr(addr))))),
   5145                                 mkU8(8)),
   5146                           mkU32(0x4500)
   5147                    ));
   5148                break;
   5149 
   5150             case 3: /* FICOMP m16int */
   5151                DIP("ficompw %s\n", dis_buf);
   5152                /* This forces C1 to zero, which isn't right. */
   5153                put_C3210(
   5154                    binop( Iop_And32,
   5155                           binop(Iop_Shl32,
   5156                                 binop(Iop_CmpF64,
   5157                                       get_ST(0),
   5158                                       unop(Iop_I32StoF64,
   5159                                          unop(Iop_16Sto32,
   5160                                               loadLE(Ity_I16,mkexpr(addr))))),
   5161                                 mkU8(8)),
   5162                           mkU32(0x4500)
   5163                    ));
   5164                fp_pop();
   5165                break;
   5166 
   5167             case 4: /* FISUB m16int */ /* ST(0) -= m16int */
   5168                DIP("fisubw %s\n", dis_buf);
   5169                fop = Iop_SubF64;
   5170                goto do_fop_m16;
   5171 
   5172             case 5: /* FISUBR m16int */ /* ST(0) = m16int - ST(0) */
   5173                DIP("fisubrw %s\n", dis_buf);
   5174                fop = Iop_SubF64;
   5175                goto do_foprev_m16;
   5176 
   5177             case 6: /* FIDIV m16int */ /* ST(0) /= m16int */
   5178                DIP("fisubw %s\n", dis_buf);
   5179                fop = Iop_DivF64;
   5180                goto do_fop_m16;
   5181 
   5182             case 7: /* FIDIVR m16int */ /* ST(0) = m16int / ST(0) */
   5183                DIP("fidivrw %s\n", dis_buf);
   5184                fop = Iop_DivF64;
   5185                goto do_foprev_m16;
   5186 
   5187             do_fop_m16:
   5188                put_ST_UNCHECKED(0,
   5189                   triop(fop,
   5190                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5191                         get_ST(0),
   5192                         unop(Iop_I32StoF64,
   5193                              unop(Iop_16Sto32,
   5194                                   loadLE(Ity_I16, mkexpr(addr))))));
   5195                break;
   5196 
   5197             do_foprev_m16:
   5198                put_ST_UNCHECKED(0,
   5199                   triop(fop,
   5200                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5201                         unop(Iop_I32StoF64,
   5202                              unop(Iop_16Sto32,
   5203                                   loadLE(Ity_I16, mkexpr(addr)))),
   5204                         get_ST(0)));
   5205                break;
   5206 
   5207             default:
   5208                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   5209                vex_printf("first_opcode == 0xDE\n");
   5210                goto decode_fail;
   5211          }
   5212 
   5213       } else {
   5214 
   5215          delta++;
   5216          switch (modrm) {
   5217 
   5218             case 0xC0 ... 0xC7: /* FADDP %st(0),%st(?) */
   5219                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, True );
   5220                break;
   5221 
   5222             case 0xC8 ... 0xCF: /* FMULP %st(0),%st(?) */
   5223                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, True );
   5224                break;
   5225 
   5226             case 0xD9: /* FCOMPP %st(0),%st(1) */
   5227                DIP("fuompp %%st(0),%%st(1)\n");
   5228                /* This forces C1 to zero, which isn't right. */
   5229                put_C3210(
   5230                    binop( Iop_And32,
   5231                           binop(Iop_Shl32,
   5232                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
   5233                                 mkU8(8)),
   5234                           mkU32(0x4500)
   5235                    ));
   5236                fp_pop();
   5237                fp_pop();
   5238                break;
   5239 
   5240             case 0xE0 ... 0xE7: /* FSUBRP %st(0),%st(?) */
   5241                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0,  modrm - 0xE0, True );
   5242                break;
   5243 
   5244             case 0xE8 ... 0xEF: /* FSUBP %st(0),%st(?) */
   5245                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0,  modrm - 0xE8, True );
   5246                break;
   5247 
   5248             case 0xF0 ... 0xF7: /* FDIVRP %st(0),%st(?) */
   5249                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, True );
   5250                break;
   5251 
   5252             case 0xF8 ... 0xFF: /* FDIVP %st(0),%st(?) */
   5253                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, True );
   5254                break;
   5255 
   5256             default:
   5257                goto decode_fail;
   5258          }
   5259 
   5260       }
   5261    }
   5262 
   5263    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDF opcodes +-+-+-+-+-+-+-+ */
   5264    else
   5265    if (first_opcode == 0xDF) {
   5266 
   5267       if (modrm < 0xC0) {
   5268 
   5269          /* bits 5,4,3 are an opcode extension, and the modRM also
   5270             specifies an address. */
   5271          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5272          delta += len;
   5273 
   5274          switch (gregOfRM(modrm)) {
   5275 
   5276             case 0: /* FILD m16int */
   5277                DIP("fildw %s\n", dis_buf);
   5278                fp_push();
   5279                put_ST(0, unop(Iop_I32StoF64,
   5280                               unop(Iop_16Sto32,
   5281                                    loadLE(Ity_I16, mkexpr(addr)))));
   5282                break;
   5283 
   5284             case 1: /* FISTTPS m16 (SSE3) */
   5285                DIP("fisttps %s\n", dis_buf);
   5286                storeLE( mkexpr(addr),
   5287                         binop(Iop_F64toI16S, mkU32(Irrm_ZERO), get_ST(0)) );
   5288                fp_pop();
   5289                break;
   5290 
   5291             case 2: /* FIST m16 */
   5292                DIP("fistp %s\n", dis_buf);
   5293                storeLE( mkexpr(addr),
   5294                         binop(Iop_F64toI16S, get_roundingmode(), get_ST(0)) );
   5295                break;
   5296 
   5297             case 3: /* FISTP m16 */
   5298                DIP("fistps %s\n", dis_buf);
   5299                storeLE( mkexpr(addr),
   5300                         binop(Iop_F64toI16S, get_roundingmode(), get_ST(0)) );
   5301                fp_pop();
   5302                break;
   5303 
   5304             case 5: /* FILD m64 */
   5305                DIP("fildll %s\n", dis_buf);
   5306                fp_push();
   5307                put_ST(0, binop(Iop_I64StoF64,
   5308                                get_roundingmode(),
   5309                                loadLE(Ity_I64, mkexpr(addr))));
   5310                break;
   5311 
   5312             case 7: /* FISTP m64 */
   5313                DIP("fistpll %s\n", dis_buf);
   5314                storeLE( mkexpr(addr),
   5315                         binop(Iop_F64toI64S, get_roundingmode(), get_ST(0)) );
   5316                fp_pop();
   5317                break;
   5318 
   5319             default:
   5320                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   5321                vex_printf("first_opcode == 0xDF\n");
   5322                goto decode_fail;
   5323          }
   5324 
   5325       } else {
   5326 
   5327          delta++;
   5328          switch (modrm) {
   5329 
   5330             case 0xC0: /* FFREEP %st(0) */
   5331                DIP("ffreep %%st(%d)\n", 0);
   5332                put_ST_TAG ( 0, mkU8(0) );
   5333                fp_pop();
   5334                break;
   5335 
   5336             case 0xE0: /* FNSTSW %ax */
   5337                DIP("fnstsw %%ax\n");
   5338                /* Get the FPU status word value and dump it in %AX. */
   5339                if (0) {
   5340                   /* The obvious thing to do is simply dump the 16-bit
   5341                      status word value in %AX.  However, due to a
   5342                      limitation in Memcheck's origin tracking
   5343                      machinery, this causes Memcheck not to track the
   5344                      origin of any undefinedness into %AH (only into
   5345                      %AL/%AX/%EAX), which means origins are lost in
   5346                      the sequence "fnstsw %ax; test $M,%ah; jcond .." */
   5347                   putIReg(2, R_EAX, get_FPU_sw());
   5348                } else {
   5349                   /* So a somewhat lame kludge is to make it very
   5350                      clear to Memcheck that the value is written to
   5351                      both %AH and %AL.  This generates marginally
   5352                      worse code, but I don't think it matters much. */
   5353                   IRTemp t16 = newTemp(Ity_I16);
   5354                   assign(t16, get_FPU_sw());
   5355                   putIReg( 1, R_AL, unop(Iop_16to8, mkexpr(t16)) );
   5356                   putIReg( 1, R_AH, unop(Iop_16HIto8, mkexpr(t16)) );
   5357                }
   5358                break;
   5359 
   5360             case 0xE8 ... 0xEF: /* FUCOMIP %st(0),%st(?) */
   5361                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, True );
   5362                break;
   5363 
   5364             case 0xF0 ... 0xF7: /* FCOMIP %st(0),%st(?) */
   5365                /* not really right since COMIP != UCOMIP */
   5366                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, True );
   5367                break;
   5368 
   5369             default:
   5370                goto decode_fail;
   5371          }
   5372       }
   5373 
   5374    }
   5375 
   5376    else
   5377    vpanic("dis_FPU(x86): invalid primary opcode");
   5378 
   5379    *decode_ok = True;
   5380    return delta;
   5381 
   5382   decode_fail:
   5383    *decode_ok = False;
   5384    return delta;
   5385 }
   5386 
   5387 
   5388 /*------------------------------------------------------------*/
   5389 /*---                                                      ---*/
   5390 /*--- MMX INSTRUCTIONS                                     ---*/
   5391 /*---                                                      ---*/
   5392 /*------------------------------------------------------------*/
   5393 
   5394 /* Effect of MMX insns on x87 FPU state (table 11-2 of
   5395    IA32 arch manual, volume 3):
   5396 
   5397    Read from, or write to MMX register (viz, any insn except EMMS):
   5398    * All tags set to Valid (non-empty) -- FPTAGS[i] := nonzero
   5399    * FP stack pointer set to zero
   5400 
   5401    EMMS:
   5402    * All tags set to Invalid (empty) -- FPTAGS[i] := zero
   5403    * FP stack pointer set to zero
   5404 */
   5405 
   5406 static void do_MMX_preamble ( void )
   5407 {
   5408    Int         i;
   5409    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   5410    IRExpr*     zero  = mkU32(0);
   5411    IRExpr*     tag1  = mkU8(1);
   5412    put_ftop(zero);
   5413    for (i = 0; i < 8; i++)
   5414       stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag1) ) );
   5415 }
   5416 
   5417 static void do_EMMS_preamble ( void )
   5418 {
   5419    Int         i;
   5420    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   5421    IRExpr*     zero  = mkU32(0);
   5422    IRExpr*     tag0  = mkU8(0);
   5423    put_ftop(zero);
   5424    for (i = 0; i < 8; i++)
   5425       stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag0) ) );
   5426 }
   5427 
   5428 
   5429 static IRExpr* getMMXReg ( UInt archreg )
   5430 {
   5431    vassert(archreg < 8);
   5432    return IRExpr_Get( OFFB_FPREGS + 8 * archreg, Ity_I64 );
   5433 }
   5434 
   5435 
   5436 static void putMMXReg ( UInt archreg, IRExpr* e )
   5437 {
   5438    vassert(archreg < 8);
   5439    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   5440    stmt( IRStmt_Put( OFFB_FPREGS + 8 * archreg, e ) );
   5441 }
   5442 
   5443 
   5444 /* Helper for non-shift MMX insns.  Note this is incomplete in the
   5445    sense that it does not first call do_MMX_preamble() -- that is the
   5446    responsibility of its caller. */
   5447 
   5448 static
   5449 UInt dis_MMXop_regmem_to_reg ( UChar  sorb,
   5450                                Int    delta,
   5451                                UChar  opc,
   5452                                HChar* name,
   5453                                Bool   show_granularity )
   5454 {
   5455    HChar   dis_buf[50];
   5456    UChar   modrm = getIByte(delta);
   5457    Bool    isReg = epartIsReg(modrm);
   5458    IRExpr* argL  = NULL;
   5459    IRExpr* argR  = NULL;
   5460    IRExpr* argG  = NULL;
   5461    IRExpr* argE  = NULL;
   5462    IRTemp  res   = newTemp(Ity_I64);
   5463 
   5464    Bool    invG  = False;
   5465    IROp    op    = Iop_INVALID;
   5466    void*   hAddr = NULL;
   5467    HChar*  hName = NULL;
   5468    Bool    eLeft = False;
   5469 
   5470 #  define XXX(_name) do { hAddr = &_name; hName = #_name; } while (0)
   5471 
   5472    switch (opc) {
   5473       /* Original MMX ones */
   5474       case 0xFC: op = Iop_Add8x8; break;
   5475       case 0xFD: op = Iop_Add16x4; break;
   5476       case 0xFE: op = Iop_Add32x2; break;
   5477 
   5478       case 0xEC: op = Iop_QAdd8Sx8; break;
   5479       case 0xED: op = Iop_QAdd16Sx4; break;
   5480 
   5481       case 0xDC: op = Iop_QAdd8Ux8; break;
   5482       case 0xDD: op = Iop_QAdd16Ux4; break;
   5483 
   5484       case 0xF8: op = Iop_Sub8x8;  break;
   5485       case 0xF9: op = Iop_Sub16x4; break;
   5486       case 0xFA: op = Iop_Sub32x2; break;
   5487 
   5488       case 0xE8: op = Iop_QSub8Sx8; break;
   5489       case 0xE9: op = Iop_QSub16Sx4; break;
   5490 
   5491       case 0xD8: op = Iop_QSub8Ux8; break;
   5492       case 0xD9: op = Iop_QSub16Ux4; break;
   5493 
   5494       case 0xE5: op = Iop_MulHi16Sx4; break;
   5495       case 0xD5: op = Iop_Mul16x4; break;
   5496       case 0xF5: XXX(x86g_calculate_mmx_pmaddwd); break;
   5497 
   5498       case 0x74: op = Iop_CmpEQ8x8; break;
   5499       case 0x75: op = Iop_CmpEQ16x4; break;
   5500       case 0x76: op = Iop_CmpEQ32x2; break;
   5501 
   5502       case 0x64: op = Iop_CmpGT8Sx8; break;
   5503       case 0x65: op = Iop_CmpGT16Sx4; break;
   5504       case 0x66: op = Iop_CmpGT32Sx2; break;
   5505 
   5506       case 0x6B: op = Iop_QNarrowBin32Sto16Sx4; eLeft = True; break;
   5507       case 0x63: op = Iop_QNarrowBin16Sto8Sx8;  eLeft = True; break;
   5508       case 0x67: op = Iop_QNarrowBin16Sto8Ux8;  eLeft = True; break;
   5509 
   5510       case 0x68: op = Iop_InterleaveHI8x8;  eLeft = True; break;
   5511       case 0x69: op = Iop_InterleaveHI16x4; eLeft = True; break;
   5512       case 0x6A: op = Iop_InterleaveHI32x2; eLeft = True; break;
   5513 
   5514       case 0x60: op = Iop_InterleaveLO8x8;  eLeft = True; break;
   5515       case 0x61: op = Iop_InterleaveLO16x4; eLeft = True; break;
   5516       case 0x62: op = Iop_InterleaveLO32x2; eLeft = True; break;
   5517 
   5518       case 0xDB: op = Iop_And64; break;
   5519       case 0xDF: op = Iop_And64; invG = True; break;
   5520       case 0xEB: op = Iop_Or64; break;
   5521       case 0xEF: /* Possibly do better here if argL and argR are the
   5522                     same reg */
   5523                  op = Iop_Xor64; break;
   5524 
   5525       /* Introduced in SSE1 */
   5526       case 0xE0: op = Iop_Avg8Ux8;    break;
   5527       case 0xE3: op = Iop_Avg16Ux4;   break;
   5528       case 0xEE: op = Iop_Max16Sx4;   break;
   5529       case 0xDE: op = Iop_Max8Ux8;    break;
   5530       case 0xEA: op = Iop_Min16Sx4;   break;
   5531       case 0xDA: op = Iop_Min8Ux8;    break;
   5532       case 0xE4: op = Iop_MulHi16Ux4; break;
   5533       case 0xF6: XXX(x86g_calculate_mmx_psadbw); break;
   5534 
   5535       /* Introduced in SSE2 */
   5536       case 0xD4: op = Iop_Add64; break;
   5537       case 0xFB: op = Iop_Sub64; break;
   5538 
   5539       default:
   5540          vex_printf("\n0x%x\n", (Int)opc);
   5541          vpanic("dis_MMXop_regmem_to_reg");
   5542    }
   5543 
   5544 #  undef XXX
   5545 
   5546    argG = getMMXReg(gregOfRM(modrm));
   5547    if (invG)
   5548       argG = unop(Iop_Not64, argG);
   5549 
   5550    if (isReg) {
   5551       delta++;
   5552       argE = getMMXReg(eregOfRM(modrm));
   5553    } else {
   5554       Int    len;
   5555       IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5556       delta += len;
   5557       argE = loadLE(Ity_I64, mkexpr(addr));
   5558    }
   5559 
   5560    if (eLeft) {
   5561       argL = argE;
   5562       argR = argG;
   5563    } else {
   5564       argL = argG;
   5565       argR = argE;
   5566    }
   5567 
   5568    if (op != Iop_INVALID) {
   5569       vassert(hName == NULL);
   5570       vassert(hAddr == NULL);
   5571       assign(res, binop(op, argL, argR));
   5572    } else {
   5573       vassert(hName != NULL);
   5574       vassert(hAddr != NULL);
   5575       assign( res,
   5576               mkIRExprCCall(
   5577                  Ity_I64,
   5578                  0/*regparms*/, hName, hAddr,
   5579                  mkIRExprVec_2( argL, argR )
   5580               )
   5581             );
   5582    }
   5583 
   5584    putMMXReg( gregOfRM(modrm), mkexpr(res) );
   5585 
   5586    DIP("%s%s %s, %s\n",
   5587        name, show_granularity ? nameMMXGran(opc & 3) : "",
   5588        ( isReg ? nameMMXReg(eregOfRM(modrm)) : dis_buf ),
   5589        nameMMXReg(gregOfRM(modrm)) );
   5590 
   5591    return delta;
   5592 }
   5593 
   5594 
   5595 /* Vector by scalar shift of G by the amount specified at the bottom
   5596    of E.  This is a straight copy of dis_SSE_shiftG_byE. */
   5597 
   5598 static UInt dis_MMX_shiftG_byE ( UChar sorb, Int delta,
   5599                                  HChar* opname, IROp op )
   5600 {
   5601    HChar   dis_buf[50];
   5602    Int     alen, size;
   5603    IRTemp  addr;
   5604    Bool    shl, shr, sar;
   5605    UChar   rm   = getIByte(delta);
   5606    IRTemp  g0   = newTemp(Ity_I64);
   5607    IRTemp  g1   = newTemp(Ity_I64);
   5608    IRTemp  amt  = newTemp(Ity_I32);
   5609    IRTemp  amt8 = newTemp(Ity_I8);
   5610 
   5611    if (epartIsReg(rm)) {
   5612       assign( amt, unop(Iop_64to32, getMMXReg(eregOfRM(rm))) );
   5613       DIP("%s %s,%s\n", opname,
   5614                         nameMMXReg(eregOfRM(rm)),
   5615                         nameMMXReg(gregOfRM(rm)) );
   5616       delta++;
   5617    } else {
   5618       addr = disAMode ( &alen, sorb, delta, dis_buf );
   5619       assign( amt, loadLE(Ity_I32, mkexpr(addr)) );
   5620       DIP("%s %s,%s\n", opname,
   5621                         dis_buf,
   5622                         nameMMXReg(gregOfRM(rm)) );
   5623       delta += alen;
   5624    }
   5625    assign( g0,   getMMXReg(gregOfRM(rm)) );
   5626    assign( amt8, unop(Iop_32to8, mkexpr(amt)) );
   5627 
   5628    shl = shr = sar = False;
   5629    size = 0;
   5630    switch (op) {
   5631       case Iop_ShlN16x4: shl = True; size = 32; break;
   5632       case Iop_ShlN32x2: shl = True; size = 32; break;
   5633       case Iop_Shl64:    shl = True; size = 64; break;
   5634       case Iop_ShrN16x4: shr = True; size = 16; break;
   5635       case Iop_ShrN32x2: shr = True; size = 32; break;
   5636       case Iop_Shr64:    shr = True; size = 64; break;
   5637       case Iop_SarN16x4: sar = True; size = 16; break;
   5638       case Iop_SarN32x2: sar = True; size = 32; break;
   5639       default: vassert(0);
   5640    }
   5641 
   5642    if (shl || shr) {
   5643      assign(
   5644         g1,
   5645         IRExpr_Mux0X(
   5646            unop(Iop_1Uto8,binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size))),
   5647            mkU64(0),
   5648            binop(op, mkexpr(g0), mkexpr(amt8))
   5649         )
   5650      );
   5651    } else
   5652    if (sar) {
   5653      assign(
   5654         g1,
   5655         IRExpr_Mux0X(
   5656            unop(Iop_1Uto8,binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size))),
   5657            binop(op, mkexpr(g0), mkU8(size-1)),
   5658            binop(op, mkexpr(g0), mkexpr(amt8))
   5659         )
   5660      );
   5661    } else {
   5662       /*NOTREACHED*/
   5663       vassert(0);
   5664    }
   5665 
   5666    putMMXReg( gregOfRM(rm), mkexpr(g1) );
   5667    return delta;
   5668 }
   5669 
   5670 
   5671 /* Vector by scalar shift of E by an immediate byte.  This is a
   5672    straight copy of dis_SSE_shiftE_imm. */
   5673 
   5674 static
   5675 UInt dis_MMX_shiftE_imm ( Int delta, HChar* opname, IROp op )
   5676 {
   5677    Bool    shl, shr, sar;
   5678    UChar   rm   = getIByte(delta);
   5679    IRTemp  e0   = newTemp(Ity_I64);
   5680    IRTemp  e1   = newTemp(Ity_I64);
   5681    UChar   amt, size;
   5682    vassert(epartIsReg(rm));
   5683    vassert(gregOfRM(rm) == 2
   5684            || gregOfRM(rm) == 4 || gregOfRM(rm) == 6);
   5685    amt = getIByte(delta+1);
   5686    delta += 2;
   5687    DIP("%s $%d,%s\n", opname,
   5688                       (Int)amt,
   5689                       nameMMXReg(eregOfRM(rm)) );
   5690 
   5691    assign( e0, getMMXReg(eregOfRM(rm)) );
   5692 
   5693    shl = shr = sar = False;
   5694    size = 0;
   5695    switch (op) {
   5696       case Iop_ShlN16x4: shl = True; size = 16; break;
   5697       case Iop_ShlN32x2: shl = True; size = 32; break;
   5698       case Iop_Shl64:    shl = True; size = 64; break;
   5699       case Iop_SarN16x4: sar = True; size = 16; break;
   5700       case Iop_SarN32x2: sar = True; size = 32; break;
   5701       case Iop_ShrN16x4: shr = True; size = 16; break;
   5702       case Iop_ShrN32x2: shr = True; size = 32; break;
   5703       case Iop_Shr64:    shr = True; size = 64; break;
   5704       default: vassert(0);
   5705    }
   5706 
   5707    if (shl || shr) {
   5708       assign( e1, amt >= size
   5709                      ? mkU64(0)
   5710                      : binop(op, mkexpr(e0), mkU8(amt))
   5711       );
   5712    } else
   5713    if (sar) {
   5714       assign( e1, amt >= size
   5715                      ? binop(op, mkexpr(e0), mkU8(size-1))
   5716                      : binop(op, mkexpr(e0), mkU8(amt))
   5717       );
   5718    } else {
   5719       /*NOTREACHED*/
   5720       vassert(0);
   5721    }
   5722 
   5723    putMMXReg( eregOfRM(rm), mkexpr(e1) );
   5724    return delta;
   5725 }
   5726 
   5727 
   5728 /* Completely handle all MMX instructions except emms. */
   5729 
   5730 static
   5731 UInt dis_MMX ( Bool* decode_ok, UChar sorb, Int sz, Int delta )
   5732 {
   5733    Int   len;
   5734    UChar modrm;
   5735    HChar dis_buf[50];
   5736    UChar opc = getIByte(delta);
   5737    delta++;
   5738 
   5739    /* dis_MMX handles all insns except emms. */
   5740    do_MMX_preamble();
   5741 
   5742    switch (opc) {
   5743 
   5744       case 0x6E:
   5745          /* MOVD (src)ireg-or-mem (E), (dst)mmxreg (G)*/
   5746          if (sz != 4)
   5747             goto mmx_decode_failure;
   5748          modrm = getIByte(delta);
   5749          if (epartIsReg(modrm)) {
   5750             delta++;
   5751             putMMXReg(
   5752                gregOfRM(modrm),
   5753                binop( Iop_32HLto64,
   5754                       mkU32(0),
   5755                       getIReg(4, eregOfRM(modrm)) ) );
   5756             DIP("movd %s, %s\n",
   5757                 nameIReg(4,eregOfRM(modrm)), nameMMXReg(gregOfRM(modrm)));
   5758          } else {
   5759             IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5760             delta += len;
   5761             putMMXReg(
   5762                gregOfRM(modrm),
   5763                binop( Iop_32HLto64,
   5764                       mkU32(0),
   5765                       loadLE(Ity_I32, mkexpr(addr)) ) );
   5766             DIP("movd %s, %s\n", dis_buf, nameMMXReg(gregOfRM(modrm)));
   5767          }
   5768          break;
   5769 
   5770       case 0x7E: /* MOVD (src)mmxreg (G), (dst)ireg-or-mem (E) */
   5771          if (sz != 4)
   5772             goto mmx_decode_failure;
   5773          modrm = getIByte(delta);
   5774          if (epartIsReg(modrm)) {
   5775             delta++;
   5776             putIReg( 4, eregOfRM(modrm),
   5777                      unop(Iop_64to32, getMMXReg(gregOfRM(modrm)) ) );
   5778             DIP("movd %s, %s\n",
   5779                 nameMMXReg(gregOfRM(modrm)), nameIReg(4,eregOfRM(modrm)));
   5780          } else {
   5781             IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5782             delta += len;
   5783             storeLE( mkexpr(addr),
   5784                      unop(Iop_64to32, getMMXReg(gregOfRM(modrm)) ) );
   5785             DIP("movd %s, %s\n", nameMMXReg(gregOfRM(modrm)), dis_buf);
   5786          }
   5787          break;
   5788 
   5789       case 0x6F:
   5790          /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
   5791          if (sz != 4)
   5792             goto mmx_decode_failure;
   5793          modrm = getIByte(delta);
   5794          if (epartIsReg(modrm)) {
   5795             delta++;
   5796             putMMXReg( gregOfRM(modrm), getMMXReg(eregOfRM(modrm)) );
   5797             DIP("movq %s, %s\n",
   5798                 nameMMXReg(eregOfRM(modrm)), nameMMXReg(gregOfRM(modrm)));
   5799          } else {
   5800             IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5801             delta += len;
   5802             putMMXReg( gregOfRM(modrm), loadLE(Ity_I64, mkexpr(addr)) );
   5803             DIP("movq %s, %s\n",
   5804                 dis_buf, nameMMXReg(gregOfRM(modrm)));
   5805          }
   5806          break;
   5807 
   5808       case 0x7F:
   5809          /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
   5810          if (sz != 4)
   5811             goto mmx_decode_failure;
   5812          modrm = getIByte(delta);
   5813          if (epartIsReg(modrm)) {
   5814             delta++;
   5815             putMMXReg( eregOfRM(modrm), getMMXReg(gregOfRM(modrm)) );
   5816             DIP("movq %s, %s\n",
   5817                 nameMMXReg(gregOfRM(modrm)), nameMMXReg(eregOfRM(modrm)));
   5818          } else {
   5819             IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5820             delta += len;
   5821             storeLE( mkexpr(addr), getMMXReg(gregOfRM(modrm)) );
   5822             DIP("mov(nt)q %s, %s\n",
   5823                 nameMMXReg(gregOfRM(modrm)), dis_buf);
   5824          }
   5825          break;
   5826 
   5827       case 0xFC:
   5828       case 0xFD:
   5829       case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
   5830          if (sz != 4)
   5831             goto mmx_decode_failure;
   5832          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "padd", True );
   5833          break;
   5834 
   5835       case 0xEC:
   5836       case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
   5837          if (sz != 4)
   5838             goto mmx_decode_failure;
   5839          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "padds", True );
   5840          break;
   5841 
   5842       case 0xDC:
   5843       case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   5844          if (sz != 4)
   5845             goto mmx_decode_failure;
   5846          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "paddus", True );
   5847          break;
   5848 
   5849       case 0xF8:
   5850       case 0xF9:
   5851       case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
   5852          if (sz != 4)
   5853             goto mmx_decode_failure;
   5854          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psub", True );
   5855          break;
   5856 
   5857       case 0xE8:
   5858       case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
   5859          if (sz != 4)
   5860             goto mmx_decode_failure;
   5861          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psubs", True );
   5862          break;
   5863 
   5864       case 0xD8:
   5865       case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   5866          if (sz != 4)
   5867             goto mmx_decode_failure;
   5868          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psubus", True );
   5869          break;
   5870 
   5871       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
   5872          if (sz != 4)
   5873             goto mmx_decode_failure;
   5874          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmulhw", False );
   5875          break;
   5876 
   5877       case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
   5878          if (sz != 4)
   5879             goto mmx_decode_failure;
   5880          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmullw", False );
   5881          break;
   5882 
   5883       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
   5884          vassert(sz == 4);
   5885          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmaddwd", False );
   5886          break;
   5887 
   5888       case 0x74:
   5889       case 0x75:
   5890       case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
   5891          if (sz != 4)
   5892             goto mmx_decode_failure;
   5893          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pcmpeq", True );
   5894          break;
   5895 
   5896       case 0x64:
   5897       case 0x65:
   5898       case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
   5899          if (sz != 4)
   5900             goto mmx_decode_failure;
   5901          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pcmpgt", True );
   5902          break;
   5903 
   5904       case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
   5905          if (sz != 4)
   5906             goto mmx_decode_failure;
   5907          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packssdw", False );
   5908          break;
   5909 
   5910       case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
   5911          if (sz != 4)
   5912             goto mmx_decode_failure;
   5913          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packsswb", False );
   5914          break;
   5915 
   5916       case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
   5917          if (sz != 4)
   5918             goto mmx_decode_failure;
   5919          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packuswb", False );
   5920          break;
   5921 
   5922       case 0x68:
   5923       case 0x69:
   5924       case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
   5925          if (sz != 4)
   5926             goto mmx_decode_failure;
   5927          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "punpckh", True );
   5928          break;
   5929 
   5930       case 0x60:
   5931       case 0x61:
   5932       case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
   5933          if (sz != 4)
   5934             goto mmx_decode_failure;
   5935          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "punpckl", True );
   5936          break;
   5937 
   5938       case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
   5939          if (sz != 4)
   5940             goto mmx_decode_failure;
   5941          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pand", False );
   5942          break;
   5943 
   5944       case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
   5945          if (sz != 4)
   5946             goto mmx_decode_failure;
   5947          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pandn", False );
   5948          break;
   5949 
   5950       case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
   5951          if (sz != 4)
   5952             goto mmx_decode_failure;
   5953          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "por", False );
   5954          break;
   5955 
   5956       case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
   5957          if (sz != 4)
   5958             goto mmx_decode_failure;
   5959          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pxor", False );
   5960          break;
   5961 
   5962 #     define SHIFT_BY_REG(_name,_op)                                 \
   5963                 delta = dis_MMX_shiftG_byE(sorb, delta, _name, _op); \
   5964                 break;
   5965 
   5966       /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
   5967       case 0xF1: SHIFT_BY_REG("psllw", Iop_ShlN16x4);
   5968       case 0xF2: SHIFT_BY_REG("pslld", Iop_ShlN32x2);
   5969       case 0xF3: SHIFT_BY_REG("psllq", Iop_Shl64);
   5970 
   5971       /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
   5972       case 0xD1: SHIFT_BY_REG("psrlw", Iop_ShrN16x4);
   5973       case 0xD2: SHIFT_BY_REG("psrld", Iop_ShrN32x2);
   5974       case 0xD3: SHIFT_BY_REG("psrlq", Iop_Shr64);
   5975 
   5976       /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
   5977       case 0xE1: SHIFT_BY_REG("psraw", Iop_SarN16x4);
   5978       case 0xE2: SHIFT_BY_REG("psrad", Iop_SarN32x2);
   5979 
   5980 #     undef SHIFT_BY_REG
   5981 
   5982       case 0x71:
   5983       case 0x72:
   5984       case 0x73: {
   5985          /* (sz==4): PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
   5986          UChar byte2, subopc;
   5987          if (sz != 4)
   5988             goto mmx_decode_failure;
   5989          byte2  = getIByte(delta);           /* amode / sub-opcode */
   5990          subopc = toUChar( (byte2 >> 3) & 7 );
   5991 
   5992 #        define SHIFT_BY_IMM(_name,_op)                         \
   5993              do { delta = dis_MMX_shiftE_imm(delta,_name,_op);  \
   5994              } while (0)
   5995 
   5996               if (subopc == 2 /*SRL*/ && opc == 0x71)
   5997                  SHIFT_BY_IMM("psrlw", Iop_ShrN16x4);
   5998          else if (subopc == 2 /*SRL*/ && opc == 0x72)
   5999                  SHIFT_BY_IMM("psrld", Iop_ShrN32x2);
   6000          else if (subopc == 2 /*SRL*/ && opc == 0x73)
   6001                  SHIFT_BY_IMM("psrlq", Iop_Shr64);
   6002 
   6003          else if (subopc == 4 /*SAR*/ && opc == 0x71)
   6004                  SHIFT_BY_IMM("psraw", Iop_SarN16x4);
   6005          else if (subopc == 4 /*SAR*/ && opc == 0x72)
   6006                  SHIFT_BY_IMM("psrad", Iop_SarN32x2);
   6007 
   6008          else if (subopc == 6 /*SHL*/ && opc == 0x71)
   6009                  SHIFT_BY_IMM("psllw", Iop_ShlN16x4);
   6010          else if (subopc == 6 /*SHL*/ && opc == 0x72)
   6011                  SHIFT_BY_IMM("pslld", Iop_ShlN32x2);
   6012          else if (subopc == 6 /*SHL*/ && opc == 0x73)
   6013                  SHIFT_BY_IMM("psllq", Iop_Shl64);
   6014 
   6015          else goto mmx_decode_failure;
   6016 
   6017 #        undef SHIFT_BY_IMM
   6018          break;
   6019       }
   6020 
   6021       case 0xF7: {
   6022          IRTemp addr    = newTemp(Ity_I32);
   6023          IRTemp regD    = newTemp(Ity_I64);
   6024          IRTemp regM    = newTemp(Ity_I64);
   6025          IRTemp mask    = newTemp(Ity_I64);
   6026          IRTemp olddata = newTemp(Ity_I64);
   6027          IRTemp newdata = newTemp(Ity_I64);
   6028 
   6029          modrm = getIByte(delta);
   6030          if (sz != 4 || (!epartIsReg(modrm)))
   6031             goto mmx_decode_failure;
   6032          delta++;
   6033 
   6034          assign( addr, handleSegOverride( sorb, getIReg(4, R_EDI) ));
   6035          assign( regM, getMMXReg( eregOfRM(modrm) ));
   6036          assign( regD, getMMXReg( gregOfRM(modrm) ));
   6037          assign( mask, binop(Iop_SarN8x8, mkexpr(regM), mkU8(7)) );
   6038          assign( olddata, loadLE( Ity_I64, mkexpr(addr) ));
   6039          assign( newdata,
   6040                  binop(Iop_Or64,
   6041                        binop(Iop_And64,
   6042                              mkexpr(regD),
   6043                              mkexpr(mask) ),
   6044                        binop(Iop_And64,
   6045                              mkexpr(olddata),
   6046                              unop(Iop_Not64, mkexpr(mask)))) );
   6047          storeLE( mkexpr(addr), mkexpr(newdata) );
   6048          DIP("maskmovq %s,%s\n", nameMMXReg( eregOfRM(modrm) ),
   6049                                  nameMMXReg( gregOfRM(modrm) ) );
   6050          break;
   6051       }
   6052 
   6053       /* --- MMX decode failure --- */
   6054       default:
   6055       mmx_decode_failure:
   6056          *decode_ok = False;
   6057          return delta; /* ignored */
   6058 
   6059    }
   6060 
   6061    *decode_ok = True;
   6062    return delta;
   6063 }
   6064 
   6065 
   6066 /*------------------------------------------------------------*/
   6067 /*--- More misc arithmetic and other obscure insns.        ---*/
   6068 /*------------------------------------------------------------*/
   6069 
   6070 /* Double length left and right shifts.  Apparently only required in
   6071    v-size (no b- variant). */
   6072 static
   6073 UInt dis_SHLRD_Gv_Ev ( UChar sorb,
   6074                        Int delta, UChar modrm,
   6075                        Int sz,
   6076                        IRExpr* shift_amt,
   6077                        Bool amt_is_literal,
   6078                        HChar* shift_amt_txt,
   6079                        Bool left_shift )
   6080 {
   6081    /* shift_amt :: Ity_I8 is the amount to shift.  shift_amt_txt is used
   6082       for printing it.   And eip on entry points at the modrm byte. */
   6083    Int len;
   6084    HChar dis_buf[50];
   6085 
   6086    IRType ty       = szToITy(sz);
   6087    IRTemp gsrc     = newTemp(ty);
   6088    IRTemp esrc     = newTemp(ty);
   6089    IRTemp addr     = IRTemp_INVALID;
   6090    IRTemp tmpSH    = newTemp(Ity_I8);
   6091    IRTemp tmpL     = IRTemp_INVALID;
   6092    IRTemp tmpRes   = IRTemp_INVALID;
   6093    IRTemp tmpSubSh = IRTemp_INVALID;
   6094    IROp   mkpair;
   6095    IROp   getres;
   6096    IROp   shift;
   6097    IRExpr* mask = NULL;
   6098 
   6099    vassert(sz == 2 || sz == 4);
   6100 
   6101    /* The E-part is the destination; this is shifted.  The G-part
   6102       supplies bits to be shifted into the E-part, but is not
   6103       changed.
   6104 
   6105       If shifting left, form a double-length word with E at the top
   6106       and G at the bottom, and shift this left.  The result is then in
   6107       the high part.
   6108 
   6109       If shifting right, form a double-length word with G at the top
   6110       and E at the bottom, and shift this right.  The result is then
   6111       at the bottom.  */
   6112 
   6113    /* Fetch the operands. */
   6114 
   6115    assign( gsrc, getIReg(sz, gregOfRM(modrm)) );
   6116 
   6117    if (epartIsReg(modrm)) {
   6118       delta++;
   6119       assign( esrc, getIReg(sz, eregOfRM(modrm)) );
   6120       DIP("sh%cd%c %s, %s, %s\n",
   6121           ( left_shift ? 'l' : 'r' ), nameISize(sz),
   6122           shift_amt_txt,
   6123           nameIReg(sz, gregOfRM(modrm)), nameIReg(sz, eregOfRM(modrm)));
   6124    } else {
   6125       addr = disAMode ( &len, sorb, delta, dis_buf );
   6126       delta += len;
   6127       assign( esrc, loadLE(ty, mkexpr(addr)) );
   6128       DIP("sh%cd%c %s, %s, %s\n",
   6129           ( left_shift ? 'l' : 'r' ), nameISize(sz),
   6130           shift_amt_txt,
   6131           nameIReg(sz, gregOfRM(modrm)), dis_buf);
   6132    }
   6133 
   6134    /* Round up the relevant primops. */
   6135 
   6136    if (sz == 4) {
   6137       tmpL     = newTemp(Ity_I64);
   6138       tmpRes   = newTemp(Ity_I32);
   6139       tmpSubSh = newTemp(Ity_I32);
   6140       mkpair   = Iop_32HLto64;
   6141       getres   = left_shift ? Iop_64HIto32 : Iop_64to32;
   6142       shift    = left_shift ? Iop_Shl64 : Iop_Shr64;
   6143       mask     = mkU8(31);
   6144    } else {
   6145       /* sz == 2 */
   6146       tmpL     = newTemp(Ity_I32);
   6147       tmpRes   = newTemp(Ity_I16);
   6148       tmpSubSh = newTemp(Ity_I16);
   6149       mkpair   = Iop_16HLto32;
   6150       getres   = left_shift ? Iop_32HIto16 : Iop_32to16;
   6151       shift    = left_shift ? Iop_Shl32 : Iop_Shr32;
   6152       mask     = mkU8(15);
   6153    }
   6154 
   6155    /* Do the shift, calculate the subshift value, and set
   6156       the flag thunk. */
   6157 
   6158    assign( tmpSH, binop(Iop_And8, shift_amt, mask) );
   6159 
   6160    if (left_shift)
   6161       assign( tmpL, binop(mkpair, mkexpr(esrc), mkexpr(gsrc)) );
   6162    else
   6163       assign( tmpL, binop(mkpair, mkexpr(gsrc), mkexpr(esrc)) );
   6164 
   6165    assign( tmpRes, unop(getres, binop(shift, mkexpr(tmpL), mkexpr(tmpSH)) ) );
   6166    assign( tmpSubSh,
   6167            unop(getres,
   6168                 binop(shift,
   6169                       mkexpr(tmpL),
   6170                       binop(Iop_And8,
   6171                             binop(Iop_Sub8, mkexpr(tmpSH), mkU8(1) ),
   6172                             mask))) );
   6173 
   6174    setFlags_DEP1_DEP2_shift ( left_shift ? Iop_Shl32 : Iop_Sar32,
   6175                               tmpRes, tmpSubSh, ty, tmpSH );
   6176 
   6177    /* Put result back. */
   6178 
   6179    if (epartIsReg(modrm)) {
   6180       putIReg(sz, eregOfRM(modrm), mkexpr(tmpRes));
   6181    } else {
   6182       storeLE( mkexpr(addr), mkexpr(tmpRes) );
   6183    }
   6184 
   6185    if (amt_is_literal) delta++;
   6186    return delta;
   6187 }
   6188 
   6189 
   6190 /* Handle BT/BTS/BTR/BTC Gv, Ev.  Apparently b-size is not
   6191    required. */
   6192 
   6193 typedef enum { BtOpNone, BtOpSet, BtOpReset, BtOpComp } BtOp;
   6194 
   6195 static HChar* nameBtOp ( BtOp op )
   6196 {
   6197    switch (op) {
   6198       case BtOpNone:  return "";
   6199       case BtOpSet:   return "s";
   6200       case BtOpReset: return "r";
   6201       case BtOpComp:  return "c";
   6202       default: vpanic("nameBtOp(x86)");
   6203    }
   6204 }
   6205 
   6206 
   6207 static
   6208 UInt dis_bt_G_E ( VexAbiInfo* vbi,
   6209                   UChar sorb, Bool locked, Int sz, Int delta, BtOp op )
   6210 {
   6211    HChar  dis_buf[50];
   6212    UChar  modrm;
   6213    Int    len;
   6214    IRTemp t_fetched, t_bitno0, t_bitno1, t_bitno2, t_addr0,
   6215           t_addr1, t_esp, t_mask, t_new;
   6216 
   6217    vassert(sz == 2 || sz == 4);
   6218 
   6219    t_fetched = t_bitno0 = t_bitno1 = t_bitno2
   6220              = t_addr0 = t_addr1 = t_esp
   6221              = t_mask = t_new = IRTemp_INVALID;
   6222 
   6223    t_fetched = newTemp(Ity_I8);
   6224    t_new     = newTemp(Ity_I8);
   6225    t_bitno0  = newTemp(Ity_I32);
   6226    t_bitno1  = newTemp(Ity_I32);
   6227    t_bitno2  = newTemp(Ity_I8);
   6228    t_addr1   = newTemp(Ity_I32);
   6229    modrm     = getIByte(delta);
   6230 
   6231    assign( t_bitno0, widenSto32(getIReg(sz, gregOfRM(modrm))) );
   6232 
   6233    if (epartIsReg(modrm)) {
   6234       delta++;
   6235       /* Get it onto the client's stack. */
   6236       t_esp = newTemp(Ity_I32);
   6237       t_addr0 = newTemp(Ity_I32);
   6238 
   6239       /* For the choice of the value 128, see comment in dis_bt_G_E in
   6240          guest_amd64_toIR.c.  We point out here only that 128 is
   6241          fast-cased in Memcheck and is > 0, so seems like a good
   6242          choice. */
   6243       vassert(vbi->guest_stack_redzone_size == 0);
   6244       assign( t_esp, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(128)) );
   6245       putIReg(4, R_ESP, mkexpr(t_esp));
   6246 
   6247       storeLE( mkexpr(t_esp), getIReg(sz, eregOfRM(modrm)) );
   6248 
   6249       /* Make t_addr0 point at it. */
   6250       assign( t_addr0, mkexpr(t_esp) );
   6251 
   6252       /* Mask out upper bits of the shift amount, since we're doing a
   6253          reg. */
   6254       assign( t_bitno1, binop(Iop_And32,
   6255                               mkexpr(t_bitno0),
   6256                               mkU32(sz == 4 ? 31 : 15)) );
   6257 
   6258    } else {
   6259       t_addr0 = disAMode ( &len, sorb, delta, dis_buf );
   6260       delta += len;
   6261       assign( t_bitno1, mkexpr(t_bitno0) );
   6262    }
   6263 
   6264    /* At this point: t_addr0 is the address being operated on.  If it
   6265       was a reg, we will have pushed it onto the client's stack.
   6266       t_bitno1 is the bit number, suitably masked in the case of a
   6267       reg.  */
   6268 
   6269    /* Now the main sequence. */
   6270    assign( t_addr1,
   6271            binop(Iop_Add32,
   6272                  mkexpr(t_addr0),
   6273                  binop(Iop_Sar32, mkexpr(t_bitno1), mkU8(3))) );
   6274 
   6275    /* t_addr1 now holds effective address */
   6276 
   6277    assign( t_bitno2,
   6278            unop(Iop_32to8,
   6279                 binop(Iop_And32, mkexpr(t_bitno1), mkU32(7))) );
   6280 
   6281    /* t_bitno2 contains offset of bit within byte */
   6282 
   6283    if (op != BtOpNone) {
   6284       t_mask = newTemp(Ity_I8);
   6285       assign( t_mask, binop(Iop_Shl8, mkU8(1), mkexpr(t_bitno2)) );
   6286    }
   6287 
   6288    /* t_mask is now a suitable byte mask */
   6289 
   6290    assign( t_fetched, loadLE(Ity_I8, mkexpr(t_addr1)) );
   6291 
   6292    if (op != BtOpNone) {
   6293       switch (op) {
   6294          case BtOpSet:
   6295             assign( t_new,
   6296                     binop(Iop_Or8, mkexpr(t_fetched), mkexpr(t_mask)) );
   6297             break;
   6298          case BtOpComp:
   6299             assign( t_new,
   6300                     binop(Iop_Xor8, mkexpr(t_fetched), mkexpr(t_mask)) );
   6301             break;
   6302          case BtOpReset:
   6303             assign( t_new,
   6304                     binop(Iop_And8, mkexpr(t_fetched),
   6305                                     unop(Iop_Not8, mkexpr(t_mask))) );
   6306             break;
   6307          default:
   6308             vpanic("dis_bt_G_E(x86)");
   6309       }
   6310       if (locked && !epartIsReg(modrm)) {
   6311          casLE( mkexpr(t_addr1), mkexpr(t_fetched)/*expd*/,
   6312                                  mkexpr(t_new)/*new*/,
   6313                                  guest_EIP_curr_instr );
   6314       } else {
   6315          storeLE( mkexpr(t_addr1), mkexpr(t_new) );
   6316       }
   6317    }
   6318 
   6319    /* Side effect done; now get selected bit into Carry flag */
   6320    /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
   6321    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   6322    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   6323    stmt( IRStmt_Put(
   6324             OFFB_CC_DEP1,
   6325             binop(Iop_And32,
   6326                   binop(Iop_Shr32,
   6327                         unop(Iop_8Uto32, mkexpr(t_fetched)),
   6328                         mkexpr(t_bitno2)),
   6329                   mkU32(1)))
   6330        );
   6331    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   6332       elimination of previous stores to this field work better. */
   6333    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   6334 
   6335    /* Move reg operand from stack back to reg */
   6336    if (epartIsReg(modrm)) {
   6337       /* t_esp still points at it. */
   6338       putIReg(sz, eregOfRM(modrm), loadLE(szToITy(sz), mkexpr(t_esp)) );
   6339       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t_esp), mkU32(128)) );
   6340    }
   6341 
   6342    DIP("bt%s%c %s, %s\n",
   6343        nameBtOp(op), nameISize(sz), nameIReg(sz, gregOfRM(modrm)),
   6344        ( epartIsReg(modrm) ? nameIReg(sz, eregOfRM(modrm)) : dis_buf ) );
   6345 
   6346    return delta;
   6347 }
   6348 
   6349 
   6350 
   6351 /* Handle BSF/BSR.  Only v-size seems necessary. */
   6352 static
   6353 UInt dis_bs_E_G ( UChar sorb, Int sz, Int delta, Bool fwds )
   6354 {
   6355    Bool   isReg;
   6356    UChar  modrm;
   6357    HChar  dis_buf[50];
   6358 
   6359    IRType ty  = szToITy(sz);
   6360    IRTemp src = newTemp(ty);
   6361    IRTemp dst = newTemp(ty);
   6362 
   6363    IRTemp src32 = newTemp(Ity_I32);
   6364    IRTemp dst32 = newTemp(Ity_I32);
   6365    IRTemp src8  = newTemp(Ity_I8);
   6366 
   6367    vassert(sz == 4 || sz == 2);
   6368 
   6369    modrm = getIByte(delta);
   6370 
   6371    isReg = epartIsReg(modrm);
   6372    if (isReg) {
   6373       delta++;
   6374       assign( src, getIReg(sz, eregOfRM(modrm)) );
   6375    } else {
   6376       Int    len;
   6377       IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   6378       delta += len;
   6379       assign( src, loadLE(ty, mkexpr(addr)) );
   6380    }
   6381 
   6382    DIP("bs%c%c %s, %s\n",
   6383        fwds ? 'f' : 'r', nameISize(sz),
   6384        ( isReg ? nameIReg(sz, eregOfRM(modrm)) : dis_buf ),
   6385        nameIReg(sz, gregOfRM(modrm)));
   6386 
   6387    /* Generate an 8-bit expression which is zero iff the
   6388       original is zero, and nonzero otherwise */
   6389    assign( src8,
   6390            unop(Iop_1Uto8, binop(mkSizedOp(ty,Iop_CmpNE8),
   6391                            mkexpr(src), mkU(ty,0))) );
   6392 
   6393    /* Flags: Z is 1 iff source value is zero.  All others
   6394       are undefined -- we force them to zero. */
   6395    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   6396    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   6397    stmt( IRStmt_Put(
   6398             OFFB_CC_DEP1,
   6399             IRExpr_Mux0X( mkexpr(src8),
   6400                           /* src==0 */
   6401                           mkU32(X86G_CC_MASK_Z),
   6402                           /* src!=0 */
   6403                           mkU32(0)
   6404                         )
   6405        ));
   6406    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   6407       elimination of previous stores to this field work better. */
   6408    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   6409 
   6410    /* Result: iff source value is zero, we can't use
   6411       Iop_Clz32/Iop_Ctz32 as they have no defined result in that case.
   6412       But anyway, Intel x86 semantics say the result is undefined in
   6413       such situations.  Hence handle the zero case specially. */
   6414 
   6415    /* Bleh.  What we compute:
   6416 
   6417           bsf32:  if src == 0 then 0 else  Ctz32(src)
   6418           bsr32:  if src == 0 then 0 else  31 - Clz32(src)
   6419 
   6420           bsf16:  if src == 0 then 0 else  Ctz32(16Uto32(src))
   6421           bsr16:  if src == 0 then 0 else  31 - Clz32(16Uto32(src))
   6422 
   6423       First, widen src to 32 bits if it is not already.
   6424 
   6425       Postscript 15 Oct 04: it seems that at least VIA Nehemiah leaves the
   6426       dst register unchanged when src == 0.  Hence change accordingly.
   6427    */
   6428    if (sz == 2)
   6429       assign( src32, unop(Iop_16Uto32, mkexpr(src)) );
   6430    else
   6431       assign( src32, mkexpr(src) );
   6432 
   6433    /* The main computation, guarding against zero. */
   6434    assign( dst32,
   6435            IRExpr_Mux0X(
   6436               mkexpr(src8),
   6437               /* src == 0 -- leave dst unchanged */
   6438               widenUto32( getIReg( sz, gregOfRM(modrm) ) ),
   6439               /* src != 0 */
   6440               fwds ? unop(Iop_Ctz32, mkexpr(src32))
   6441                    : binop(Iop_Sub32,
   6442                            mkU32(31),
   6443                            unop(Iop_Clz32, mkexpr(src32)))
   6444            )
   6445          );
   6446 
   6447    if (sz == 2)
   6448       assign( dst, unop(Iop_32to16, mkexpr(dst32)) );
   6449    else
   6450       assign( dst, mkexpr(dst32) );
   6451 
   6452    /* dump result back */
   6453    putIReg( sz, gregOfRM(modrm), mkexpr(dst) );
   6454 
   6455    return delta;
   6456 }
   6457 
   6458 
   6459 static
   6460 void codegen_xchg_eAX_Reg ( Int sz, Int reg )
   6461 {
   6462    IRType ty = szToITy(sz);
   6463    IRTemp t1 = newTemp(ty);
   6464    IRTemp t2 = newTemp(ty);
   6465    vassert(sz == 2 || sz == 4);
   6466    assign( t1, getIReg(sz, R_EAX) );
   6467    assign( t2, getIReg(sz, reg) );
   6468    putIReg( sz, R_EAX, mkexpr(t2) );
   6469    putIReg( sz, reg, mkexpr(t1) );
   6470    DIP("xchg%c %s, %s\n",
   6471        nameISize(sz), nameIReg(sz, R_EAX), nameIReg(sz, reg));
   6472 }
   6473 
   6474 
   6475 static
   6476 void codegen_SAHF ( void )
   6477 {
   6478    /* Set the flags to:
   6479       (x86g_calculate_flags_all() & X86G_CC_MASK_O)  -- retain the old O flag
   6480       | (%AH & (X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
   6481                 |X86G_CC_MASK_P|X86G_CC_MASK_C)
   6482    */
   6483    UInt   mask_SZACP = X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
   6484                        |X86G_CC_MASK_C|X86G_CC_MASK_P;
   6485    IRTemp oldflags   = newTemp(Ity_I32);
   6486    assign( oldflags, mk_x86g_calculate_eflags_all() );
   6487    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   6488    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   6489    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   6490    stmt( IRStmt_Put( OFFB_CC_DEP1,
   6491          binop(Iop_Or32,
   6492                binop(Iop_And32, mkexpr(oldflags), mkU32(X86G_CC_MASK_O)),
   6493                binop(Iop_And32,
   6494                      binop(Iop_Shr32, getIReg(4, R_EAX), mkU8(8)),
   6495                      mkU32(mask_SZACP))
   6496               )
   6497    ));
   6498    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   6499       elimination of previous stores to this field work better. */
   6500    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   6501 }
   6502 
   6503 
   6504 static
   6505 void codegen_LAHF ( void  )
   6506 {
   6507    /* AH <- EFLAGS(SF:ZF:0:AF:0:PF:1:CF) */
   6508    IRExpr* eax_with_hole;
   6509    IRExpr* new_byte;
   6510    IRExpr* new_eax;
   6511    UInt    mask_SZACP = X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
   6512                         |X86G_CC_MASK_C|X86G_CC_MASK_P;
   6513 
   6514    IRTemp  flags = newTemp(Ity_I32);
   6515    assign( flags, mk_x86g_calculate_eflags_all() );
   6516 
   6517    eax_with_hole
   6518       = binop(Iop_And32, getIReg(4, R_EAX), mkU32(0xFFFF00FF));
   6519    new_byte
   6520       = binop(Iop_Or32, binop(Iop_And32, mkexpr(flags), mkU32(mask_SZACP)),
   6521                         mkU32(1<<1));
   6522    new_eax
   6523       = binop(Iop_Or32, eax_with_hole,
   6524                         binop(Iop_Shl32, new_byte, mkU8(8)));
   6525    putIReg(4, R_EAX, new_eax);
   6526 }
   6527 
   6528 
   6529 static
   6530 UInt dis_cmpxchg_G_E ( UChar       sorb,
   6531                        Bool        locked,
   6532                        Int         size,
   6533                        Int         delta0 )
   6534 {
   6535    HChar dis_buf[50];
   6536    Int   len;
   6537 
   6538    IRType ty    = szToITy(size);
   6539    IRTemp acc   = newTemp(ty);
   6540    IRTemp src   = newTemp(ty);
   6541    IRTemp dest  = newTemp(ty);
   6542    IRTemp dest2 = newTemp(ty);
   6543    IRTemp acc2  = newTemp(ty);
   6544    IRTemp cond8 = newTemp(Ity_I8);
   6545    IRTemp addr  = IRTemp_INVALID;
   6546    UChar  rm    = getUChar(delta0);
   6547 
   6548    /* There are 3 cases to consider:
   6549 
   6550       reg-reg: ignore any lock prefix, generate sequence based
   6551                on Mux0X
   6552 
   6553       reg-mem, not locked: ignore any lock prefix, generate sequence
   6554                            based on Mux0X
   6555 
   6556       reg-mem, locked: use IRCAS
   6557    */
   6558    if (epartIsReg(rm)) {
   6559       /* case 1 */
   6560       assign( dest, getIReg(size, eregOfRM(rm)) );
   6561       delta0++;
   6562       assign( src, getIReg(size, gregOfRM(rm)) );
   6563       assign( acc, getIReg(size, R_EAX) );
   6564       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   6565       assign( cond8, unop(Iop_1Uto8, mk_x86g_calculate_condition(X86CondZ)) );
   6566       assign( dest2, IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(src)) );
   6567       assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
   6568       putIReg(size, R_EAX, mkexpr(acc2));
   6569       putIReg(size, eregOfRM(rm), mkexpr(dest2));
   6570       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   6571                                nameIReg(size,gregOfRM(rm)),
   6572                                nameIReg(size,eregOfRM(rm)) );
   6573    }
   6574    else if (!epartIsReg(rm) && !locked) {
   6575       /* case 2 */
   6576       addr = disAMode ( &len, sorb, delta0, dis_buf );
   6577       assign( dest, loadLE(ty, mkexpr(addr)) );
   6578       delta0 += len;
   6579       assign( src, getIReg(size, gregOfRM(rm)) );
   6580       assign( acc, getIReg(size, R_EAX) );
   6581       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   6582       assign( cond8, unop(Iop_1Uto8, mk_x86g_calculate_condition(X86CondZ)) );
   6583       assign( dest2, IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(src)) );
   6584       assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
   6585       putIReg(size, R_EAX, mkexpr(acc2));
   6586       storeLE( mkexpr(addr), mkexpr(dest2) );
   6587       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   6588                                nameIReg(size,gregOfRM(rm)), dis_buf);
   6589    }
   6590    else if (!epartIsReg(rm) && locked) {
   6591       /* case 3 */
   6592       /* src is new value.  acc is expected value.  dest is old value.
   6593          Compute success from the output of the IRCAS, and steer the
   6594          new value for EAX accordingly: in case of success, EAX is
   6595          unchanged. */
   6596       addr = disAMode ( &len, sorb, delta0, dis_buf );
   6597       delta0 += len;
   6598       assign( src, getIReg(size, gregOfRM(rm)) );
   6599       assign( acc, getIReg(size, R_EAX) );
   6600       stmt( IRStmt_CAS(
   6601          mkIRCAS( IRTemp_INVALID, dest, Iend_LE, mkexpr(addr),
   6602                   NULL, mkexpr(acc), NULL, mkexpr(src) )
   6603       ));
   6604       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   6605       assign( cond8, unop(Iop_1Uto8, mk_x86g_calculate_condition(X86CondZ)) );
   6606       assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
   6607       putIReg(size, R_EAX, mkexpr(acc2));
   6608       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   6609                                nameIReg(size,gregOfRM(rm)), dis_buf);
   6610    }
   6611    else vassert(0);
   6612 
   6613    return delta0;
   6614 }
   6615 
   6616 
   6617 /* Handle conditional move instructions of the form
   6618       cmovcc E(reg-or-mem), G(reg)
   6619 
   6620    E(src) is reg-or-mem
   6621    G(dst) is reg.
   6622 
   6623    If E is reg, -->    GET %E, tmps
   6624                        GET %G, tmpd
   6625                        CMOVcc tmps, tmpd
   6626                        PUT tmpd, %G
   6627 
   6628    If E is mem  -->    (getAddr E) -> tmpa
   6629                        LD (tmpa), tmps
   6630                        GET %G, tmpd
   6631                        CMOVcc tmps, tmpd
   6632                        PUT tmpd, %G
   6633 */
   6634 static
   6635 UInt dis_cmov_E_G ( UChar       sorb,
   6636                     Int         sz,
   6637                     X86Condcode cond,
   6638                     Int         delta0 )
   6639 {
   6640    UChar rm  = getIByte(delta0);
   6641    HChar dis_buf[50];
   6642    Int   len;
   6643 
   6644    IRType ty   = szToITy(sz);
   6645    IRTemp tmps = newTemp(ty);
   6646    IRTemp tmpd = newTemp(ty);
   6647 
   6648    if (epartIsReg(rm)) {
   6649       assign( tmps, getIReg(sz, eregOfRM(rm)) );
   6650       assign( tmpd, getIReg(sz, gregOfRM(rm)) );
   6651 
   6652       putIReg(sz, gregOfRM(rm),
   6653                   IRExpr_Mux0X( unop(Iop_1Uto8,
   6654                                      mk_x86g_calculate_condition(cond)),
   6655                                 mkexpr(tmpd),
   6656                                 mkexpr(tmps) )
   6657              );
   6658       DIP("cmov%c%s %s,%s\n", nameISize(sz),
   6659                               name_X86Condcode(cond),
   6660                               nameIReg(sz,eregOfRM(rm)),
   6661                               nameIReg(sz,gregOfRM(rm)));
   6662       return 1+delta0;
   6663    }
   6664 
   6665    /* E refers to memory */
   6666    {
   6667       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
   6668       assign( tmps, loadLE(ty, mkexpr(addr)) );
   6669       assign( tmpd, getIReg(sz, gregOfRM(rm)) );
   6670 
   6671       putIReg(sz, gregOfRM(rm),
   6672                   IRExpr_Mux0X( unop(Iop_1Uto8,
   6673                                      mk_x86g_calculate_condition(cond)),
   6674                                 mkexpr(tmpd),
   6675                                 mkexpr(tmps) )
   6676              );
   6677 
   6678       DIP("cmov%c%s %s,%s\n", nameISize(sz),
   6679                               name_X86Condcode(cond),
   6680                               dis_buf,
   6681                               nameIReg(sz,gregOfRM(rm)));
   6682       return len+delta0;
   6683    }
   6684 }
   6685 
   6686 
   6687 static
   6688 UInt dis_xadd_G_E ( UChar sorb, Bool locked, Int sz, Int delta0,
   6689                     Bool* decodeOK )
   6690 {
   6691    Int   len;
   6692    UChar rm = getIByte(delta0);
   6693    HChar dis_buf[50];
   6694 
   6695    IRType ty    = szToITy(sz);
   6696    IRTemp tmpd  = newTemp(ty);
   6697    IRTemp tmpt0 = newTemp(ty);
   6698    IRTemp tmpt1 = newTemp(ty);
   6699 
   6700    /* There are 3 cases to consider:
   6701 
   6702       reg-reg: ignore any lock prefix,
   6703                generate 'naive' (non-atomic) sequence
   6704 
   6705       reg-mem, not locked: ignore any lock prefix, generate 'naive'
   6706                            (non-atomic) sequence
   6707 
   6708       reg-mem, locked: use IRCAS
   6709    */
   6710 
   6711    if (epartIsReg(rm)) {
   6712       /* case 1 */
   6713       assign( tmpd,  getIReg(sz, eregOfRM(rm)));
   6714       assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
   6715       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   6716                            mkexpr(tmpd), mkexpr(tmpt0)) );
   6717       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   6718       putIReg(sz, eregOfRM(rm), mkexpr(tmpt1));
   6719       putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
   6720       DIP("xadd%c %s, %s\n",
   6721           nameISize(sz), nameIReg(sz,gregOfRM(rm)),
   6722           				 nameIReg(sz,eregOfRM(rm)));
   6723       *decodeOK = True;
   6724       return 1+delta0;
   6725    }
   6726    else if (!epartIsReg(rm) && !locked) {
   6727       /* case 2 */
   6728       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
   6729       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
   6730       assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
   6731       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   6732                            mkexpr(tmpd), mkexpr(tmpt0)) );
   6733       storeLE( mkexpr(addr), mkexpr(tmpt1) );
   6734       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   6735       putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
   6736       DIP("xadd%c %s, %s\n",
   6737           nameISize(sz), nameIReg(sz,gregOfRM(rm)), dis_buf);
   6738       *decodeOK = True;
   6739       return len+delta0;
   6740    }
   6741    else if (!epartIsReg(rm) && locked) {
   6742       /* case 3 */
   6743       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
   6744       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
   6745       assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
   6746       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   6747                            mkexpr(tmpd), mkexpr(tmpt0)) );
   6748       casLE( mkexpr(addr), mkexpr(tmpd)/*expVal*/,
   6749                            mkexpr(tmpt1)/*newVal*/, guest_EIP_curr_instr );
   6750       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   6751       putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
   6752       DIP("xadd%c %s, %s\n",
   6753           nameISize(sz), nameIReg(sz,gregOfRM(rm)), dis_buf);
   6754       *decodeOK = True;
   6755       return len+delta0;
   6756    }
   6757    /*UNREACHED*/
   6758    vassert(0);
   6759 }
   6760 
   6761 /* Move 16 bits from Ew (ireg or mem) to G (a segment register). */
   6762 
   6763 static
   6764 UInt dis_mov_Ew_Sw ( UChar sorb, Int delta0 )
   6765 {
   6766    Int    len;
   6767    IRTemp addr;
   6768    UChar  rm  = getIByte(delta0);
   6769    HChar  dis_buf[50];
   6770 
   6771    if (epartIsReg(rm)) {
   6772       putSReg( gregOfRM(rm), getIReg(2, eregOfRM(rm)) );
   6773       DIP("movw %s,%s\n", nameIReg(2,eregOfRM(rm)), nameSReg(gregOfRM(rm)));
   6774       return 1+delta0;
   6775    } else {
   6776       addr = disAMode ( &len, sorb, delta0, dis_buf );
   6777       putSReg( gregOfRM(rm), loadLE(Ity_I16, mkexpr(addr)) );
   6778       DIP("movw %s,%s\n", dis_buf, nameSReg(gregOfRM(rm)));
   6779       return len+delta0;
   6780    }
   6781 }
   6782 
   6783 /* Move 16 bits from G (a segment register) to Ew (ireg or mem).  If
   6784    dst is ireg and sz==4, zero out top half of it.  */
   6785 
   6786 static
   6787 UInt dis_mov_Sw_Ew ( UChar sorb,
   6788                      Int   sz,
   6789                      Int   delta0 )
   6790 {
   6791    Int    len;
   6792    IRTemp addr;
   6793    UChar  rm  = getIByte(delta0);
   6794    HChar  dis_buf[50];
   6795 
   6796    vassert(sz == 2 || sz == 4);
   6797 
   6798    if (epartIsReg(rm)) {
   6799       if (sz == 4)
   6800          putIReg(4, eregOfRM(rm), unop(Iop_16Uto32, getSReg(gregOfRM(rm))));
   6801       else
   6802          putIReg(2, eregOfRM(rm), getSReg(gregOfRM(rm)));
   6803 
   6804       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), nameIReg(sz,eregOfRM(rm)));
   6805       return 1+delta0;
   6806    } else {
   6807       addr = disAMode ( &len, sorb, delta0, dis_buf );
   6808       storeLE( mkexpr(addr), getSReg(gregOfRM(rm)) );
   6809       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), dis_buf);
   6810       return len+delta0;
   6811    }
   6812 }
   6813 
   6814 
   6815 static
   6816 void dis_push_segreg ( UInt sreg, Int sz )
   6817 {
   6818     IRTemp t1 = newTemp(Ity_I16);
   6819     IRTemp ta = newTemp(Ity_I32);
   6820     vassert(sz == 2 || sz == 4);
   6821 
   6822     assign( t1, getSReg(sreg) );
   6823     assign( ta, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)) );
   6824     putIReg(4, R_ESP, mkexpr(ta));
   6825     storeLE( mkexpr(ta), mkexpr(t1) );
   6826 
   6827     DIP("push%c %s\n", sz==2 ? 'w' : 'l', nameSReg(sreg));
   6828 }
   6829 
   6830 static
   6831 void dis_pop_segreg ( UInt sreg, Int sz )
   6832 {
   6833     IRTemp t1 = newTemp(Ity_I16);
   6834     IRTemp ta = newTemp(Ity_I32);
   6835     vassert(sz == 2 || sz == 4);
   6836 
   6837     assign( ta, getIReg(4, R_ESP) );
   6838     assign( t1, loadLE(Ity_I16, mkexpr(ta)) );
   6839 
   6840     putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(ta), mkU32(sz)) );
   6841     putSReg( sreg, mkexpr(t1) );
   6842     DIP("pop%c %s\n", sz==2 ? 'w' : 'l', nameSReg(sreg));
   6843 }
   6844 
   6845 static
   6846 void dis_ret ( /*MOD*/DisResult* dres, UInt d32 )
   6847 {
   6848    IRTemp t1 = newTemp(Ity_I32);
   6849    IRTemp t2 = newTemp(Ity_I32);
   6850    assign(t1, getIReg(4,R_ESP));
   6851    assign(t2, loadLE(Ity_I32,mkexpr(t1)));
   6852    putIReg(4, R_ESP,binop(Iop_Add32, mkexpr(t1), mkU32(4+d32)));
   6853    jmp_treg(dres, Ijk_Ret, t2);
   6854    vassert(dres->whatNext == Dis_StopHere);
   6855 }
   6856 
   6857 /*------------------------------------------------------------*/
   6858 /*--- SSE/SSE2/SSE3 helpers                                ---*/
   6859 /*------------------------------------------------------------*/
   6860 
   6861 /* Worker function; do not call directly.
   6862    Handles full width G = G `op` E   and   G = (not G) `op` E.
   6863 */
   6864 
   6865 static UInt dis_SSE_E_to_G_all_wrk (
   6866                UChar sorb, Int delta,
   6867                HChar* opname, IROp op,
   6868                Bool   invertG
   6869             )
   6870 {
   6871    HChar   dis_buf[50];
   6872    Int     alen;
   6873    IRTemp  addr;
   6874    UChar   rm = getIByte(delta);
   6875    IRExpr* gpart
   6876       = invertG ? unop(Iop_NotV128, getXMMReg(gregOfRM(rm)))
   6877                 : getXMMReg(gregOfRM(rm));
   6878    if (epartIsReg(rm)) {
   6879       putXMMReg( gregOfRM(rm),
   6880                  binop(op, gpart,
   6881                            getXMMReg(eregOfRM(rm))) );
   6882       DIP("%s %s,%s\n", opname,
   6883                         nameXMMReg(eregOfRM(rm)),
   6884                         nameXMMReg(gregOfRM(rm)) );
   6885       return delta+1;
   6886    } else {
   6887       addr = disAMode ( &alen, sorb, delta, dis_buf );
   6888       putXMMReg( gregOfRM(rm),
   6889                  binop(op, gpart,
   6890                            loadLE(Ity_V128, mkexpr(addr))) );
   6891       DIP("%s %s,%s\n", opname,
   6892                         dis_buf,
   6893                         nameXMMReg(gregOfRM(rm)) );
   6894       return delta+alen;
   6895    }
   6896 }
   6897 
   6898 
   6899 /* All lanes SSE binary operation, G = G `op` E. */
   6900 
   6901 static
   6902 UInt dis_SSE_E_to_G_all ( UChar sorb, Int delta, HChar* opname, IROp op )
   6903 {
   6904    return dis_SSE_E_to_G_all_wrk( sorb, delta, opname, op, False );
   6905 }
   6906 
   6907 /* All lanes SSE binary operation, G = (not G) `op` E. */
   6908 
   6909 static
   6910 UInt dis_SSE_E_to_G_all_invG ( UChar sorb, Int delta,
   6911                                HChar* opname, IROp op )
   6912 {
   6913    return dis_SSE_E_to_G_all_wrk( sorb, delta, opname, op, True );
   6914 }
   6915 
   6916 
   6917 /* Lowest 32-bit lane only SSE binary operation, G = G `op` E. */
   6918 
   6919 static UInt dis_SSE_E_to_G_lo32 ( UChar sorb, Int delta,
   6920                                   HChar* opname, IROp op )
   6921 {
   6922    HChar   dis_buf[50];
   6923    Int     alen;
   6924    IRTemp  addr;
   6925    UChar   rm = getIByte(delta);
   6926    IRExpr* gpart = getXMMReg(gregOfRM(rm));
   6927    if (epartIsReg(rm)) {
   6928       putXMMReg( gregOfRM(rm),
   6929                  binop(op, gpart,
   6930                            getXMMReg(eregOfRM(rm))) );
   6931       DIP("%s %s,%s\n", opname,
   6932                         nameXMMReg(eregOfRM(rm)),
   6933                         nameXMMReg(gregOfRM(rm)) );
   6934       return delta+1;
   6935    } else {
   6936       /* We can only do a 32-bit memory read, so the upper 3/4 of the
   6937          E operand needs to be made simply of zeroes. */
   6938       IRTemp epart = newTemp(Ity_V128);
   6939       addr = disAMode ( &alen, sorb, delta, dis_buf );
   6940       assign( epart, unop( Iop_32UtoV128,
   6941                            loadLE(Ity_I32, mkexpr(addr))) );
   6942       putXMMReg( gregOfRM(rm),
   6943                  binop(op, gpart, mkexpr(epart)) );
   6944       DIP("%s %s,%s\n", opname,
   6945                         dis_buf,
   6946                         nameXMMReg(gregOfRM(rm)) );
   6947       return delta+alen;
   6948    }
   6949 }
   6950 
   6951 
   6952 /* Lower 64-bit lane only SSE binary operation, G = G `op` E. */
   6953 
   6954 static UInt dis_SSE_E_to_G_lo64 ( UChar sorb, Int delta,
   6955                                   HChar* opname, IROp op )
   6956 {
   6957    HChar   dis_buf[50];
   6958    Int     alen;
   6959    IRTemp  addr;
   6960    UChar   rm = getIByte(delta);
   6961    IRExpr* gpart = getXMMReg(gregOfRM(rm));
   6962    if (epartIsReg(rm)) {
   6963       putXMMReg( gregOfRM(rm),
   6964                  binop(op, gpart,
   6965                            getXMMReg(eregOfRM(rm))) );
   6966       DIP("%s %s,%s\n", opname,
   6967                         nameXMMReg(eregOfRM(rm)),
   6968                         nameXMMReg(gregOfRM(rm)) );
   6969       return delta+1;
   6970    } else {
   6971       /* We can only do a 64-bit memory read, so the upper half of the
   6972          E operand needs to be made simply of zeroes. */
   6973       IRTemp epart = newTemp(Ity_V128);
   6974       addr = disAMode ( &alen, sorb, delta, dis_buf );
   6975       assign( epart, unop( Iop_64UtoV128,
   6976                            loadLE(Ity_I64, mkexpr(addr))) );
   6977       putXMMReg( gregOfRM(rm),
   6978                  binop(op, gpart, mkexpr(epart)) );
   6979       DIP("%s %s,%s\n", opname,
   6980                         dis_buf,
   6981                         nameXMMReg(gregOfRM(rm)) );
   6982       return delta+alen;
   6983    }
   6984 }
   6985 
   6986 
   6987 /* All lanes unary SSE operation, G = op(E). */
   6988 
   6989 static UInt dis_SSE_E_to_G_unary_all (
   6990                UChar sorb, Int delta,
   6991                HChar* opname, IROp op
   6992             )
   6993 {
   6994    HChar   dis_buf[50];
   6995    Int     alen;
   6996    IRTemp  addr;
   6997    UChar   rm = getIByte(delta);
   6998    if (epartIsReg(rm)) {
   6999       putXMMReg( gregOfRM(rm),
   7000                  unop(op, getXMMReg(eregOfRM(rm))) );
   7001       DIP("%s %s,%s\n", opname,
   7002                         nameXMMReg(eregOfRM(rm)),
   7003                         nameXMMReg(gregOfRM(rm)) );
   7004       return delta+1;
   7005    } else {
   7006       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7007       putXMMReg( gregOfRM(rm),
   7008                  unop(op, loadLE(Ity_V128, mkexpr(addr))) );
   7009       DIP("%s %s,%s\n", opname,
   7010                         dis_buf,
   7011                         nameXMMReg(gregOfRM(rm)) );
   7012       return delta+alen;
   7013    }
   7014 }
   7015 
   7016 
   7017 /* Lowest 32-bit lane only unary SSE operation, G = op(E). */
   7018 
   7019 static UInt dis_SSE_E_to_G_unary_lo32 (
   7020                UChar sorb, Int delta,
   7021                HChar* opname, IROp op
   7022             )
   7023 {
   7024    /* First we need to get the old G value and patch the low 32 bits
   7025       of the E operand into it.  Then apply op and write back to G. */
   7026    HChar   dis_buf[50];
   7027    Int     alen;
   7028    IRTemp  addr;
   7029    UChar   rm = getIByte(delta);
   7030    IRTemp  oldG0 = newTemp(Ity_V128);
   7031    IRTemp  oldG1 = newTemp(Ity_V128);
   7032 
   7033    assign( oldG0, getXMMReg(gregOfRM(rm)) );
   7034 
   7035    if (epartIsReg(rm)) {
   7036       assign( oldG1,
   7037               binop( Iop_SetV128lo32,
   7038                      mkexpr(oldG0),
   7039                      getXMMRegLane32(eregOfRM(rm), 0)) );
   7040       putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
   7041       DIP("%s %s,%s\n", opname,
   7042                         nameXMMReg(eregOfRM(rm)),
   7043                         nameXMMReg(gregOfRM(rm)) );
   7044       return delta+1;
   7045    } else {
   7046       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7047       assign( oldG1,
   7048               binop( Iop_SetV128lo32,
   7049                      mkexpr(oldG0),
   7050                      loadLE(Ity_I32, mkexpr(addr)) ));
   7051       putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
   7052       DIP("%s %s,%s\n", opname,
   7053                         dis_buf,
   7054                         nameXMMReg(gregOfRM(rm)) );
   7055       return delta+alen;
   7056    }
   7057 }
   7058 
   7059 
   7060 /* Lowest 64-bit lane only unary SSE operation, G = op(E). */
   7061 
   7062 static UInt dis_SSE_E_to_G_unary_lo64 (
   7063                UChar sorb, Int delta,
   7064                HChar* opname, IROp op
   7065             )
   7066 {
   7067    /* First we need to get the old G value and patch the low 64 bits
   7068       of the E operand into it.  Then apply op and write back to G. */
   7069    HChar   dis_buf[50];
   7070    Int     alen;
   7071    IRTemp  addr;
   7072    UChar   rm = getIByte(delta);
   7073    IRTemp  oldG0 = newTemp(Ity_V128);
   7074    IRTemp  oldG1 = newTemp(Ity_V128);
   7075 
   7076    assign( oldG0, getXMMReg(gregOfRM(rm)) );
   7077 
   7078    if (epartIsReg(rm)) {
   7079       assign( oldG1,
   7080               binop( Iop_SetV128lo64,
   7081                      mkexpr(oldG0),
   7082                      getXMMRegLane64(eregOfRM(rm), 0)) );
   7083       putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
   7084       DIP("%s %s,%s\n", opname,
   7085                         nameXMMReg(eregOfRM(rm)),
   7086                         nameXMMReg(gregOfRM(rm)) );
   7087       return delta+1;
   7088    } else {
   7089       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7090       assign( oldG1,
   7091               binop( Iop_SetV128lo64,
   7092                      mkexpr(oldG0),
   7093                      loadLE(Ity_I64, mkexpr(addr)) ));
   7094       putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
   7095       DIP("%s %s,%s\n", opname,
   7096                         dis_buf,
   7097                         nameXMMReg(gregOfRM(rm)) );
   7098       return delta+alen;
   7099    }
   7100 }
   7101 
   7102 
   7103 /* SSE integer binary operation:
   7104       G = G `op` E   (eLeft == False)
   7105       G = E `op` G   (eLeft == True)
   7106 */
   7107 static UInt dis_SSEint_E_to_G(
   7108                UChar sorb, Int delta,
   7109                HChar* opname, IROp op,
   7110                Bool   eLeft
   7111             )
   7112 {
   7113    HChar   dis_buf[50];
   7114    Int     alen;
   7115    IRTemp  addr;
   7116    UChar   rm = getIByte(delta);
   7117    IRExpr* gpart = getXMMReg(gregOfRM(rm));
   7118    IRExpr* epart = NULL;
   7119    if (epartIsReg(rm)) {
   7120       epart = getXMMReg(eregOfRM(rm));
   7121       DIP("%s %s,%s\n", opname,
   7122                         nameXMMReg(eregOfRM(rm)),
   7123                         nameXMMReg(gregOfRM(rm)) );
   7124       delta += 1;
   7125    } else {
   7126       addr  = disAMode ( &alen, sorb, delta, dis_buf );
   7127       epart = loadLE(Ity_V128, mkexpr(addr));
   7128       DIP("%s %s,%s\n", opname,
   7129                         dis_buf,
   7130                         nameXMMReg(gregOfRM(rm)) );
   7131       delta += alen;
   7132    }
   7133    putXMMReg( gregOfRM(rm),
   7134               eLeft ? binop(op, epart, gpart)
   7135 	            : binop(op, gpart, epart) );
   7136    return delta;
   7137 }
   7138 
   7139 
   7140 /* Helper for doing SSE FP comparisons. */
   7141 
   7142 static void findSSECmpOp ( Bool* needNot, IROp* op,
   7143                            Int imm8, Bool all_lanes, Int sz )
   7144 {
   7145    imm8 &= 7;
   7146    *needNot = False;
   7147    *op      = Iop_INVALID;
   7148    if (imm8 >= 4) {
   7149       *needNot = True;
   7150       imm8 -= 4;
   7151    }
   7152 
   7153    if (sz == 4 && all_lanes) {
   7154       switch (imm8) {
   7155          case 0: *op = Iop_CmpEQ32Fx4; return;
   7156          case 1: *op = Iop_CmpLT32Fx4; return;
   7157          case 2: *op = Iop_CmpLE32Fx4; return;
   7158          case 3: *op = Iop_CmpUN32Fx4; return;
   7159          default: break;
   7160       }
   7161    }
   7162    if (sz == 4 && !all_lanes) {
   7163       switch (imm8) {
   7164          case 0: *op = Iop_CmpEQ32F0x4; return;
   7165          case 1: *op = Iop_CmpLT32F0x4; return;
   7166          case 2: *op = Iop_CmpLE32F0x4; return;
   7167          case 3: *op = Iop_CmpUN32F0x4; return;
   7168          default: break;
   7169       }
   7170    }
   7171    if (sz == 8 && all_lanes) {
   7172       switch (imm8) {
   7173          case 0: *op = Iop_CmpEQ64Fx2; return;
   7174          case 1: *op = Iop_CmpLT64Fx2; return;
   7175          case 2: *op = Iop_CmpLE64Fx2; return;
   7176          case 3: *op = Iop_CmpUN64Fx2; return;
   7177          default: break;
   7178       }
   7179    }
   7180    if (sz == 8 && !all_lanes) {
   7181       switch (imm8) {
   7182          case 0: *op = Iop_CmpEQ64F0x2; return;
   7183          case 1: *op = Iop_CmpLT64F0x2; return;
   7184          case 2: *op = Iop_CmpLE64F0x2; return;
   7185          case 3: *op = Iop_CmpUN64F0x2; return;
   7186          default: break;
   7187       }
   7188    }
   7189    vpanic("findSSECmpOp(x86,guest)");
   7190 }
   7191 
   7192 /* Handles SSE 32F/64F comparisons. */
   7193 
   7194 static UInt dis_SSEcmp_E_to_G ( UChar sorb, Int delta,
   7195 				HChar* opname, Bool all_lanes, Int sz )
   7196 {
   7197    HChar   dis_buf[50];
   7198    Int     alen, imm8;
   7199    IRTemp  addr;
   7200    Bool    needNot = False;
   7201    IROp    op      = Iop_INVALID;
   7202    IRTemp  plain   = newTemp(Ity_V128);
   7203    UChar   rm      = getIByte(delta);
   7204    UShort  mask    = 0;
   7205    vassert(sz == 4 || sz == 8);
   7206    if (epartIsReg(rm)) {
   7207       imm8 = getIByte(delta+1);
   7208       findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
   7209       assign( plain, binop(op, getXMMReg(gregOfRM(rm)),
   7210                                getXMMReg(eregOfRM(rm))) );
   7211       delta += 2;
   7212       DIP("%s $%d,%s,%s\n", opname,
   7213                             (Int)imm8,
   7214                             nameXMMReg(eregOfRM(rm)),
   7215                             nameXMMReg(gregOfRM(rm)) );
   7216    } else {
   7217       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7218       imm8 = getIByte(delta+alen);
   7219       findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
   7220       assign( plain,
   7221               binop(
   7222                  op,
   7223                  getXMMReg(gregOfRM(rm)),
   7224                    all_lanes  ? loadLE(Ity_V128, mkexpr(addr))
   7225                  : sz == 8    ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
   7226                  : /*sz==4*/    unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr)))
   7227              )
   7228       );
   7229       delta += alen+1;
   7230       DIP("%s $%d,%s,%s\n", opname,
   7231                             (Int)imm8,
   7232                             dis_buf,
   7233                             nameXMMReg(gregOfRM(rm)) );
   7234    }
   7235 
   7236    if (needNot && all_lanes) {
   7237       putXMMReg( gregOfRM(rm),
   7238                  unop(Iop_NotV128, mkexpr(plain)) );
   7239    }
   7240    else
   7241    if (needNot && !all_lanes) {
   7242       mask = toUShort( sz==4 ? 0x000F : 0x00FF );
   7243       putXMMReg( gregOfRM(rm),
   7244                  binop(Iop_XorV128, mkexpr(plain), mkV128(mask)) );
   7245    }
   7246    else {
   7247       putXMMReg( gregOfRM(rm), mkexpr(plain) );
   7248    }
   7249 
   7250    return delta;
   7251 }
   7252 
   7253 
   7254 /* Vector by scalar shift of G by the amount specified at the bottom
   7255    of E. */
   7256 
   7257 static UInt dis_SSE_shiftG_byE ( UChar sorb, Int delta,
   7258                                  HChar* opname, IROp op )
   7259 {
   7260    HChar   dis_buf[50];
   7261    Int     alen, size;
   7262    IRTemp  addr;
   7263    Bool    shl, shr, sar;
   7264    UChar   rm   = getIByte(delta);
   7265    IRTemp  g0   = newTemp(Ity_V128);
   7266    IRTemp  g1   = newTemp(Ity_V128);
   7267    IRTemp  amt  = newTemp(Ity_I32);
   7268    IRTemp  amt8 = newTemp(Ity_I8);
   7269    if (epartIsReg(rm)) {
   7270       assign( amt, getXMMRegLane32(eregOfRM(rm), 0) );
   7271       DIP("%s %s,%s\n", opname,
   7272                         nameXMMReg(eregOfRM(rm)),
   7273                         nameXMMReg(gregOfRM(rm)) );
   7274       delta++;
   7275    } else {
   7276       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7277       assign( amt, loadLE(Ity_I32, mkexpr(addr)) );
   7278       DIP("%s %s,%s\n", opname,
   7279                         dis_buf,
   7280                         nameXMMReg(gregOfRM(rm)) );
   7281       delta += alen;
   7282    }
   7283    assign( g0,   getXMMReg(gregOfRM(rm)) );
   7284    assign( amt8, unop(Iop_32to8, mkexpr(amt)) );
   7285 
   7286    shl = shr = sar = False;
   7287    size = 0;
   7288    switch (op) {
   7289       case Iop_ShlN16x8: shl = True; size = 32; break;
   7290       case Iop_ShlN32x4: shl = True; size = 32; break;
   7291       case Iop_ShlN64x2: shl = True; size = 64; break;
   7292       case Iop_SarN16x8: sar = True; size = 16; break;
   7293       case Iop_SarN32x4: sar = True; size = 32; break;
   7294       case Iop_ShrN16x8: shr = True; size = 16; break;
   7295       case Iop_ShrN32x4: shr = True; size = 32; break;
   7296       case Iop_ShrN64x2: shr = True; size = 64; break;
   7297       default: vassert(0);
   7298    }
   7299 
   7300    if (shl || shr) {
   7301      assign(
   7302         g1,
   7303         IRExpr_Mux0X(
   7304            unop(Iop_1Uto8,binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size))),
   7305            mkV128(0x0000),
   7306            binop(op, mkexpr(g0), mkexpr(amt8))
   7307         )
   7308      );
   7309    } else
   7310    if (sar) {
   7311      assign(
   7312         g1,
   7313         IRExpr_Mux0X(
   7314            unop(Iop_1Uto8,binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size))),
   7315            binop(op, mkexpr(g0), mkU8(size-1)),
   7316            binop(op, mkexpr(g0), mkexpr(amt8))
   7317         )
   7318      );
   7319    } else {
   7320       /*NOTREACHED*/
   7321       vassert(0);
   7322    }
   7323 
   7324    putXMMReg( gregOfRM(rm), mkexpr(g1) );
   7325    return delta;
   7326 }
   7327 
   7328 
   7329 /* Vector by scalar shift of E by an immediate byte. */
   7330 
   7331 static
   7332 UInt dis_SSE_shiftE_imm ( Int delta, HChar* opname, IROp op )
   7333 {
   7334    Bool    shl, shr, sar;
   7335    UChar   rm   = getIByte(delta);
   7336    IRTemp  e0   = newTemp(Ity_V128);
   7337    IRTemp  e1   = newTemp(Ity_V128);
   7338    UChar   amt, size;
   7339    vassert(epartIsReg(rm));
   7340    vassert(gregOfRM(rm) == 2
   7341            || gregOfRM(rm) == 4 || gregOfRM(rm) == 6);
   7342    amt = getIByte(delta+1);
   7343    delta += 2;
   7344    DIP("%s $%d,%s\n", opname,
   7345                       (Int)amt,
   7346                       nameXMMReg(eregOfRM(rm)) );
   7347    assign( e0, getXMMReg(eregOfRM(rm)) );
   7348 
   7349    shl = shr = sar = False;
   7350    size = 0;
   7351    switch (op) {
   7352       case Iop_ShlN16x8: shl = True; size = 16; break;
   7353       case Iop_ShlN32x4: shl = True; size = 32; break;
   7354       case Iop_ShlN64x2: shl = True; size = 64; break;
   7355       case Iop_SarN16x8: sar = True; size = 16; break;
   7356       case Iop_SarN32x4: sar = True; size = 32; break;
   7357       case Iop_ShrN16x8: shr = True; size = 16; break;
   7358       case Iop_ShrN32x4: shr = True; size = 32; break;
   7359       case Iop_ShrN64x2: shr = True; size = 64; break;
   7360       default: vassert(0);
   7361    }
   7362 
   7363    if (shl || shr) {
   7364       assign( e1, amt >= size
   7365                      ? mkV128(0x0000)
   7366                      : binop(op, mkexpr(e0), mkU8(amt))
   7367       );
   7368    } else
   7369    if (sar) {
   7370       assign( e1, amt >= size
   7371                      ? binop(op, mkexpr(e0), mkU8(size-1))
   7372                      : binop(op, mkexpr(e0), mkU8(amt))
   7373       );
   7374    } else {
   7375       /*NOTREACHED*/
   7376       vassert(0);
   7377    }
   7378 
   7379    putXMMReg( eregOfRM(rm), mkexpr(e1) );
   7380    return delta;
   7381 }
   7382 
   7383 
   7384 /* Get the current SSE rounding mode. */
   7385 
   7386 static IRExpr* /* :: Ity_I32 */ get_sse_roundingmode ( void )
   7387 {
   7388    return binop( Iop_And32,
   7389                  IRExpr_Get( OFFB_SSEROUND, Ity_I32 ),
   7390                  mkU32(3) );
   7391 }
   7392 
   7393 static void put_sse_roundingmode ( IRExpr* sseround )
   7394 {
   7395    vassert(typeOfIRExpr(irsb->tyenv, sseround) == Ity_I32);
   7396    stmt( IRStmt_Put( OFFB_SSEROUND, sseround ) );
   7397 }
   7398 
   7399 /* Break a 128-bit value up into four 32-bit ints. */
   7400 
   7401 static void breakup128to32s ( IRTemp t128,
   7402 			      /*OUTs*/
   7403                               IRTemp* t3, IRTemp* t2,
   7404                               IRTemp* t1, IRTemp* t0 )
   7405 {
   7406    IRTemp hi64 = newTemp(Ity_I64);
   7407    IRTemp lo64 = newTemp(Ity_I64);
   7408    assign( hi64, unop(Iop_V128HIto64, mkexpr(t128)) );
   7409    assign( lo64, unop(Iop_V128to64,   mkexpr(t128)) );
   7410 
   7411    vassert(t0 && *t0 == IRTemp_INVALID);
   7412    vassert(t1 && *t1 == IRTemp_INVALID);
   7413    vassert(t2 && *t2 == IRTemp_INVALID);
   7414    vassert(t3 && *t3 == IRTemp_INVALID);
   7415 
   7416    *t0 = newTemp(Ity_I32);
   7417    *t1 = newTemp(Ity_I32);
   7418    *t2 = newTemp(Ity_I32);
   7419    *t3 = newTemp(Ity_I32);
   7420    assign( *t0, unop(Iop_64to32,   mkexpr(lo64)) );
   7421    assign( *t1, unop(Iop_64HIto32, mkexpr(lo64)) );
   7422    assign( *t2, unop(Iop_64to32,   mkexpr(hi64)) );
   7423    assign( *t3, unop(Iop_64HIto32, mkexpr(hi64)) );
   7424 }
   7425 
   7426 /* Construct a 128-bit value from four 32-bit ints. */
   7427 
   7428 static IRExpr* mk128from32s ( IRTemp t3, IRTemp t2,
   7429                               IRTemp t1, IRTemp t0 )
   7430 {
   7431    return
   7432       binop( Iop_64HLtoV128,
   7433              binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
   7434              binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0))
   7435    );
   7436 }
   7437 
   7438 /* Break a 64-bit value up into four 16-bit ints. */
   7439 
   7440 static void breakup64to16s ( IRTemp t64,
   7441                              /*OUTs*/
   7442                              IRTemp* t3, IRTemp* t2,
   7443                              IRTemp* t1, IRTemp* t0 )
   7444 {
   7445    IRTemp hi32 = newTemp(Ity_I32);
   7446    IRTemp lo32 = newTemp(Ity_I32);
   7447    assign( hi32, unop(Iop_64HIto32, mkexpr(t64)) );
   7448    assign( lo32, unop(Iop_64to32,   mkexpr(t64)) );
   7449 
   7450    vassert(t0 && *t0 == IRTemp_INVALID);
   7451    vassert(t1 && *t1 == IRTemp_INVALID);
   7452    vassert(t2 && *t2 == IRTemp_INVALID);
   7453    vassert(t3 && *t3 == IRTemp_INVALID);
   7454 
   7455    *t0 = newTemp(Ity_I16);
   7456    *t1 = newTemp(Ity_I16);
   7457    *t2 = newTemp(Ity_I16);
   7458    *t3 = newTemp(Ity_I16);
   7459    assign( *t0, unop(Iop_32to16,   mkexpr(lo32)) );
   7460    assign( *t1, unop(Iop_32HIto16, mkexpr(lo32)) );
   7461    assign( *t2, unop(Iop_32to16,   mkexpr(hi32)) );
   7462    assign( *t3, unop(Iop_32HIto16, mkexpr(hi32)) );
   7463 }
   7464 
   7465 /* Construct a 64-bit value from four 16-bit ints. */
   7466 
   7467 static IRExpr* mk64from16s ( IRTemp t3, IRTemp t2,
   7468                              IRTemp t1, IRTemp t0 )
   7469 {
   7470    return
   7471       binop( Iop_32HLto64,
   7472              binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)),
   7473              binop(Iop_16HLto32, mkexpr(t1), mkexpr(t0))
   7474    );
   7475 }
   7476 
   7477 /* Generate IR to set the guest %EFLAGS from the pushfl-format image
   7478    in the given 32-bit temporary.  The flags that are set are: O S Z A
   7479    C P D ID AC.
   7480 
   7481    In all cases, code to set AC is generated.  However, VEX actually
   7482    ignores the AC value and so can optionally emit an emulation
   7483    warning when it is enabled.  In this routine, an emulation warning
   7484    is only emitted if emit_AC_emwarn is True, in which case
   7485    next_insn_EIP must be correct (this allows for correct code
   7486    generation for popfl/popfw).  If emit_AC_emwarn is False,
   7487    next_insn_EIP is unimportant (this allows for easy if kludgey code
   7488    generation for IRET.) */
   7489 
   7490 static
   7491 void set_EFLAGS_from_value ( IRTemp t1,
   7492                              Bool   emit_AC_emwarn,
   7493                              Addr32 next_insn_EIP )
   7494 {
   7495    vassert(typeOfIRTemp(irsb->tyenv,t1) == Ity_I32);
   7496 
   7497    /* t1 is the flag word.  Mask out everything except OSZACP and set
   7498       the flags thunk to X86G_CC_OP_COPY. */
   7499    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   7500    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   7501    stmt( IRStmt_Put( OFFB_CC_DEP1,
   7502                      binop(Iop_And32,
   7503                            mkexpr(t1),
   7504                            mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
   7505                                   | X86G_CC_MASK_A | X86G_CC_MASK_Z
   7506                                   | X86G_CC_MASK_S| X86G_CC_MASK_O )
   7507                           )
   7508                     )
   7509        );
   7510    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   7511       elimination of previous stores to this field work better. */
   7512    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   7513 
   7514    /* Also need to set the D flag, which is held in bit 10 of t1.
   7515       If zero, put 1 in OFFB_DFLAG, else -1 in OFFB_DFLAG. */
   7516    stmt( IRStmt_Put(
   7517             OFFB_DFLAG,
   7518             IRExpr_Mux0X(
   7519                unop(Iop_32to8,
   7520                     binop(Iop_And32,
   7521                           binop(Iop_Shr32, mkexpr(t1), mkU8(10)),
   7522                           mkU32(1))),
   7523                mkU32(1),
   7524                mkU32(0xFFFFFFFF)))
   7525        );
   7526 
   7527    /* Set the ID flag */
   7528    stmt( IRStmt_Put(
   7529             OFFB_IDFLAG,
   7530             IRExpr_Mux0X(
   7531                unop(Iop_32to8,
   7532                     binop(Iop_And32,
   7533                           binop(Iop_Shr32, mkexpr(t1), mkU8(21)),
   7534                           mkU32(1))),
   7535                mkU32(0),
   7536                mkU32(1)))
   7537        );
   7538 
   7539    /* And set the AC flag.  If setting it 1 to, possibly emit an
   7540       emulation warning. */
   7541    stmt( IRStmt_Put(
   7542             OFFB_ACFLAG,
   7543             IRExpr_Mux0X(
   7544                unop(Iop_32to8,
   7545                     binop(Iop_And32,
   7546                           binop(Iop_Shr32, mkexpr(t1), mkU8(18)),
   7547                           mkU32(1))),
   7548                mkU32(0),
   7549                mkU32(1)))
   7550        );
   7551 
   7552    if (emit_AC_emwarn) {
   7553       put_emwarn( mkU32(EmWarn_X86_acFlag) );
   7554       stmt(
   7555          IRStmt_Exit(
   7556             binop( Iop_CmpNE32,
   7557                    binop(Iop_And32, mkexpr(t1), mkU32(1<<18)),
   7558                    mkU32(0) ),
   7559             Ijk_EmWarn,
   7560             IRConst_U32( next_insn_EIP ),
   7561             OFFB_EIP
   7562          )
   7563       );
   7564    }
   7565 }
   7566 
   7567 
   7568 /* Helper for the SSSE3 (not SSE3) PMULHRSW insns.  Given two 64-bit
   7569    values (aa,bb), computes, for each of the 4 16-bit lanes:
   7570 
   7571    (((aa_lane *s32 bb_lane) >>u 14) + 1) >>u 1
   7572 */
   7573 static IRExpr* dis_PMULHRSW_helper ( IRExpr* aax, IRExpr* bbx )
   7574 {
   7575    IRTemp aa      = newTemp(Ity_I64);
   7576    IRTemp bb      = newTemp(Ity_I64);
   7577    IRTemp aahi32s = newTemp(Ity_I64);
   7578    IRTemp aalo32s = newTemp(Ity_I64);
   7579    IRTemp bbhi32s = newTemp(Ity_I64);
   7580    IRTemp bblo32s = newTemp(Ity_I64);
   7581    IRTemp rHi     = newTemp(Ity_I64);
   7582    IRTemp rLo     = newTemp(Ity_I64);
   7583    IRTemp one32x2 = newTemp(Ity_I64);
   7584    assign(aa, aax);
   7585    assign(bb, bbx);
   7586    assign( aahi32s,
   7587            binop(Iop_SarN32x2,
   7588                  binop(Iop_InterleaveHI16x4, mkexpr(aa), mkexpr(aa)),
   7589                  mkU8(16) ));
   7590    assign( aalo32s,
   7591            binop(Iop_SarN32x2,
   7592                  binop(Iop_InterleaveLO16x4, mkexpr(aa), mkexpr(aa)),
   7593                  mkU8(16) ));
   7594    assign( bbhi32s,
   7595            binop(Iop_SarN32x2,
   7596                  binop(Iop_InterleaveHI16x4, mkexpr(bb), mkexpr(bb)),
   7597                  mkU8(16) ));
   7598    assign( bblo32s,
   7599            binop(Iop_SarN32x2,
   7600                  binop(Iop_InterleaveLO16x4, mkexpr(bb), mkexpr(bb)),
   7601                  mkU8(16) ));
   7602    assign(one32x2, mkU64( (1ULL << 32) + 1 ));
   7603    assign(
   7604       rHi,
   7605       binop(
   7606          Iop_ShrN32x2,
   7607          binop(
   7608             Iop_Add32x2,
   7609             binop(
   7610                Iop_ShrN32x2,
   7611                binop(Iop_Mul32x2, mkexpr(aahi32s), mkexpr(bbhi32s)),
   7612                mkU8(14)
   7613             ),
   7614             mkexpr(one32x2)
   7615          ),
   7616          mkU8(1)
   7617       )
   7618    );
   7619    assign(
   7620       rLo,
   7621       binop(
   7622          Iop_ShrN32x2,
   7623          binop(
   7624             Iop_Add32x2,
   7625             binop(
   7626                Iop_ShrN32x2,
   7627                binop(Iop_Mul32x2, mkexpr(aalo32s), mkexpr(bblo32s)),
   7628                mkU8(14)
   7629             ),
   7630             mkexpr(one32x2)
   7631          ),
   7632          mkU8(1)
   7633       )
   7634    );
   7635    return
   7636       binop(Iop_CatEvenLanes16x4, mkexpr(rHi), mkexpr(rLo));
   7637 }
   7638 
   7639 /* Helper for the SSSE3 (not SSE3) PSIGN{B,W,D} insns.  Given two 64-bit
   7640    values (aa,bb), computes, for each lane:
   7641 
   7642           if aa_lane < 0 then - bb_lane
   7643      else if aa_lane > 0 then bb_lane
   7644      else 0
   7645 */
   7646 static IRExpr* dis_PSIGN_helper ( IRExpr* aax, IRExpr* bbx, Int laneszB )
   7647 {
   7648    IRTemp aa       = newTemp(Ity_I64);
   7649    IRTemp bb       = newTemp(Ity_I64);
   7650    IRTemp zero     = newTemp(Ity_I64);
   7651    IRTemp bbNeg    = newTemp(Ity_I64);
   7652    IRTemp negMask  = newTemp(Ity_I64);
   7653    IRTemp posMask  = newTemp(Ity_I64);
   7654    IROp   opSub    = Iop_INVALID;
   7655    IROp   opCmpGTS = Iop_INVALID;
   7656 
   7657    switch (laneszB) {
   7658       case 1: opSub = Iop_Sub8x8;  opCmpGTS = Iop_CmpGT8Sx8;  break;
   7659       case 2: opSub = Iop_Sub16x4; opCmpGTS = Iop_CmpGT16Sx4; break;
   7660       case 4: opSub = Iop_Sub32x2; opCmpGTS = Iop_CmpGT32Sx2; break;
   7661       default: vassert(0);
   7662    }
   7663 
   7664    assign( aa,      aax );
   7665    assign( bb,      bbx );
   7666    assign( zero,    mkU64(0) );
   7667    assign( bbNeg,   binop(opSub,    mkexpr(zero), mkexpr(bb)) );
   7668    assign( negMask, binop(opCmpGTS, mkexpr(zero), mkexpr(aa)) );
   7669    assign( posMask, binop(opCmpGTS, mkexpr(aa),   mkexpr(zero)) );
   7670 
   7671    return
   7672       binop(Iop_Or64,
   7673             binop(Iop_And64, mkexpr(bb),    mkexpr(posMask)),
   7674             binop(Iop_And64, mkexpr(bbNeg), mkexpr(negMask)) );
   7675 
   7676 }
   7677 
   7678 /* Helper for the SSSE3 (not SSE3) PABS{B,W,D} insns.  Given a 64-bit
   7679    value aa, computes, for each lane
   7680 
   7681    if aa < 0 then -aa else aa
   7682 
   7683    Note that the result is interpreted as unsigned, so that the
   7684    absolute value of the most negative signed input can be
   7685    represented.
   7686 */
   7687 static IRExpr* dis_PABS_helper ( IRExpr* aax, Int laneszB )
   7688 {
   7689    IRTemp aa      = newTemp(Ity_I64);
   7690    IRTemp zero    = newTemp(Ity_I64);
   7691    IRTemp aaNeg   = newTemp(Ity_I64);
   7692    IRTemp negMask = newTemp(Ity_I64);
   7693    IRTemp posMask = newTemp(Ity_I64);
   7694    IROp   opSub   = Iop_INVALID;
   7695    IROp   opSarN  = Iop_INVALID;
   7696 
   7697    switch (laneszB) {
   7698       case 1: opSub = Iop_Sub8x8;  opSarN = Iop_SarN8x8;  break;
   7699       case 2: opSub = Iop_Sub16x4; opSarN = Iop_SarN16x4; break;
   7700       case 4: opSub = Iop_Sub32x2; opSarN = Iop_SarN32x2; break;
   7701       default: vassert(0);
   7702    }
   7703 
   7704    assign( aa,      aax );
   7705    assign( negMask, binop(opSarN, mkexpr(aa), mkU8(8*laneszB-1)) );
   7706    assign( posMask, unop(Iop_Not64, mkexpr(negMask)) );
   7707    assign( zero,    mkU64(0) );
   7708    assign( aaNeg,   binop(opSub, mkexpr(zero), mkexpr(aa)) );
   7709    return
   7710       binop(Iop_Or64,
   7711             binop(Iop_And64, mkexpr(aa),    mkexpr(posMask)),
   7712             binop(Iop_And64, mkexpr(aaNeg), mkexpr(negMask)) );
   7713 }
   7714 
   7715 static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64,
   7716                                         IRTemp lo64, Int byteShift )
   7717 {
   7718    vassert(byteShift >= 1 && byteShift <= 7);
   7719    return
   7720       binop(Iop_Or64,
   7721             binop(Iop_Shl64, mkexpr(hi64), mkU8(8*(8-byteShift))),
   7722             binop(Iop_Shr64, mkexpr(lo64), mkU8(8*byteShift))
   7723       );
   7724 }
   7725 
   7726 /* Generate a SIGSEGV followed by a restart of the current instruction
   7727    if effective_addr is not 16-aligned.  This is required behaviour
   7728    for some SSE3 instructions and all 128-bit SSSE3 instructions.
   7729    This assumes that guest_RIP_curr_instr is set correctly! */
   7730 static void gen_SEGV_if_not_16_aligned ( IRTemp effective_addr )
   7731 {
   7732    stmt(
   7733       IRStmt_Exit(
   7734          binop(Iop_CmpNE32,
   7735                binop(Iop_And32,mkexpr(effective_addr),mkU32(0xF)),
   7736                mkU32(0)),
   7737          Ijk_SigSEGV,
   7738          IRConst_U32(guest_EIP_curr_instr),
   7739          OFFB_EIP
   7740       )
   7741    );
   7742 }
   7743 
   7744 
   7745 /* Helper for deciding whether a given insn (starting at the opcode
   7746    byte) may validly be used with a LOCK prefix.  The following insns
   7747    may be used with LOCK when their destination operand is in memory.
   7748    AFAICS this is exactly the same for both 32-bit and 64-bit mode.
   7749 
   7750    ADD        80 /0,  81 /0,  82 /0,  83 /0,  00,  01
   7751    OR         80 /1,  81 /1,  82 /x,  83 /1,  08,  09
   7752    ADC        80 /2,  81 /2,  82 /2,  83 /2,  10,  11
   7753    SBB        81 /3,  81 /3,  82 /x,  83 /3,  18,  19
   7754    AND        80 /4,  81 /4,  82 /x,  83 /4,  20,  21
   7755    SUB        80 /5,  81 /5,  82 /x,  83 /5,  28,  29
   7756    XOR        80 /6,  81 /6,  82 /x,  83 /6,  30,  31
   7757 
   7758    DEC        FE /1,  FF /1
   7759    INC        FE /0,  FF /0
   7760 
   7761    NEG        F6 /3,  F7 /3
   7762    NOT        F6 /2,  F7 /2
   7763 
   7764    XCHG       86, 87
   7765 
   7766    BTC        0F BB,  0F BA /7
   7767    BTR        0F B3,  0F BA /6
   7768    BTS        0F AB,  0F BA /5
   7769 
   7770    CMPXCHG    0F B0,  0F B1
   7771    CMPXCHG8B  0F C7 /1
   7772 
   7773    XADD       0F C0,  0F C1
   7774 
   7775    ------------------------------
   7776 
   7777    80 /0  =  addb $imm8,  rm8
   7778    81 /0  =  addl $imm32, rm32  and  addw $imm16, rm16
   7779    82 /0  =  addb $imm8,  rm8
   7780    83 /0  =  addl $simm8, rm32  and  addw $simm8, rm16
   7781 
   7782    00     =  addb r8,  rm8
   7783    01     =  addl r32, rm32  and  addw r16, rm16
   7784 
   7785    Same for ADD OR ADC SBB AND SUB XOR
   7786 
   7787    FE /1  = dec rm8
   7788    FF /1  = dec rm32  and  dec rm16
   7789 
   7790    FE /0  = inc rm8
   7791    FF /0  = inc rm32  and  inc rm16
   7792 
   7793    F6 /3  = neg rm8
   7794    F7 /3  = neg rm32  and  neg rm16
   7795 
   7796    F6 /2  = not rm8
   7797    F7 /2  = not rm32  and  not rm16
   7798 
   7799    0F BB     = btcw r16, rm16    and  btcl r32, rm32
   7800    OF BA /7  = btcw $imm8, rm16  and  btcw $imm8, rm32
   7801 
   7802    Same for BTS, BTR
   7803 */
   7804 static Bool can_be_used_with_LOCK_prefix ( UChar* opc )
   7805 {
   7806    switch (opc[0]) {
   7807       case 0x00: case 0x01: case 0x08: case 0x09:
   7808       case 0x10: case 0x11: case 0x18: case 0x19:
   7809       case 0x20: case 0x21: case 0x28: case 0x29:
   7810       case 0x30: case 0x31:
   7811          if (!epartIsReg(opc[1]))
   7812             return True;
   7813          break;
   7814 
   7815       case 0x80: case 0x81: case 0x82: case 0x83:
   7816          if (gregOfRM(opc[1]) >= 0 && gregOfRM(opc[1]) <= 6
   7817              && !epartIsReg(opc[1]))
   7818             return True;
   7819          break;
   7820 
   7821       case 0xFE: case 0xFF:
   7822          if (gregOfRM(opc[1]) >= 0 && gregOfRM(opc[1]) <= 1
   7823              && !epartIsReg(opc[1]))
   7824             return True;
   7825          break;
   7826 
   7827       case 0xF6: case 0xF7:
   7828          if (gregOfRM(opc[1]) >= 2 && gregOfRM(opc[1]) <= 3
   7829              && !epartIsReg(opc[1]))
   7830             return True;
   7831          break;
   7832 
   7833       case 0x86: case 0x87:
   7834          if (!epartIsReg(opc[1]))
   7835             return True;
   7836          break;
   7837 
   7838       case 0x0F: {
   7839          switch (opc[1]) {
   7840             case 0xBB: case 0xB3: case 0xAB:
   7841                if (!epartIsReg(opc[2]))
   7842                   return True;
   7843                break;
   7844             case 0xBA:
   7845                if (gregOfRM(opc[2]) >= 5 && gregOfRM(opc[2]) <= 7
   7846                    && !epartIsReg(opc[2]))
   7847                   return True;
   7848                break;
   7849             case 0xB0: case 0xB1:
   7850                if (!epartIsReg(opc[2]))
   7851                   return True;
   7852                break;
   7853             case 0xC7:
   7854                if (gregOfRM(opc[2]) == 1 && !epartIsReg(opc[2]) )
   7855                   return True;
   7856                break;
   7857             case 0xC0: case 0xC1:
   7858                if (!epartIsReg(opc[2]))
   7859                   return True;
   7860                break;
   7861             default:
   7862                break;
   7863          } /* switch (opc[1]) */
   7864          break;
   7865       }
   7866 
   7867       default:
   7868          break;
   7869    } /* switch (opc[0]) */
   7870 
   7871    return False;
   7872 }
   7873 
   7874 static IRTemp math_BSWAP ( IRTemp t1, IRType ty )
   7875 {
   7876    IRTemp t2 = newTemp(ty);
   7877    if (ty == Ity_I32) {
   7878       assign( t2,
   7879          binop(
   7880             Iop_Or32,
   7881             binop(Iop_Shl32, mkexpr(t1), mkU8(24)),
   7882             binop(
   7883                Iop_Or32,
   7884                binop(Iop_And32, binop(Iop_Shl32, mkexpr(t1), mkU8(8)),
   7885                                 mkU32(0x00FF0000)),
   7886                binop(Iop_Or32,
   7887                      binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(8)),
   7888                                       mkU32(0x0000FF00)),
   7889                      binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(24)),
   7890                                       mkU32(0x000000FF) )
   7891             )))
   7892       );
   7893       return t2;
   7894    }
   7895    if (ty == Ity_I16) {
   7896       assign(t2,
   7897              binop(Iop_Or16,
   7898                    binop(Iop_Shl16, mkexpr(t1), mkU8(8)),
   7899                    binop(Iop_Shr16, mkexpr(t1), mkU8(8)) ));
   7900       return t2;
   7901    }
   7902    vassert(0);
   7903    /*NOTREACHED*/
   7904    return IRTemp_INVALID;
   7905 }
   7906 
   7907 /*------------------------------------------------------------*/
   7908 /*--- Disassemble a single instruction                     ---*/
   7909 /*------------------------------------------------------------*/
   7910 
   7911 /* Disassemble a single instruction into IR.  The instruction is
   7912    located in host memory at &guest_code[delta].  *expect_CAS is set
   7913    to True if the resulting IR is expected to contain an IRCAS
   7914    statement, and False if it's not expected to.  This makes it
   7915    possible for the caller of disInstr_X86_WRK to check that
   7916    LOCK-prefixed instructions are at least plausibly translated, in
   7917    that it becomes possible to check that a (validly) LOCK-prefixed
   7918    instruction generates a translation containing an IRCAS, and
   7919    instructions without LOCK prefixes don't generate translations
   7920    containing an IRCAS.
   7921 */
   7922 static
   7923 DisResult disInstr_X86_WRK (
   7924              /*OUT*/Bool* expect_CAS,
   7925              Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
   7926              Bool         resteerCisOk,
   7927              void*        callback_opaque,
   7928              Long         delta64,
   7929              VexArchInfo* archinfo,
   7930              VexAbiInfo*  vbi
   7931           )
   7932 {
   7933    IRType    ty;
   7934    IRTemp    addr, t0, t1, t2, t3, t4, t5, t6;
   7935    Int       alen;
   7936    UChar     opc, modrm, abyte, pre;
   7937    UInt      d32;
   7938    HChar     dis_buf[50];
   7939    Int       am_sz, d_sz, n_prefixes;
   7940    DisResult dres;
   7941    UChar*    insn; /* used in SSE decoders */
   7942 
   7943    /* The running delta */
   7944    Int delta = (Int)delta64;
   7945 
   7946    /* Holds eip at the start of the insn, so that we can print
   7947       consistent error messages for unimplemented insns. */
   7948    Int delta_start = delta;
   7949 
   7950    /* sz denotes the nominal data-op size of the insn; we change it to
   7951       2 if an 0x66 prefix is seen */
   7952    Int sz = 4;
   7953 
   7954    /* sorb holds the segment-override-prefix byte, if any.  Zero if no
   7955       prefix has been seen, else one of {0x26, 0x3E, 0x64, 0x65}
   7956       indicating the prefix.  */
   7957    UChar sorb = 0;
   7958 
   7959    /* Gets set to True if a LOCK prefix is seen. */
   7960    Bool pfx_lock = False;
   7961 
   7962    /* Set result defaults. */
   7963    dres.whatNext    = Dis_Continue;
   7964    dres.len         = 0;
   7965    dres.continueAt  = 0;
   7966    dres.jk_StopHere = Ijk_INVALID;
   7967 
   7968    *expect_CAS = False;
   7969 
   7970    addr = t0 = t1 = t2 = t3 = t4 = t5 = t6 = IRTemp_INVALID;
   7971 
   7972    vassert(guest_EIP_bbstart + delta == guest_EIP_curr_instr);
   7973    DIP("\t0x%x:  ", guest_EIP_bbstart+delta);
   7974 
   7975    /* Spot "Special" instructions (see comment at top of file). */
   7976    {
   7977       UChar* code = (UChar*)(guest_code + delta);
   7978       /* Spot the 12-byte preamble:
   7979          C1C703   roll $3,  %edi
   7980          C1C70D   roll $13, %edi
   7981          C1C71D   roll $29, %edi
   7982          C1C713   roll $19, %edi
   7983       */
   7984       if (code[ 0] == 0xC1 && code[ 1] == 0xC7 && code[ 2] == 0x03 &&
   7985           code[ 3] == 0xC1 && code[ 4] == 0xC7 && code[ 5] == 0x0D &&
   7986           code[ 6] == 0xC1 && code[ 7] == 0xC7 && code[ 8] == 0x1D &&
   7987           code[ 9] == 0xC1 && code[10] == 0xC7 && code[11] == 0x13) {
   7988          /* Got a "Special" instruction preamble.  Which one is it? */
   7989          if (code[12] == 0x87 && code[13] == 0xDB /* xchgl %ebx,%ebx */) {
   7990             /* %EDX = client_request ( %EAX ) */
   7991             DIP("%%edx = client_request ( %%eax )\n");
   7992             delta += 14;
   7993             jmp_lit(&dres, Ijk_ClientReq, guest_EIP_bbstart+delta);
   7994             vassert(dres.whatNext == Dis_StopHere);
   7995             goto decode_success;
   7996          }
   7997          else
   7998          if (code[12] == 0x87 && code[13] == 0xC9 /* xchgl %ecx,%ecx */) {
   7999             /* %EAX = guest_NRADDR */
   8000             DIP("%%eax = guest_NRADDR\n");
   8001             delta += 14;
   8002             putIReg(4, R_EAX, IRExpr_Get( OFFB_NRADDR, Ity_I32 ));
   8003             goto decode_success;
   8004          }
   8005          else
   8006          if (code[12] == 0x87 && code[13] == 0xD2 /* xchgl %edx,%edx */) {
   8007             /* call-noredir *%EAX */
   8008             DIP("call-noredir *%%eax\n");
   8009             delta += 14;
   8010             t1 = newTemp(Ity_I32);
   8011             assign(t1, getIReg(4,R_EAX));
   8012             t2 = newTemp(Ity_I32);
   8013             assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
   8014             putIReg(4, R_ESP, mkexpr(t2));
   8015             storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta));
   8016             jmp_treg(&dres, Ijk_NoRedir, t1);
   8017             vassert(dres.whatNext == Dis_StopHere);
   8018             goto decode_success;
   8019          }
   8020          /* We don't know what it is. */
   8021          goto decode_failure;
   8022          /*NOTREACHED*/
   8023       }
   8024    }
   8025 
   8026    /* Handle a couple of weird-ass NOPs that have been observed in the
   8027       wild. */
   8028    {
   8029       UChar* code = (UChar*)(guest_code + delta);
   8030       /* Sun's JVM 1.5.0 uses the following as a NOP:
   8031          26 2E 64 65 90  %es:%cs:%fs:%gs:nop */
   8032       if (code[0] == 0x26 && code[1] == 0x2E && code[2] == 0x64
   8033           && code[3] == 0x65 && code[4] == 0x90) {
   8034          DIP("%%es:%%cs:%%fs:%%gs:nop\n");
   8035          delta += 5;
   8036          goto decode_success;
   8037       }
   8038       /* Don't barf on recent binutils padding,
   8039          all variants of which are: nopw %cs:0x0(%eax,%eax,1)
   8040          66 2e 0f 1f 84 00 00 00 00 00
   8041          66 66 2e 0f 1f 84 00 00 00 00 00
   8042          66 66 66 2e 0f 1f 84 00 00 00 00 00
   8043          66 66 66 66 2e 0f 1f 84 00 00 00 00 00
   8044          66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00
   8045          66 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00
   8046       */
   8047       if (code[0] == 0x66) {
   8048          Int data16_cnt;
   8049          for (data16_cnt = 1; data16_cnt < 6; data16_cnt++)
   8050             if (code[data16_cnt] != 0x66)
   8051                break;
   8052          if (code[data16_cnt] == 0x2E && code[data16_cnt + 1] == 0x0F
   8053              && code[data16_cnt + 2] == 0x1F && code[data16_cnt + 3] == 0x84
   8054              && code[data16_cnt + 4] == 0x00 && code[data16_cnt + 5] == 0x00
   8055              && code[data16_cnt + 6] == 0x00 && code[data16_cnt + 7] == 0x00
   8056              && code[data16_cnt + 8] == 0x00 ) {
   8057             DIP("nopw %%cs:0x0(%%eax,%%eax,1)\n");
   8058             delta += 9 + data16_cnt;
   8059             goto decode_success;
   8060          }
   8061       }
   8062    }
   8063 
   8064    /* Normal instruction handling starts here. */
   8065 
   8066    /* Deal with some but not all prefixes:
   8067          66(oso)
   8068          F0(lock)
   8069          2E(cs:) 3E(ds:) 26(es:) 64(fs:) 65(gs:) 36(ss:)
   8070       Not dealt with (left in place):
   8071          F2 F3
   8072    */
   8073    n_prefixes = 0;
   8074    while (True) {
   8075       if (n_prefixes > 7) goto decode_failure;
   8076       pre = getUChar(delta);
   8077       switch (pre) {
   8078          case 0x66:
   8079             sz = 2;
   8080             break;
   8081          case 0xF0:
   8082             pfx_lock = True;
   8083             *expect_CAS = True;
   8084             break;
   8085          case 0x3E: /* %DS: */
   8086          case 0x26: /* %ES: */
   8087          case 0x64: /* %FS: */
   8088          case 0x65: /* %GS: */
   8089             if (sorb != 0)
   8090                goto decode_failure; /* only one seg override allowed */
   8091             sorb = pre;
   8092             break;
   8093          case 0x2E: { /* %CS: */
   8094             /* 2E prefix on a conditional branch instruction is a
   8095                branch-prediction hint, which can safely be ignored.  */
   8096             UChar op1 = getIByte(delta+1);
   8097             UChar op2 = getIByte(delta+2);
   8098             if ((op1 >= 0x70 && op1 <= 0x7F)
   8099                 || (op1 == 0xE3)
   8100                 || (op1 == 0x0F && op2 >= 0x80 && op2 <= 0x8F)) {
   8101                if (0) vex_printf("vex x86->IR: ignoring branch hint\n");
   8102             } else {
   8103                /* All other CS override cases are not handled */
   8104                goto decode_failure;
   8105             }
   8106             break;
   8107          }
   8108          case 0x36: /* %SS: */
   8109             /* SS override cases are not handled */
   8110             goto decode_failure;
   8111          default:
   8112             goto not_a_prefix;
   8113       }
   8114       n_prefixes++;
   8115       delta++;
   8116    }
   8117 
   8118    not_a_prefix:
   8119 
   8120    /* Now we should be looking at the primary opcode byte or the
   8121       leading F2 or F3.  Check that any LOCK prefix is actually
   8122       allowed. */
   8123 
   8124    if (pfx_lock) {
   8125       if (can_be_used_with_LOCK_prefix( (UChar*)&guest_code[delta] )) {
   8126          DIP("lock ");
   8127       } else {
   8128          *expect_CAS = False;
   8129          goto decode_failure;
   8130       }
   8131    }
   8132 
   8133 
   8134    /* ---------------------------------------------------- */
   8135    /* --- The SSE decoder.                             --- */
   8136    /* ---------------------------------------------------- */
   8137 
   8138    /* What did I do to deserve SSE ?  Perhaps I was really bad in a
   8139       previous life? */
   8140 
   8141    /* Note, this doesn't handle SSE2 or SSE3.  That is handled in a
   8142       later section, further on. */
   8143 
   8144    insn = (UChar*)&guest_code[delta];
   8145 
   8146    /* Treat fxsave specially.  It should be doable even on an SSE0
   8147       (Pentium-II class) CPU.  Hence be prepared to handle it on
   8148       any subarchitecture variant.
   8149    */
   8150 
   8151    /* 0F AE /0 = FXSAVE m512 -- write x87 and SSE state to memory */
   8152    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
   8153        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 0) {
   8154       IRDirty* d;
   8155       modrm = getIByte(delta+2);
   8156       vassert(sz == 4);
   8157       vassert(!epartIsReg(modrm));
   8158 
   8159       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8160       delta += 2+alen;
   8161       gen_SEGV_if_not_16_aligned(addr);
   8162 
   8163       DIP("fxsave %s\n", dis_buf);
   8164 
   8165       /* Uses dirty helper:
   8166             void x86g_do_FXSAVE ( VexGuestX86State*, UInt ) */
   8167       d = unsafeIRDirty_0_N (
   8168              0/*regparms*/,
   8169              "x86g_dirtyhelper_FXSAVE",
   8170              &x86g_dirtyhelper_FXSAVE,
   8171              mkIRExprVec_1( mkexpr(addr) )
   8172           );
   8173       d->needsBBP = True;
   8174 
   8175       /* declare we're writing memory */
   8176       d->mFx   = Ifx_Write;
   8177       d->mAddr = mkexpr(addr);
   8178       d->mSize = 464; /* according to recent Intel docs */
   8179 
   8180       /* declare we're reading guest state */
   8181       d->nFxState = 7;
   8182       vex_bzero(&d->fxState, sizeof(d->fxState));
   8183 
   8184       d->fxState[0].fx     = Ifx_Read;
   8185       d->fxState[0].offset = OFFB_FTOP;
   8186       d->fxState[0].size   = sizeof(UInt);
   8187 
   8188       d->fxState[1].fx     = Ifx_Read;
   8189       d->fxState[1].offset = OFFB_FPREGS;
   8190       d->fxState[1].size   = 8 * sizeof(ULong);
   8191 
   8192       d->fxState[2].fx     = Ifx_Read;
   8193       d->fxState[2].offset = OFFB_FPTAGS;
   8194       d->fxState[2].size   = 8 * sizeof(UChar);
   8195 
   8196       d->fxState[3].fx     = Ifx_Read;
   8197       d->fxState[3].offset = OFFB_FPROUND;
   8198       d->fxState[3].size   = sizeof(UInt);
   8199 
   8200       d->fxState[4].fx     = Ifx_Read;
   8201       d->fxState[4].offset = OFFB_FC3210;
   8202       d->fxState[4].size   = sizeof(UInt);
   8203 
   8204       d->fxState[5].fx     = Ifx_Read;
   8205       d->fxState[5].offset = OFFB_XMM0;
   8206       d->fxState[5].size   = 8 * sizeof(U128);
   8207 
   8208       d->fxState[6].fx     = Ifx_Read;
   8209       d->fxState[6].offset = OFFB_SSEROUND;
   8210       d->fxState[6].size   = sizeof(UInt);
   8211 
   8212       /* Be paranoid ... this assertion tries to ensure the 8 %xmm
   8213 	 images are packed back-to-back.  If not, the value of
   8214 	 d->fxState[5].size is wrong. */
   8215       vassert(16 == sizeof(U128));
   8216       vassert(OFFB_XMM7 == (OFFB_XMM0 + 7 * 16));
   8217 
   8218       stmt( IRStmt_Dirty(d) );
   8219 
   8220       goto decode_success;
   8221    }
   8222 
   8223    /* 0F AE /1 = FXRSTOR m512 -- read x87 and SSE state from memory */
   8224    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
   8225        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 1) {
   8226       IRDirty* d;
   8227       modrm = getIByte(delta+2);
   8228       vassert(sz == 4);
   8229       vassert(!epartIsReg(modrm));
   8230 
   8231       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8232       delta += 2+alen;
   8233       gen_SEGV_if_not_16_aligned(addr);
   8234 
   8235       DIP("fxrstor %s\n", dis_buf);
   8236 
   8237       /* Uses dirty helper:
   8238             VexEmWarn x86g_do_FXRSTOR ( VexGuestX86State*, UInt )
   8239          NOTE:
   8240             the VexEmWarn value is simply ignored (unlike for FRSTOR)
   8241       */
   8242       d = unsafeIRDirty_0_N (
   8243              0/*regparms*/,
   8244              "x86g_dirtyhelper_FXRSTOR",
   8245              &x86g_dirtyhelper_FXRSTOR,
   8246              mkIRExprVec_1( mkexpr(addr) )
   8247           );
   8248       d->needsBBP = True;
   8249 
   8250       /* declare we're reading memory */
   8251       d->mFx   = Ifx_Read;
   8252       d->mAddr = mkexpr(addr);
   8253       d->mSize = 464; /* according to recent Intel docs */
   8254 
   8255       /* declare we're writing guest state */
   8256       d->nFxState = 7;
   8257       vex_bzero(&d->fxState, sizeof(d->fxState));
   8258 
   8259       d->fxState[0].fx     = Ifx_Write;
   8260       d->fxState[0].offset = OFFB_FTOP;
   8261       d->fxState[0].size   = sizeof(UInt);
   8262 
   8263       d->fxState[1].fx     = Ifx_Write;
   8264       d->fxState[1].offset = OFFB_FPREGS;
   8265       d->fxState[1].size   = 8 * sizeof(ULong);
   8266 
   8267       d->fxState[2].fx     = Ifx_Write;
   8268       d->fxState[2].offset = OFFB_FPTAGS;
   8269       d->fxState[2].size   = 8 * sizeof(UChar);
   8270 
   8271       d->fxState[3].fx     = Ifx_Write;
   8272       d->fxState[3].offset = OFFB_FPROUND;
   8273       d->fxState[3].size   = sizeof(UInt);
   8274 
   8275       d->fxState[4].fx     = Ifx_Write;
   8276       d->fxState[4].offset = OFFB_FC3210;
   8277       d->fxState[4].size   = sizeof(UInt);
   8278 
   8279       d->fxState[5].fx     = Ifx_Write;
   8280       d->fxState[5].offset = OFFB_XMM0;
   8281       d->fxState[5].size   = 8 * sizeof(U128);
   8282 
   8283       d->fxState[6].fx     = Ifx_Write;
   8284       d->fxState[6].offset = OFFB_SSEROUND;
   8285       d->fxState[6].size   = sizeof(UInt);
   8286 
   8287       /* Be paranoid ... this assertion tries to ensure the 8 %xmm
   8288 	 images are packed back-to-back.  If not, the value of
   8289 	 d->fxState[5].size is wrong. */
   8290       vassert(16 == sizeof(U128));
   8291       vassert(OFFB_XMM7 == (OFFB_XMM0 + 7 * 16));
   8292 
   8293       stmt( IRStmt_Dirty(d) );
   8294 
   8295       goto decode_success;
   8296    }
   8297 
   8298    /* ------ SSE decoder main ------ */
   8299 
   8300    /* Skip parts of the decoder which don't apply given the stated
   8301       guest subarchitecture. */
   8302    if (archinfo->hwcaps == 0/*baseline, no sse at all*/)
   8303       goto after_sse_decoders;
   8304 
   8305    /* Otherwise we must be doing sse1 or sse2, so we can at least try
   8306       for SSE1 here. */
   8307 
   8308    /* 0F 58 = ADDPS -- add 32Fx4 from R/M to R */
   8309    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x58) {
   8310       delta = dis_SSE_E_to_G_all( sorb, delta+2, "addps", Iop_Add32Fx4 );
   8311       goto decode_success;
   8312    }
   8313 
   8314    /* F3 0F 58 = ADDSS -- add 32F0x4 from R/M to R */
   8315    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x58) {
   8316       vassert(sz == 4);
   8317       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "addss", Iop_Add32F0x4 );
   8318       goto decode_success;
   8319    }
   8320 
   8321    /* 0F 55 = ANDNPS -- G = (not G) and E */
   8322    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x55) {
   8323       delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "andnps", Iop_AndV128 );
   8324       goto decode_success;
   8325    }
   8326 
   8327    /* 0F 54 = ANDPS -- G = G and E */
   8328    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x54) {
   8329       delta = dis_SSE_E_to_G_all( sorb, delta+2, "andps", Iop_AndV128 );
   8330       goto decode_success;
   8331    }
   8332 
   8333    /* 0F C2 = CMPPS -- 32Fx4 comparison from R/M to R */
   8334    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC2) {
   8335       delta = dis_SSEcmp_E_to_G( sorb, delta+2, "cmpps", True, 4 );
   8336       goto decode_success;
   8337    }
   8338 
   8339    /* F3 0F C2 = CMPSS -- 32F0x4 comparison from R/M to R */
   8340    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xC2) {
   8341       vassert(sz == 4);
   8342       delta = dis_SSEcmp_E_to_G( sorb, delta+3, "cmpss", False, 4 );
   8343       goto decode_success;
   8344    }
   8345 
   8346    /* 0F 2F = COMISS  -- 32F0x4 comparison G,E, and set ZCP */
   8347    /* 0F 2E = UCOMISS -- 32F0x4 comparison G,E, and set ZCP */
   8348    if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
   8349       IRTemp argL = newTemp(Ity_F32);
   8350       IRTemp argR = newTemp(Ity_F32);
   8351       modrm = getIByte(delta+2);
   8352       if (epartIsReg(modrm)) {
   8353          assign( argR, getXMMRegLane32F( eregOfRM(modrm), 0/*lowest lane*/ ) );
   8354          delta += 2+1;
   8355          DIP("[u]comiss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   8356                                   nameXMMReg(gregOfRM(modrm)) );
   8357       } else {
   8358          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8359 	 assign( argR, loadLE(Ity_F32, mkexpr(addr)) );
   8360          delta += 2+alen;
   8361          DIP("[u]comiss %s,%s\n", dis_buf,
   8362                                   nameXMMReg(gregOfRM(modrm)) );
   8363       }
   8364       assign( argL, getXMMRegLane32F( gregOfRM(modrm), 0/*lowest lane*/ ) );
   8365 
   8366       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   8367       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   8368       stmt( IRStmt_Put(
   8369                OFFB_CC_DEP1,
   8370                binop( Iop_And32,
   8371                       binop(Iop_CmpF64,
   8372                             unop(Iop_F32toF64,mkexpr(argL)),
   8373                             unop(Iop_F32toF64,mkexpr(argR))),
   8374                       mkU32(0x45)
   8375           )));
   8376       /* Set NDEP even though it isn't used.  This makes redundant-PUT
   8377          elimination of previous stores to this field work better. */
   8378       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   8379       goto decode_success;
   8380    }
   8381 
   8382    /* 0F 2A = CVTPI2PS -- convert 2 x I32 in mem/mmx to 2 x F32 in low
   8383       half xmm */
   8384    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x2A) {
   8385       IRTemp arg64 = newTemp(Ity_I64);
   8386       IRTemp rmode = newTemp(Ity_I32);
   8387       vassert(sz == 4);
   8388 
   8389       modrm = getIByte(delta+2);
   8390       do_MMX_preamble();
   8391       if (epartIsReg(modrm)) {
   8392          assign( arg64, getMMXReg(eregOfRM(modrm)) );
   8393          delta += 2+1;
   8394          DIP("cvtpi2ps %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   8395                                  nameXMMReg(gregOfRM(modrm)));
   8396       } else {
   8397          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8398 	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   8399          delta += 2+alen;
   8400          DIP("cvtpi2ps %s,%s\n", dis_buf,
   8401                                  nameXMMReg(gregOfRM(modrm)) );
   8402       }
   8403 
   8404       assign( rmode, get_sse_roundingmode() );
   8405 
   8406       putXMMRegLane32F(
   8407          gregOfRM(modrm), 0,
   8408          binop(Iop_F64toF32,
   8409                mkexpr(rmode),
   8410                unop(Iop_I32StoF64,
   8411                     unop(Iop_64to32, mkexpr(arg64)) )) );
   8412 
   8413       putXMMRegLane32F(
   8414          gregOfRM(modrm), 1,
   8415          binop(Iop_F64toF32,
   8416                mkexpr(rmode),
   8417                unop(Iop_I32StoF64,
   8418                     unop(Iop_64HIto32, mkexpr(arg64)) )) );
   8419 
   8420       goto decode_success;
   8421    }
   8422 
   8423    /* F3 0F 2A = CVTSI2SS -- convert I32 in mem/ireg to F32 in low
   8424       quarter xmm */
   8425    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x2A) {
   8426       IRTemp arg32 = newTemp(Ity_I32);
   8427       IRTemp rmode = newTemp(Ity_I32);
   8428       vassert(sz == 4);
   8429 
   8430       modrm = getIByte(delta+3);
   8431       if (epartIsReg(modrm)) {
   8432          assign( arg32, getIReg(4, eregOfRM(modrm)) );
   8433          delta += 3+1;
   8434          DIP("cvtsi2ss %s,%s\n", nameIReg(4, eregOfRM(modrm)),
   8435                                  nameXMMReg(gregOfRM(modrm)));
   8436       } else {
   8437          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   8438 	 assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   8439          delta += 3+alen;
   8440          DIP("cvtsi2ss %s,%s\n", dis_buf,
   8441                                  nameXMMReg(gregOfRM(modrm)) );
   8442       }
   8443 
   8444       assign( rmode, get_sse_roundingmode() );
   8445 
   8446       putXMMRegLane32F(
   8447          gregOfRM(modrm), 0,
   8448          binop(Iop_F64toF32,
   8449                mkexpr(rmode),
   8450                unop(Iop_I32StoF64, mkexpr(arg32)) ) );
   8451 
   8452       goto decode_success;
   8453    }
   8454 
   8455    /* 0F 2D = CVTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
   8456       I32 in mmx, according to prevailing SSE rounding mode */
   8457    /* 0F 2C = CVTTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
   8458       I32 in mmx, rounding towards zero */
   8459    if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
   8460       IRTemp dst64  = newTemp(Ity_I64);
   8461       IRTemp rmode  = newTemp(Ity_I32);
   8462       IRTemp f32lo  = newTemp(Ity_F32);
   8463       IRTemp f32hi  = newTemp(Ity_F32);
   8464       Bool   r2zero = toBool(insn[1] == 0x2C);
   8465 
   8466       do_MMX_preamble();
   8467       modrm = getIByte(delta+2);
   8468 
   8469       if (epartIsReg(modrm)) {
   8470          delta += 2+1;
   8471 	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
   8472 	 assign(f32hi, getXMMRegLane32F(eregOfRM(modrm), 1));
   8473          DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
   8474                                    nameXMMReg(eregOfRM(modrm)),
   8475                                    nameMMXReg(gregOfRM(modrm)));
   8476       } else {
   8477          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8478 	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   8479 	 assign(f32hi, loadLE(Ity_F32, binop( Iop_Add32,
   8480                                               mkexpr(addr),
   8481                                               mkU32(4) )));
   8482          delta += 2+alen;
   8483          DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
   8484                                    dis_buf,
   8485                                    nameMMXReg(gregOfRM(modrm)));
   8486       }
   8487 
   8488       if (r2zero) {
   8489          assign(rmode, mkU32((UInt)Irrm_ZERO) );
   8490       } else {
   8491          assign( rmode, get_sse_roundingmode() );
   8492       }
   8493 
   8494       assign(
   8495          dst64,
   8496          binop( Iop_32HLto64,
   8497                 binop( Iop_F64toI32S,
   8498                        mkexpr(rmode),
   8499                        unop( Iop_F32toF64, mkexpr(f32hi) ) ),
   8500                 binop( Iop_F64toI32S,
   8501                        mkexpr(rmode),
   8502                        unop( Iop_F32toF64, mkexpr(f32lo) ) )
   8503               )
   8504       );
   8505 
   8506       putMMXReg(gregOfRM(modrm), mkexpr(dst64));
   8507       goto decode_success;
   8508    }
   8509 
   8510    /* F3 0F 2D = CVTSS2SI -- convert F32 in mem/low quarter xmm to
   8511       I32 in ireg, according to prevailing SSE rounding mode */
   8512    /* F3 0F 2C = CVTTSS2SI -- convert F32 in mem/low quarter xmm to
   8513       I32 in ireg, rounding towards zero */
   8514    if (insn[0] == 0xF3 && insn[1] == 0x0F
   8515        && (insn[2] == 0x2D || insn[2] == 0x2C)) {
   8516       IRTemp rmode = newTemp(Ity_I32);
   8517       IRTemp f32lo = newTemp(Ity_F32);
   8518       Bool   r2zero = toBool(insn[2] == 0x2C);
   8519       vassert(sz == 4);
   8520 
   8521       modrm = getIByte(delta+3);
   8522       if (epartIsReg(modrm)) {
   8523          delta += 3+1;
   8524 	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
   8525          DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
   8526                                    nameXMMReg(eregOfRM(modrm)),
   8527                                    nameIReg(4, gregOfRM(modrm)));
   8528       } else {
   8529          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   8530 	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   8531          delta += 3+alen;
   8532          DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
   8533                                    dis_buf,
   8534                                    nameIReg(4, gregOfRM(modrm)));
   8535       }
   8536 
   8537       if (r2zero) {
   8538          assign( rmode, mkU32((UInt)Irrm_ZERO) );
   8539       } else {
   8540          assign( rmode, get_sse_roundingmode() );
   8541       }
   8542 
   8543       putIReg(4, gregOfRM(modrm),
   8544                  binop( Iop_F64toI32S,
   8545                         mkexpr(rmode),
   8546                         unop( Iop_F32toF64, mkexpr(f32lo) ) )
   8547       );
   8548 
   8549       goto decode_success;
   8550    }
   8551 
   8552    /* 0F 5E = DIVPS -- div 32Fx4 from R/M to R */
   8553    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5E) {
   8554       delta = dis_SSE_E_to_G_all( sorb, delta+2, "divps", Iop_Div32Fx4 );
   8555       goto decode_success;
   8556    }
   8557 
   8558    /* F3 0F 5E = DIVSS -- div 32F0x4 from R/M to R */
   8559    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5E) {
   8560       vassert(sz == 4);
   8561       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "divss", Iop_Div32F0x4 );
   8562       goto decode_success;
   8563    }
   8564 
   8565    /* 0F AE /2 = LDMXCSR m32 -- load %mxcsr */
   8566    if (insn[0] == 0x0F && insn[1] == 0xAE
   8567        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 2) {
   8568 
   8569       IRTemp t64 = newTemp(Ity_I64);
   8570       IRTemp ew = newTemp(Ity_I32);
   8571 
   8572       modrm = getIByte(delta+2);
   8573       vassert(!epartIsReg(modrm));
   8574       vassert(sz == 4);
   8575 
   8576       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8577       delta += 2+alen;
   8578       DIP("ldmxcsr %s\n", dis_buf);
   8579 
   8580       /* The only thing we observe in %mxcsr is the rounding mode.
   8581          Therefore, pass the 32-bit value (SSE native-format control
   8582          word) to a clean helper, getting back a 64-bit value, the
   8583          lower half of which is the SSEROUND value to store, and the
   8584          upper half of which is the emulation-warning token which may
   8585          be generated.
   8586       */
   8587       /* ULong x86h_check_ldmxcsr ( UInt ); */
   8588       assign( t64, mkIRExprCCall(
   8589                       Ity_I64, 0/*regparms*/,
   8590                       "x86g_check_ldmxcsr",
   8591                       &x86g_check_ldmxcsr,
   8592                       mkIRExprVec_1( loadLE(Ity_I32, mkexpr(addr)) )
   8593                    )
   8594             );
   8595 
   8596       put_sse_roundingmode( unop(Iop_64to32, mkexpr(t64)) );
   8597       assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
   8598       put_emwarn( mkexpr(ew) );
   8599       /* Finally, if an emulation warning was reported, side-exit to
   8600          the next insn, reporting the warning, so that Valgrind's
   8601          dispatcher sees the warning. */
   8602       stmt(
   8603          IRStmt_Exit(
   8604             binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   8605             Ijk_EmWarn,
   8606             IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
   8607             OFFB_EIP
   8608          )
   8609       );
   8610       goto decode_success;
   8611    }
   8612 
   8613    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8614    /* 0F F7 = MASKMOVQ -- 8x8 masked store */
   8615    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF7) {
   8616       Bool ok = False;
   8617       delta = dis_MMX( &ok, sorb, sz, delta+1 );
   8618       if (!ok)
   8619          goto decode_failure;
   8620       goto decode_success;
   8621    }
   8622 
   8623    /* 0F 5F = MAXPS -- max 32Fx4 from R/M to R */
   8624    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5F) {
   8625       delta = dis_SSE_E_to_G_all( sorb, delta+2, "maxps", Iop_Max32Fx4 );
   8626       goto decode_success;
   8627    }
   8628 
   8629    /* F3 0F 5F = MAXSS -- max 32F0x4 from R/M to R */
   8630    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5F) {
   8631       vassert(sz == 4);
   8632       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "maxss", Iop_Max32F0x4 );
   8633       goto decode_success;
   8634    }
   8635 
   8636    /* 0F 5D = MINPS -- min 32Fx4 from R/M to R */
   8637    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5D) {
   8638       delta = dis_SSE_E_to_G_all( sorb, delta+2, "minps", Iop_Min32Fx4 );
   8639       goto decode_success;
   8640    }
   8641 
   8642    /* F3 0F 5D = MINSS -- min 32F0x4 from R/M to R */
   8643    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5D) {
   8644       vassert(sz == 4);
   8645       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "minss", Iop_Min32F0x4 );
   8646       goto decode_success;
   8647    }
   8648 
   8649    /* 0F 28 = MOVAPS -- move from E (mem or xmm) to G (xmm). */
   8650    /* 0F 10 = MOVUPS -- move from E (mem or xmm) to G (xmm). */
   8651    if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x28 || insn[1] == 0x10)) {
   8652       modrm = getIByte(delta+2);
   8653       if (epartIsReg(modrm)) {
   8654          putXMMReg( gregOfRM(modrm),
   8655                     getXMMReg( eregOfRM(modrm) ));
   8656          DIP("mov[ua]ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   8657                                   nameXMMReg(gregOfRM(modrm)));
   8658          delta += 2+1;
   8659       } else {
   8660          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8661          if (insn[1] == 0x28/*movaps*/)
   8662             gen_SEGV_if_not_16_aligned( addr );
   8663          putXMMReg( gregOfRM(modrm),
   8664                     loadLE(Ity_V128, mkexpr(addr)) );
   8665          DIP("mov[ua]ps %s,%s\n", dis_buf,
   8666                                   nameXMMReg(gregOfRM(modrm)));
   8667          delta += 2+alen;
   8668       }
   8669       goto decode_success;
   8670    }
   8671 
   8672    /* 0F 29 = MOVAPS -- move from G (xmm) to E (mem or xmm). */
   8673    /* 0F 11 = MOVUPS -- move from G (xmm) to E (mem or xmm). */
   8674    if (sz == 4 && insn[0] == 0x0F
   8675        && (insn[1] == 0x29 || insn[1] == 0x11)) {
   8676       modrm = getIByte(delta+2);
   8677       if (epartIsReg(modrm)) {
   8678          /* fall through; awaiting test case */
   8679       } else {
   8680          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8681          if (insn[1] == 0x29/*movaps*/)
   8682             gen_SEGV_if_not_16_aligned( addr );
   8683          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   8684          DIP("mov[ua]ps %s,%s\n", nameXMMReg(gregOfRM(modrm)),
   8685                                   dis_buf );
   8686          delta += 2+alen;
   8687          goto decode_success;
   8688       }
   8689    }
   8690 
   8691    /* 0F 16 = MOVHPS -- move from mem to high half of XMM. */
   8692    /* 0F 16 = MOVLHPS -- move from lo half to hi half of XMM. */
   8693    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x16) {
   8694       modrm = getIByte(delta+2);
   8695       if (epartIsReg(modrm)) {
   8696          delta += 2+1;
   8697          putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
   8698                           getXMMRegLane64( eregOfRM(modrm), 0 ) );
   8699          DIP("movhps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   8700                                nameXMMReg(gregOfRM(modrm)));
   8701       } else {
   8702          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8703          delta += 2+alen;
   8704          putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
   8705                           loadLE(Ity_I64, mkexpr(addr)) );
   8706          DIP("movhps %s,%s\n", dis_buf,
   8707                                nameXMMReg( gregOfRM(modrm) ));
   8708       }
   8709       goto decode_success;
   8710    }
   8711 
   8712    /* 0F 17 = MOVHPS -- move from high half of XMM to mem. */
   8713    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x17) {
   8714       if (!epartIsReg(insn[2])) {
   8715          delta += 2;
   8716          addr = disAMode ( &alen, sorb, delta, dis_buf );
   8717          delta += alen;
   8718          storeLE( mkexpr(addr),
   8719                   getXMMRegLane64( gregOfRM(insn[2]),
   8720                                    1/*upper lane*/ ) );
   8721          DIP("movhps %s,%s\n", nameXMMReg( gregOfRM(insn[2]) ),
   8722                                dis_buf);
   8723          goto decode_success;
   8724       }
   8725       /* else fall through */
   8726    }
   8727 
   8728    /* 0F 12 = MOVLPS -- move from mem to low half of XMM. */
   8729    /* OF 12 = MOVHLPS -- from from hi half to lo half of XMM. */
   8730    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x12) {
   8731       modrm = getIByte(delta+2);
   8732       if (epartIsReg(modrm)) {
   8733          delta += 2+1;
   8734          putXMMRegLane64( gregOfRM(modrm),
   8735                           0/*lower lane*/,
   8736                           getXMMRegLane64( eregOfRM(modrm), 1 ));
   8737          DIP("movhlps %s, %s\n", nameXMMReg(eregOfRM(modrm)),
   8738                                  nameXMMReg(gregOfRM(modrm)));
   8739       } else {
   8740          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8741          delta += 2+alen;
   8742          putXMMRegLane64( gregOfRM(modrm),  0/*lower lane*/,
   8743                           loadLE(Ity_I64, mkexpr(addr)) );
   8744          DIP("movlps %s, %s\n",
   8745              dis_buf, nameXMMReg( gregOfRM(modrm) ));
   8746       }
   8747       goto decode_success;
   8748    }
   8749 
   8750    /* 0F 13 = MOVLPS -- move from low half of XMM to mem. */
   8751    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x13) {
   8752       if (!epartIsReg(insn[2])) {
   8753          delta += 2;
   8754          addr = disAMode ( &alen, sorb, delta, dis_buf );
   8755          delta += alen;
   8756          storeLE( mkexpr(addr),
   8757                   getXMMRegLane64( gregOfRM(insn[2]),
   8758                                    0/*lower lane*/ ) );
   8759          DIP("movlps %s, %s\n", nameXMMReg( gregOfRM(insn[2]) ),
   8760                                 dis_buf);
   8761          goto decode_success;
   8762       }
   8763       /* else fall through */
   8764    }
   8765 
   8766    /* 0F 50 = MOVMSKPS - move 4 sign bits from 4 x F32 in xmm(E)
   8767       to 4 lowest bits of ireg(G) */
   8768    if (insn[0] == 0x0F && insn[1] == 0x50) {
   8769       modrm = getIByte(delta+2);
   8770       if (sz == 4 && epartIsReg(modrm)) {
   8771          Int src;
   8772          t0 = newTemp(Ity_I32);
   8773          t1 = newTemp(Ity_I32);
   8774          t2 = newTemp(Ity_I32);
   8775          t3 = newTemp(Ity_I32);
   8776          delta += 2+1;
   8777          src = eregOfRM(modrm);
   8778          assign( t0, binop( Iop_And32,
   8779                             binop(Iop_Shr32, getXMMRegLane32(src,0), mkU8(31)),
   8780                             mkU32(1) ));
   8781          assign( t1, binop( Iop_And32,
   8782                             binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(30)),
   8783                             mkU32(2) ));
   8784          assign( t2, binop( Iop_And32,
   8785                             binop(Iop_Shr32, getXMMRegLane32(src,2), mkU8(29)),
   8786                             mkU32(4) ));
   8787          assign( t3, binop( Iop_And32,
   8788                             binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(28)),
   8789                             mkU32(8) ));
   8790          putIReg(4, gregOfRM(modrm),
   8791                     binop(Iop_Or32,
   8792                           binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
   8793                           binop(Iop_Or32, mkexpr(t2), mkexpr(t3))
   8794                          )
   8795                  );
   8796          DIP("movmskps %s,%s\n", nameXMMReg(src),
   8797                                  nameIReg(4, gregOfRM(modrm)));
   8798          goto decode_success;
   8799       }
   8800       /* else fall through */
   8801    }
   8802 
   8803    /* 0F 2B = MOVNTPS -- for us, just a plain SSE store. */
   8804    /* 66 0F 2B = MOVNTPD -- for us, just a plain SSE store. */
   8805    if (insn[0] == 0x0F && insn[1] == 0x2B) {
   8806       modrm = getIByte(delta+2);
   8807       if (!epartIsReg(modrm)) {
   8808          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8809          gen_SEGV_if_not_16_aligned( addr );
   8810          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   8811          DIP("movntp%s %s,%s\n", sz==2 ? "d" : "s",
   8812                                  dis_buf,
   8813                                  nameXMMReg(gregOfRM(modrm)));
   8814          delta += 2+alen;
   8815          goto decode_success;
   8816       }
   8817       /* else fall through */
   8818    }
   8819 
   8820    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8821    /* 0F E7 = MOVNTQ -- for us, just a plain MMX store.  Note, the
   8822       Intel manual does not say anything about the usual business of
   8823       the FP reg tags getting trashed whenever an MMX insn happens.
   8824       So we just leave them alone.
   8825    */
   8826    if (insn[0] == 0x0F && insn[1] == 0xE7) {
   8827       modrm = getIByte(delta+2);
   8828       if (sz == 4 && !epartIsReg(modrm)) {
   8829          /* do_MMX_preamble(); Intel docs don't specify this */
   8830          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8831          storeLE( mkexpr(addr), getMMXReg(gregOfRM(modrm)) );
   8832          DIP("movntq %s,%s\n", dis_buf,
   8833                                nameMMXReg(gregOfRM(modrm)));
   8834          delta += 2+alen;
   8835          goto decode_success;
   8836       }
   8837       /* else fall through */
   8838    }
   8839 
   8840    /* F3 0F 10 = MOVSS -- move 32 bits from E (mem or lo 1/4 xmm) to G
   8841       (lo 1/4 xmm).  If E is mem, upper 3/4 of G is zeroed out. */
   8842    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x10) {
   8843       vassert(sz == 4);
   8844       modrm = getIByte(delta+3);
   8845       if (epartIsReg(modrm)) {
   8846          putXMMRegLane32( gregOfRM(modrm), 0,
   8847                           getXMMRegLane32( eregOfRM(modrm), 0 ));
   8848          DIP("movss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   8849                               nameXMMReg(gregOfRM(modrm)));
   8850          delta += 3+1;
   8851       } else {
   8852          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   8853          /* zero bits 127:64 */
   8854          putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
   8855          /* zero bits 63:32 */
   8856          putXMMRegLane32( gregOfRM(modrm), 1, mkU32(0) );
   8857          /* write bits 31:0 */
   8858          putXMMRegLane32( gregOfRM(modrm), 0,
   8859                           loadLE(Ity_I32, mkexpr(addr)) );
   8860          DIP("movss %s,%s\n", dis_buf,
   8861                               nameXMMReg(gregOfRM(modrm)));
   8862          delta += 3+alen;
   8863       }
   8864       goto decode_success;
   8865    }
   8866 
   8867    /* F3 0F 11 = MOVSS -- move 32 bits from G (lo 1/4 xmm) to E (mem
   8868       or lo 1/4 xmm). */
   8869    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x11) {
   8870       vassert(sz == 4);
   8871       modrm = getIByte(delta+3);
   8872       if (epartIsReg(modrm)) {
   8873          /* fall through, we don't yet have a test case */
   8874       } else {
   8875          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   8876          storeLE( mkexpr(addr),
   8877                   getXMMRegLane32(gregOfRM(modrm), 0) );
   8878          DIP("movss %s,%s\n", nameXMMReg(gregOfRM(modrm)),
   8879                               dis_buf);
   8880          delta += 3+alen;
   8881          goto decode_success;
   8882       }
   8883    }
   8884 
   8885    /* 0F 59 = MULPS -- mul 32Fx4 from R/M to R */
   8886    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x59) {
   8887       delta = dis_SSE_E_to_G_all( sorb, delta+2, "mulps", Iop_Mul32Fx4 );
   8888       goto decode_success;
   8889    }
   8890 
   8891    /* F3 0F 59 = MULSS -- mul 32F0x4 from R/M to R */
   8892    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x59) {
   8893       vassert(sz == 4);
   8894       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "mulss", Iop_Mul32F0x4 );
   8895       goto decode_success;
   8896    }
   8897 
   8898    /* 0F 56 = ORPS -- G = G and E */
   8899    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x56) {
   8900       delta = dis_SSE_E_to_G_all( sorb, delta+2, "orps", Iop_OrV128 );
   8901       goto decode_success;
   8902    }
   8903 
   8904    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8905    /* 0F E0 = PAVGB -- 8x8 unsigned Packed Average, with rounding */
   8906    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE0) {
   8907       do_MMX_preamble();
   8908       delta = dis_MMXop_regmem_to_reg (
   8909                 sorb, delta+2, insn[1], "pavgb", False );
   8910       goto decode_success;
   8911    }
   8912 
   8913    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8914    /* 0F E3 = PAVGW -- 16x4 unsigned Packed Average, with rounding */
   8915    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE3) {
   8916       do_MMX_preamble();
   8917       delta = dis_MMXop_regmem_to_reg (
   8918                 sorb, delta+2, insn[1], "pavgw", False );
   8919       goto decode_success;
   8920    }
   8921 
   8922    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8923    /* 0F C5 = PEXTRW -- extract 16-bit field from mmx(E) and put
   8924       zero-extend of it in ireg(G). */
   8925    if (insn[0] == 0x0F && insn[1] == 0xC5) {
   8926       modrm = insn[2];
   8927       if (sz == 4 && epartIsReg(modrm)) {
   8928          IRTemp sV = newTemp(Ity_I64);
   8929          t5 = newTemp(Ity_I16);
   8930          do_MMX_preamble();
   8931          assign(sV, getMMXReg(eregOfRM(modrm)));
   8932          breakup64to16s( sV, &t3, &t2, &t1, &t0 );
   8933          switch (insn[3] & 3) {
   8934             case 0:  assign(t5, mkexpr(t0)); break;
   8935             case 1:  assign(t5, mkexpr(t1)); break;
   8936             case 2:  assign(t5, mkexpr(t2)); break;
   8937             case 3:  assign(t5, mkexpr(t3)); break;
   8938             default: vassert(0); /*NOTREACHED*/
   8939          }
   8940          putIReg(4, gregOfRM(modrm), unop(Iop_16Uto32, mkexpr(t5)));
   8941          DIP("pextrw $%d,%s,%s\n",
   8942              (Int)insn[3], nameMMXReg(eregOfRM(modrm)),
   8943                            nameIReg(4,gregOfRM(modrm)));
   8944          delta += 4;
   8945          goto decode_success;
   8946       }
   8947       /* else fall through */
   8948    }
   8949 
   8950    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8951    /* 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
   8952       put it into the specified lane of mmx(G). */
   8953    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC4) {
   8954       /* Use t0 .. t3 to hold the 4 original 16-bit lanes of the
   8955          mmx reg.  t4 is the new lane value.  t5 is the original
   8956          mmx value. t6 is the new mmx value. */
   8957       Int lane;
   8958       t4 = newTemp(Ity_I16);
   8959       t5 = newTemp(Ity_I64);
   8960       t6 = newTemp(Ity_I64);
   8961       modrm = insn[2];
   8962       do_MMX_preamble();
   8963 
   8964       assign(t5, getMMXReg(gregOfRM(modrm)));
   8965       breakup64to16s( t5, &t3, &t2, &t1, &t0 );
   8966 
   8967       if (epartIsReg(modrm)) {
   8968          assign(t4, getIReg(2, eregOfRM(modrm)));
   8969          delta += 3+1;
   8970          lane = insn[3+1-1];
   8971          DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   8972                                    nameIReg(2,eregOfRM(modrm)),
   8973                                    nameMMXReg(gregOfRM(modrm)));
   8974       } else {
   8975          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8976          delta += 3+alen;
   8977          lane = insn[3+alen-1];
   8978          assign(t4, loadLE(Ity_I16, mkexpr(addr)));
   8979          DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   8980                                    dis_buf,
   8981                                    nameMMXReg(gregOfRM(modrm)));
   8982       }
   8983 
   8984       switch (lane & 3) {
   8985          case 0:  assign(t6, mk64from16s(t3,t2,t1,t4)); break;
   8986          case 1:  assign(t6, mk64from16s(t3,t2,t4,t0)); break;
   8987          case 2:  assign(t6, mk64from16s(t3,t4,t1,t0)); break;
   8988          case 3:  assign(t6, mk64from16s(t4,t2,t1,t0)); break;
   8989          default: vassert(0); /*NOTREACHED*/
   8990       }
   8991       putMMXReg(gregOfRM(modrm), mkexpr(t6));
   8992       goto decode_success;
   8993    }
   8994 
   8995    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8996    /* 0F EE = PMAXSW -- 16x4 signed max */
   8997    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xEE) {
   8998       do_MMX_preamble();
   8999       delta = dis_MMXop_regmem_to_reg (
   9000                 sorb, delta+2, insn[1], "pmaxsw", False );
   9001       goto decode_success;
   9002    }
   9003 
   9004    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   9005    /* 0F DE = PMAXUB -- 8x8 unsigned max */
   9006    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xDE) {
   9007       do_MMX_preamble();
   9008       delta = dis_MMXop_regmem_to_reg (
   9009                 sorb, delta+2, insn[1], "pmaxub", False );
   9010       goto decode_success;
   9011    }
   9012 
   9013    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   9014    /* 0F EA = PMINSW -- 16x4 signed min */
   9015    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xEA) {
   9016       do_MMX_preamble();
   9017       delta = dis_MMXop_regmem_to_reg (
   9018                 sorb, delta+2, insn[1], "pminsw", False );
   9019       goto decode_success;
   9020    }
   9021 
   9022    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   9023    /* 0F DA = PMINUB -- 8x8 unsigned min */
   9024    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xDA) {
   9025       do_MMX_preamble();
   9026       delta = dis_MMXop_regmem_to_reg (
   9027                 sorb, delta+2, insn[1], "pminub", False );
   9028       goto decode_success;
   9029    }
   9030 
   9031    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   9032    /* 0F D7 = PMOVMSKB -- extract sign bits from each of 8 lanes in
   9033       mmx(G), turn them into a byte, and put zero-extend of it in
   9034       ireg(G). */
   9035    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xD7) {
   9036       modrm = insn[2];
   9037       if (epartIsReg(modrm)) {
   9038          do_MMX_preamble();
   9039          t0 = newTemp(Ity_I64);
   9040          t1 = newTemp(Ity_I32);
   9041          assign(t0, getMMXReg(eregOfRM(modrm)));
   9042          assign(t1, mkIRExprCCall(
   9043                        Ity_I32, 0/*regparms*/,
   9044                        "x86g_calculate_mmx_pmovmskb",
   9045                        &x86g_calculate_mmx_pmovmskb,
   9046                        mkIRExprVec_1(mkexpr(t0))));
   9047          putIReg(4, gregOfRM(modrm), mkexpr(t1));
   9048          DIP("pmovmskb %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   9049                                  nameIReg(4,gregOfRM(modrm)));
   9050          delta += 3;
   9051          goto decode_success;
   9052       }
   9053       /* else fall through */
   9054    }
   9055 
   9056    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   9057    /* 0F E4 = PMULUH -- 16x4 hi-half of unsigned widening multiply */
   9058    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE4) {
   9059       do_MMX_preamble();
   9060       delta = dis_MMXop_regmem_to_reg (
   9061                 sorb, delta+2, insn[1], "pmuluh", False );
   9062       goto decode_success;
   9063    }
   9064 
   9065    /* 0F 18 /0 = PREFETCHNTA -- prefetch into caches, */
   9066    /* 0F 18 /1 = PREFETCH0   -- with various different hints */
   9067    /* 0F 18 /2 = PREFETCH1 */
   9068    /* 0F 18 /3 = PREFETCH2 */
   9069    if (insn[0] == 0x0F && insn[1] == 0x18
   9070        && !epartIsReg(insn[2])
   9071        && gregOfRM(insn[2]) >= 0 && gregOfRM(insn[2]) <= 3) {
   9072       HChar* hintstr = "??";
   9073 
   9074       modrm = getIByte(delta+2);
   9075       vassert(!epartIsReg(modrm));
   9076 
   9077       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9078       delta += 2+alen;
   9079 
   9080       switch (gregOfRM(modrm)) {
   9081          case 0: hintstr = "nta"; break;
   9082          case 1: hintstr = "t0"; break;
   9083          case 2: hintstr = "t1"; break;
   9084          case 3: hintstr = "t2"; break;
   9085          default: vassert(0); /*NOTREACHED*/
   9086       }
   9087 
   9088       DIP("prefetch%s %s\n", hintstr, dis_buf);
   9089       goto decode_success;
   9090    }
   9091 
   9092    /* 0F 0D /0 = PREFETCH  m8 -- 3DNow! prefetch */
   9093    /* 0F 0D /1 = PREFETCHW m8 -- ditto, with some other hint */
   9094    if (insn[0] == 0x0F && insn[1] == 0x0D
   9095        && !epartIsReg(insn[2])
   9096        && gregOfRM(insn[2]) >= 0 && gregOfRM(insn[2]) <= 1) {
   9097       HChar* hintstr = "??";
   9098 
   9099       modrm = getIByte(delta+2);
   9100       vassert(!epartIsReg(modrm));
   9101 
   9102       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9103       delta += 2+alen;
   9104 
   9105       switch (gregOfRM(modrm)) {
   9106          case 0: hintstr = ""; break;
   9107          case 1: hintstr = "w"; break;
   9108          default: vassert(0); /*NOTREACHED*/
   9109       }
   9110 
   9111       DIP("prefetch%s %s\n", hintstr, dis_buf);
   9112       goto decode_success;
   9113    }
   9114 
   9115    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   9116    /* 0F F6 = PSADBW -- sum of 8Ux8 absolute differences */
   9117    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF6) {
   9118       do_MMX_preamble();
   9119       delta = dis_MMXop_regmem_to_reg (
   9120                  sorb, delta+2, insn[1], "psadbw", False );
   9121       goto decode_success;
   9122    }
   9123 
   9124    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   9125    /* 0F 70 = PSHUFW -- rearrange 4x16 from E(mmx or mem) to G(mmx) */
   9126    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x70) {
   9127       Int order;
   9128       IRTemp sV, dV, s3, s2, s1, s0;
   9129       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   9130       sV = newTemp(Ity_I64);
   9131       dV = newTemp(Ity_I64);
   9132       do_MMX_preamble();
   9133       modrm = insn[2];
   9134       if (epartIsReg(modrm)) {
   9135          assign( sV, getMMXReg(eregOfRM(modrm)) );
   9136          order = (Int)insn[3];
   9137          delta += 2+2;
   9138          DIP("pshufw $%d,%s,%s\n", order,
   9139                                    nameMMXReg(eregOfRM(modrm)),
   9140                                    nameMMXReg(gregOfRM(modrm)));
   9141       } else {
   9142          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9143          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   9144 	 order = (Int)insn[2+alen];
   9145          delta += 3+alen;
   9146          DIP("pshufw $%d,%s,%s\n", order,
   9147                                    dis_buf,
   9148                                    nameMMXReg(gregOfRM(modrm)));
   9149       }
   9150       breakup64to16s( sV, &s3, &s2, &s1, &s0 );
   9151 
   9152 #     define SEL(n) \
   9153                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   9154       assign(dV,
   9155 	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
   9156                           SEL((order>>2)&3), SEL((order>>0)&3) )
   9157       );
   9158       putMMXReg(gregOfRM(modrm), mkexpr(dV));
   9159 #     undef SEL
   9160       goto decode_success;
   9161    }
   9162 
   9163    /* 0F 53 = RCPPS -- approx reciprocal 32Fx4 from R/M to R */
   9164    if (insn[0] == 0x0F && insn[1] == 0x53) {
   9165       vassert(sz == 4);
   9166       delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
   9167                                         "rcpps", Iop_Recip32Fx4 );
   9168       goto decode_success;
   9169    }
   9170 
   9171    /* F3 0F 53 = RCPSS -- approx reciprocal 32F0x4 from R/M to R */
   9172    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x53) {
   9173       vassert(sz == 4);
   9174       delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
   9175                                          "rcpss", Iop_Recip32F0x4 );
   9176       goto decode_success;
   9177    }
   9178 
   9179    /* 0F 52 = RSQRTPS -- approx reciprocal sqrt 32Fx4 from R/M to R */
   9180    if (insn[0] == 0x0F && insn[1] == 0x52) {
   9181       vassert(sz == 4);
   9182       delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
   9183                                         "rsqrtps", Iop_RSqrt32Fx4 );
   9184       goto decode_success;
   9185    }
   9186 
   9187    /* F3 0F 52 = RSQRTSS -- approx reciprocal sqrt 32F0x4 from R/M to R */
   9188    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x52) {
   9189       vassert(sz == 4);
   9190       delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
   9191                                          "rsqrtss", Iop_RSqrt32F0x4 );
   9192       goto decode_success;
   9193    }
   9194 
   9195    /* 0F AE /7 = SFENCE -- flush pending operations to memory */
   9196    if (insn[0] == 0x0F && insn[1] == 0xAE
   9197        && epartIsReg(insn[2]) && gregOfRM(insn[2]) == 7) {
   9198       vassert(sz == 4);
   9199       delta += 3;
   9200       /* Insert a memory fence.  It's sometimes important that these
   9201          are carried through to the generated code. */
   9202       stmt( IRStmt_MBE(Imbe_Fence) );
   9203       DIP("sfence\n");
   9204       goto decode_success;
   9205    }
   9206 
   9207    /* 0F C6 /r ib = SHUFPS -- shuffle packed F32s */
   9208    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC6) {
   9209       Int    select;
   9210       IRTemp sV, dV;
   9211       IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   9212       sV = newTemp(Ity_V128);
   9213       dV = newTemp(Ity_V128);
   9214       s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   9215       modrm = insn[2];
   9216       assign( dV, getXMMReg(gregOfRM(modrm)) );
   9217 
   9218       if (epartIsReg(modrm)) {
   9219          assign( sV, getXMMReg(eregOfRM(modrm)) );
   9220          select = (Int)insn[3];
   9221          delta += 2+2;
   9222          DIP("shufps $%d,%s,%s\n", select,
   9223                                    nameXMMReg(eregOfRM(modrm)),
   9224                                    nameXMMReg(gregOfRM(modrm)));
   9225       } else {
   9226          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9227          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   9228          select = (Int)insn[2+alen];
   9229          delta += 3+alen;
   9230          DIP("shufps $%d,%s,%s\n", select,
   9231                                    dis_buf,
   9232                                    nameXMMReg(gregOfRM(modrm)));
   9233       }
   9234 
   9235       breakup128to32s( dV, &d3, &d2, &d1, &d0 );
   9236       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   9237 
   9238 #     define SELD(n) ((n)==0 ? d0 : ((n)==1 ? d1 : ((n)==2 ? d2 : d3)))
   9239 #     define SELS(n) ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   9240 
   9241       putXMMReg(
   9242          gregOfRM(modrm),
   9243          mk128from32s( SELS((select>>6)&3), SELS((select>>4)&3),
   9244                        SELD((select>>2)&3), SELD((select>>0)&3) )
   9245       );
   9246 
   9247 #     undef SELD
   9248 #     undef SELS
   9249 
   9250       goto decode_success;
   9251    }
   9252 
   9253    /* 0F 51 = SQRTPS -- approx sqrt 32Fx4 from R/M to R */
   9254    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x51) {
   9255       delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
   9256                                         "sqrtps", Iop_Sqrt32Fx4 );
   9257       goto decode_success;
   9258    }
   9259 
   9260    /* F3 0F 51 = SQRTSS -- approx sqrt 32F0x4 from R/M to R */
   9261    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x51) {
   9262       vassert(sz == 4);
   9263       delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
   9264                                          "sqrtss", Iop_Sqrt32F0x4 );
   9265       goto decode_success;
   9266    }
   9267 
   9268    /* 0F AE /3 = STMXCSR m32 -- store %mxcsr */
   9269    if (insn[0] == 0x0F && insn[1] == 0xAE
   9270        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 3) {
   9271       modrm = getIByte(delta+2);
   9272       vassert(sz == 4);
   9273       vassert(!epartIsReg(modrm));
   9274 
   9275       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9276       delta += 2+alen;
   9277 
   9278       /* Fake up a native SSE mxcsr word.  The only thing it depends
   9279          on is SSEROUND[1:0], so call a clean helper to cook it up.
   9280       */
   9281       /* UInt x86h_create_mxcsr ( UInt sseround ) */
   9282       DIP("stmxcsr %s\n", dis_buf);
   9283       storeLE( mkexpr(addr),
   9284                mkIRExprCCall(
   9285                   Ity_I32, 0/*regp*/,
   9286                   "x86g_create_mxcsr", &x86g_create_mxcsr,
   9287                   mkIRExprVec_1( get_sse_roundingmode() )
   9288                )
   9289              );
   9290       goto decode_success;
   9291    }
   9292 
   9293    /* 0F 5C = SUBPS -- sub 32Fx4 from R/M to R */
   9294    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5C) {
   9295       delta = dis_SSE_E_to_G_all( sorb, delta+2, "subps", Iop_Sub32Fx4 );
   9296       goto decode_success;
   9297    }
   9298 
   9299    /* F3 0F 5C = SUBSS -- sub 32F0x4 from R/M to R */
   9300    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5C) {
   9301       vassert(sz == 4);
   9302       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "subss", Iop_Sub32F0x4 );
   9303       goto decode_success;
   9304    }
   9305 
   9306    /* 0F 15 = UNPCKHPS -- unpack and interleave high part F32s */
   9307    /* 0F 14 = UNPCKLPS -- unpack and interleave low part F32s */
   9308    /* These just appear to be special cases of SHUFPS */
   9309    if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
   9310       IRTemp sV, dV;
   9311       IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   9312       Bool hi = toBool(insn[1] == 0x15);
   9313       sV = newTemp(Ity_V128);
   9314       dV = newTemp(Ity_V128);
   9315       s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   9316       modrm = insn[2];
   9317       assign( dV, getXMMReg(gregOfRM(modrm)) );
   9318 
   9319       if (epartIsReg(modrm)) {
   9320          assign( sV, getXMMReg(eregOfRM(modrm)) );
   9321          delta += 2+1;
   9322          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   9323                                   nameXMMReg(eregOfRM(modrm)),
   9324                                   nameXMMReg(gregOfRM(modrm)));
   9325       } else {
   9326          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9327          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   9328          delta += 2+alen;
   9329          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   9330                                   dis_buf,
   9331                                   nameXMMReg(gregOfRM(modrm)));
   9332       }
   9333 
   9334       breakup128to32s( dV, &d3, &d2, &d1, &d0 );
   9335       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   9336 
   9337       if (hi) {
   9338          putXMMReg( gregOfRM(modrm), mk128from32s( s3, d3, s2, d2 ) );
   9339       } else {
   9340          putXMMReg( gregOfRM(modrm), mk128from32s( s1, d1, s0, d0 ) );
   9341       }
   9342 
   9343       goto decode_success;
   9344    }
   9345 
   9346    /* 0F 57 = XORPS -- G = G and E */
   9347    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x57) {
   9348       delta = dis_SSE_E_to_G_all( sorb, delta+2, "xorps", Iop_XorV128 );
   9349       goto decode_success;
   9350    }
   9351 
   9352    /* ---------------------------------------------------- */
   9353    /* --- end of the SSE decoder.                      --- */
   9354    /* ---------------------------------------------------- */
   9355 
   9356    /* ---------------------------------------------------- */
   9357    /* --- start of the SSE2 decoder.                   --- */
   9358    /* ---------------------------------------------------- */
   9359 
   9360    /* Skip parts of the decoder which don't apply given the stated
   9361       guest subarchitecture. */
   9362    if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2))
   9363       goto after_sse_decoders; /* no SSE2 capabilities */
   9364 
   9365    insn = (UChar*)&guest_code[delta];
   9366 
   9367    /* 66 0F 58 = ADDPD -- add 32Fx4 from R/M to R */
   9368    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x58) {
   9369       delta = dis_SSE_E_to_G_all( sorb, delta+2, "addpd", Iop_Add64Fx2 );
   9370       goto decode_success;
   9371    }
   9372 
   9373    /* F2 0F 58 = ADDSD -- add 64F0x2 from R/M to R */
   9374    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x58) {
   9375       vassert(sz == 4);
   9376       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "addsd", Iop_Add64F0x2 );
   9377       goto decode_success;
   9378    }
   9379 
   9380    /* 66 0F 55 = ANDNPD -- G = (not G) and E */
   9381    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x55) {
   9382       delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "andnpd", Iop_AndV128 );
   9383       goto decode_success;
   9384    }
   9385 
   9386    /* 66 0F 54 = ANDPD -- G = G and E */
   9387    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x54) {
   9388       delta = dis_SSE_E_to_G_all( sorb, delta+2, "andpd", Iop_AndV128 );
   9389       goto decode_success;
   9390    }
   9391 
   9392    /* 66 0F C2 = CMPPD -- 64Fx2 comparison from R/M to R */
   9393    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC2) {
   9394       delta = dis_SSEcmp_E_to_G( sorb, delta+2, "cmppd", True, 8 );
   9395       goto decode_success;
   9396    }
   9397 
   9398    /* F2 0F C2 = CMPSD -- 64F0x2 comparison from R/M to R */
   9399    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xC2) {
   9400       vassert(sz == 4);
   9401       delta = dis_SSEcmp_E_to_G( sorb, delta+3, "cmpsd", False, 8 );
   9402       goto decode_success;
   9403    }
   9404 
   9405    /* 66 0F 2F = COMISD  -- 64F0x2 comparison G,E, and set ZCP */
   9406    /* 66 0F 2E = UCOMISD -- 64F0x2 comparison G,E, and set ZCP */
   9407    if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
   9408       IRTemp argL = newTemp(Ity_F64);
   9409       IRTemp argR = newTemp(Ity_F64);
   9410       modrm = getIByte(delta+2);
   9411       if (epartIsReg(modrm)) {
   9412          assign( argR, getXMMRegLane64F( eregOfRM(modrm), 0/*lowest lane*/ ) );
   9413          delta += 2+1;
   9414          DIP("[u]comisd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9415                                   nameXMMReg(gregOfRM(modrm)) );
   9416       } else {
   9417          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9418 	 assign( argR, loadLE(Ity_F64, mkexpr(addr)) );
   9419          delta += 2+alen;
   9420          DIP("[u]comisd %s,%s\n", dis_buf,
   9421                                   nameXMMReg(gregOfRM(modrm)) );
   9422       }
   9423       assign( argL, getXMMRegLane64F( gregOfRM(modrm), 0/*lowest lane*/ ) );
   9424 
   9425       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   9426       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   9427       stmt( IRStmt_Put(
   9428                OFFB_CC_DEP1,
   9429                binop( Iop_And32,
   9430                       binop(Iop_CmpF64, mkexpr(argL), mkexpr(argR)),
   9431                       mkU32(0x45)
   9432           )));
   9433       /* Set NDEP even though it isn't used.  This makes redundant-PUT
   9434          elimination of previous stores to this field work better. */
   9435       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   9436       goto decode_success;
   9437    }
   9438 
   9439    /* F3 0F E6 = CVTDQ2PD -- convert 2 x I32 in mem/lo half xmm to 2 x
   9440       F64 in xmm(G) */
   9441    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xE6) {
   9442       IRTemp arg64 = newTemp(Ity_I64);
   9443       vassert(sz == 4);
   9444 
   9445       modrm = getIByte(delta+3);
   9446       if (epartIsReg(modrm)) {
   9447          assign( arg64, getXMMRegLane64(eregOfRM(modrm), 0) );
   9448          delta += 3+1;
   9449          DIP("cvtdq2pd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9450                                  nameXMMReg(gregOfRM(modrm)));
   9451       } else {
   9452          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9453 	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   9454          delta += 3+alen;
   9455          DIP("cvtdq2pd %s,%s\n", dis_buf,
   9456                                  nameXMMReg(gregOfRM(modrm)) );
   9457       }
   9458 
   9459       putXMMRegLane64F(
   9460          gregOfRM(modrm), 0,
   9461          unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)))
   9462       );
   9463 
   9464       putXMMRegLane64F(
   9465          gregOfRM(modrm), 1,
   9466          unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)))
   9467       );
   9468 
   9469       goto decode_success;
   9470    }
   9471 
   9472    /* 0F 5B = CVTDQ2PS -- convert 4 x I32 in mem/xmm to 4 x F32 in
   9473       xmm(G) */
   9474    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5B) {
   9475       IRTemp argV  = newTemp(Ity_V128);
   9476       IRTemp rmode = newTemp(Ity_I32);
   9477 
   9478       modrm = getIByte(delta+2);
   9479       if (epartIsReg(modrm)) {
   9480          assign( argV, getXMMReg(eregOfRM(modrm)) );
   9481          delta += 2+1;
   9482          DIP("cvtdq2ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9483                                  nameXMMReg(gregOfRM(modrm)));
   9484       } else {
   9485          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9486 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   9487          delta += 2+alen;
   9488          DIP("cvtdq2ps %s,%s\n", dis_buf,
   9489                                  nameXMMReg(gregOfRM(modrm)) );
   9490       }
   9491 
   9492       assign( rmode, get_sse_roundingmode() );
   9493       breakup128to32s( argV, &t3, &t2, &t1, &t0 );
   9494 
   9495 #     define CVT(_t)  binop( Iop_F64toF32,                    \
   9496                              mkexpr(rmode),                   \
   9497                              unop(Iop_I32StoF64,mkexpr(_t)))
   9498 
   9499       putXMMRegLane32F( gregOfRM(modrm), 3, CVT(t3) );
   9500       putXMMRegLane32F( gregOfRM(modrm), 2, CVT(t2) );
   9501       putXMMRegLane32F( gregOfRM(modrm), 1, CVT(t1) );
   9502       putXMMRegLane32F( gregOfRM(modrm), 0, CVT(t0) );
   9503 
   9504 #     undef CVT
   9505 
   9506       goto decode_success;
   9507    }
   9508 
   9509    /* F2 0F E6 = CVTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
   9510       lo half xmm(G), and zero upper half */
   9511    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xE6) {
   9512       IRTemp argV  = newTemp(Ity_V128);
   9513       IRTemp rmode = newTemp(Ity_I32);
   9514       vassert(sz == 4);
   9515 
   9516       modrm = getIByte(delta+3);
   9517       if (epartIsReg(modrm)) {
   9518          assign( argV, getXMMReg(eregOfRM(modrm)) );
   9519          delta += 3+1;
   9520          DIP("cvtpd2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9521                                  nameXMMReg(gregOfRM(modrm)));
   9522       } else {
   9523          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9524 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   9525          delta += 3+alen;
   9526          DIP("cvtpd2dq %s,%s\n", dis_buf,
   9527                                  nameXMMReg(gregOfRM(modrm)) );
   9528       }
   9529 
   9530       assign( rmode, get_sse_roundingmode() );
   9531       t0 = newTemp(Ity_F64);
   9532       t1 = newTemp(Ity_F64);
   9533       assign( t0, unop(Iop_ReinterpI64asF64,
   9534                        unop(Iop_V128to64, mkexpr(argV))) );
   9535       assign( t1, unop(Iop_ReinterpI64asF64,
   9536                        unop(Iop_V128HIto64, mkexpr(argV))) );
   9537 
   9538 #     define CVT(_t)  binop( Iop_F64toI32S,                   \
   9539                              mkexpr(rmode),                   \
   9540                              mkexpr(_t) )
   9541 
   9542       putXMMRegLane32( gregOfRM(modrm), 3, mkU32(0) );
   9543       putXMMRegLane32( gregOfRM(modrm), 2, mkU32(0) );
   9544       putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
   9545       putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
   9546 
   9547 #     undef CVT
   9548 
   9549       goto decode_success;
   9550    }
   9551 
   9552    /* 66 0F 2D = CVTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
   9553       I32 in mmx, according to prevailing SSE rounding mode */
   9554    /* 66 0F 2C = CVTTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
   9555       I32 in mmx, rounding towards zero */
   9556    if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
   9557       IRTemp dst64  = newTemp(Ity_I64);
   9558       IRTemp rmode  = newTemp(Ity_I32);
   9559       IRTemp f64lo  = newTemp(Ity_F64);
   9560       IRTemp f64hi  = newTemp(Ity_F64);
   9561       Bool   r2zero = toBool(insn[1] == 0x2C);
   9562 
   9563       do_MMX_preamble();
   9564       modrm = getIByte(delta+2);
   9565 
   9566       if (epartIsReg(modrm)) {
   9567          delta += 2+1;
   9568 	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
   9569 	 assign(f64hi, getXMMRegLane64F(eregOfRM(modrm), 1));
   9570          DIP("cvt%spd2pi %s,%s\n", r2zero ? "t" : "",
   9571                                    nameXMMReg(eregOfRM(modrm)),
   9572                                    nameMMXReg(gregOfRM(modrm)));
   9573       } else {
   9574          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9575 	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   9576 	 assign(f64hi, loadLE(Ity_F64, binop( Iop_Add32,
   9577                                               mkexpr(addr),
   9578                                               mkU32(8) )));
   9579          delta += 2+alen;
   9580          DIP("cvt%spf2pi %s,%s\n", r2zero ? "t" : "",
   9581                                    dis_buf,
   9582                                    nameMMXReg(gregOfRM(modrm)));
   9583       }
   9584 
   9585       if (r2zero) {
   9586          assign(rmode, mkU32((UInt)Irrm_ZERO) );
   9587       } else {
   9588          assign( rmode, get_sse_roundingmode() );
   9589       }
   9590 
   9591       assign(
   9592          dst64,
   9593          binop( Iop_32HLto64,
   9594                 binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64hi) ),
   9595                 binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo) )
   9596               )
   9597       );
   9598 
   9599       putMMXReg(gregOfRM(modrm), mkexpr(dst64));
   9600       goto decode_success;
   9601    }
   9602 
   9603    /* 66 0F 5A = CVTPD2PS -- convert 2 x F64 in mem/xmm to 2 x F32 in
   9604       lo half xmm(G), and zero upper half */
   9605    /* Note, this is practically identical to CVTPD2DQ.  It would have
   9606       been nicer to merge them together, but the insn[] offsets differ
   9607       by one. */
   9608    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5A) {
   9609       IRTemp argV  = newTemp(Ity_V128);
   9610       IRTemp rmode = newTemp(Ity_I32);
   9611 
   9612       modrm = getIByte(delta+2);
   9613       if (epartIsReg(modrm)) {
   9614          assign( argV, getXMMReg(eregOfRM(modrm)) );
   9615          delta += 2+1;
   9616          DIP("cvtpd2ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9617                                  nameXMMReg(gregOfRM(modrm)));
   9618       } else {
   9619          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9620 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   9621          delta += 2+alen;
   9622          DIP("cvtpd2ps %s,%s\n", dis_buf,
   9623                                  nameXMMReg(gregOfRM(modrm)) );
   9624       }
   9625 
   9626       assign( rmode, get_sse_roundingmode() );
   9627       t0 = newTemp(Ity_F64);
   9628       t1 = newTemp(Ity_F64);
   9629       assign( t0, unop(Iop_ReinterpI64asF64,
   9630                        unop(Iop_V128to64, mkexpr(argV))) );
   9631       assign( t1, unop(Iop_ReinterpI64asF64,
   9632                        unop(Iop_V128HIto64, mkexpr(argV))) );
   9633 
   9634 #     define CVT(_t)  binop( Iop_F64toF32,                    \
   9635                              mkexpr(rmode),                   \
   9636                              mkexpr(_t) )
   9637 
   9638       putXMMRegLane32(  gregOfRM(modrm), 3, mkU32(0) );
   9639       putXMMRegLane32(  gregOfRM(modrm), 2, mkU32(0) );
   9640       putXMMRegLane32F( gregOfRM(modrm), 1, CVT(t1) );
   9641       putXMMRegLane32F( gregOfRM(modrm), 0, CVT(t0) );
   9642 
   9643 #     undef CVT
   9644 
   9645       goto decode_success;
   9646    }
   9647 
   9648    /* 66 0F 2A = CVTPI2PD -- convert 2 x I32 in mem/mmx to 2 x F64 in
   9649       xmm(G) */
   9650    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x2A) {
   9651       IRTemp arg64 = newTemp(Ity_I64);
   9652 
   9653       modrm = getIByte(delta+2);
   9654       if (epartIsReg(modrm)) {
   9655          /* Only switch to MMX mode if the source is a MMX register.
   9656             This is inconsistent with all other instructions which
   9657             convert between XMM and (M64 or MMX), which always switch
   9658             to MMX mode even if 64-bit operand is M64 and not MMX.  At
   9659             least, that's what the Intel docs seem to me to say.
   9660             Fixes #210264. */
   9661          do_MMX_preamble();
   9662          assign( arg64, getMMXReg(eregOfRM(modrm)) );
   9663          delta += 2+1;
   9664          DIP("cvtpi2pd %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   9665                                  nameXMMReg(gregOfRM(modrm)));
   9666       } else {
   9667          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9668 	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   9669          delta += 2+alen;
   9670          DIP("cvtpi2pd %s,%s\n", dis_buf,
   9671                                  nameXMMReg(gregOfRM(modrm)) );
   9672       }
   9673 
   9674       putXMMRegLane64F(
   9675          gregOfRM(modrm), 0,
   9676          unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)) )
   9677       );
   9678 
   9679       putXMMRegLane64F(
   9680          gregOfRM(modrm), 1,
   9681          unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)) )
   9682       );
   9683 
   9684       goto decode_success;
   9685    }
   9686 
   9687    /* 66 0F 5B = CVTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
   9688       xmm(G) */
   9689    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5B) {
   9690       IRTemp argV  = newTemp(Ity_V128);
   9691       IRTemp rmode = newTemp(Ity_I32);
   9692 
   9693       modrm = getIByte(delta+2);
   9694       if (epartIsReg(modrm)) {
   9695          assign( argV, getXMMReg(eregOfRM(modrm)) );
   9696          delta += 2+1;
   9697          DIP("cvtps2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9698                                  nameXMMReg(gregOfRM(modrm)));
   9699       } else {
   9700          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9701 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   9702          delta += 2+alen;
   9703          DIP("cvtps2dq %s,%s\n", dis_buf,
   9704                                  nameXMMReg(gregOfRM(modrm)) );
   9705       }
   9706 
   9707       assign( rmode, get_sse_roundingmode() );
   9708       breakup128to32s( argV, &t3, &t2, &t1, &t0 );
   9709 
   9710       /* This is less than ideal.  If it turns out to be a performance
   9711 	 bottleneck it can be improved. */
   9712 #     define CVT(_t)                            \
   9713         binop( Iop_F64toI32S,                   \
   9714                mkexpr(rmode),                   \
   9715                unop( Iop_F32toF64,              \
   9716                      unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
   9717 
   9718       putXMMRegLane32( gregOfRM(modrm), 3, CVT(t3) );
   9719       putXMMRegLane32( gregOfRM(modrm), 2, CVT(t2) );
   9720       putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
   9721       putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
   9722 
   9723 #     undef CVT
   9724 
   9725       goto decode_success;
   9726    }
   9727 
   9728    /* 0F 5A = CVTPS2PD -- convert 2 x F32 in low half mem/xmm to 2 x
   9729       F64 in xmm(G). */
   9730    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5A) {
   9731       IRTemp f32lo = newTemp(Ity_F32);
   9732       IRTemp f32hi = newTemp(Ity_F32);
   9733 
   9734       modrm = getIByte(delta+2);
   9735       if (epartIsReg(modrm)) {
   9736          assign( f32lo, getXMMRegLane32F(eregOfRM(modrm), 0) );
   9737          assign( f32hi, getXMMRegLane32F(eregOfRM(modrm), 1) );
   9738          delta += 2+1;
   9739          DIP("cvtps2pd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9740                                  nameXMMReg(gregOfRM(modrm)));
   9741       } else {
   9742          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9743 	 assign( f32lo, loadLE(Ity_F32, mkexpr(addr)) );
   9744 	 assign( f32hi, loadLE(Ity_F32,
   9745                                binop(Iop_Add32,mkexpr(addr),mkU32(4))) );
   9746          delta += 2+alen;
   9747          DIP("cvtps2pd %s,%s\n", dis_buf,
   9748                                  nameXMMReg(gregOfRM(modrm)) );
   9749       }
   9750 
   9751       putXMMRegLane64F( gregOfRM(modrm), 1,
   9752                         unop(Iop_F32toF64, mkexpr(f32hi)) );
   9753       putXMMRegLane64F( gregOfRM(modrm), 0,
   9754                         unop(Iop_F32toF64, mkexpr(f32lo)) );
   9755 
   9756       goto decode_success;
   9757    }
   9758 
   9759    /* F2 0F 2D = CVTSD2SI -- convert F64 in mem/low half xmm to
   9760       I32 in ireg, according to prevailing SSE rounding mode */
   9761    /* F2 0F 2C = CVTTSD2SI -- convert F64 in mem/low half xmm to
   9762       I32 in ireg, rounding towards zero */
   9763    if (insn[0] == 0xF2 && insn[1] == 0x0F
   9764        && (insn[2] == 0x2D || insn[2] == 0x2C)) {
   9765       IRTemp rmode = newTemp(Ity_I32);
   9766       IRTemp f64lo = newTemp(Ity_F64);
   9767       Bool   r2zero = toBool(insn[2] == 0x2C);
   9768       vassert(sz == 4);
   9769 
   9770       modrm = getIByte(delta+3);
   9771       if (epartIsReg(modrm)) {
   9772          delta += 3+1;
   9773 	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
   9774          DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
   9775                                    nameXMMReg(eregOfRM(modrm)),
   9776                                    nameIReg(4, gregOfRM(modrm)));
   9777       } else {
   9778          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9779 	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   9780          delta += 3+alen;
   9781          DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
   9782                                    dis_buf,
   9783                                    nameIReg(4, gregOfRM(modrm)));
   9784       }
   9785 
   9786       if (r2zero) {
   9787          assign( rmode, mkU32((UInt)Irrm_ZERO) );
   9788       } else {
   9789          assign( rmode, get_sse_roundingmode() );
   9790       }
   9791 
   9792       putIReg(4, gregOfRM(modrm),
   9793                  binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo)) );
   9794 
   9795       goto decode_success;
   9796    }
   9797 
   9798    /* F2 0F 5A = CVTSD2SS -- convert F64 in mem/low half xmm to F32 in
   9799       low 1/4 xmm(G), according to prevailing SSE rounding mode */
   9800    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5A) {
   9801       IRTemp rmode = newTemp(Ity_I32);
   9802       IRTemp f64lo = newTemp(Ity_F64);
   9803       vassert(sz == 4);
   9804 
   9805       modrm = getIByte(delta+3);
   9806       if (epartIsReg(modrm)) {
   9807          delta += 3+1;
   9808 	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
   9809          DIP("cvtsd2ss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9810                                  nameXMMReg(gregOfRM(modrm)));
   9811       } else {
   9812          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9813 	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   9814          delta += 3+alen;
   9815          DIP("cvtsd2ss %s,%s\n", dis_buf,
   9816                                  nameXMMReg(gregOfRM(modrm)));
   9817       }
   9818 
   9819       assign( rmode, get_sse_roundingmode() );
   9820       putXMMRegLane32F(
   9821          gregOfRM(modrm), 0,
   9822          binop( Iop_F64toF32, mkexpr(rmode), mkexpr(f64lo) )
   9823       );
   9824 
   9825       goto decode_success;
   9826    }
   9827 
   9828    /* F2 0F 2A = CVTSI2SD -- convert I32 in mem/ireg to F64 in low
   9829       half xmm */
   9830    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x2A) {
   9831       IRTemp arg32 = newTemp(Ity_I32);
   9832       vassert(sz == 4);
   9833 
   9834       modrm = getIByte(delta+3);
   9835       if (epartIsReg(modrm)) {
   9836          assign( arg32, getIReg(4, eregOfRM(modrm)) );
   9837          delta += 3+1;
   9838          DIP("cvtsi2sd %s,%s\n", nameIReg(4, eregOfRM(modrm)),
   9839                                  nameXMMReg(gregOfRM(modrm)));
   9840       } else {
   9841          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9842 	 assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   9843          delta += 3+alen;
   9844          DIP("cvtsi2sd %s,%s\n", dis_buf,
   9845                                  nameXMMReg(gregOfRM(modrm)) );
   9846       }
   9847 
   9848       putXMMRegLane64F(
   9849          gregOfRM(modrm), 0,
   9850          unop(Iop_I32StoF64, mkexpr(arg32)) );
   9851 
   9852       goto decode_success;
   9853    }
   9854 
   9855    /* F3 0F 5A = CVTSS2SD -- convert F32 in mem/low 1/4 xmm to F64 in
   9856       low half xmm(G) */
   9857    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5A) {
   9858       IRTemp f32lo = newTemp(Ity_F32);
   9859       vassert(sz == 4);
   9860 
   9861       modrm = getIByte(delta+3);
   9862       if (epartIsReg(modrm)) {
   9863          delta += 3+1;
   9864 	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
   9865          DIP("cvtss2sd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9866                                  nameXMMReg(gregOfRM(modrm)));
   9867       } else {
   9868          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9869 	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   9870          delta += 3+alen;
   9871          DIP("cvtss2sd %s,%s\n", dis_buf,
   9872                                  nameXMMReg(gregOfRM(modrm)));
   9873       }
   9874 
   9875       putXMMRegLane64F( gregOfRM(modrm), 0,
   9876                         unop( Iop_F32toF64, mkexpr(f32lo) ) );
   9877 
   9878       goto decode_success;
   9879    }
   9880 
   9881    /* 66 0F E6 = CVTTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
   9882       lo half xmm(G), and zero upper half, rounding towards zero */
   9883    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE6) {
   9884       IRTemp argV  = newTemp(Ity_V128);
   9885       IRTemp rmode = newTemp(Ity_I32);
   9886 
   9887       modrm = getIByte(delta+2);
   9888       if (epartIsReg(modrm)) {
   9889          assign( argV, getXMMReg(eregOfRM(modrm)) );
   9890          delta += 2+1;
   9891          DIP("cvttpd2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9892                                   nameXMMReg(gregOfRM(modrm)));
   9893       } else {
   9894          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9895 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   9896          delta += 2+alen;
   9897          DIP("cvttpd2dq %s,%s\n", dis_buf,
   9898                                   nameXMMReg(gregOfRM(modrm)) );
   9899       }
   9900 
   9901       assign( rmode, mkU32((UInt)Irrm_ZERO) );
   9902 
   9903       t0 = newTemp(Ity_F64);
   9904       t1 = newTemp(Ity_F64);
   9905       assign( t0, unop(Iop_ReinterpI64asF64,
   9906                        unop(Iop_V128to64, mkexpr(argV))) );
   9907       assign( t1, unop(Iop_ReinterpI64asF64,
   9908                        unop(Iop_V128HIto64, mkexpr(argV))) );
   9909 
   9910 #     define CVT(_t)  binop( Iop_F64toI32S,                   \
   9911                              mkexpr(rmode),                   \
   9912                              mkexpr(_t) )
   9913 
   9914       putXMMRegLane32( gregOfRM(modrm), 3, mkU32(0) );
   9915       putXMMRegLane32( gregOfRM(modrm), 2, mkU32(0) );
   9916       putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
   9917       putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
   9918 
   9919 #     undef CVT
   9920 
   9921       goto decode_success;
   9922    }
   9923 
   9924    /* F3 0F 5B = CVTTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
   9925       xmm(G), rounding towards zero */
   9926    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5B) {
   9927       IRTemp argV  = newTemp(Ity_V128);
   9928       IRTemp rmode = newTemp(Ity_I32);
   9929       vassert(sz == 4);
   9930 
   9931       modrm = getIByte(delta+3);
   9932       if (epartIsReg(modrm)) {
   9933          assign( argV, getXMMReg(eregOfRM(modrm)) );
   9934          delta += 3+1;
   9935          DIP("cvttps2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9936                                   nameXMMReg(gregOfRM(modrm)));
   9937       } else {
   9938          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9939 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   9940          delta += 3+alen;
   9941          DIP("cvttps2dq %s,%s\n", dis_buf,
   9942                                   nameXMMReg(gregOfRM(modrm)) );
   9943       }
   9944 
   9945       assign( rmode, mkU32((UInt)Irrm_ZERO) );
   9946       breakup128to32s( argV, &t3, &t2, &t1, &t0 );
   9947 
   9948       /* This is less than ideal.  If it turns out to be a performance
   9949 	 bottleneck it can be improved. */
   9950 #     define CVT(_t)                            \
   9951         binop( Iop_F64toI32S,                   \
   9952                mkexpr(rmode),                   \
   9953                unop( Iop_F32toF64,              \
   9954                      unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
   9955 
   9956       putXMMRegLane32( gregOfRM(modrm), 3, CVT(t3) );
   9957       putXMMRegLane32( gregOfRM(modrm), 2, CVT(t2) );
   9958       putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
   9959       putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
   9960 
   9961 #     undef CVT
   9962 
   9963       goto decode_success;
   9964    }
   9965 
   9966    /* 66 0F 5E = DIVPD -- div 64Fx2 from R/M to R */
   9967    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5E) {
   9968       delta = dis_SSE_E_to_G_all( sorb, delta+2, "divpd", Iop_Div64Fx2 );
   9969       goto decode_success;
   9970    }
   9971 
   9972    /* F2 0F 5E = DIVSD -- div 64F0x2 from R/M to R */
   9973    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5E) {
   9974       vassert(sz == 4);
   9975       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "divsd", Iop_Div64F0x2 );
   9976       goto decode_success;
   9977    }
   9978 
   9979    /* 0F AE /5 = LFENCE -- flush pending operations to memory */
   9980    /* 0F AE /6 = MFENCE -- flush pending operations to memory */
   9981    if (insn[0] == 0x0F && insn[1] == 0xAE
   9982        && epartIsReg(insn[2])
   9983        && (gregOfRM(insn[2]) == 5 || gregOfRM(insn[2]) == 6)) {
   9984       vassert(sz == 4);
   9985       delta += 3;
   9986       /* Insert a memory fence.  It's sometimes important that these
   9987          are carried through to the generated code. */
   9988       stmt( IRStmt_MBE(Imbe_Fence) );
   9989       DIP("%sfence\n", gregOfRM(insn[2])==5 ? "l" : "m");
   9990       goto decode_success;
   9991    }
   9992 
   9993    /* 66 0F 5F = MAXPD -- max 64Fx2 from R/M to R */
   9994    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5F) {
   9995       delta = dis_SSE_E_to_G_all( sorb, delta+2, "maxpd", Iop_Max64Fx2 );
   9996       goto decode_success;
   9997    }
   9998 
   9999    /* F2 0F 5F = MAXSD -- max 64F0x2 from R/M to R */
   10000    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5F) {
   10001       vassert(sz == 4);
   10002       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "maxsd", Iop_Max64F0x2 );
   10003       goto decode_success;
   10004    }
   10005 
   10006    /* 66 0F 5D = MINPD -- min 64Fx2 from R/M to R */
   10007    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5D) {
   10008       delta = dis_SSE_E_to_G_all( sorb, delta+2, "minpd", Iop_Min64Fx2 );
   10009       goto decode_success;
   10010    }
   10011 
   10012    /* F2 0F 5D = MINSD -- min 64F0x2 from R/M to R */
   10013    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5D) {
   10014       vassert(sz == 4);
   10015       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "minsd", Iop_Min64F0x2 );
   10016       goto decode_success;
   10017    }
   10018 
   10019    /* 66 0F 28 = MOVAPD -- move from E (mem or xmm) to G (xmm). */
   10020    /* 66 0F 10 = MOVUPD -- move from E (mem or xmm) to G (xmm). */
   10021    /* 66 0F 6F = MOVDQA -- move from E (mem or xmm) to G (xmm). */
   10022    if (sz == 2 && insn[0] == 0x0F
   10023        && (insn[1] == 0x28 || insn[1] == 0x10 || insn[1] == 0x6F)) {
   10024       HChar* wot = insn[1]==0x28 ? "apd" :
   10025                    insn[1]==0x10 ? "upd" : "dqa";
   10026       modrm = getIByte(delta+2);
   10027       if (epartIsReg(modrm)) {
   10028          putXMMReg( gregOfRM(modrm),
   10029                     getXMMReg( eregOfRM(modrm) ));
   10030          DIP("mov%s %s,%s\n", wot, nameXMMReg(eregOfRM(modrm)),
   10031                                    nameXMMReg(gregOfRM(modrm)));
   10032          delta += 2+1;
   10033       } else {
   10034          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10035          if (insn[1] == 0x28/*movapd*/ || insn[1] == 0x6F/*movdqa*/)
   10036             gen_SEGV_if_not_16_aligned( addr );
   10037          putXMMReg( gregOfRM(modrm),
   10038                     loadLE(Ity_V128, mkexpr(addr)) );
   10039          DIP("mov%s %s,%s\n", wot, dis_buf,
   10040                                    nameXMMReg(gregOfRM(modrm)));
   10041          delta += 2+alen;
   10042       }
   10043       goto decode_success;
   10044    }
   10045 
   10046    /* 66 0F 29 = MOVAPD -- move from G (xmm) to E (mem or xmm). */
   10047    /* 66 0F 11 = MOVUPD -- move from G (xmm) to E (mem or xmm). */
   10048    if (sz == 2 && insn[0] == 0x0F
   10049        && (insn[1] == 0x29 || insn[1] == 0x11)) {
   10050       HChar* wot = insn[1]==0x29 ? "apd" : "upd";
   10051       modrm = getIByte(delta+2);
   10052       if (epartIsReg(modrm)) {
   10053          /* fall through; awaiting test case */
   10054       } else {
   10055          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10056          if (insn[1] == 0x29/*movapd*/)
   10057             gen_SEGV_if_not_16_aligned( addr );
   10058          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   10059          DIP("mov%s %s,%s\n", wot, nameXMMReg(gregOfRM(modrm)),
   10060                                    dis_buf );
   10061          delta += 2+alen;
   10062          goto decode_success;
   10063       }
   10064    }
   10065 
   10066    /* 66 0F 6E = MOVD from r/m32 to xmm, zeroing high 3/4 of xmm. */
   10067    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6E) {
   10068       modrm = getIByte(delta+2);
   10069       if (epartIsReg(modrm)) {
   10070          delta += 2+1;
   10071          putXMMReg(
   10072             gregOfRM(modrm),
   10073             unop( Iop_32UtoV128, getIReg(4, eregOfRM(modrm)) )
   10074          );
   10075          DIP("movd %s, %s\n",
   10076              nameIReg(4,eregOfRM(modrm)), nameXMMReg(gregOfRM(modrm)));
   10077       } else {
   10078          addr = disAMode( &alen, sorb, delta+2, dis_buf );
   10079          delta += 2+alen;
   10080          putXMMReg(
   10081             gregOfRM(modrm),
   10082             unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)) )
   10083          );
   10084          DIP("movd %s, %s\n", dis_buf, nameXMMReg(gregOfRM(modrm)));
   10085       }
   10086       goto decode_success;
   10087    }
   10088 
   10089    /* 66 0F 7E = MOVD from xmm low 1/4 to r/m32. */
   10090    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x7E) {
   10091       modrm = getIByte(delta+2);
   10092       if (epartIsReg(modrm)) {
   10093          delta += 2+1;
   10094          putIReg( 4, eregOfRM(modrm),
   10095                   getXMMRegLane32(gregOfRM(modrm), 0) );
   10096          DIP("movd %s, %s\n",
   10097              nameXMMReg(gregOfRM(modrm)), nameIReg(4,eregOfRM(modrm)));
   10098       } else {
   10099          addr = disAMode( &alen, sorb, delta+2, dis_buf );
   10100          delta += 2+alen;
   10101          storeLE( mkexpr(addr),
   10102                   getXMMRegLane32(gregOfRM(modrm), 0) );
   10103          DIP("movd %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
   10104       }
   10105       goto decode_success;
   10106    }
   10107 
   10108    /* 66 0F 7F = MOVDQA -- move from G (xmm) to E (mem or xmm). */
   10109    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x7F) {
   10110       modrm = getIByte(delta+2);
   10111       if (epartIsReg(modrm)) {
   10112          delta += 2+1;
   10113          putXMMReg( eregOfRM(modrm),
   10114                     getXMMReg(gregOfRM(modrm)) );
   10115          DIP("movdqa %s, %s\n", nameXMMReg(gregOfRM(modrm)),
   10116                                 nameXMMReg(eregOfRM(modrm)));
   10117       } else {
   10118          addr = disAMode( &alen, sorb, delta+2, dis_buf );
   10119          delta += 2+alen;
   10120          gen_SEGV_if_not_16_aligned( addr );
   10121          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   10122          DIP("movdqa %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
   10123       }
   10124       goto decode_success;
   10125    }
   10126 
   10127    /* F3 0F 6F = MOVDQU -- move from E (mem or xmm) to G (xmm). */
   10128    /* Unfortunately can't simply use the MOVDQA case since the
   10129       prefix lengths are different (66 vs F3) */
   10130    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x6F) {
   10131       vassert(sz == 4);
   10132       modrm = getIByte(delta+3);
   10133       if (epartIsReg(modrm)) {
   10134          putXMMReg( gregOfRM(modrm),
   10135                     getXMMReg( eregOfRM(modrm) ));
   10136          DIP("movdqu %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10137                                nameXMMReg(gregOfRM(modrm)));
   10138          delta += 3+1;
   10139       } else {
   10140          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   10141          putXMMReg( gregOfRM(modrm),
   10142                     loadLE(Ity_V128, mkexpr(addr)) );
   10143          DIP("movdqu %s,%s\n", dis_buf,
   10144                                nameXMMReg(gregOfRM(modrm)));
   10145          delta += 3+alen;
   10146       }
   10147       goto decode_success;
   10148    }
   10149 
   10150    /* F3 0F 7F = MOVDQU -- move from G (xmm) to E (mem or xmm). */
   10151    /* Unfortunately can't simply use the MOVDQA case since the
   10152       prefix lengths are different (66 vs F3) */
   10153    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x7F) {
   10154       vassert(sz == 4);
   10155       modrm = getIByte(delta+3);
   10156       if (epartIsReg(modrm)) {
   10157          delta += 3+1;
   10158          putXMMReg( eregOfRM(modrm),
   10159                     getXMMReg(gregOfRM(modrm)) );
   10160          DIP("movdqu %s, %s\n", nameXMMReg(gregOfRM(modrm)),
   10161                                 nameXMMReg(eregOfRM(modrm)));
   10162       } else {
   10163          addr = disAMode( &alen, sorb, delta+3, dis_buf );
   10164          delta += 3+alen;
   10165          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   10166          DIP("movdqu %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
   10167       }
   10168       goto decode_success;
   10169    }
   10170 
   10171    /* F2 0F D6 = MOVDQ2Q -- move from E (lo half xmm, not mem) to G (mmx). */
   10172    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xD6) {
   10173       vassert(sz == 4);
   10174       modrm = getIByte(delta+3);
   10175       if (epartIsReg(modrm)) {
   10176          do_MMX_preamble();
   10177          putMMXReg( gregOfRM(modrm),
   10178                     getXMMRegLane64( eregOfRM(modrm), 0 ));
   10179          DIP("movdq2q %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10180                                 nameMMXReg(gregOfRM(modrm)));
   10181          delta += 3+1;
   10182          goto decode_success;
   10183       } else {
   10184          /* fall through, apparently no mem case for this insn */
   10185       }
   10186    }
   10187 
   10188    /* 66 0F 16 = MOVHPD -- move from mem to high half of XMM. */
   10189    /* These seems identical to MOVHPS.  This instruction encoding is
   10190       completely crazy. */
   10191    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x16) {
   10192       modrm = getIByte(delta+2);
   10193       if (epartIsReg(modrm)) {
   10194          /* fall through; apparently reg-reg is not possible */
   10195       } else {
   10196          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10197          delta += 2+alen;
   10198          putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
   10199                           loadLE(Ity_I64, mkexpr(addr)) );
   10200          DIP("movhpd %s,%s\n", dis_buf,
   10201                                nameXMMReg( gregOfRM(modrm) ));
   10202          goto decode_success;
   10203       }
   10204    }
   10205 
   10206    /* 66 0F 17 = MOVHPD -- move from high half of XMM to mem. */
   10207    /* Again, this seems identical to MOVHPS. */
   10208    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x17) {
   10209       if (!epartIsReg(insn[2])) {
   10210          delta += 2;
   10211          addr = disAMode ( &alen, sorb, delta, dis_buf );
   10212          delta += alen;
   10213          storeLE( mkexpr(addr),
   10214                   getXMMRegLane64( gregOfRM(insn[2]),
   10215                                    1/*upper lane*/ ) );
   10216          DIP("movhpd %s,%s\n", nameXMMReg( gregOfRM(insn[2]) ),
   10217                                dis_buf);
   10218          goto decode_success;
   10219       }
   10220       /* else fall through */
   10221    }
   10222 
   10223    /* 66 0F 12 = MOVLPD -- move from mem to low half of XMM. */
   10224    /* Identical to MOVLPS ? */
   10225    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x12) {
   10226       modrm = getIByte(delta+2);
   10227       if (epartIsReg(modrm)) {
   10228          /* fall through; apparently reg-reg is not possible */
   10229       } else {
   10230          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10231          delta += 2+alen;
   10232          putXMMRegLane64( gregOfRM(modrm),  0/*lower lane*/,
   10233                           loadLE(Ity_I64, mkexpr(addr)) );
   10234          DIP("movlpd %s, %s\n",
   10235              dis_buf, nameXMMReg( gregOfRM(modrm) ));
   10236          goto decode_success;
   10237       }
   10238    }
   10239 
   10240    /* 66 0F 13 = MOVLPD -- move from low half of XMM to mem. */
   10241    /* Identical to MOVLPS ? */
   10242    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x13) {
   10243       if (!epartIsReg(insn[2])) {
   10244          delta += 2;
   10245          addr = disAMode ( &alen, sorb, delta, dis_buf );
   10246          delta += alen;
   10247          storeLE( mkexpr(addr),
   10248                   getXMMRegLane64( gregOfRM(insn[2]),
   10249                                    0/*lower lane*/ ) );
   10250          DIP("movlpd %s, %s\n", nameXMMReg( gregOfRM(insn[2]) ),
   10251                                 dis_buf);
   10252          goto decode_success;
   10253       }
   10254       /* else fall through */
   10255    }
   10256 
   10257    /* 66 0F 50 = MOVMSKPD - move 2 sign bits from 2 x F64 in xmm(E) to
   10258       2 lowest bits of ireg(G) */
   10259    if (insn[0] == 0x0F && insn[1] == 0x50) {
   10260       modrm = getIByte(delta+2);
   10261       if (sz == 2 && epartIsReg(modrm)) {
   10262          Int src;
   10263          t0 = newTemp(Ity_I32);
   10264          t1 = newTemp(Ity_I32);
   10265          delta += 2+1;
   10266          src = eregOfRM(modrm);
   10267          assign( t0, binop( Iop_And32,
   10268                             binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(31)),
   10269                             mkU32(1) ));
   10270          assign( t1, binop( Iop_And32,
   10271                             binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(30)),
   10272                             mkU32(2) ));
   10273          putIReg(4, gregOfRM(modrm),
   10274                     binop(Iop_Or32, mkexpr(t0), mkexpr(t1))
   10275                  );
   10276          DIP("movmskpd %s,%s\n", nameXMMReg(src),
   10277                                  nameIReg(4, gregOfRM(modrm)));
   10278          goto decode_success;
   10279       }
   10280       /* else fall through */
   10281    }
   10282 
   10283    /* 66 0F F7 = MASKMOVDQU -- store selected bytes of double quadword */
   10284    if (insn[0] == 0x0F && insn[1] == 0xF7) {
   10285       modrm = getIByte(delta+2);
   10286       if (sz == 2 && epartIsReg(modrm)) {
   10287          IRTemp regD    = newTemp(Ity_V128);
   10288          IRTemp mask    = newTemp(Ity_V128);
   10289          IRTemp olddata = newTemp(Ity_V128);
   10290          IRTemp newdata = newTemp(Ity_V128);
   10291                 addr    = newTemp(Ity_I32);
   10292 
   10293          assign( addr, handleSegOverride( sorb, getIReg(4, R_EDI) ));
   10294          assign( regD, getXMMReg( gregOfRM(modrm) ));
   10295 
   10296          /* Unfortunately can't do the obvious thing with SarN8x16
   10297             here since that can't be re-emitted as SSE2 code - no such
   10298             insn. */
   10299 	 assign(
   10300             mask,
   10301             binop(Iop_64HLtoV128,
   10302                   binop(Iop_SarN8x8,
   10303                         getXMMRegLane64( eregOfRM(modrm), 1 ),
   10304                         mkU8(7) ),
   10305                   binop(Iop_SarN8x8,
   10306                         getXMMRegLane64( eregOfRM(modrm), 0 ),
   10307                         mkU8(7) ) ));
   10308          assign( olddata, loadLE( Ity_V128, mkexpr(addr) ));
   10309          assign( newdata,
   10310                  binop(Iop_OrV128,
   10311                        binop(Iop_AndV128,
   10312                              mkexpr(regD),
   10313                              mkexpr(mask) ),
   10314                        binop(Iop_AndV128,
   10315                              mkexpr(olddata),
   10316                              unop(Iop_NotV128, mkexpr(mask)))) );
   10317          storeLE( mkexpr(addr), mkexpr(newdata) );
   10318 
   10319          delta += 2+1;
   10320          DIP("maskmovdqu %s,%s\n", nameXMMReg( eregOfRM(modrm) ),
   10321                                    nameXMMReg( gregOfRM(modrm) ) );
   10322          goto decode_success;
   10323       }
   10324       /* else fall through */
   10325    }
   10326 
   10327    /* 66 0F E7 = MOVNTDQ -- for us, just a plain SSE store. */
   10328    if (insn[0] == 0x0F && insn[1] == 0xE7) {
   10329       modrm = getIByte(delta+2);
   10330       if (sz == 2 && !epartIsReg(modrm)) {
   10331          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10332          gen_SEGV_if_not_16_aligned( addr );
   10333          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   10334          DIP("movntdq %s,%s\n", dis_buf,
   10335                                 nameXMMReg(gregOfRM(modrm)));
   10336          delta += 2+alen;
   10337          goto decode_success;
   10338       }
   10339       /* else fall through */
   10340    }
   10341 
   10342    /* 0F C3 = MOVNTI -- for us, just a plain ireg store. */
   10343    if (insn[0] == 0x0F && insn[1] == 0xC3) {
   10344       vassert(sz == 4);
   10345       modrm = getIByte(delta+2);
   10346       if (!epartIsReg(modrm)) {
   10347          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10348          storeLE( mkexpr(addr), getIReg(4, gregOfRM(modrm)) );
   10349          DIP("movnti %s,%s\n", dis_buf,
   10350                                nameIReg(4, gregOfRM(modrm)));
   10351          delta += 2+alen;
   10352          goto decode_success;
   10353       }
   10354       /* else fall through */
   10355    }
   10356 
   10357    /* 66 0F D6 = MOVQ -- move 64 bits from G (lo half xmm) to E (mem
   10358       or lo half xmm).  */
   10359    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD6) {
   10360       modrm = getIByte(delta+2);
   10361       if (epartIsReg(modrm)) {
   10362          /* fall through, awaiting test case */
   10363          /* dst: lo half copied, hi half zeroed */
   10364       } else {
   10365          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10366          storeLE( mkexpr(addr),
   10367                   getXMMRegLane64( gregOfRM(modrm), 0 ));
   10368          DIP("movq %s,%s\n", nameXMMReg(gregOfRM(modrm)), dis_buf );
   10369          delta += 2+alen;
   10370          goto decode_success;
   10371       }
   10372    }
   10373 
   10374    /* F3 0F D6 = MOVQ2DQ -- move from E (mmx) to G (lo half xmm, zero
   10375       hi half). */
   10376    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xD6) {
   10377       vassert(sz == 4);
   10378       modrm = getIByte(delta+3);
   10379       if (epartIsReg(modrm)) {
   10380          do_MMX_preamble();
   10381          putXMMReg( gregOfRM(modrm),
   10382                     unop(Iop_64UtoV128, getMMXReg( eregOfRM(modrm) )) );
   10383          DIP("movq2dq %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   10384                                 nameXMMReg(gregOfRM(modrm)));
   10385          delta += 3+1;
   10386          goto decode_success;
   10387       } else {
   10388          /* fall through, apparently no mem case for this insn */
   10389       }
   10390    }
   10391 
   10392    /* F3 0F 7E = MOVQ -- move 64 bits from E (mem or lo half xmm) to
   10393       G (lo half xmm).  Upper half of G is zeroed out. */
   10394    /* F2 0F 10 = MOVSD -- move 64 bits from E (mem or lo half xmm) to
   10395       G (lo half xmm).  If E is mem, upper half of G is zeroed out.
   10396       If E is reg, upper half of G is unchanged. */
   10397    if ((insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x10)
   10398        || (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x7E)) {
   10399       vassert(sz == 4);
   10400       modrm = getIByte(delta+3);
   10401       if (epartIsReg(modrm)) {
   10402          putXMMRegLane64( gregOfRM(modrm), 0,
   10403                           getXMMRegLane64( eregOfRM(modrm), 0 ));
   10404          if (insn[0] == 0xF3/*MOVQ*/) {
   10405             /* zero bits 127:64 */
   10406             putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
   10407          }
   10408          DIP("movsd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10409                               nameXMMReg(gregOfRM(modrm)));
   10410          delta += 3+1;
   10411       } else {
   10412          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   10413          /* zero bits 127:64 */
   10414          putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
   10415          /* write bits 63:0 */
   10416          putXMMRegLane64( gregOfRM(modrm), 0,
   10417                           loadLE(Ity_I64, mkexpr(addr)) );
   10418          DIP("movsd %s,%s\n", dis_buf,
   10419                               nameXMMReg(gregOfRM(modrm)));
   10420          delta += 3+alen;
   10421       }
   10422       goto decode_success;
   10423    }
   10424 
   10425    /* F2 0F 11 = MOVSD -- move 64 bits from G (lo half xmm) to E (mem
   10426       or lo half xmm). */
   10427    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x11) {
   10428       vassert(sz == 4);
   10429       modrm = getIByte(delta+3);
   10430       if (epartIsReg(modrm)) {
   10431          putXMMRegLane64( eregOfRM(modrm), 0,
   10432                           getXMMRegLane64( gregOfRM(modrm), 0 ));
   10433          DIP("movsd %s,%s\n", nameXMMReg(gregOfRM(modrm)),
   10434                               nameXMMReg(eregOfRM(modrm)));
   10435          delta += 3+1;
   10436       } else {
   10437          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   10438          storeLE( mkexpr(addr),
   10439                   getXMMRegLane64(gregOfRM(modrm), 0) );
   10440          DIP("movsd %s,%s\n", nameXMMReg(gregOfRM(modrm)),
   10441                               dis_buf);
   10442          delta += 3+alen;
   10443       }
   10444       goto decode_success;
   10445    }
   10446 
   10447    /* 66 0F 59 = MULPD -- mul 64Fx2 from R/M to R */
   10448    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x59) {
   10449       delta = dis_SSE_E_to_G_all( sorb, delta+2, "mulpd", Iop_Mul64Fx2 );
   10450       goto decode_success;
   10451    }
   10452 
   10453    /* F2 0F 59 = MULSD -- mul 64F0x2 from R/M to R */
   10454    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x59) {
   10455       vassert(sz == 4);
   10456       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "mulsd", Iop_Mul64F0x2 );
   10457       goto decode_success;
   10458    }
   10459 
   10460    /* 66 0F 56 = ORPD -- G = G and E */
   10461    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x56) {
   10462       delta = dis_SSE_E_to_G_all( sorb, delta+2, "orpd", Iop_OrV128 );
   10463       goto decode_success;
   10464    }
   10465 
   10466    /* 66 0F C6 /r ib = SHUFPD -- shuffle packed F64s */
   10467    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC6) {
   10468       Int    select;
   10469       IRTemp sV = newTemp(Ity_V128);
   10470       IRTemp dV = newTemp(Ity_V128);
   10471       IRTemp s1 = newTemp(Ity_I64);
   10472       IRTemp s0 = newTemp(Ity_I64);
   10473       IRTemp d1 = newTemp(Ity_I64);
   10474       IRTemp d0 = newTemp(Ity_I64);
   10475 
   10476       modrm = insn[2];
   10477       assign( dV, getXMMReg(gregOfRM(modrm)) );
   10478 
   10479       if (epartIsReg(modrm)) {
   10480          assign( sV, getXMMReg(eregOfRM(modrm)) );
   10481          select = (Int)insn[3];
   10482          delta += 2+2;
   10483          DIP("shufpd $%d,%s,%s\n", select,
   10484                                    nameXMMReg(eregOfRM(modrm)),
   10485                                    nameXMMReg(gregOfRM(modrm)));
   10486       } else {
   10487          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10488          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   10489          select = (Int)insn[2+alen];
   10490          delta += 3+alen;
   10491          DIP("shufpd $%d,%s,%s\n", select,
   10492                                    dis_buf,
   10493                                    nameXMMReg(gregOfRM(modrm)));
   10494       }
   10495 
   10496       assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
   10497       assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
   10498       assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
   10499       assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
   10500 
   10501 #     define SELD(n) mkexpr((n)==0 ? d0 : d1)
   10502 #     define SELS(n) mkexpr((n)==0 ? s0 : s1)
   10503 
   10504       putXMMReg(
   10505          gregOfRM(modrm),
   10506          binop(Iop_64HLtoV128, SELS((select>>1)&1), SELD((select>>0)&1) )
   10507       );
   10508 
   10509 #     undef SELD
   10510 #     undef SELS
   10511 
   10512       goto decode_success;
   10513    }
   10514 
   10515    /* 66 0F 51 = SQRTPD -- approx sqrt 64Fx2 from R/M to R */
   10516    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x51) {
   10517       delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
   10518                                         "sqrtpd", Iop_Sqrt64Fx2 );
   10519       goto decode_success;
   10520    }
   10521 
   10522    /* F2 0F 51 = SQRTSD -- approx sqrt 64F0x2 from R/M to R */
   10523    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x51) {
   10524       vassert(sz == 4);
   10525       delta = dis_SSE_E_to_G_unary_lo64( sorb, delta+3,
   10526                                          "sqrtsd", Iop_Sqrt64F0x2 );
   10527       goto decode_success;
   10528    }
   10529 
   10530    /* 66 0F 5C = SUBPD -- sub 64Fx2 from R/M to R */
   10531    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5C) {
   10532       delta = dis_SSE_E_to_G_all( sorb, delta+2, "subpd", Iop_Sub64Fx2 );
   10533       goto decode_success;
   10534    }
   10535 
   10536    /* F2 0F 5C = SUBSD -- sub 64F0x2 from R/M to R */
   10537    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5C) {
   10538       vassert(sz == 4);
   10539       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "subsd", Iop_Sub64F0x2 );
   10540       goto decode_success;
   10541    }
   10542 
   10543    /* 66 0F 15 = UNPCKHPD -- unpack and interleave high part F64s */
   10544    /* 66 0F 14 = UNPCKLPD -- unpack and interleave low part F64s */
   10545    /* These just appear to be special cases of SHUFPS */
   10546    if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
   10547       IRTemp s1 = newTemp(Ity_I64);
   10548       IRTemp s0 = newTemp(Ity_I64);
   10549       IRTemp d1 = newTemp(Ity_I64);
   10550       IRTemp d0 = newTemp(Ity_I64);
   10551       IRTemp sV = newTemp(Ity_V128);
   10552       IRTemp dV = newTemp(Ity_V128);
   10553       Bool   hi = toBool(insn[1] == 0x15);
   10554 
   10555       modrm = insn[2];
   10556       assign( dV, getXMMReg(gregOfRM(modrm)) );
   10557 
   10558       if (epartIsReg(modrm)) {
   10559          assign( sV, getXMMReg(eregOfRM(modrm)) );
   10560          delta += 2+1;
   10561          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   10562                                   nameXMMReg(eregOfRM(modrm)),
   10563                                   nameXMMReg(gregOfRM(modrm)));
   10564       } else {
   10565          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10566          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   10567          delta += 2+alen;
   10568          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   10569                                   dis_buf,
   10570                                   nameXMMReg(gregOfRM(modrm)));
   10571       }
   10572 
   10573       assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
   10574       assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
   10575       assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
   10576       assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
   10577 
   10578       if (hi) {
   10579          putXMMReg( gregOfRM(modrm),
   10580                     binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1)) );
   10581       } else {
   10582          putXMMReg( gregOfRM(modrm),
   10583                     binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)) );
   10584       }
   10585 
   10586       goto decode_success;
   10587    }
   10588 
   10589    /* 66 0F 57 = XORPD -- G = G and E */
   10590    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x57) {
   10591       delta = dis_SSE_E_to_G_all( sorb, delta+2, "xorpd", Iop_XorV128 );
   10592       goto decode_success;
   10593    }
   10594 
   10595    /* 66 0F 6B = PACKSSDW */
   10596    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6B) {
   10597       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10598                                  "packssdw",
   10599                                  Iop_QNarrowBin32Sto16Sx8, True );
   10600       goto decode_success;
   10601    }
   10602 
   10603    /* 66 0F 63 = PACKSSWB */
   10604    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x63) {
   10605       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10606                                  "packsswb",
   10607                                  Iop_QNarrowBin16Sto8Sx16, True );
   10608       goto decode_success;
   10609    }
   10610 
   10611    /* 66 0F 67 = PACKUSWB */
   10612    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x67) {
   10613       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10614                                  "packuswb",
   10615                                  Iop_QNarrowBin16Sto8Ux16, True );
   10616       goto decode_success;
   10617    }
   10618 
   10619    /* 66 0F FC = PADDB */
   10620    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFC) {
   10621       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10622                                  "paddb", Iop_Add8x16, False );
   10623       goto decode_success;
   10624    }
   10625 
   10626    /* 66 0F FE = PADDD */
   10627    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFE) {
   10628       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10629                                  "paddd", Iop_Add32x4, False );
   10630       goto decode_success;
   10631    }
   10632 
   10633    /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   10634    /* 0F D4 = PADDQ -- add 64x1 */
   10635    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xD4) {
   10636       do_MMX_preamble();
   10637       delta = dis_MMXop_regmem_to_reg (
   10638                 sorb, delta+2, insn[1], "paddq", False );
   10639       goto decode_success;
   10640    }
   10641 
   10642    /* 66 0F D4 = PADDQ */
   10643    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD4) {
   10644       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10645                                  "paddq", Iop_Add64x2, False );
   10646       goto decode_success;
   10647    }
   10648 
   10649    /* 66 0F FD = PADDW */
   10650    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFD) {
   10651       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10652                                  "paddw", Iop_Add16x8, False );
   10653       goto decode_success;
   10654    }
   10655 
   10656    /* 66 0F EC = PADDSB */
   10657    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEC) {
   10658       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10659                                  "paddsb", Iop_QAdd8Sx16, False );
   10660       goto decode_success;
   10661    }
   10662 
   10663    /* 66 0F ED = PADDSW */
   10664    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xED) {
   10665       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10666                                  "paddsw", Iop_QAdd16Sx8, False );
   10667       goto decode_success;
   10668    }
   10669 
   10670    /* 66 0F DC = PADDUSB */
   10671    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDC) {
   10672       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10673                                  "paddusb", Iop_QAdd8Ux16, False );
   10674       goto decode_success;
   10675    }
   10676 
   10677    /* 66 0F DD = PADDUSW */
   10678    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDD) {
   10679       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10680                                  "paddusw", Iop_QAdd16Ux8, False );
   10681       goto decode_success;
   10682    }
   10683 
   10684    /* 66 0F DB = PAND */
   10685    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDB) {
   10686       delta = dis_SSE_E_to_G_all( sorb, delta+2, "pand", Iop_AndV128 );
   10687       goto decode_success;
   10688    }
   10689 
   10690    /* 66 0F DF = PANDN */
   10691    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDF) {
   10692       delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "pandn", Iop_AndV128 );
   10693       goto decode_success;
   10694    }
   10695 
   10696    /* 66 0F E0 = PAVGB */
   10697    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE0) {
   10698       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10699                                  "pavgb", Iop_Avg8Ux16, False );
   10700       goto decode_success;
   10701    }
   10702 
   10703    /* 66 0F E3 = PAVGW */
   10704    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE3) {
   10705       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10706                                  "pavgw", Iop_Avg16Ux8, False );
   10707       goto decode_success;
   10708    }
   10709 
   10710    /* 66 0F 74 = PCMPEQB */
   10711    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x74) {
   10712       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10713                                  "pcmpeqb", Iop_CmpEQ8x16, False );
   10714       goto decode_success;
   10715    }
   10716 
   10717    /* 66 0F 76 = PCMPEQD */
   10718    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x76) {
   10719       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10720                                  "pcmpeqd", Iop_CmpEQ32x4, False );
   10721       goto decode_success;
   10722    }
   10723 
   10724    /* 66 0F 75 = PCMPEQW */
   10725    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x75) {
   10726       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10727                                  "pcmpeqw", Iop_CmpEQ16x8, False );
   10728       goto decode_success;
   10729    }
   10730 
   10731    /* 66 0F 64 = PCMPGTB */
   10732    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x64) {
   10733       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10734                                  "pcmpgtb", Iop_CmpGT8Sx16, False );
   10735       goto decode_success;
   10736    }
   10737 
   10738    /* 66 0F 66 = PCMPGTD */
   10739    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x66) {
   10740       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10741                                  "pcmpgtd", Iop_CmpGT32Sx4, False );
   10742       goto decode_success;
   10743    }
   10744 
   10745    /* 66 0F 65 = PCMPGTW */
   10746    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x65) {
   10747       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10748                                  "pcmpgtw", Iop_CmpGT16Sx8, False );
   10749       goto decode_success;
   10750    }
   10751 
   10752    /* 66 0F C5 = PEXTRW -- extract 16-bit field from xmm(E) and put
   10753       zero-extend of it in ireg(G). */
   10754    if (insn[0] == 0x0F && insn[1] == 0xC5) {
   10755       modrm = insn[2];
   10756       if (sz == 2 && epartIsReg(modrm)) {
   10757          t5 = newTemp(Ity_V128);
   10758          t4 = newTemp(Ity_I16);
   10759          assign(t5, getXMMReg(eregOfRM(modrm)));
   10760          breakup128to32s( t5, &t3, &t2, &t1, &t0 );
   10761          switch (insn[3] & 7) {
   10762             case 0:  assign(t4, unop(Iop_32to16,   mkexpr(t0))); break;
   10763             case 1:  assign(t4, unop(Iop_32HIto16, mkexpr(t0))); break;
   10764             case 2:  assign(t4, unop(Iop_32to16,   mkexpr(t1))); break;
   10765             case 3:  assign(t4, unop(Iop_32HIto16, mkexpr(t1))); break;
   10766             case 4:  assign(t4, unop(Iop_32to16,   mkexpr(t2))); break;
   10767             case 5:  assign(t4, unop(Iop_32HIto16, mkexpr(t2))); break;
   10768             case 6:  assign(t4, unop(Iop_32to16,   mkexpr(t3))); break;
   10769             case 7:  assign(t4, unop(Iop_32HIto16, mkexpr(t3))); break;
   10770             default: vassert(0); /*NOTREACHED*/
   10771          }
   10772          putIReg(4, gregOfRM(modrm), unop(Iop_16Uto32, mkexpr(t4)));
   10773          DIP("pextrw $%d,%s,%s\n",
   10774              (Int)insn[3], nameXMMReg(eregOfRM(modrm)),
   10775                            nameIReg(4,gregOfRM(modrm)));
   10776          delta += 4;
   10777          goto decode_success;
   10778       }
   10779       /* else fall through */
   10780    }
   10781 
   10782    /* 66 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
   10783       put it into the specified lane of xmm(G). */
   10784    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC4) {
   10785       Int lane;
   10786       t4 = newTemp(Ity_I16);
   10787       modrm = insn[2];
   10788 
   10789       if (epartIsReg(modrm)) {
   10790          assign(t4, getIReg(2, eregOfRM(modrm)));
   10791          delta += 3+1;
   10792          lane = insn[3+1-1];
   10793          DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   10794                                    nameIReg(2,eregOfRM(modrm)),
   10795                                    nameXMMReg(gregOfRM(modrm)));
   10796       } else {
   10797          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10798          delta += 3+alen;
   10799          lane = insn[3+alen-1];
   10800          assign(t4, loadLE(Ity_I16, mkexpr(addr)));
   10801          DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   10802                                    dis_buf,
   10803                                    nameXMMReg(gregOfRM(modrm)));
   10804       }
   10805 
   10806       putXMMRegLane16( gregOfRM(modrm), lane & 7, mkexpr(t4) );
   10807       goto decode_success;
   10808    }
   10809 
   10810    /* 66 0F F5 = PMADDWD -- Multiply and add packed integers from
   10811       E(xmm or mem) to G(xmm) */
   10812    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF5) {
   10813       IRTemp s1V  = newTemp(Ity_V128);
   10814       IRTemp s2V  = newTemp(Ity_V128);
   10815       IRTemp dV   = newTemp(Ity_V128);
   10816       IRTemp s1Hi = newTemp(Ity_I64);
   10817       IRTemp s1Lo = newTemp(Ity_I64);
   10818       IRTemp s2Hi = newTemp(Ity_I64);
   10819       IRTemp s2Lo = newTemp(Ity_I64);
   10820       IRTemp dHi  = newTemp(Ity_I64);
   10821       IRTemp dLo  = newTemp(Ity_I64);
   10822       modrm = insn[2];
   10823       if (epartIsReg(modrm)) {
   10824          assign( s1V, getXMMReg(eregOfRM(modrm)) );
   10825          delta += 2+1;
   10826          DIP("pmaddwd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10827                                 nameXMMReg(gregOfRM(modrm)));
   10828       } else {
   10829          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10830          assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
   10831          delta += 2+alen;
   10832          DIP("pmaddwd %s,%s\n", dis_buf,
   10833                                 nameXMMReg(gregOfRM(modrm)));
   10834       }
   10835       assign( s2V, getXMMReg(gregOfRM(modrm)) );
   10836       assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
   10837       assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
   10838       assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
   10839       assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
   10840       assign( dHi, mkIRExprCCall(
   10841                       Ity_I64, 0/*regparms*/,
   10842                       "x86g_calculate_mmx_pmaddwd",
   10843                       &x86g_calculate_mmx_pmaddwd,
   10844                       mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
   10845                    ));
   10846       assign( dLo, mkIRExprCCall(
   10847                       Ity_I64, 0/*regparms*/,
   10848                       "x86g_calculate_mmx_pmaddwd",
   10849                       &x86g_calculate_mmx_pmaddwd,
   10850                       mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
   10851                    ));
   10852       assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
   10853       putXMMReg(gregOfRM(modrm), mkexpr(dV));
   10854       goto decode_success;
   10855    }
   10856 
   10857    /* 66 0F EE = PMAXSW -- 16x8 signed max */
   10858    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEE) {
   10859       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10860                                  "pmaxsw", Iop_Max16Sx8, False );
   10861       goto decode_success;
   10862    }
   10863 
   10864    /* 66 0F DE = PMAXUB -- 8x16 unsigned max */
   10865    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDE) {
   10866       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10867                                  "pmaxub", Iop_Max8Ux16, False );
   10868       goto decode_success;
   10869    }
   10870 
   10871    /* 66 0F EA = PMINSW -- 16x8 signed min */
   10872    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEA) {
   10873       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10874                                  "pminsw", Iop_Min16Sx8, False );
   10875       goto decode_success;
   10876    }
   10877 
   10878    /* 66 0F DA = PMINUB -- 8x16 unsigned min */
   10879    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDA) {
   10880       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10881                                  "pminub", Iop_Min8Ux16, False );
   10882       goto decode_success;
   10883    }
   10884 
   10885    /* 66 0F D7 = PMOVMSKB -- extract sign bits from each of 16 lanes in
   10886       xmm(G), turn them into a byte, and put zero-extend of it in
   10887       ireg(G).  Doing this directly is just too cumbersome; give up
   10888       therefore and call a helper. */
   10889    /* UInt x86g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo ); */
   10890    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD7) {
   10891       modrm = insn[2];
   10892       if (epartIsReg(modrm)) {
   10893          t0 = newTemp(Ity_I64);
   10894          t1 = newTemp(Ity_I64);
   10895          assign(t0, getXMMRegLane64(eregOfRM(modrm), 0));
   10896          assign(t1, getXMMRegLane64(eregOfRM(modrm), 1));
   10897          t5 = newTemp(Ity_I32);
   10898          assign(t5, mkIRExprCCall(
   10899                        Ity_I32, 0/*regparms*/,
   10900                        "x86g_calculate_sse_pmovmskb",
   10901                        &x86g_calculate_sse_pmovmskb,
   10902                        mkIRExprVec_2( mkexpr(t1), mkexpr(t0) )));
   10903          putIReg(4, gregOfRM(modrm), mkexpr(t5));
   10904          DIP("pmovmskb %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10905                                  nameIReg(4,gregOfRM(modrm)));
   10906          delta += 3;
   10907          goto decode_success;
   10908       }
   10909       /* else fall through */
   10910    }
   10911 
   10912    /* 66 0F E4 = PMULHUW -- 16x8 hi-half of unsigned widening multiply */
   10913    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE4) {
   10914       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10915                                  "pmulhuw", Iop_MulHi16Ux8, False );
   10916       goto decode_success;
   10917    }
   10918 
   10919    /* 66 0F E5 = PMULHW -- 16x8 hi-half of signed widening multiply */
   10920    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE5) {
   10921       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10922                                  "pmulhw", Iop_MulHi16Sx8, False );
   10923       goto decode_success;
   10924    }
   10925 
   10926    /* 66 0F D5 = PMULHL -- 16x8 multiply */
   10927    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD5) {
   10928       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10929                                  "pmullw", Iop_Mul16x8, False );
   10930       goto decode_success;
   10931    }
   10932 
   10933    /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   10934    /* 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
   10935       0 to form 64-bit result */
   10936    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF4) {
   10937       IRTemp sV = newTemp(Ity_I64);
   10938       IRTemp dV = newTemp(Ity_I64);
   10939       t1 = newTemp(Ity_I32);
   10940       t0 = newTemp(Ity_I32);
   10941       modrm = insn[2];
   10942 
   10943       do_MMX_preamble();
   10944       assign( dV, getMMXReg(gregOfRM(modrm)) );
   10945 
   10946       if (epartIsReg(modrm)) {
   10947          assign( sV, getMMXReg(eregOfRM(modrm)) );
   10948          delta += 2+1;
   10949          DIP("pmuludq %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   10950                                 nameMMXReg(gregOfRM(modrm)));
   10951       } else {
   10952          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10953          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   10954          delta += 2+alen;
   10955          DIP("pmuludq %s,%s\n", dis_buf,
   10956                                 nameMMXReg(gregOfRM(modrm)));
   10957       }
   10958 
   10959       assign( t0, unop(Iop_64to32, mkexpr(dV)) );
   10960       assign( t1, unop(Iop_64to32, mkexpr(sV)) );
   10961       putMMXReg( gregOfRM(modrm),
   10962                  binop( Iop_MullU32, mkexpr(t0), mkexpr(t1) ) );
   10963       goto decode_success;
   10964    }
   10965 
   10966    /* 66 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
   10967       0 to form lower 64-bit half and lanes 2 x 2 to form upper 64-bit
   10968       half */
   10969    /* This is a really poor translation -- could be improved if
   10970       performance critical */
   10971    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF4) {
   10972       IRTemp sV, dV;
   10973       IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   10974       sV = newTemp(Ity_V128);
   10975       dV = newTemp(Ity_V128);
   10976       s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   10977       t1 = newTemp(Ity_I64);
   10978       t0 = newTemp(Ity_I64);
   10979       modrm = insn[2];
   10980       assign( dV, getXMMReg(gregOfRM(modrm)) );
   10981 
   10982       if (epartIsReg(modrm)) {
   10983          assign( sV, getXMMReg(eregOfRM(modrm)) );
   10984          delta += 2+1;
   10985          DIP("pmuludq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10986                                 nameXMMReg(gregOfRM(modrm)));
   10987       } else {
   10988          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10989          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   10990          delta += 2+alen;
   10991          DIP("pmuludq %s,%s\n", dis_buf,
   10992                                 nameXMMReg(gregOfRM(modrm)));
   10993       }
   10994 
   10995       breakup128to32s( dV, &d3, &d2, &d1, &d0 );
   10996       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   10997 
   10998       assign( t0, binop( Iop_MullU32, mkexpr(d0), mkexpr(s0)) );
   10999       putXMMRegLane64( gregOfRM(modrm), 0, mkexpr(t0) );
   11000       assign( t1, binop( Iop_MullU32, mkexpr(d2), mkexpr(s2)) );
   11001       putXMMRegLane64( gregOfRM(modrm), 1, mkexpr(t1) );
   11002       goto decode_success;
   11003    }
   11004 
   11005    /* 66 0F EB = POR */
   11006    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEB) {
   11007       delta = dis_SSE_E_to_G_all( sorb, delta+2, "por", Iop_OrV128 );
   11008       goto decode_success;
   11009    }
   11010 
   11011    /* 66 0F F6 = PSADBW -- 2 x (8x8 -> 48 zeroes ++ u16) Sum Abs Diffs
   11012       from E(xmm or mem) to G(xmm) */
   11013    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF6) {
   11014       IRTemp s1V  = newTemp(Ity_V128);
   11015       IRTemp s2V  = newTemp(Ity_V128);
   11016       IRTemp dV   = newTemp(Ity_V128);
   11017       IRTemp s1Hi = newTemp(Ity_I64);
   11018       IRTemp s1Lo = newTemp(Ity_I64);
   11019       IRTemp s2Hi = newTemp(Ity_I64);
   11020       IRTemp s2Lo = newTemp(Ity_I64);
   11021       IRTemp dHi  = newTemp(Ity_I64);
   11022       IRTemp dLo  = newTemp(Ity_I64);
   11023       modrm = insn[2];
   11024       if (epartIsReg(modrm)) {
   11025          assign( s1V, getXMMReg(eregOfRM(modrm)) );
   11026          delta += 2+1;
   11027          DIP("psadbw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11028                                nameXMMReg(gregOfRM(modrm)));
   11029       } else {
   11030          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11031          assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
   11032          delta += 2+alen;
   11033          DIP("psadbw %s,%s\n", dis_buf,
   11034                                nameXMMReg(gregOfRM(modrm)));
   11035       }
   11036       assign( s2V, getXMMReg(gregOfRM(modrm)) );
   11037       assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
   11038       assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
   11039       assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
   11040       assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
   11041       assign( dHi, mkIRExprCCall(
   11042                       Ity_I64, 0/*regparms*/,
   11043                       "x86g_calculate_mmx_psadbw",
   11044                       &x86g_calculate_mmx_psadbw,
   11045                       mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
   11046                    ));
   11047       assign( dLo, mkIRExprCCall(
   11048                       Ity_I64, 0/*regparms*/,
   11049                       "x86g_calculate_mmx_psadbw",
   11050                       &x86g_calculate_mmx_psadbw,
   11051                       mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
   11052                    ));
   11053       assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
   11054       putXMMReg(gregOfRM(modrm), mkexpr(dV));
   11055       goto decode_success;
   11056    }
   11057 
   11058    /* 66 0F 70 = PSHUFD -- rearrange 4x32 from E(xmm or mem) to G(xmm) */
   11059    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x70) {
   11060       Int order;
   11061       IRTemp sV, dV, s3, s2, s1, s0;
   11062       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11063       sV = newTemp(Ity_V128);
   11064       dV = newTemp(Ity_V128);
   11065       modrm = insn[2];
   11066       if (epartIsReg(modrm)) {
   11067          assign( sV, getXMMReg(eregOfRM(modrm)) );
   11068          order = (Int)insn[3];
   11069          delta += 2+2;
   11070          DIP("pshufd $%d,%s,%s\n", order,
   11071                                    nameXMMReg(eregOfRM(modrm)),
   11072                                    nameXMMReg(gregOfRM(modrm)));
   11073       } else {
   11074          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11075          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11076 	 order = (Int)insn[2+alen];
   11077          delta += 3+alen;
   11078          DIP("pshufd $%d,%s,%s\n", order,
   11079                                    dis_buf,
   11080                                    nameXMMReg(gregOfRM(modrm)));
   11081       }
   11082       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   11083 
   11084 #     define SEL(n) \
   11085                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   11086       assign(dV,
   11087 	     mk128from32s( SEL((order>>6)&3), SEL((order>>4)&3),
   11088                            SEL((order>>2)&3), SEL((order>>0)&3) )
   11089       );
   11090       putXMMReg(gregOfRM(modrm), mkexpr(dV));
   11091 #     undef SEL
   11092       goto decode_success;
   11093    }
   11094 
   11095    /* F3 0F 70 = PSHUFHW -- rearrange upper half 4x16 from E(xmm or
   11096       mem) to G(xmm), and copy lower half */
   11097    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x70) {
   11098       Int order;
   11099       IRTemp sVhi, dVhi, sV, dV, s3, s2, s1, s0;
   11100       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11101       sV   = newTemp(Ity_V128);
   11102       dV   = newTemp(Ity_V128);
   11103       sVhi = newTemp(Ity_I64);
   11104       dVhi = newTemp(Ity_I64);
   11105       modrm = insn[3];
   11106       if (epartIsReg(modrm)) {
   11107          assign( sV, getXMMReg(eregOfRM(modrm)) );
   11108          order = (Int)insn[4];
   11109          delta += 4+1;
   11110          DIP("pshufhw $%d,%s,%s\n", order,
   11111                                     nameXMMReg(eregOfRM(modrm)),
   11112                                     nameXMMReg(gregOfRM(modrm)));
   11113       } else {
   11114          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11115          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11116 	 order = (Int)insn[3+alen];
   11117          delta += 4+alen;
   11118          DIP("pshufhw $%d,%s,%s\n", order,
   11119                                     dis_buf,
   11120                                     nameXMMReg(gregOfRM(modrm)));
   11121       }
   11122       assign( sVhi, unop(Iop_V128HIto64, mkexpr(sV)) );
   11123       breakup64to16s( sVhi, &s3, &s2, &s1, &s0 );
   11124 
   11125 #     define SEL(n) \
   11126                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   11127       assign(dVhi,
   11128 	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
   11129                           SEL((order>>2)&3), SEL((order>>0)&3) )
   11130       );
   11131       assign(dV, binop( Iop_64HLtoV128,
   11132                         mkexpr(dVhi),
   11133                         unop(Iop_V128to64, mkexpr(sV))) );
   11134       putXMMReg(gregOfRM(modrm), mkexpr(dV));
   11135 #     undef SEL
   11136       goto decode_success;
   11137    }
   11138 
   11139    /* F2 0F 70 = PSHUFLW -- rearrange lower half 4x16 from E(xmm or
   11140       mem) to G(xmm), and copy upper half */
   11141    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x70) {
   11142       Int order;
   11143       IRTemp sVlo, dVlo, sV, dV, s3, s2, s1, s0;
   11144       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11145       sV   = newTemp(Ity_V128);
   11146       dV   = newTemp(Ity_V128);
   11147       sVlo = newTemp(Ity_I64);
   11148       dVlo = newTemp(Ity_I64);
   11149       modrm = insn[3];
   11150       if (epartIsReg(modrm)) {
   11151          assign( sV, getXMMReg(eregOfRM(modrm)) );
   11152          order = (Int)insn[4];
   11153          delta += 4+1;
   11154          DIP("pshuflw $%d,%s,%s\n", order,
   11155                                     nameXMMReg(eregOfRM(modrm)),
   11156                                     nameXMMReg(gregOfRM(modrm)));
   11157       } else {
   11158          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11159          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11160 	 order = (Int)insn[3+alen];
   11161          delta += 4+alen;
   11162          DIP("pshuflw $%d,%s,%s\n", order,
   11163                                     dis_buf,
   11164                                     nameXMMReg(gregOfRM(modrm)));
   11165       }
   11166       assign( sVlo, unop(Iop_V128to64, mkexpr(sV)) );
   11167       breakup64to16s( sVlo, &s3, &s2, &s1, &s0 );
   11168 
   11169 #     define SEL(n) \
   11170                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   11171       assign(dVlo,
   11172 	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
   11173                           SEL((order>>2)&3), SEL((order>>0)&3) )
   11174       );
   11175       assign(dV, binop( Iop_64HLtoV128,
   11176                         unop(Iop_V128HIto64, mkexpr(sV)),
   11177                         mkexpr(dVlo) ) );
   11178       putXMMReg(gregOfRM(modrm), mkexpr(dV));
   11179 #     undef SEL
   11180       goto decode_success;
   11181    }
   11182 
   11183    /* 66 0F 72 /6 ib = PSLLD by immediate */
   11184    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
   11185        && epartIsReg(insn[2])
   11186        && gregOfRM(insn[2]) == 6) {
   11187       delta = dis_SSE_shiftE_imm( delta+2, "pslld", Iop_ShlN32x4 );
   11188       goto decode_success;
   11189    }
   11190 
   11191    /* 66 0F F2 = PSLLD by E */
   11192    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF2) {
   11193       delta = dis_SSE_shiftG_byE( sorb, delta+2, "pslld", Iop_ShlN32x4 );
   11194       goto decode_success;
   11195    }
   11196 
   11197    /* 66 0F 73 /7 ib = PSLLDQ by immediate */
   11198    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
   11199        && epartIsReg(insn[2])
   11200        && gregOfRM(insn[2]) == 7) {
   11201       IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
   11202       Int    imm = (Int)insn[3];
   11203       Int    reg = eregOfRM(insn[2]);
   11204       DIP("pslldq $%d,%s\n", imm, nameXMMReg(reg));
   11205       vassert(imm >= 0 && imm <= 255);
   11206       delta += 4;
   11207 
   11208       sV    = newTemp(Ity_V128);
   11209       dV    = newTemp(Ity_V128);
   11210       hi64  = newTemp(Ity_I64);
   11211       lo64  = newTemp(Ity_I64);
   11212       hi64r = newTemp(Ity_I64);
   11213       lo64r = newTemp(Ity_I64);
   11214 
   11215       if (imm >= 16) {
   11216          putXMMReg(reg, mkV128(0x0000));
   11217          goto decode_success;
   11218       }
   11219 
   11220       assign( sV, getXMMReg(reg) );
   11221       assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
   11222       assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
   11223 
   11224       if (imm == 0) {
   11225          assign( lo64r, mkexpr(lo64) );
   11226          assign( hi64r, mkexpr(hi64) );
   11227       }
   11228       else
   11229       if (imm == 8) {
   11230          assign( lo64r, mkU64(0) );
   11231          assign( hi64r, mkexpr(lo64) );
   11232       }
   11233       else
   11234       if (imm > 8) {
   11235          assign( lo64r, mkU64(0) );
   11236          assign( hi64r, binop( Iop_Shl64,
   11237                                mkexpr(lo64),
   11238                                mkU8( 8*(imm-8) ) ));
   11239       } else {
   11240          assign( lo64r, binop( Iop_Shl64,
   11241                                mkexpr(lo64),
   11242                                mkU8(8 * imm) ));
   11243          assign( hi64r,
   11244                  binop( Iop_Or64,
   11245                         binop(Iop_Shl64, mkexpr(hi64),
   11246                                          mkU8(8 * imm)),
   11247                         binop(Iop_Shr64, mkexpr(lo64),
   11248                                          mkU8(8 * (8 - imm)) )
   11249                       )
   11250                );
   11251       }
   11252       assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
   11253       putXMMReg(reg, mkexpr(dV));
   11254       goto decode_success;
   11255    }
   11256 
   11257    /* 66 0F 73 /6 ib = PSLLQ by immediate */
   11258    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
   11259        && epartIsReg(insn[2])
   11260        && gregOfRM(insn[2]) == 6) {
   11261       delta = dis_SSE_shiftE_imm( delta+2, "psllq", Iop_ShlN64x2 );
   11262       goto decode_success;
   11263    }
   11264 
   11265    /* 66 0F F3 = PSLLQ by E */
   11266    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF3) {
   11267       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psllq", Iop_ShlN64x2 );
   11268       goto decode_success;
   11269    }
   11270 
   11271    /* 66 0F 71 /6 ib = PSLLW by immediate */
   11272    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
   11273        && epartIsReg(insn[2])
   11274        && gregOfRM(insn[2]) == 6) {
   11275       delta = dis_SSE_shiftE_imm( delta+2, "psllw", Iop_ShlN16x8 );
   11276       goto decode_success;
   11277    }
   11278 
   11279    /* 66 0F F1 = PSLLW by E */
   11280    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF1) {
   11281       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psllw", Iop_ShlN16x8 );
   11282       goto decode_success;
   11283    }
   11284 
   11285    /* 66 0F 72 /4 ib = PSRAD by immediate */
   11286    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
   11287        && epartIsReg(insn[2])
   11288        && gregOfRM(insn[2]) == 4) {
   11289       delta = dis_SSE_shiftE_imm( delta+2, "psrad", Iop_SarN32x4 );
   11290       goto decode_success;
   11291    }
   11292 
   11293    /* 66 0F E2 = PSRAD by E */
   11294    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE2) {
   11295       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrad", Iop_SarN32x4 );
   11296       goto decode_success;
   11297    }
   11298 
   11299    /* 66 0F 71 /4 ib = PSRAW by immediate */
   11300    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
   11301        && epartIsReg(insn[2])
   11302        && gregOfRM(insn[2]) == 4) {
   11303       delta = dis_SSE_shiftE_imm( delta+2, "psraw", Iop_SarN16x8 );
   11304       goto decode_success;
   11305    }
   11306 
   11307    /* 66 0F E1 = PSRAW by E */
   11308    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE1) {
   11309       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psraw", Iop_SarN16x8 );
   11310       goto decode_success;
   11311    }
   11312 
   11313    /* 66 0F 72 /2 ib = PSRLD by immediate */
   11314    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
   11315        && epartIsReg(insn[2])
   11316        && gregOfRM(insn[2]) == 2) {
   11317       delta = dis_SSE_shiftE_imm( delta+2, "psrld", Iop_ShrN32x4 );
   11318       goto decode_success;
   11319    }
   11320 
   11321    /* 66 0F D2 = PSRLD by E */
   11322    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD2) {
   11323       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrld", Iop_ShrN32x4 );
   11324       goto decode_success;
   11325    }
   11326 
   11327    /* 66 0F 73 /3 ib = PSRLDQ by immediate */
   11328    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
   11329        && epartIsReg(insn[2])
   11330        && gregOfRM(insn[2]) == 3) {
   11331       IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
   11332       Int    imm = (Int)insn[3];
   11333       Int    reg = eregOfRM(insn[2]);
   11334       DIP("psrldq $%d,%s\n", imm, nameXMMReg(reg));
   11335       vassert(imm >= 0 && imm <= 255);
   11336       delta += 4;
   11337 
   11338       sV    = newTemp(Ity_V128);
   11339       dV    = newTemp(Ity_V128);
   11340       hi64  = newTemp(Ity_I64);
   11341       lo64  = newTemp(Ity_I64);
   11342       hi64r = newTemp(Ity_I64);
   11343       lo64r = newTemp(Ity_I64);
   11344 
   11345       if (imm >= 16) {
   11346          putXMMReg(reg, mkV128(0x0000));
   11347          goto decode_success;
   11348       }
   11349 
   11350       assign( sV, getXMMReg(reg) );
   11351       assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
   11352       assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
   11353 
   11354       if (imm == 0) {
   11355          assign( lo64r, mkexpr(lo64) );
   11356          assign( hi64r, mkexpr(hi64) );
   11357       }
   11358       else
   11359       if (imm == 8) {
   11360          assign( hi64r, mkU64(0) );
   11361          assign( lo64r, mkexpr(hi64) );
   11362       }
   11363       else
   11364       if (imm > 8) {
   11365          assign( hi64r, mkU64(0) );
   11366          assign( lo64r, binop( Iop_Shr64,
   11367                                mkexpr(hi64),
   11368                                mkU8( 8*(imm-8) ) ));
   11369       } else {
   11370          assign( hi64r, binop( Iop_Shr64,
   11371                                mkexpr(hi64),
   11372                                mkU8(8 * imm) ));
   11373          assign( lo64r,
   11374                  binop( Iop_Or64,
   11375                         binop(Iop_Shr64, mkexpr(lo64),
   11376                                          mkU8(8 * imm)),
   11377                         binop(Iop_Shl64, mkexpr(hi64),
   11378                                          mkU8(8 * (8 - imm)) )
   11379                       )
   11380                );
   11381       }
   11382 
   11383       assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
   11384       putXMMReg(reg, mkexpr(dV));
   11385       goto decode_success;
   11386    }
   11387 
   11388    /* 66 0F 73 /2 ib = PSRLQ by immediate */
   11389    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
   11390        && epartIsReg(insn[2])
   11391        && gregOfRM(insn[2]) == 2) {
   11392       delta = dis_SSE_shiftE_imm( delta+2, "psrlq", Iop_ShrN64x2 );
   11393       goto decode_success;
   11394    }
   11395 
   11396    /* 66 0F D3 = PSRLQ by E */
   11397    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD3) {
   11398       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrlq", Iop_ShrN64x2 );
   11399       goto decode_success;
   11400    }
   11401 
   11402    /* 66 0F 71 /2 ib = PSRLW by immediate */
   11403    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
   11404        && epartIsReg(insn[2])
   11405        && gregOfRM(insn[2]) == 2) {
   11406       delta = dis_SSE_shiftE_imm( delta+2, "psrlw", Iop_ShrN16x8 );
   11407       goto decode_success;
   11408    }
   11409 
   11410    /* 66 0F D1 = PSRLW by E */
   11411    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD1) {
   11412       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrlw", Iop_ShrN16x8 );
   11413       goto decode_success;
   11414    }
   11415 
   11416    /* 66 0F F8 = PSUBB */
   11417    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF8) {
   11418       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11419                                  "psubb", Iop_Sub8x16, False );
   11420       goto decode_success;
   11421    }
   11422 
   11423    /* 66 0F FA = PSUBD */
   11424    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFA) {
   11425       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11426                                  "psubd", Iop_Sub32x4, False );
   11427       goto decode_success;
   11428    }
   11429 
   11430    /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   11431    /* 0F FB = PSUBQ -- sub 64x1 */
   11432    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xFB) {
   11433       do_MMX_preamble();
   11434       delta = dis_MMXop_regmem_to_reg (
   11435                 sorb, delta+2, insn[1], "psubq", False );
   11436       goto decode_success;
   11437    }
   11438 
   11439    /* 66 0F FB = PSUBQ */
   11440    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFB) {
   11441       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11442                                  "psubq", Iop_Sub64x2, False );
   11443       goto decode_success;
   11444    }
   11445 
   11446    /* 66 0F F9 = PSUBW */
   11447    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF9) {
   11448       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11449                                  "psubw", Iop_Sub16x8, False );
   11450       goto decode_success;
   11451    }
   11452 
   11453    /* 66 0F E8 = PSUBSB */
   11454    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE8) {
   11455       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11456                                  "psubsb", Iop_QSub8Sx16, False );
   11457       goto decode_success;
   11458    }
   11459 
   11460    /* 66 0F E9 = PSUBSW */
   11461    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE9) {
   11462       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11463                                  "psubsw", Iop_QSub16Sx8, False );
   11464       goto decode_success;
   11465    }
   11466 
   11467    /* 66 0F D8 = PSUBSB */
   11468    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD8) {
   11469       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11470                                  "psubusb", Iop_QSub8Ux16, False );
   11471       goto decode_success;
   11472    }
   11473 
   11474    /* 66 0F D9 = PSUBSW */
   11475    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD9) {
   11476       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11477                                  "psubusw", Iop_QSub16Ux8, False );
   11478       goto decode_success;
   11479    }
   11480 
   11481    /* 66 0F 68 = PUNPCKHBW */
   11482    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x68) {
   11483       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11484                                  "punpckhbw",
   11485                                  Iop_InterleaveHI8x16, True );
   11486       goto decode_success;
   11487    }
   11488 
   11489    /* 66 0F 6A = PUNPCKHDQ */
   11490    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6A) {
   11491       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11492                                  "punpckhdq",
   11493                                  Iop_InterleaveHI32x4, True );
   11494       goto decode_success;
   11495    }
   11496 
   11497    /* 66 0F 6D = PUNPCKHQDQ */
   11498    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6D) {
   11499       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11500                                  "punpckhqdq",
   11501                                  Iop_InterleaveHI64x2, True );
   11502       goto decode_success;
   11503    }
   11504 
   11505    /* 66 0F 69 = PUNPCKHWD */
   11506    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x69) {
   11507       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11508                                  "punpckhwd",
   11509                                  Iop_InterleaveHI16x8, True );
   11510       goto decode_success;
   11511    }
   11512 
   11513    /* 66 0F 60 = PUNPCKLBW */
   11514    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x60) {
   11515       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11516                                  "punpcklbw",
   11517                                  Iop_InterleaveLO8x16, True );
   11518       goto decode_success;
   11519    }
   11520 
   11521    /* 66 0F 62 = PUNPCKLDQ */
   11522    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x62) {
   11523       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11524                                  "punpckldq",
   11525                                  Iop_InterleaveLO32x4, True );
   11526       goto decode_success;
   11527    }
   11528 
   11529    /* 66 0F 6C = PUNPCKLQDQ */
   11530    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6C) {
   11531       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11532                                  "punpcklqdq",
   11533                                  Iop_InterleaveLO64x2, True );
   11534       goto decode_success;
   11535    }
   11536 
   11537    /* 66 0F 61 = PUNPCKLWD */
   11538    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x61) {
   11539       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11540                                  "punpcklwd",
   11541                                  Iop_InterleaveLO16x8, True );
   11542       goto decode_success;
   11543    }
   11544 
   11545    /* 66 0F EF = PXOR */
   11546    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEF) {
   11547       delta = dis_SSE_E_to_G_all( sorb, delta+2, "pxor", Iop_XorV128 );
   11548       goto decode_success;
   11549    }
   11550 
   11551 //--    /* FXSAVE/FXRSTOR m32 -- load/store the FPU/MMX/SSE state. */
   11552 //--    if (insn[0] == 0x0F && insn[1] == 0xAE
   11553 //--        && (!epartIsReg(insn[2]))
   11554 //--        && (gregOfRM(insn[2]) == 1 || gregOfRM(insn[2]) == 0) ) {
   11555 //--       Bool store = gregOfRM(insn[2]) == 0;
   11556 //--       vg_assert(sz == 4);
   11557 //--       pair = disAMode ( cb, sorb, eip+2, dis_buf );
   11558 //--       t1   = LOW24(pair);
   11559 //--       eip += 2+HI8(pair);
   11560 //--       uInstr3(cb, store ? SSE2a_MemWr : SSE2a_MemRd, 512,
   11561 //--                   Lit16, (((UShort)insn[0]) << 8) | (UShort)insn[1],
   11562 //--                   Lit16, (UShort)insn[2],
   11563 //--                   TempReg, t1 );
   11564 //--       DIP("fx%s %s\n", store ? "save" : "rstor", dis_buf );
   11565 //--       goto decode_success;
   11566 //--    }
   11567 
   11568    /* 0F AE /7 = CLFLUSH -- flush cache line */
   11569    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
   11570        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 7) {
   11571 
   11572       /* This is something of a hack.  We need to know the size of the
   11573          cache line containing addr.  Since we don't (easily), assume
   11574          256 on the basis that no real cache would have a line that
   11575          big.  It's safe to invalidate more stuff than we need, just
   11576          inefficient. */
   11577       UInt lineszB = 256;
   11578 
   11579       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11580       delta += 2+alen;
   11581 
   11582       /* Round addr down to the start of the containing block. */
   11583       stmt( IRStmt_Put(
   11584                OFFB_TISTART,
   11585                binop( Iop_And32,
   11586                       mkexpr(addr),
   11587                       mkU32( ~(lineszB-1) ))) );
   11588 
   11589       stmt( IRStmt_Put(OFFB_TILEN, mkU32(lineszB) ) );
   11590 
   11591       jmp_lit(&dres, Ijk_TInval, (Addr32)(guest_EIP_bbstart+delta));
   11592 
   11593       DIP("clflush %s\n", dis_buf);
   11594       goto decode_success;
   11595    }
   11596 
   11597    /* ---------------------------------------------------- */
   11598    /* --- end of the SSE2 decoder.                     --- */
   11599    /* ---------------------------------------------------- */
   11600 
   11601    /* ---------------------------------------------------- */
   11602    /* --- start of the SSE3 decoder.                   --- */
   11603    /* ---------------------------------------------------- */
   11604 
   11605    /* Skip parts of the decoder which don't apply given the stated
   11606       guest subarchitecture. */
   11607    /* if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE3)) */
   11608    /* In fact this is highly bogus; we accept SSE3 insns even on a
   11609       SSE2-only guest since they turn into IR which can be re-emitted
   11610       successfully on an SSE2 host. */
   11611    if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2))
   11612       goto after_sse_decoders; /* no SSE3 capabilities */
   11613 
   11614    insn = (UChar*)&guest_code[delta];
   11615 
   11616    /* F3 0F 12 = MOVSLDUP -- move from E (mem or xmm) to G (xmm),
   11617       duplicating some lanes (2:2:0:0). */
   11618    /* F3 0F 16 = MOVSHDUP -- move from E (mem or xmm) to G (xmm),
   11619       duplicating some lanes (3:3:1:1). */
   11620    if (sz == 4 && insn[0] == 0xF3 && insn[1] == 0x0F
   11621        && (insn[2] == 0x12 || insn[2] == 0x16)) {
   11622       IRTemp s3, s2, s1, s0;
   11623       IRTemp sV  = newTemp(Ity_V128);
   11624       Bool   isH = insn[2] == 0x16;
   11625       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11626 
   11627       modrm = insn[3];
   11628       if (epartIsReg(modrm)) {
   11629          assign( sV, getXMMReg( eregOfRM(modrm)) );
   11630          DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
   11631                                   nameXMMReg(eregOfRM(modrm)),
   11632                                   nameXMMReg(gregOfRM(modrm)));
   11633          delta += 3+1;
   11634       } else {
   11635          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11636          gen_SEGV_if_not_16_aligned( addr );
   11637          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11638          DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
   11639 	     dis_buf,
   11640              nameXMMReg(gregOfRM(modrm)));
   11641          delta += 3+alen;
   11642       }
   11643 
   11644       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   11645       putXMMReg( gregOfRM(modrm),
   11646                  isH ? mk128from32s( s3, s3, s1, s1 )
   11647                      : mk128from32s( s2, s2, s0, s0 ) );
   11648       goto decode_success;
   11649    }
   11650 
   11651    /* F2 0F 12 = MOVDDUP -- move from E (mem or xmm) to G (xmm),
   11652       duplicating some lanes (0:1:0:1). */
   11653    if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x12) {
   11654       IRTemp sV = newTemp(Ity_V128);
   11655       IRTemp d0 = newTemp(Ity_I64);
   11656 
   11657       modrm = insn[3];
   11658       if (epartIsReg(modrm)) {
   11659          assign( sV, getXMMReg( eregOfRM(modrm)) );
   11660          DIP("movddup %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11661                                 nameXMMReg(gregOfRM(modrm)));
   11662          delta += 3+1;
   11663          assign ( d0, unop(Iop_V128to64, mkexpr(sV)) );
   11664       } else {
   11665          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11666          assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
   11667          DIP("movddup %s,%s\n", dis_buf,
   11668                                 nameXMMReg(gregOfRM(modrm)));
   11669          delta += 3+alen;
   11670       }
   11671 
   11672       putXMMReg( gregOfRM(modrm), binop(Iop_64HLtoV128,mkexpr(d0),mkexpr(d0)) );
   11673       goto decode_success;
   11674    }
   11675 
   11676    /* F2 0F D0 = ADDSUBPS -- 32x4 +/-/+/- from E (mem or xmm) to G (xmm). */
   11677    if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xD0) {
   11678       IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
   11679       IRTemp eV   = newTemp(Ity_V128);
   11680       IRTemp gV   = newTemp(Ity_V128);
   11681       IRTemp addV = newTemp(Ity_V128);
   11682       IRTemp subV = newTemp(Ity_V128);
   11683       a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11684 
   11685       modrm = insn[3];
   11686       if (epartIsReg(modrm)) {
   11687          assign( eV, getXMMReg( eregOfRM(modrm)) );
   11688          DIP("addsubps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11689                                  nameXMMReg(gregOfRM(modrm)));
   11690          delta += 3+1;
   11691       } else {
   11692          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11693          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   11694          DIP("addsubps %s,%s\n", dis_buf,
   11695                                  nameXMMReg(gregOfRM(modrm)));
   11696          delta += 3+alen;
   11697       }
   11698 
   11699       assign( gV, getXMMReg(gregOfRM(modrm)) );
   11700 
   11701       assign( addV, binop(Iop_Add32Fx4, mkexpr(gV), mkexpr(eV)) );
   11702       assign( subV, binop(Iop_Sub32Fx4, mkexpr(gV), mkexpr(eV)) );
   11703 
   11704       breakup128to32s( addV, &a3, &a2, &a1, &a0 );
   11705       breakup128to32s( subV, &s3, &s2, &s1, &s0 );
   11706 
   11707       putXMMReg( gregOfRM(modrm), mk128from32s( a3, s2, a1, s0 ));
   11708       goto decode_success;
   11709    }
   11710 
   11711    /* 66 0F D0 = ADDSUBPD -- 64x4 +/- from E (mem or xmm) to G (xmm). */
   11712    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD0) {
   11713       IRTemp eV   = newTemp(Ity_V128);
   11714       IRTemp gV   = newTemp(Ity_V128);
   11715       IRTemp addV = newTemp(Ity_V128);
   11716       IRTemp subV = newTemp(Ity_V128);
   11717       IRTemp a1     = newTemp(Ity_I64);
   11718       IRTemp s0     = newTemp(Ity_I64);
   11719 
   11720       modrm = insn[2];
   11721       if (epartIsReg(modrm)) {
   11722          assign( eV, getXMMReg( eregOfRM(modrm)) );
   11723          DIP("addsubpd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11724                                  nameXMMReg(gregOfRM(modrm)));
   11725          delta += 2+1;
   11726       } else {
   11727          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11728          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   11729          DIP("addsubpd %s,%s\n", dis_buf,
   11730                                  nameXMMReg(gregOfRM(modrm)));
   11731          delta += 2+alen;
   11732       }
   11733 
   11734       assign( gV, getXMMReg(gregOfRM(modrm)) );
   11735 
   11736       assign( addV, binop(Iop_Add64Fx2, mkexpr(gV), mkexpr(eV)) );
   11737       assign( subV, binop(Iop_Sub64Fx2, mkexpr(gV), mkexpr(eV)) );
   11738 
   11739       assign( a1, unop(Iop_V128HIto64, mkexpr(addV) ));
   11740       assign( s0, unop(Iop_V128to64,   mkexpr(subV) ));
   11741 
   11742       putXMMReg( gregOfRM(modrm),
   11743                  binop(Iop_64HLtoV128, mkexpr(a1), mkexpr(s0)) );
   11744       goto decode_success;
   11745    }
   11746 
   11747    /* F2 0F 7D = HSUBPS -- 32x4 sub across from E (mem or xmm) to G (xmm). */
   11748    /* F2 0F 7C = HADDPS -- 32x4 add across from E (mem or xmm) to G (xmm). */
   11749    if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F
   11750        && (insn[2] == 0x7C || insn[2] == 0x7D)) {
   11751       IRTemp e3, e2, e1, e0, g3, g2, g1, g0;
   11752       IRTemp eV     = newTemp(Ity_V128);
   11753       IRTemp gV     = newTemp(Ity_V128);
   11754       IRTemp leftV  = newTemp(Ity_V128);
   11755       IRTemp rightV = newTemp(Ity_V128);
   11756       Bool   isAdd  = insn[2] == 0x7C;
   11757       HChar* str    = isAdd ? "add" : "sub";
   11758       e3 = e2 = e1 = e0 = g3 = g2 = g1 = g0 = IRTemp_INVALID;
   11759 
   11760       modrm = insn[3];
   11761       if (epartIsReg(modrm)) {
   11762          assign( eV, getXMMReg( eregOfRM(modrm)) );
   11763          DIP("h%sps %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
   11764                                    nameXMMReg(gregOfRM(modrm)));
   11765          delta += 3+1;
   11766       } else {
   11767          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11768          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   11769          DIP("h%sps %s,%s\n", str, dis_buf,
   11770                                    nameXMMReg(gregOfRM(modrm)));
   11771          delta += 3+alen;
   11772       }
   11773 
   11774       assign( gV, getXMMReg(gregOfRM(modrm)) );
   11775 
   11776       breakup128to32s( eV, &e3, &e2, &e1, &e0 );
   11777       breakup128to32s( gV, &g3, &g2, &g1, &g0 );
   11778 
   11779       assign( leftV,  mk128from32s( e2, e0, g2, g0 ) );
   11780       assign( rightV, mk128from32s( e3, e1, g3, g1 ) );
   11781 
   11782       putXMMReg( gregOfRM(modrm),
   11783                  binop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4,
   11784                        mkexpr(leftV), mkexpr(rightV) ) );
   11785       goto decode_success;
   11786    }
   11787 
   11788    /* 66 0F 7D = HSUBPD -- 64x2 sub across from E (mem or xmm) to G (xmm). */
   11789    /* 66 0F 7C = HADDPD -- 64x2 add across from E (mem or xmm) to G (xmm). */
   11790    if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x7C || insn[1] == 0x7D)) {
   11791       IRTemp e1     = newTemp(Ity_I64);
   11792       IRTemp e0     = newTemp(Ity_I64);
   11793       IRTemp g1     = newTemp(Ity_I64);
   11794       IRTemp g0     = newTemp(Ity_I64);
   11795       IRTemp eV     = newTemp(Ity_V128);
   11796       IRTemp gV     = newTemp(Ity_V128);
   11797       IRTemp leftV  = newTemp(Ity_V128);
   11798       IRTemp rightV = newTemp(Ity_V128);
   11799       Bool   isAdd  = insn[1] == 0x7C;
   11800       HChar* str    = isAdd ? "add" : "sub";
   11801 
   11802       modrm = insn[2];
   11803       if (epartIsReg(modrm)) {
   11804          assign( eV, getXMMReg( eregOfRM(modrm)) );
   11805          DIP("h%spd %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
   11806                                    nameXMMReg(gregOfRM(modrm)));
   11807          delta += 2+1;
   11808       } else {
   11809          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11810          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   11811          DIP("h%spd %s,%s\n", str, dis_buf,
   11812                               nameXMMReg(gregOfRM(modrm)));
   11813          delta += 2+alen;
   11814       }
   11815 
   11816       assign( gV, getXMMReg(gregOfRM(modrm)) );
   11817 
   11818       assign( e1, unop(Iop_V128HIto64, mkexpr(eV) ));
   11819       assign( e0, unop(Iop_V128to64, mkexpr(eV) ));
   11820       assign( g1, unop(Iop_V128HIto64, mkexpr(gV) ));
   11821       assign( g0, unop(Iop_V128to64, mkexpr(gV) ));
   11822 
   11823       assign( leftV,  binop(Iop_64HLtoV128, mkexpr(e0),mkexpr(g0)) );
   11824       assign( rightV, binop(Iop_64HLtoV128, mkexpr(e1),mkexpr(g1)) );
   11825 
   11826       putXMMReg( gregOfRM(modrm),
   11827                  binop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2,
   11828                        mkexpr(leftV), mkexpr(rightV) ) );
   11829       goto decode_success;
   11830    }
   11831 
   11832    /* F2 0F F0 = LDDQU -- move from E (mem or xmm) to G (xmm). */
   11833    if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xF0) {
   11834       modrm = getIByte(delta+3);
   11835       if (epartIsReg(modrm)) {
   11836          goto decode_failure;
   11837       } else {
   11838          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11839          putXMMReg( gregOfRM(modrm),
   11840                     loadLE(Ity_V128, mkexpr(addr)) );
   11841          DIP("lddqu %s,%s\n", dis_buf,
   11842                               nameXMMReg(gregOfRM(modrm)));
   11843          delta += 3+alen;
   11844       }
   11845       goto decode_success;
   11846    }
   11847 
   11848    /* ---------------------------------------------------- */
   11849    /* --- end of the SSE3 decoder.                     --- */
   11850    /* ---------------------------------------------------- */
   11851 
   11852    /* ---------------------------------------------------- */
   11853    /* --- start of the SSSE3 decoder.                  --- */
   11854    /* ---------------------------------------------------- */
   11855 
   11856    /* 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
   11857       Unsigned Bytes (MMX) */
   11858    if (sz == 4
   11859        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
   11860       IRTemp sV        = newTemp(Ity_I64);
   11861       IRTemp dV        = newTemp(Ity_I64);
   11862       IRTemp sVoddsSX  = newTemp(Ity_I64);
   11863       IRTemp sVevensSX = newTemp(Ity_I64);
   11864       IRTemp dVoddsZX  = newTemp(Ity_I64);
   11865       IRTemp dVevensZX = newTemp(Ity_I64);
   11866 
   11867       modrm = insn[3];
   11868       do_MMX_preamble();
   11869       assign( dV, getMMXReg(gregOfRM(modrm)) );
   11870 
   11871       if (epartIsReg(modrm)) {
   11872          assign( sV, getMMXReg(eregOfRM(modrm)) );
   11873          delta += 3+1;
   11874          DIP("pmaddubsw %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   11875                                   nameMMXReg(gregOfRM(modrm)));
   11876       } else {
   11877          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11878          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   11879          delta += 3+alen;
   11880          DIP("pmaddubsw %s,%s\n", dis_buf,
   11881                                   nameMMXReg(gregOfRM(modrm)));
   11882       }
   11883 
   11884       /* compute dV unsigned x sV signed */
   11885       assign( sVoddsSX,
   11886               binop(Iop_SarN16x4, mkexpr(sV), mkU8(8)) );
   11887       assign( sVevensSX,
   11888               binop(Iop_SarN16x4,
   11889                     binop(Iop_ShlN16x4, mkexpr(sV), mkU8(8)),
   11890                     mkU8(8)) );
   11891       assign( dVoddsZX,
   11892               binop(Iop_ShrN16x4, mkexpr(dV), mkU8(8)) );
   11893       assign( dVevensZX,
   11894               binop(Iop_ShrN16x4,
   11895                     binop(Iop_ShlN16x4, mkexpr(dV), mkU8(8)),
   11896                     mkU8(8)) );
   11897 
   11898       putMMXReg(
   11899          gregOfRM(modrm),
   11900          binop(Iop_QAdd16Sx4,
   11901                binop(Iop_Mul16x4, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
   11902                binop(Iop_Mul16x4, mkexpr(sVevensSX), mkexpr(dVevensZX))
   11903          )
   11904       );
   11905       goto decode_success;
   11906    }
   11907 
   11908    /* 66 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
   11909       Unsigned Bytes (XMM) */
   11910    if (sz == 2
   11911        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
   11912       IRTemp sV        = newTemp(Ity_V128);
   11913       IRTemp dV        = newTemp(Ity_V128);
   11914       IRTemp sVoddsSX  = newTemp(Ity_V128);
   11915       IRTemp sVevensSX = newTemp(Ity_V128);
   11916       IRTemp dVoddsZX  = newTemp(Ity_V128);
   11917       IRTemp dVevensZX = newTemp(Ity_V128);
   11918 
   11919       modrm = insn[3];
   11920       assign( dV, getXMMReg(gregOfRM(modrm)) );
   11921 
   11922       if (epartIsReg(modrm)) {
   11923          assign( sV, getXMMReg(eregOfRM(modrm)) );
   11924          delta += 3+1;
   11925          DIP("pmaddubsw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11926                                   nameXMMReg(gregOfRM(modrm)));
   11927       } else {
   11928          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11929          gen_SEGV_if_not_16_aligned( addr );
   11930          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11931          delta += 3+alen;
   11932          DIP("pmaddubsw %s,%s\n", dis_buf,
   11933                                   nameXMMReg(gregOfRM(modrm)));
   11934       }
   11935 
   11936       /* compute dV unsigned x sV signed */
   11937       assign( sVoddsSX,
   11938               binop(Iop_SarN16x8, mkexpr(sV), mkU8(8)) );
   11939       assign( sVevensSX,
   11940               binop(Iop_SarN16x8,
   11941                     binop(Iop_ShlN16x8, mkexpr(sV), mkU8(8)),
   11942                     mkU8(8)) );
   11943       assign( dVoddsZX,
   11944               binop(Iop_ShrN16x8, mkexpr(dV), mkU8(8)) );
   11945       assign( dVevensZX,
   11946               binop(Iop_ShrN16x8,
   11947                     binop(Iop_ShlN16x8, mkexpr(dV), mkU8(8)),
   11948                     mkU8(8)) );
   11949 
   11950       putXMMReg(
   11951          gregOfRM(modrm),
   11952          binop(Iop_QAdd16Sx8,
   11953                binop(Iop_Mul16x8, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
   11954                binop(Iop_Mul16x8, mkexpr(sVevensSX), mkexpr(dVevensZX))
   11955          )
   11956       );
   11957       goto decode_success;
   11958    }
   11959 
   11960    /* ***--- these are MMX class insns introduced in SSSE3 ---*** */
   11961    /* 0F 38 03 = PHADDSW -- 16x4 signed qadd across from E (mem or
   11962       mmx) and G to G (mmx). */
   11963    /* 0F 38 07 = PHSUBSW -- 16x4 signed qsub across from E (mem or
   11964       mmx) and G to G (mmx). */
   11965    /* 0F 38 01 = PHADDW -- 16x4 add across from E (mem or mmx) and G
   11966       to G (mmx). */
   11967    /* 0F 38 05 = PHSUBW -- 16x4 sub across from E (mem or mmx) and G
   11968       to G (mmx). */
   11969    /* 0F 38 02 = PHADDD -- 32x2 add across from E (mem or mmx) and G
   11970       to G (mmx). */
   11971    /* 0F 38 06 = PHSUBD -- 32x2 sub across from E (mem or mmx) and G
   11972       to G (mmx). */
   11973 
   11974    if (sz == 4
   11975        && insn[0] == 0x0F && insn[1] == 0x38
   11976        && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
   11977            || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
   11978       HChar* str    = "???";
   11979       IROp   opV64  = Iop_INVALID;
   11980       IROp   opCatO = Iop_CatOddLanes16x4;
   11981       IROp   opCatE = Iop_CatEvenLanes16x4;
   11982       IRTemp sV     = newTemp(Ity_I64);
   11983       IRTemp dV     = newTemp(Ity_I64);
   11984 
   11985       modrm = insn[3];
   11986 
   11987       switch (insn[2]) {
   11988          case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
   11989          case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
   11990          case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
   11991          case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
   11992          case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
   11993          case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
   11994          default: vassert(0);
   11995       }
   11996       if (insn[2] == 0x02 || insn[2] == 0x06) {
   11997          opCatO = Iop_InterleaveHI32x2;
   11998          opCatE = Iop_InterleaveLO32x2;
   11999       }
   12000 
   12001       do_MMX_preamble();
   12002       assign( dV, getMMXReg(gregOfRM(modrm)) );
   12003 
   12004       if (epartIsReg(modrm)) {
   12005          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12006          delta += 3+1;
   12007          DIP("ph%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
   12008                                   nameMMXReg(gregOfRM(modrm)));
   12009       } else {
   12010          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12011          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12012          delta += 3+alen;
   12013          DIP("ph%s %s,%s\n", str, dis_buf,
   12014                                   nameMMXReg(gregOfRM(modrm)));
   12015       }
   12016 
   12017       putMMXReg(
   12018          gregOfRM(modrm),
   12019          binop(opV64,
   12020                binop(opCatE,mkexpr(sV),mkexpr(dV)),
   12021                binop(opCatO,mkexpr(sV),mkexpr(dV))
   12022          )
   12023       );
   12024       goto decode_success;
   12025    }
   12026 
   12027    /* 66 0F 38 03 = PHADDSW -- 16x8 signed qadd across from E (mem or
   12028       xmm) and G to G (xmm). */
   12029    /* 66 0F 38 07 = PHSUBSW -- 16x8 signed qsub across from E (mem or
   12030       xmm) and G to G (xmm). */
   12031    /* 66 0F 38 01 = PHADDW -- 16x8 add across from E (mem or xmm) and
   12032       G to G (xmm). */
   12033    /* 66 0F 38 05 = PHSUBW -- 16x8 sub across from E (mem or xmm) and
   12034       G to G (xmm). */
   12035    /* 66 0F 38 02 = PHADDD -- 32x4 add across from E (mem or xmm) and
   12036       G to G (xmm). */
   12037    /* 66 0F 38 06 = PHSUBD -- 32x4 sub across from E (mem or xmm) and
   12038       G to G (xmm). */
   12039 
   12040    if (sz == 2
   12041        && insn[0] == 0x0F && insn[1] == 0x38
   12042        && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
   12043            || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
   12044       HChar* str    = "???";
   12045       IROp   opV64  = Iop_INVALID;
   12046       IROp   opCatO = Iop_CatOddLanes16x4;
   12047       IROp   opCatE = Iop_CatEvenLanes16x4;
   12048       IRTemp sV     = newTemp(Ity_V128);
   12049       IRTemp dV     = newTemp(Ity_V128);
   12050       IRTemp sHi    = newTemp(Ity_I64);
   12051       IRTemp sLo    = newTemp(Ity_I64);
   12052       IRTemp dHi    = newTemp(Ity_I64);
   12053       IRTemp dLo    = newTemp(Ity_I64);
   12054 
   12055       modrm = insn[3];
   12056 
   12057       switch (insn[2]) {
   12058          case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
   12059          case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
   12060          case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
   12061          case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
   12062          case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
   12063          case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
   12064          default: vassert(0);
   12065       }
   12066       if (insn[2] == 0x02 || insn[2] == 0x06) {
   12067          opCatO = Iop_InterleaveHI32x2;
   12068          opCatE = Iop_InterleaveLO32x2;
   12069       }
   12070 
   12071       assign( dV, getXMMReg(gregOfRM(modrm)) );
   12072 
   12073       if (epartIsReg(modrm)) {
   12074          assign( sV, getXMMReg( eregOfRM(modrm)) );
   12075          DIP("ph%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
   12076                                   nameXMMReg(gregOfRM(modrm)));
   12077          delta += 3+1;
   12078       } else {
   12079          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12080          gen_SEGV_if_not_16_aligned( addr );
   12081          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12082          DIP("ph%s %s,%s\n", str, dis_buf,
   12083                              nameXMMReg(gregOfRM(modrm)));
   12084          delta += 3+alen;
   12085       }
   12086 
   12087       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   12088       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   12089       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12090       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12091 
   12092       /* This isn't a particularly efficient way to compute the
   12093          result, but at least it avoids a proliferation of IROps,
   12094          hence avoids complication all the backends. */
   12095       putXMMReg(
   12096          gregOfRM(modrm),
   12097          binop(Iop_64HLtoV128,
   12098                binop(opV64,
   12099                      binop(opCatE,mkexpr(sHi),mkexpr(sLo)),
   12100                      binop(opCatO,mkexpr(sHi),mkexpr(sLo))
   12101                ),
   12102                binop(opV64,
   12103                      binop(opCatE,mkexpr(dHi),mkexpr(dLo)),
   12104                      binop(opCatO,mkexpr(dHi),mkexpr(dLo))
   12105                )
   12106          )
   12107       );
   12108       goto decode_success;
   12109    }
   12110 
   12111    /* 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and Scale
   12112       (MMX) */
   12113    if (sz == 4
   12114        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
   12115       IRTemp sV = newTemp(Ity_I64);
   12116       IRTemp dV = newTemp(Ity_I64);
   12117 
   12118       modrm = insn[3];
   12119       do_MMX_preamble();
   12120       assign( dV, getMMXReg(gregOfRM(modrm)) );
   12121 
   12122       if (epartIsReg(modrm)) {
   12123          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12124          delta += 3+1;
   12125          DIP("pmulhrsw %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   12126                                  nameMMXReg(gregOfRM(modrm)));
   12127       } else {
   12128          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12129          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12130          delta += 3+alen;
   12131          DIP("pmulhrsw %s,%s\n", dis_buf,
   12132                                  nameMMXReg(gregOfRM(modrm)));
   12133       }
   12134 
   12135       putMMXReg(
   12136          gregOfRM(modrm),
   12137          dis_PMULHRSW_helper( mkexpr(sV), mkexpr(dV) )
   12138       );
   12139       goto decode_success;
   12140    }
   12141 
   12142    /* 66 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and
   12143       Scale (XMM) */
   12144    if (sz == 2
   12145        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
   12146       IRTemp sV  = newTemp(Ity_V128);
   12147       IRTemp dV  = newTemp(Ity_V128);
   12148       IRTemp sHi = newTemp(Ity_I64);
   12149       IRTemp sLo = newTemp(Ity_I64);
   12150       IRTemp dHi = newTemp(Ity_I64);
   12151       IRTemp dLo = newTemp(Ity_I64);
   12152 
   12153       modrm = insn[3];
   12154       assign( dV, getXMMReg(gregOfRM(modrm)) );
   12155 
   12156       if (epartIsReg(modrm)) {
   12157          assign( sV, getXMMReg(eregOfRM(modrm)) );
   12158          delta += 3+1;
   12159          DIP("pmulhrsw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   12160                                  nameXMMReg(gregOfRM(modrm)));
   12161       } else {
   12162          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12163          gen_SEGV_if_not_16_aligned( addr );
   12164          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12165          delta += 3+alen;
   12166          DIP("pmulhrsw %s,%s\n", dis_buf,
   12167                                  nameXMMReg(gregOfRM(modrm)));
   12168       }
   12169 
   12170       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   12171       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   12172       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12173       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12174 
   12175       putXMMReg(
   12176          gregOfRM(modrm),
   12177          binop(Iop_64HLtoV128,
   12178                dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
   12179                dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
   12180          )
   12181       );
   12182       goto decode_success;
   12183    }
   12184 
   12185    /* 0F 38 08 = PSIGNB -- Packed Sign 8x8  (MMX) */
   12186    /* 0F 38 09 = PSIGNW -- Packed Sign 16x4 (MMX) */
   12187    /* 0F 38 09 = PSIGND -- Packed Sign 32x2 (MMX) */
   12188    if (sz == 4
   12189        && insn[0] == 0x0F && insn[1] == 0x38
   12190        && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
   12191       IRTemp sV      = newTemp(Ity_I64);
   12192       IRTemp dV      = newTemp(Ity_I64);
   12193       HChar* str     = "???";
   12194       Int    laneszB = 0;
   12195 
   12196       switch (insn[2]) {
   12197          case 0x08: laneszB = 1; str = "b"; break;
   12198          case 0x09: laneszB = 2; str = "w"; break;
   12199          case 0x0A: laneszB = 4; str = "d"; break;
   12200          default: vassert(0);
   12201       }
   12202 
   12203       modrm = insn[3];
   12204       do_MMX_preamble();
   12205       assign( dV, getMMXReg(gregOfRM(modrm)) );
   12206 
   12207       if (epartIsReg(modrm)) {
   12208          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12209          delta += 3+1;
   12210          DIP("psign%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
   12211                                      nameMMXReg(gregOfRM(modrm)));
   12212       } else {
   12213          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12214          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12215          delta += 3+alen;
   12216          DIP("psign%s %s,%s\n", str, dis_buf,
   12217                                      nameMMXReg(gregOfRM(modrm)));
   12218       }
   12219 
   12220       putMMXReg(
   12221          gregOfRM(modrm),
   12222          dis_PSIGN_helper( mkexpr(sV), mkexpr(dV), laneszB )
   12223       );
   12224       goto decode_success;
   12225    }
   12226 
   12227    /* 66 0F 38 08 = PSIGNB -- Packed Sign 8x16 (XMM) */
   12228    /* 66 0F 38 09 = PSIGNW -- Packed Sign 16x8 (XMM) */
   12229    /* 66 0F 38 09 = PSIGND -- Packed Sign 32x4 (XMM) */
   12230    if (sz == 2
   12231        && insn[0] == 0x0F && insn[1] == 0x38
   12232        && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
   12233       IRTemp sV      = newTemp(Ity_V128);
   12234       IRTemp dV      = newTemp(Ity_V128);
   12235       IRTemp sHi     = newTemp(Ity_I64);
   12236       IRTemp sLo     = newTemp(Ity_I64);
   12237       IRTemp dHi     = newTemp(Ity_I64);
   12238       IRTemp dLo     = newTemp(Ity_I64);
   12239       HChar* str     = "???";
   12240       Int    laneszB = 0;
   12241 
   12242       switch (insn[2]) {
   12243          case 0x08: laneszB = 1; str = "b"; break;
   12244          case 0x09: laneszB = 2; str = "w"; break;
   12245          case 0x0A: laneszB = 4; str = "d"; break;
   12246          default: vassert(0);
   12247       }
   12248 
   12249       modrm = insn[3];
   12250       assign( dV, getXMMReg(gregOfRM(modrm)) );
   12251 
   12252       if (epartIsReg(modrm)) {
   12253          assign( sV, getXMMReg(eregOfRM(modrm)) );
   12254          delta += 3+1;
   12255          DIP("psign%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
   12256                                      nameXMMReg(gregOfRM(modrm)));
   12257       } else {
   12258          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12259          gen_SEGV_if_not_16_aligned( addr );
   12260          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12261          delta += 3+alen;
   12262          DIP("psign%s %s,%s\n", str, dis_buf,
   12263                                      nameXMMReg(gregOfRM(modrm)));
   12264       }
   12265 
   12266       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   12267       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   12268       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12269       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12270 
   12271       putXMMReg(
   12272          gregOfRM(modrm),
   12273          binop(Iop_64HLtoV128,
   12274                dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
   12275                dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
   12276          )
   12277       );
   12278       goto decode_success;
   12279    }
   12280 
   12281    /* 0F 38 1C = PABSB -- Packed Absolute Value 8x8  (MMX) */
   12282    /* 0F 38 1D = PABSW -- Packed Absolute Value 16x4 (MMX) */
   12283    /* 0F 38 1E = PABSD -- Packed Absolute Value 32x2 (MMX) */
   12284    if (sz == 4
   12285        && insn[0] == 0x0F && insn[1] == 0x38
   12286        && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
   12287       IRTemp sV      = newTemp(Ity_I64);
   12288       HChar* str     = "???";
   12289       Int    laneszB = 0;
   12290 
   12291       switch (insn[2]) {
   12292          case 0x1C: laneszB = 1; str = "b"; break;
   12293          case 0x1D: laneszB = 2; str = "w"; break;
   12294          case 0x1E: laneszB = 4; str = "d"; break;
   12295          default: vassert(0);
   12296       }
   12297 
   12298       modrm = insn[3];
   12299       do_MMX_preamble();
   12300 
   12301       if (epartIsReg(modrm)) {
   12302          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12303          delta += 3+1;
   12304          DIP("pabs%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
   12305                                     nameMMXReg(gregOfRM(modrm)));
   12306       } else {
   12307          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12308          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12309          delta += 3+alen;
   12310          DIP("pabs%s %s,%s\n", str, dis_buf,
   12311                                     nameMMXReg(gregOfRM(modrm)));
   12312       }
   12313 
   12314       putMMXReg(
   12315          gregOfRM(modrm),
   12316          dis_PABS_helper( mkexpr(sV), laneszB )
   12317       );
   12318       goto decode_success;
   12319    }
   12320 
   12321    /* 66 0F 38 1C = PABSB -- Packed Absolute Value 8x16 (XMM) */
   12322    /* 66 0F 38 1D = PABSW -- Packed Absolute Value 16x8 (XMM) */
   12323    /* 66 0F 38 1E = PABSD -- Packed Absolute Value 32x4 (XMM) */
   12324    if (sz == 2
   12325        && insn[0] == 0x0F && insn[1] == 0x38
   12326        && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
   12327       IRTemp sV      = newTemp(Ity_V128);
   12328       IRTemp sHi     = newTemp(Ity_I64);
   12329       IRTemp sLo     = newTemp(Ity_I64);
   12330       HChar* str     = "???";
   12331       Int    laneszB = 0;
   12332 
   12333       switch (insn[2]) {
   12334          case 0x1C: laneszB = 1; str = "b"; break;
   12335          case 0x1D: laneszB = 2; str = "w"; break;
   12336          case 0x1E: laneszB = 4; str = "d"; break;
   12337          default: vassert(0);
   12338       }
   12339 
   12340       modrm = insn[3];
   12341 
   12342       if (epartIsReg(modrm)) {
   12343          assign( sV, getXMMReg(eregOfRM(modrm)) );
   12344          delta += 3+1;
   12345          DIP("pabs%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
   12346                                     nameXMMReg(gregOfRM(modrm)));
   12347       } else {
   12348          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12349          gen_SEGV_if_not_16_aligned( addr );
   12350          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12351          delta += 3+alen;
   12352          DIP("pabs%s %s,%s\n", str, dis_buf,
   12353                                     nameXMMReg(gregOfRM(modrm)));
   12354       }
   12355 
   12356       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12357       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12358 
   12359       putXMMReg(
   12360          gregOfRM(modrm),
   12361          binop(Iop_64HLtoV128,
   12362                dis_PABS_helper( mkexpr(sHi), laneszB ),
   12363                dis_PABS_helper( mkexpr(sLo), laneszB )
   12364          )
   12365       );
   12366       goto decode_success;
   12367    }
   12368 
   12369    /* 0F 3A 0F = PALIGNR -- Packed Align Right (MMX) */
   12370    if (sz == 4
   12371        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
   12372       IRTemp sV  = newTemp(Ity_I64);
   12373       IRTemp dV  = newTemp(Ity_I64);
   12374       IRTemp res = newTemp(Ity_I64);
   12375 
   12376       modrm = insn[3];
   12377       do_MMX_preamble();
   12378       assign( dV, getMMXReg(gregOfRM(modrm)) );
   12379 
   12380       if (epartIsReg(modrm)) {
   12381          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12382          d32 = (UInt)insn[3+1];
   12383          delta += 3+1+1;
   12384          DIP("palignr $%d,%s,%s\n",  (Int)d32,
   12385                                      nameMMXReg(eregOfRM(modrm)),
   12386                                      nameMMXReg(gregOfRM(modrm)));
   12387       } else {
   12388          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12389          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12390          d32 = (UInt)insn[3+alen];
   12391          delta += 3+alen+1;
   12392          DIP("palignr $%d%s,%s\n", (Int)d32,
   12393                                    dis_buf,
   12394                                    nameMMXReg(gregOfRM(modrm)));
   12395       }
   12396 
   12397       if (d32 == 0) {
   12398          assign( res, mkexpr(sV) );
   12399       }
   12400       else if (d32 >= 1 && d32 <= 7) {
   12401          assign(res,
   12402                 binop(Iop_Or64,
   12403                       binop(Iop_Shr64, mkexpr(sV), mkU8(8*d32)),
   12404                       binop(Iop_Shl64, mkexpr(dV), mkU8(8*(8-d32))
   12405                      )));
   12406       }
   12407       else if (d32 == 8) {
   12408         assign( res, mkexpr(dV) );
   12409       }
   12410       else if (d32 >= 9 && d32 <= 15) {
   12411          assign( res, binop(Iop_Shr64, mkexpr(dV), mkU8(8*(d32-8))) );
   12412       }
   12413       else if (d32 >= 16 && d32 <= 255) {
   12414          assign( res, mkU64(0) );
   12415       }
   12416       else
   12417          vassert(0);
   12418 
   12419       putMMXReg( gregOfRM(modrm), mkexpr(res) );
   12420       goto decode_success;
   12421    }
   12422 
   12423    /* 66 0F 3A 0F = PALIGNR -- Packed Align Right (XMM) */
   12424    if (sz == 2
   12425        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
   12426       IRTemp sV  = newTemp(Ity_V128);
   12427       IRTemp dV  = newTemp(Ity_V128);
   12428       IRTemp sHi = newTemp(Ity_I64);
   12429       IRTemp sLo = newTemp(Ity_I64);
   12430       IRTemp dHi = newTemp(Ity_I64);
   12431       IRTemp dLo = newTemp(Ity_I64);
   12432       IRTemp rHi = newTemp(Ity_I64);
   12433       IRTemp rLo = newTemp(Ity_I64);
   12434 
   12435       modrm = insn[3];
   12436       assign( dV, getXMMReg(gregOfRM(modrm)) );
   12437 
   12438       if (epartIsReg(modrm)) {
   12439          assign( sV, getXMMReg(eregOfRM(modrm)) );
   12440          d32 = (UInt)insn[3+1];
   12441          delta += 3+1+1;
   12442          DIP("palignr $%d,%s,%s\n", (Int)d32,
   12443                                     nameXMMReg(eregOfRM(modrm)),
   12444                                     nameXMMReg(gregOfRM(modrm)));
   12445       } else {
   12446          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12447          gen_SEGV_if_not_16_aligned( addr );
   12448          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12449          d32 = (UInt)insn[3+alen];
   12450          delta += 3+alen+1;
   12451          DIP("palignr $%d,%s,%s\n", (Int)d32,
   12452                                     dis_buf,
   12453                                     nameXMMReg(gregOfRM(modrm)));
   12454       }
   12455 
   12456       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   12457       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   12458       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12459       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12460 
   12461       if (d32 == 0) {
   12462          assign( rHi, mkexpr(sHi) );
   12463          assign( rLo, mkexpr(sLo) );
   12464       }
   12465       else if (d32 >= 1 && d32 <= 7) {
   12466          assign( rHi, dis_PALIGNR_XMM_helper(dLo, sHi, d32) );
   12467          assign( rLo, dis_PALIGNR_XMM_helper(sHi, sLo, d32) );
   12468       }
   12469       else if (d32 == 8) {
   12470          assign( rHi, mkexpr(dLo) );
   12471          assign( rLo, mkexpr(sHi) );
   12472       }
   12473       else if (d32 >= 9 && d32 <= 15) {
   12474          assign( rHi, dis_PALIGNR_XMM_helper(dHi, dLo, d32-8) );
   12475          assign( rLo, dis_PALIGNR_XMM_helper(dLo, sHi, d32-8) );
   12476       }
   12477       else if (d32 == 16) {
   12478          assign( rHi, mkexpr(dHi) );
   12479          assign( rLo, mkexpr(dLo) );
   12480       }
   12481       else if (d32 >= 17 && d32 <= 23) {
   12482          assign( rHi, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d32-16))) );
   12483          assign( rLo, dis_PALIGNR_XMM_helper(dHi, dLo, d32-16) );
   12484       }
   12485       else if (d32 == 24) {
   12486          assign( rHi, mkU64(0) );
   12487          assign( rLo, mkexpr(dHi) );
   12488       }
   12489       else if (d32 >= 25 && d32 <= 31) {
   12490          assign( rHi, mkU64(0) );
   12491          assign( rLo, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d32-24))) );
   12492       }
   12493       else if (d32 >= 32 && d32 <= 255) {
   12494          assign( rHi, mkU64(0) );
   12495          assign( rLo, mkU64(0) );
   12496       }
   12497       else
   12498          vassert(0);
   12499 
   12500       putXMMReg(
   12501          gregOfRM(modrm),
   12502          binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
   12503       );
   12504       goto decode_success;
   12505    }
   12506 
   12507    /* 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x8 (MMX) */
   12508    if (sz == 4
   12509        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
   12510       IRTemp sV      = newTemp(Ity_I64);
   12511       IRTemp dV      = newTemp(Ity_I64);
   12512 
   12513       modrm = insn[3];
   12514       do_MMX_preamble();
   12515       assign( dV, getMMXReg(gregOfRM(modrm)) );
   12516 
   12517       if (epartIsReg(modrm)) {
   12518          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12519          delta += 3+1;
   12520          DIP("pshufb %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   12521                                nameMMXReg(gregOfRM(modrm)));
   12522       } else {
   12523          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12524          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12525          delta += 3+alen;
   12526          DIP("pshufb %s,%s\n", dis_buf,
   12527                                nameMMXReg(gregOfRM(modrm)));
   12528       }
   12529 
   12530       putMMXReg(
   12531          gregOfRM(modrm),
   12532          binop(
   12533             Iop_And64,
   12534             /* permute the lanes */
   12535             binop(
   12536                Iop_Perm8x8,
   12537                mkexpr(dV),
   12538                binop(Iop_And64, mkexpr(sV), mkU64(0x0707070707070707ULL))
   12539             ),
   12540             /* mask off lanes which have (index & 0x80) == 0x80 */
   12541             unop(Iop_Not64, binop(Iop_SarN8x8, mkexpr(sV), mkU8(7)))
   12542          )
   12543       );
   12544       goto decode_success;
   12545    }
   12546 
   12547    /* 66 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x16 (XMM) */
   12548    if (sz == 2
   12549        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
   12550       IRTemp sV         = newTemp(Ity_V128);
   12551       IRTemp dV         = newTemp(Ity_V128);
   12552       IRTemp sHi        = newTemp(Ity_I64);
   12553       IRTemp sLo        = newTemp(Ity_I64);
   12554       IRTemp dHi        = newTemp(Ity_I64);
   12555       IRTemp dLo        = newTemp(Ity_I64);
   12556       IRTemp rHi        = newTemp(Ity_I64);
   12557       IRTemp rLo        = newTemp(Ity_I64);
   12558       IRTemp sevens     = newTemp(Ity_I64);
   12559       IRTemp mask0x80hi = newTemp(Ity_I64);
   12560       IRTemp mask0x80lo = newTemp(Ity_I64);
   12561       IRTemp maskBit3hi = newTemp(Ity_I64);
   12562       IRTemp maskBit3lo = newTemp(Ity_I64);
   12563       IRTemp sAnd7hi    = newTemp(Ity_I64);
   12564       IRTemp sAnd7lo    = newTemp(Ity_I64);
   12565       IRTemp permdHi    = newTemp(Ity_I64);
   12566       IRTemp permdLo    = newTemp(Ity_I64);
   12567 
   12568       modrm = insn[3];
   12569       assign( dV, getXMMReg(gregOfRM(modrm)) );
   12570 
   12571       if (epartIsReg(modrm)) {
   12572          assign( sV, getXMMReg(eregOfRM(modrm)) );
   12573          delta += 3+1;
   12574          DIP("pshufb %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   12575                                nameXMMReg(gregOfRM(modrm)));
   12576       } else {
   12577          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12578          gen_SEGV_if_not_16_aligned( addr );
   12579          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12580          delta += 3+alen;
   12581          DIP("pshufb %s,%s\n", dis_buf,
   12582                                nameXMMReg(gregOfRM(modrm)));
   12583       }
   12584 
   12585       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   12586       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   12587       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12588       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12589 
   12590       assign( sevens, mkU64(0x0707070707070707ULL) );
   12591 
   12592       /*
   12593       mask0x80hi = Not(SarN8x8(sHi,7))
   12594       maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7)
   12595       sAnd7hi    = And(sHi,sevens)
   12596       permdHi    = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi),
   12597                        And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) )
   12598       rHi        = And(permdHi,mask0x80hi)
   12599       */
   12600       assign(
   12601          mask0x80hi,
   12602          unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7))));
   12603 
   12604       assign(
   12605          maskBit3hi,
   12606          binop(Iop_SarN8x8,
   12607                binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)),
   12608                mkU8(7)));
   12609 
   12610       assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens)));
   12611 
   12612       assign(
   12613          permdHi,
   12614          binop(
   12615             Iop_Or64,
   12616             binop(Iop_And64,
   12617                   binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)),
   12618                   mkexpr(maskBit3hi)),
   12619             binop(Iop_And64,
   12620                   binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)),
   12621                   unop(Iop_Not64,mkexpr(maskBit3hi))) ));
   12622 
   12623       assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) );
   12624 
   12625       /* And the same for the lower half of the result.  What fun. */
   12626 
   12627       assign(
   12628          mask0x80lo,
   12629          unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7))));
   12630 
   12631       assign(
   12632          maskBit3lo,
   12633          binop(Iop_SarN8x8,
   12634                binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)),
   12635                mkU8(7)));
   12636 
   12637       assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens)));
   12638 
   12639       assign(
   12640          permdLo,
   12641          binop(
   12642             Iop_Or64,
   12643             binop(Iop_And64,
   12644                   binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)),
   12645                   mkexpr(maskBit3lo)),
   12646             binop(Iop_And64,
   12647                   binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)),
   12648                   unop(Iop_Not64,mkexpr(maskBit3lo))) ));
   12649 
   12650       assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) );
   12651 
   12652       putXMMReg(
   12653          gregOfRM(modrm),
   12654          binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
   12655       );
   12656       goto decode_success;
   12657    }
   12658 
   12659    /* 0F 38 F0 = MOVBE m16/32(E), r16/32(G) */
   12660    /* 0F 38 F1 = MOVBE r16/32(G), m16/32(E) */
   12661    if ((sz == 2 || sz == 4)
   12662        && insn[0] == 0x0F && insn[1] == 0x38
   12663        && (insn[2] == 0xF0 || insn[2] == 0xF1)
   12664        && !epartIsReg(insn[3])) {
   12665 
   12666       modrm = insn[3];
   12667       addr = disAMode(&alen, sorb, delta + 3, dis_buf);
   12668       delta += 3 + alen;
   12669       ty = szToITy(sz);
   12670       IRTemp src = newTemp(ty);
   12671 
   12672       if (insn[2] == 0xF0) { /* LOAD */
   12673          assign(src, loadLE(ty, mkexpr(addr)));
   12674          IRTemp dst = math_BSWAP(src, ty);
   12675          putIReg(sz, gregOfRM(modrm), mkexpr(dst));
   12676          DIP("movbe %s,%s\n", dis_buf, nameIReg(sz, gregOfRM(modrm)));
   12677       } else { /* STORE */
   12678          assign(src, getIReg(sz, gregOfRM(modrm)));
   12679          IRTemp dst = math_BSWAP(src, ty);
   12680          storeLE(mkexpr(addr), mkexpr(dst));
   12681          DIP("movbe %s,%s\n", nameIReg(sz, gregOfRM(modrm)), dis_buf);
   12682       }
   12683       goto decode_success;
   12684    }
   12685 
   12686    /* ---------------------------------------------------- */
   12687    /* --- end of the SSSE3 decoder.                    --- */
   12688    /* ---------------------------------------------------- */
   12689 
   12690    /* ---------------------------------------------------- */
   12691    /* --- start of the SSE4 decoder                    --- */
   12692    /* ---------------------------------------------------- */
   12693 
   12694    /* 66 0F 3A 0B /r ib = ROUNDSD imm8, xmm2/m64, xmm1
   12695       (Partial implementation only -- only deal with cases where
   12696       the rounding mode is specified directly by the immediate byte.)
   12697       66 0F 3A 0A /r ib = ROUNDSS imm8, xmm2/m32, xmm1
   12698       (Limitations ditto)
   12699    */
   12700    if (sz == 2
   12701        && insn[0] == 0x0F && insn[1] == 0x3A
   12702        && (/*insn[2] == 0x0B || */insn[2] == 0x0A)) {
   12703 
   12704       Bool   isD = insn[2] == 0x0B;
   12705       IRTemp src = newTemp(isD ? Ity_F64 : Ity_F32);
   12706       IRTemp res = newTemp(isD ? Ity_F64 : Ity_F32);
   12707       Int    imm = 0;
   12708 
   12709       modrm = insn[3];
   12710 
   12711       if (epartIsReg(modrm)) {
   12712          assign( src,
   12713                  isD ? getXMMRegLane64F( eregOfRM(modrm), 0 )
   12714                      : getXMMRegLane32F( eregOfRM(modrm), 0 ) );
   12715          imm = insn[3+1];
   12716          if (imm & ~3) goto decode_failure;
   12717          delta += 3+1+1;
   12718          DIP( "rounds%c $%d,%s,%s\n",
   12719               isD ? 'd' : 's',
   12720               imm, nameXMMReg( eregOfRM(modrm) ),
   12721                    nameXMMReg( gregOfRM(modrm) ) );
   12722       } else {
   12723          addr = disAMode( &alen, sorb, delta+3, dis_buf );
   12724          assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
   12725          imm = insn[3+alen];
   12726          if (imm & ~3) goto decode_failure;
   12727          delta += 3+alen+1;
   12728          DIP( "roundsd $%d,%s,%s\n",
   12729               imm, dis_buf, nameXMMReg( gregOfRM(modrm) ) );
   12730       }
   12731 
   12732       /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   12733          that encoding is the same as the encoding for IRRoundingMode,
   12734          we can use that value directly in the IR as a rounding
   12735          mode. */
   12736       assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
   12737                   mkU32(imm & 3), mkexpr(src)) );
   12738 
   12739       if (isD)
   12740          putXMMRegLane64F( gregOfRM(modrm), 0, mkexpr(res) );
   12741       else
   12742          putXMMRegLane32F( gregOfRM(modrm), 0, mkexpr(res) );
   12743 
   12744       goto decode_success;
   12745    }
   12746 
   12747    /* F3 0F BD -- LZCNT (count leading zeroes.  An AMD extension,
   12748       which we can only decode if we're sure this is an AMD cpu that
   12749       supports LZCNT, since otherwise it's BSR, which behaves
   12750       differently. */
   12751    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xBD
   12752        && 0 != (archinfo->hwcaps & VEX_HWCAPS_X86_LZCNT)) {
   12753       vassert(sz == 2 || sz == 4);
   12754       /*IRType*/ ty  = szToITy(sz);
   12755       IRTemp     src = newTemp(ty);
   12756       modrm = insn[3];
   12757       if (epartIsReg(modrm)) {
   12758          assign(src, getIReg(sz, eregOfRM(modrm)));
   12759          delta += 3+1;
   12760          DIP("lzcnt%c %s, %s\n", nameISize(sz),
   12761              nameIReg(sz, eregOfRM(modrm)),
   12762              nameIReg(sz, gregOfRM(modrm)));
   12763       } else {
   12764          addr = disAMode( &alen, sorb, delta+3, dis_buf );
   12765          assign(src, loadLE(ty, mkexpr(addr)));
   12766          delta += 3+alen;
   12767          DIP("lzcnt%c %s, %s\n", nameISize(sz), dis_buf,
   12768              nameIReg(sz, gregOfRM(modrm)));
   12769       }
   12770 
   12771       IRTemp res = gen_LZCNT(ty, src);
   12772       putIReg(sz, gregOfRM(modrm), mkexpr(res));
   12773 
   12774       // Update flags.  This is pretty lame .. perhaps can do better
   12775       // if this turns out to be performance critical.
   12776       // O S A P are cleared.  Z is set if RESULT == 0.
   12777       // C is set if SRC is zero.
   12778       IRTemp src32 = newTemp(Ity_I32);
   12779       IRTemp res32 = newTemp(Ity_I32);
   12780       assign(src32, widenUto32(mkexpr(src)));
   12781       assign(res32, widenUto32(mkexpr(res)));
   12782 
   12783       IRTemp oszacp = newTemp(Ity_I32);
   12784       assign(
   12785          oszacp,
   12786          binop(Iop_Or32,
   12787                binop(Iop_Shl32,
   12788                      unop(Iop_1Uto32,
   12789                           binop(Iop_CmpEQ32, mkexpr(res32), mkU32(0))),
   12790                      mkU8(X86G_CC_SHIFT_Z)),
   12791                binop(Iop_Shl32,
   12792                      unop(Iop_1Uto32,
   12793                           binop(Iop_CmpEQ32, mkexpr(src32), mkU32(0))),
   12794                      mkU8(X86G_CC_SHIFT_C))
   12795          )
   12796       );
   12797 
   12798       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   12799       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   12800       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   12801       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
   12802 
   12803       goto decode_success;
   12804    }
   12805 
   12806    /* ---------------------------------------------------- */
   12807    /* --- end of the SSE4 decoder                      --- */
   12808    /* ---------------------------------------------------- */
   12809 
   12810    after_sse_decoders:
   12811 
   12812    /* ---------------------------------------------------- */
   12813    /* --- deal with misc 0x67 pfxs (addr size override) -- */
   12814    /* ---------------------------------------------------- */
   12815 
   12816    /* 67 E3 = JCXZ (for JECXZ see below) */
   12817    if (insn[0] == 0x67 && insn[1] == 0xE3 && sz == 4) {
   12818       delta += 2;
   12819       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
   12820       delta ++;
   12821       stmt( IRStmt_Exit(
   12822                binop(Iop_CmpEQ16, getIReg(2,R_ECX), mkU16(0)),
   12823                Ijk_Boring,
   12824                IRConst_U32(d32),
   12825                OFFB_EIP
   12826             ));
   12827        DIP("jcxz 0x%x\n", d32);
   12828        goto decode_success;
   12829    }
   12830 
   12831    /* ---------------------------------------------------- */
   12832    /* --- start of the baseline insn decoder            -- */
   12833    /* ---------------------------------------------------- */
   12834 
   12835    /* Get the primary opcode. */
   12836    opc = getIByte(delta); delta++;
   12837 
   12838    /* We get here if the current insn isn't SSE, or this CPU doesn't
   12839       support SSE. */
   12840 
   12841    switch (opc) {
   12842 
   12843    /* ------------------------ Control flow --------------- */
   12844 
   12845    case 0xC2: /* RET imm16 */
   12846       d32 = getUDisp16(delta);
   12847       delta += 2;
   12848       dis_ret(&dres, d32);
   12849       DIP("ret %d\n", (Int)d32);
   12850       break;
   12851    case 0xC3: /* RET */
   12852       dis_ret(&dres, 0);
   12853       DIP("ret\n");
   12854       break;
   12855 
   12856    case 0xCF: /* IRET */
   12857       /* Note, this is an extremely kludgey and limited implementation
   12858          of iret.  All it really does is:
   12859             popl %EIP; popl %CS; popl %EFLAGS.
   12860          %CS is set but ignored (as it is in (eg) popw %cs)". */
   12861       t1 = newTemp(Ity_I32); /* ESP */
   12862       t2 = newTemp(Ity_I32); /* new EIP */
   12863       t3 = newTemp(Ity_I32); /* new CS */
   12864       t4 = newTemp(Ity_I32); /* new EFLAGS */
   12865       assign(t1, getIReg(4,R_ESP));
   12866       assign(t2, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(0) )));
   12867       assign(t3, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(4) )));
   12868       assign(t4, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(8) )));
   12869       /* Get stuff off stack */
   12870       putIReg(4, R_ESP,binop(Iop_Add32, mkexpr(t1), mkU32(12)));
   12871       /* set %CS (which is ignored anyway) */
   12872       putSReg( R_CS, unop(Iop_32to16, mkexpr(t3)) );
   12873       /* set %EFLAGS */
   12874       set_EFLAGS_from_value( t4, False/*!emit_AC_emwarn*/, 0/*unused*/ );
   12875       /* goto new EIP value */
   12876       jmp_treg(&dres, Ijk_Ret, t2);
   12877       vassert(dres.whatNext == Dis_StopHere);
   12878       DIP("iret (very kludgey)\n");
   12879       break;
   12880 
   12881    case 0xE8: /* CALL J4 */
   12882       d32 = getUDisp32(delta); delta += 4;
   12883       d32 += (guest_EIP_bbstart+delta);
   12884       /* (guest_eip_bbstart+delta) == return-to addr, d32 == call-to addr */
   12885       if (d32 == guest_EIP_bbstart+delta && getIByte(delta) >= 0x58
   12886                                          && getIByte(delta) <= 0x5F) {
   12887          /* Specially treat the position-independent-code idiom
   12888                  call X
   12889               X: popl %reg
   12890             as
   12891                  movl %eip, %reg.
   12892             since this generates better code, but for no other reason. */
   12893          Int archReg = getIByte(delta) - 0x58;
   12894          /* vex_printf("-- fPIC thingy\n"); */
   12895          putIReg(4, archReg, mkU32(guest_EIP_bbstart+delta));
   12896          delta++; /* Step over the POP */
   12897          DIP("call 0x%x ; popl %s\n",d32,nameIReg(4,archReg));
   12898       } else {
   12899          /* The normal sequence for a call. */
   12900          t1 = newTemp(Ity_I32);
   12901          assign(t1, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
   12902          putIReg(4, R_ESP, mkexpr(t1));
   12903          storeLE( mkexpr(t1), mkU32(guest_EIP_bbstart+delta));
   12904          if (resteerOkFn( callback_opaque, (Addr64)(Addr32)d32 )) {
   12905             /* follow into the call target. */
   12906             dres.whatNext   = Dis_ResteerU;
   12907             dres.continueAt = (Addr64)(Addr32)d32;
   12908          } else {
   12909             jmp_lit(&dres, Ijk_Call, d32);
   12910             vassert(dres.whatNext == Dis_StopHere);
   12911          }
   12912          DIP("call 0x%x\n",d32);
   12913       }
   12914       break;
   12915 
   12916 //--    case 0xC8: /* ENTER */
   12917 //--       d32 = getUDisp16(eip); eip += 2;
   12918 //--       abyte = getIByte(delta); delta++;
   12919 //--
   12920 //--       vg_assert(sz == 4);
   12921 //--       vg_assert(abyte == 0);
   12922 //--
   12923 //--       t1 = newTemp(cb); t2 = newTemp(cb);
   12924 //--       uInstr2(cb, GET,   sz, ArchReg, R_EBP, TempReg, t1);
   12925 //--       uInstr2(cb, GET,    4, ArchReg, R_ESP, TempReg, t2);
   12926 //--       uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
   12927 //--       uLiteral(cb, sz);
   12928 //--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
   12929 //--       uInstr2(cb, STORE,  4, TempReg, t1,    TempReg, t2);
   12930 //--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_EBP);
   12931 //--       if (d32) {
   12932 //--          uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
   12933 //--          uLiteral(cb, d32);
   12934 //--          uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
   12935 //--       }
   12936 //--       DIP("enter 0x%x, 0x%x", d32, abyte);
   12937 //--       break;
   12938 
   12939    case 0xC9: /* LEAVE */
   12940       vassert(sz == 4);
   12941       t1 = newTemp(Ity_I32); t2 = newTemp(Ity_I32);
   12942       assign(t1, getIReg(4,R_EBP));
   12943       /* First PUT ESP looks redundant, but need it because ESP must
   12944          always be up-to-date for Memcheck to work... */
   12945       putIReg(4, R_ESP, mkexpr(t1));
   12946       assign(t2, loadLE(Ity_I32,mkexpr(t1)));
   12947       putIReg(4, R_EBP, mkexpr(t2));
   12948       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t1), mkU32(4)) );
   12949       DIP("leave\n");
   12950       break;
   12951 
   12952    /* ---------------- Misc weird-ass insns --------------- */
   12953 
   12954    case 0x27: /* DAA */
   12955    case 0x2F: /* DAS */
   12956    case 0x37: /* AAA */
   12957    case 0x3F: /* AAS */
   12958       /* An ugly implementation for some ugly instructions.  Oh
   12959 	 well. */
   12960       if (sz != 4) goto decode_failure;
   12961       t1 = newTemp(Ity_I32);
   12962       t2 = newTemp(Ity_I32);
   12963       /* Make up a 32-bit value (t1), with the old value of AX in the
   12964          bottom 16 bits, and the old OSZACP bitmask in the upper 16
   12965          bits. */
   12966       assign(t1,
   12967              binop(Iop_16HLto32,
   12968                    unop(Iop_32to16,
   12969                         mk_x86g_calculate_eflags_all()),
   12970                    getIReg(2, R_EAX)
   12971             ));
   12972       /* Call the helper fn, to get a new AX and OSZACP value, and
   12973          poke both back into the guest state.  Also pass the helper
   12974          the actual opcode so it knows which of the 4 instructions it
   12975          is doing the computation for. */
   12976       vassert(opc == 0x27 || opc == 0x2F || opc == 0x37 || opc == 0x3F);
   12977       assign(t2,
   12978               mkIRExprCCall(
   12979                  Ity_I32, 0/*regparm*/, "x86g_calculate_daa_das_aaa_aas",
   12980                  &x86g_calculate_daa_das_aaa_aas,
   12981                  mkIRExprVec_2( mkexpr(t1), mkU32( opc & 0xFF) )
   12982             ));
   12983      putIReg(2, R_EAX, unop(Iop_32to16, mkexpr(t2) ));
   12984 
   12985      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   12986      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   12987      stmt( IRStmt_Put( OFFB_CC_DEP1,
   12988                        binop(Iop_And32,
   12989                              binop(Iop_Shr32, mkexpr(t2), mkU8(16)),
   12990                              mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
   12991                                     | X86G_CC_MASK_A | X86G_CC_MASK_Z
   12992                                     | X86G_CC_MASK_S| X86G_CC_MASK_O )
   12993                             )
   12994                       )
   12995          );
   12996      /* Set NDEP even though it isn't used.  This makes redundant-PUT
   12997         elimination of previous stores to this field work better. */
   12998      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   12999      switch (opc) {
   13000         case 0x27: DIP("daa\n"); break;
   13001         case 0x2F: DIP("das\n"); break;
   13002         case 0x37: DIP("aaa\n"); break;
   13003         case 0x3F: DIP("aas\n"); break;
   13004         default: vassert(0);
   13005      }
   13006      break;
   13007 
   13008    case 0xD4: /* AAM */
   13009    case 0xD5: /* AAD */
   13010       d32 = getIByte(delta); delta++;
   13011       if (sz != 4 || d32 != 10) goto decode_failure;
   13012       t1 = newTemp(Ity_I32);
   13013       t2 = newTemp(Ity_I32);
   13014       /* Make up a 32-bit value (t1), with the old value of AX in the
   13015          bottom 16 bits, and the old OSZACP bitmask in the upper 16
   13016          bits. */
   13017       assign(t1,
   13018              binop(Iop_16HLto32,
   13019                    unop(Iop_32to16,
   13020                         mk_x86g_calculate_eflags_all()),
   13021                    getIReg(2, R_EAX)
   13022             ));
   13023       /* Call the helper fn, to get a new AX and OSZACP value, and
   13024          poke both back into the guest state.  Also pass the helper
   13025          the actual opcode so it knows which of the 2 instructions it
   13026          is doing the computation for. */
   13027       assign(t2,
   13028               mkIRExprCCall(
   13029                  Ity_I32, 0/*regparm*/, "x86g_calculate_aad_aam",
   13030                  &x86g_calculate_aad_aam,
   13031                  mkIRExprVec_2( mkexpr(t1), mkU32( opc & 0xFF) )
   13032             ));
   13033       putIReg(2, R_EAX, unop(Iop_32to16, mkexpr(t2) ));
   13034 
   13035       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   13036       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   13037       stmt( IRStmt_Put( OFFB_CC_DEP1,
   13038                         binop(Iop_And32,
   13039                               binop(Iop_Shr32, mkexpr(t2), mkU8(16)),
   13040                               mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
   13041                                      | X86G_CC_MASK_A | X86G_CC_MASK_Z
   13042                                      | X86G_CC_MASK_S| X86G_CC_MASK_O )
   13043                              )
   13044                        )
   13045           );
   13046       /* Set NDEP even though it isn't used.  This makes
   13047          redundant-PUT elimination of previous stores to this field
   13048          work better. */
   13049       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   13050 
   13051       DIP(opc == 0xD4 ? "aam\n" : "aad\n");
   13052       break;
   13053 
   13054    /* ------------------------ CWD/CDQ -------------------- */
   13055 
   13056    case 0x98: /* CBW */
   13057       if (sz == 4) {
   13058          putIReg(4, R_EAX, unop(Iop_16Sto32, getIReg(2, R_EAX)));
   13059          DIP("cwde\n");
   13060       } else {
   13061          vassert(sz == 2);
   13062          putIReg(2, R_EAX, unop(Iop_8Sto16, getIReg(1, R_EAX)));
   13063          DIP("cbw\n");
   13064       }
   13065       break;
   13066 
   13067    case 0x99: /* CWD/CDQ */
   13068       ty = szToITy(sz);
   13069       putIReg(sz, R_EDX,
   13070                   binop(mkSizedOp(ty,Iop_Sar8),
   13071                         getIReg(sz, R_EAX),
   13072                         mkU8(sz == 2 ? 15 : 31)) );
   13073       DIP(sz == 2 ? "cwdq\n" : "cdqq\n");
   13074       break;
   13075 
   13076    /* ------------------------ FPU ops -------------------- */
   13077 
   13078    case 0x9E: /* SAHF */
   13079       codegen_SAHF();
   13080       DIP("sahf\n");
   13081       break;
   13082 
   13083    case 0x9F: /* LAHF */
   13084       codegen_LAHF();
   13085       DIP("lahf\n");
   13086       break;
   13087 
   13088    case 0x9B: /* FWAIT */
   13089       /* ignore? */
   13090       DIP("fwait\n");
   13091       break;
   13092 
   13093    case 0xD8:
   13094    case 0xD9:
   13095    case 0xDA:
   13096    case 0xDB:
   13097    case 0xDC:
   13098    case 0xDD:
   13099    case 0xDE:
   13100    case 0xDF: {
   13101       Int  delta0    = delta;
   13102       Bool decode_OK = False;
   13103       delta = dis_FPU ( &decode_OK, sorb, delta );
   13104       if (!decode_OK) {
   13105          delta = delta0;
   13106          goto decode_failure;
   13107       }
   13108       break;
   13109    }
   13110 
   13111    /* ------------------------ INC & DEC ------------------ */
   13112 
   13113    case 0x40: /* INC eAX */
   13114    case 0x41: /* INC eCX */
   13115    case 0x42: /* INC eDX */
   13116    case 0x43: /* INC eBX */
   13117    case 0x44: /* INC eSP */
   13118    case 0x45: /* INC eBP */
   13119    case 0x46: /* INC eSI */
   13120    case 0x47: /* INC eDI */
   13121       vassert(sz == 2 || sz == 4);
   13122       ty = szToITy(sz);
   13123       t1 = newTemp(ty);
   13124       assign( t1, binop(mkSizedOp(ty,Iop_Add8),
   13125                         getIReg(sz, (UInt)(opc - 0x40)),
   13126                         mkU(ty,1)) );
   13127       setFlags_INC_DEC( True, t1, ty );
   13128       putIReg(sz, (UInt)(opc - 0x40), mkexpr(t1));
   13129       DIP("inc%c %s\n", nameISize(sz), nameIReg(sz,opc-0x40));
   13130       break;
   13131 
   13132    case 0x48: /* DEC eAX */
   13133    case 0x49: /* DEC eCX */
   13134    case 0x4A: /* DEC eDX */
   13135    case 0x4B: /* DEC eBX */
   13136    case 0x4C: /* DEC eSP */
   13137    case 0x4D: /* DEC eBP */
   13138    case 0x4E: /* DEC eSI */
   13139    case 0x4F: /* DEC eDI */
   13140       vassert(sz == 2 || sz == 4);
   13141       ty = szToITy(sz);
   13142       t1 = newTemp(ty);
   13143       assign( t1, binop(mkSizedOp(ty,Iop_Sub8),
   13144                         getIReg(sz, (UInt)(opc - 0x48)),
   13145                         mkU(ty,1)) );
   13146       setFlags_INC_DEC( False, t1, ty );
   13147       putIReg(sz, (UInt)(opc - 0x48), mkexpr(t1));
   13148       DIP("dec%c %s\n", nameISize(sz), nameIReg(sz,opc-0x48));
   13149       break;
   13150 
   13151    /* ------------------------ INT ------------------------ */
   13152 
   13153    case 0xCC: /* INT 3 */
   13154       jmp_lit(&dres, Ijk_SigTRAP, ((Addr32)guest_EIP_bbstart)+delta);
   13155       vassert(dres.whatNext == Dis_StopHere);
   13156       DIP("int $0x3\n");
   13157       break;
   13158 
   13159    case 0xCD: /* INT imm8 */
   13160       d32 = getIByte(delta); delta++;
   13161 
   13162       /* For any of the cases where we emit a jump (that is, for all
   13163          currently handled cases), it's important that all ArchRegs
   13164          carry their up-to-date value at this point.  So we declare an
   13165          end-of-block here, which forces any TempRegs caching ArchRegs
   13166          to be flushed. */
   13167 
   13168       /* Handle int $0x3F .. $0x4F by synthesising a segfault and a
   13169          restart of this instruction (hence the "-2" two lines below,
   13170          to get the restart EIP to be this instruction.  This is
   13171          probably Linux-specific and it would be more correct to only
   13172          do this if the VexAbiInfo says that is what we should do.
   13173          This used to handle just 0x40-0x43; Jikes RVM uses a larger
   13174          range (0x3F-0x49), and this allows some slack as well. */
   13175       if (d32 >= 0x3F && d32 <= 0x4F) {
   13176          jmp_lit(&dres, Ijk_SigSEGV, ((Addr32)guest_EIP_bbstart)+delta-2);
   13177          vassert(dres.whatNext == Dis_StopHere);
   13178          DIP("int $0x%x\n", (Int)d32);
   13179          break;
   13180       }
   13181 
   13182       /* Handle int $0x80 (linux syscalls), int $0x81 and $0x82
   13183          (darwin syscalls).  As part of this, note where we are, so we
   13184          can back up the guest to this point if the syscall needs to
   13185          be restarted. */
   13186       if (d32 == 0x80) {
   13187          stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
   13188                            mkU32(guest_EIP_curr_instr) ) );
   13189          jmp_lit(&dres, Ijk_Sys_int128, ((Addr32)guest_EIP_bbstart)+delta);
   13190          vassert(dres.whatNext == Dis_StopHere);
   13191          DIP("int $0x80\n");
   13192          break;
   13193       }
   13194       if (d32 == 0x81) {
   13195          stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
   13196                            mkU32(guest_EIP_curr_instr) ) );
   13197          jmp_lit(&dres, Ijk_Sys_int129, ((Addr32)guest_EIP_bbstart)+delta);
   13198          vassert(dres.whatNext == Dis_StopHere);
   13199          DIP("int $0x81\n");
   13200          break;
   13201       }
   13202       if (d32 == 0x82) {
   13203          stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
   13204                            mkU32(guest_EIP_curr_instr) ) );
   13205          jmp_lit(&dres, Ijk_Sys_int130, ((Addr32)guest_EIP_bbstart)+delta);
   13206          vassert(dres.whatNext == Dis_StopHere);
   13207          DIP("int $0x82\n");
   13208          break;
   13209       }
   13210 
   13211       /* none of the above */
   13212       goto decode_failure;
   13213 
   13214    /* ------------------------ Jcond, byte offset --------- */
   13215 
   13216    case 0xEB: /* Jb (jump, byte offset) */
   13217       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
   13218       delta++;
   13219       if (resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
   13220          dres.whatNext   = Dis_ResteerU;
   13221          dres.continueAt = (Addr64)(Addr32)d32;
   13222       } else {
   13223          jmp_lit(&dres, Ijk_Boring, d32);
   13224          vassert(dres.whatNext == Dis_StopHere);
   13225       }
   13226       DIP("jmp-8 0x%x\n", d32);
   13227       break;
   13228 
   13229    case 0xE9: /* Jv (jump, 16/32 offset) */
   13230       vassert(sz == 4); /* JRS added 2004 July 11 */
   13231       d32 = (((Addr32)guest_EIP_bbstart)+delta+sz) + getSDisp(sz,delta);
   13232       delta += sz;
   13233       if (resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
   13234          dres.whatNext   = Dis_ResteerU;
   13235          dres.continueAt = (Addr64)(Addr32)d32;
   13236       } else {
   13237          jmp_lit(&dres, Ijk_Boring, d32);
   13238          vassert(dres.whatNext == Dis_StopHere);
   13239       }
   13240       DIP("jmp 0x%x\n", d32);
   13241       break;
   13242 
   13243    case 0x70:
   13244    case 0x71:
   13245    case 0x72: /* JBb/JNAEb (jump below) */
   13246    case 0x73: /* JNBb/JAEb (jump not below) */
   13247    case 0x74: /* JZb/JEb (jump zero) */
   13248    case 0x75: /* JNZb/JNEb (jump not zero) */
   13249    case 0x76: /* JBEb/JNAb (jump below or equal) */
   13250    case 0x77: /* JNBEb/JAb (jump not below or equal) */
   13251    case 0x78: /* JSb (jump negative) */
   13252    case 0x79: /* JSb (jump not negative) */
   13253    case 0x7A: /* JP (jump parity even) */
   13254    case 0x7B: /* JNP/JPO (jump parity odd) */
   13255    case 0x7C: /* JLb/JNGEb (jump less) */
   13256    case 0x7D: /* JGEb/JNLb (jump greater or equal) */
   13257    case 0x7E: /* JLEb/JNGb (jump less or equal) */
   13258    case 0x7F: /* JGb/JNLEb (jump greater) */
   13259     { Int    jmpDelta;
   13260       HChar* comment  = "";
   13261       jmpDelta = (Int)getSDisp8(delta);
   13262       vassert(-128 <= jmpDelta && jmpDelta < 128);
   13263       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + jmpDelta;
   13264       delta++;
   13265       if (resteerCisOk
   13266           && vex_control.guest_chase_cond
   13267           && (Addr32)d32 != (Addr32)guest_EIP_bbstart
   13268           && jmpDelta < 0
   13269           && resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
   13270          /* Speculation: assume this backward branch is taken.  So we
   13271             need to emit a side-exit to the insn following this one,
   13272             on the negation of the condition, and continue at the
   13273             branch target address (d32).  If we wind up back at the
   13274             first instruction of the trace, just stop; it's better to
   13275             let the IR loop unroller handle that case. */
   13276          stmt( IRStmt_Exit(
   13277                   mk_x86g_calculate_condition((X86Condcode)(1 ^ (opc - 0x70))),
   13278                   Ijk_Boring,
   13279                   IRConst_U32(guest_EIP_bbstart+delta),
   13280                   OFFB_EIP ) );
   13281          dres.whatNext   = Dis_ResteerC;
   13282          dres.continueAt = (Addr64)(Addr32)d32;
   13283          comment = "(assumed taken)";
   13284       }
   13285       else
   13286       if (resteerCisOk
   13287           && vex_control.guest_chase_cond
   13288           && (Addr32)d32 != (Addr32)guest_EIP_bbstart
   13289           && jmpDelta >= 0
   13290           && resteerOkFn( callback_opaque,
   13291                           (Addr64)(Addr32)(guest_EIP_bbstart+delta)) ) {
   13292          /* Speculation: assume this forward branch is not taken.  So
   13293             we need to emit a side-exit to d32 (the dest) and continue
   13294             disassembling at the insn immediately following this
   13295             one. */
   13296          stmt( IRStmt_Exit(
   13297                   mk_x86g_calculate_condition((X86Condcode)(opc - 0x70)),
   13298                   Ijk_Boring,
   13299                   IRConst_U32(d32),
   13300                   OFFB_EIP ) );
   13301          dres.whatNext   = Dis_ResteerC;
   13302          dres.continueAt = (Addr64)(Addr32)(guest_EIP_bbstart+delta);
   13303          comment = "(assumed not taken)";
   13304       }
   13305       else {
   13306          /* Conservative default translation - end the block at this
   13307             point. */
   13308          jcc_01( &dres, (X86Condcode)(opc - 0x70),
   13309                  (Addr32)(guest_EIP_bbstart+delta), d32);
   13310          vassert(dres.whatNext == Dis_StopHere);
   13311       }
   13312       DIP("j%s-8 0x%x %s\n", name_X86Condcode(opc - 0x70), d32, comment);
   13313       break;
   13314     }
   13315 
   13316    case 0xE3: /* JECXZ (for JCXZ see above) */
   13317       if (sz != 4) goto decode_failure;
   13318       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
   13319       delta ++;
   13320       stmt( IRStmt_Exit(
   13321                binop(Iop_CmpEQ32, getIReg(4,R_ECX), mkU32(0)),
   13322             Ijk_Boring,
   13323             IRConst_U32(d32),
   13324             OFFB_EIP
   13325           ));
   13326       DIP("jecxz 0x%x\n", d32);
   13327       break;
   13328 
   13329    case 0xE0: /* LOOPNE disp8: decrement count, jump if count != 0 && ZF==0 */
   13330    case 0xE1: /* LOOPE  disp8: decrement count, jump if count != 0 && ZF==1 */
   13331    case 0xE2: /* LOOP   disp8: decrement count, jump if count != 0 */
   13332     { /* Again, the docs say this uses ECX/CX as a count depending on
   13333          the address size override, not the operand one.  Since we
   13334          don't handle address size overrides, I guess that means
   13335          ECX. */
   13336       IRExpr* zbit  = NULL;
   13337       IRExpr* count = NULL;
   13338       IRExpr* cond  = NULL;
   13339       HChar*  xtra  = NULL;
   13340 
   13341       if (sz != 4) goto decode_failure;
   13342       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
   13343       delta++;
   13344       putIReg(4, R_ECX, binop(Iop_Sub32, getIReg(4,R_ECX), mkU32(1)));
   13345 
   13346       count = getIReg(4,R_ECX);
   13347       cond = binop(Iop_CmpNE32, count, mkU32(0));
   13348       switch (opc) {
   13349          case 0xE2:
   13350             xtra = "";
   13351             break;
   13352          case 0xE1:
   13353             xtra = "e";
   13354             zbit = mk_x86g_calculate_condition( X86CondZ );
   13355 	    cond = mkAnd1(cond, zbit);
   13356             break;
   13357          case 0xE0:
   13358             xtra = "ne";
   13359             zbit = mk_x86g_calculate_condition( X86CondNZ );
   13360 	    cond = mkAnd1(cond, zbit);
   13361             break;
   13362          default:
   13363 	    vassert(0);
   13364       }
   13365       stmt( IRStmt_Exit(cond, Ijk_Boring, IRConst_U32(d32), OFFB_EIP) );
   13366 
   13367       DIP("loop%s 0x%x\n", xtra, d32);
   13368       break;
   13369     }
   13370 
   13371    /* ------------------------ IMUL ----------------------- */
   13372 
   13373    case 0x69: /* IMUL Iv, Ev, Gv */
   13374       delta = dis_imul_I_E_G ( sorb, sz, delta, sz );
   13375       break;
   13376    case 0x6B: /* IMUL Ib, Ev, Gv */
   13377       delta = dis_imul_I_E_G ( sorb, sz, delta, 1 );
   13378       break;
   13379 
   13380    /* ------------------------ MOV ------------------------ */
   13381 
   13382    case 0x88: /* MOV Gb,Eb */
   13383       delta = dis_mov_G_E(sorb, 1, delta);
   13384       break;
   13385 
   13386    case 0x89: /* MOV Gv,Ev */
   13387       delta = dis_mov_G_E(sorb, sz, delta);
   13388       break;
   13389 
   13390    case 0x8A: /* MOV Eb,Gb */
   13391       delta = dis_mov_E_G(sorb, 1, delta);
   13392       break;
   13393 
   13394    case 0x8B: /* MOV Ev,Gv */
   13395       delta = dis_mov_E_G(sorb, sz, delta);
   13396       break;
   13397 
   13398    case 0x8D: /* LEA M,Gv */
   13399       if (sz != 4)
   13400          goto decode_failure;
   13401       modrm = getIByte(delta);
   13402       if (epartIsReg(modrm))
   13403          goto decode_failure;
   13404       /* NOTE!  this is the one place where a segment override prefix
   13405          has no effect on the address calculation.  Therefore we pass
   13406          zero instead of sorb here. */
   13407       addr = disAMode ( &alen, /*sorb*/ 0, delta, dis_buf );
   13408       delta += alen;
   13409       putIReg(sz, gregOfRM(modrm), mkexpr(addr));
   13410       DIP("lea%c %s, %s\n", nameISize(sz), dis_buf,
   13411                             nameIReg(sz,gregOfRM(modrm)));
   13412       break;
   13413 
   13414    case 0x8C: /* MOV Sw,Ew -- MOV from a SEGMENT REGISTER */
   13415       delta = dis_mov_Sw_Ew(sorb, sz, delta);
   13416       break;
   13417 
   13418    case 0x8E: /* MOV Ew,Sw -- MOV to a SEGMENT REGISTER */
   13419       delta = dis_mov_Ew_Sw(sorb, delta);
   13420       break;
   13421 
   13422    case 0xA0: /* MOV Ob,AL */
   13423       sz = 1;
   13424       /* Fall through ... */
   13425    case 0xA1: /* MOV Ov,eAX */
   13426       d32 = getUDisp32(delta); delta += 4;
   13427       ty = szToITy(sz);
   13428       addr = newTemp(Ity_I32);
   13429       assign( addr, handleSegOverride(sorb, mkU32(d32)) );
   13430       putIReg(sz, R_EAX, loadLE(ty, mkexpr(addr)));
   13431       DIP("mov%c %s0x%x, %s\n", nameISize(sz), sorbTxt(sorb),
   13432                                 d32, nameIReg(sz,R_EAX));
   13433       break;
   13434 
   13435    case 0xA2: /* MOV Ob,AL */
   13436       sz = 1;
   13437       /* Fall through ... */
   13438    case 0xA3: /* MOV eAX,Ov */
   13439       d32 = getUDisp32(delta); delta += 4;
   13440       ty = szToITy(sz);
   13441       addr = newTemp(Ity_I32);
   13442       assign( addr, handleSegOverride(sorb, mkU32(d32)) );
   13443       storeLE( mkexpr(addr), getIReg(sz,R_EAX) );
   13444       DIP("mov%c %s, %s0x%x\n", nameISize(sz), nameIReg(sz,R_EAX),
   13445                                 sorbTxt(sorb), d32);
   13446       break;
   13447 
   13448    case 0xB0: /* MOV imm,AL */
   13449    case 0xB1: /* MOV imm,CL */
   13450    case 0xB2: /* MOV imm,DL */
   13451    case 0xB3: /* MOV imm,BL */
   13452    case 0xB4: /* MOV imm,AH */
   13453    case 0xB5: /* MOV imm,CH */
   13454    case 0xB6: /* MOV imm,DH */
   13455    case 0xB7: /* MOV imm,BH */
   13456       d32 = getIByte(delta); delta += 1;
   13457       putIReg(1, opc-0xB0, mkU8(d32));
   13458       DIP("movb $0x%x,%s\n", d32, nameIReg(1,opc-0xB0));
   13459       break;
   13460 
   13461    case 0xB8: /* MOV imm,eAX */
   13462    case 0xB9: /* MOV imm,eCX */
   13463    case 0xBA: /* MOV imm,eDX */
   13464    case 0xBB: /* MOV imm,eBX */
   13465    case 0xBC: /* MOV imm,eSP */
   13466    case 0xBD: /* MOV imm,eBP */
   13467    case 0xBE: /* MOV imm,eSI */
   13468    case 0xBF: /* MOV imm,eDI */
   13469       d32 = getUDisp(sz,delta); delta += sz;
   13470       putIReg(sz, opc-0xB8, mkU(szToITy(sz), d32));
   13471       DIP("mov%c $0x%x,%s\n", nameISize(sz), d32, nameIReg(sz,opc-0xB8));
   13472       break;
   13473 
   13474    case 0xC6: /* MOV Ib,Eb */
   13475       sz = 1;
   13476       goto do_Mov_I_E;
   13477    case 0xC7: /* MOV Iv,Ev */
   13478       goto do_Mov_I_E;
   13479 
   13480    do_Mov_I_E:
   13481       modrm = getIByte(delta);
   13482       if (epartIsReg(modrm)) {
   13483          delta++; /* mod/rm byte */
   13484          d32 = getUDisp(sz,delta); delta += sz;
   13485          putIReg(sz, eregOfRM(modrm), mkU(szToITy(sz), d32));
   13486          DIP("mov%c $0x%x, %s\n", nameISize(sz), d32,
   13487                                   nameIReg(sz,eregOfRM(modrm)));
   13488       } else {
   13489          addr = disAMode ( &alen, sorb, delta, dis_buf );
   13490          delta += alen;
   13491          d32 = getUDisp(sz,delta); delta += sz;
   13492          storeLE(mkexpr(addr), mkU(szToITy(sz), d32));
   13493          DIP("mov%c $0x%x, %s\n", nameISize(sz), d32, dis_buf);
   13494       }
   13495       break;
   13496 
   13497    /* ------------------------ opl imm, A ----------------- */
   13498 
   13499    case 0x04: /* ADD Ib, AL */
   13500       delta = dis_op_imm_A(  1, False, Iop_Add8, True, delta, "add" );
   13501       break;
   13502    case 0x05: /* ADD Iv, eAX */
   13503       delta = dis_op_imm_A( sz, False, Iop_Add8, True, delta, "add" );
   13504       break;
   13505 
   13506    case 0x0C: /* OR Ib, AL */
   13507       delta = dis_op_imm_A(  1, False, Iop_Or8, True, delta, "or" );
   13508       break;
   13509    case 0x0D: /* OR Iv, eAX */
   13510       delta = dis_op_imm_A( sz, False, Iop_Or8, True, delta, "or" );
   13511       break;
   13512 
   13513    case 0x14: /* ADC Ib, AL */
   13514       delta = dis_op_imm_A(  1, True, Iop_Add8, True, delta, "adc" );
   13515       break;
   13516    case 0x15: /* ADC Iv, eAX */
   13517       delta = dis_op_imm_A( sz, True, Iop_Add8, True, delta, "adc" );
   13518       break;
   13519 
   13520    case 0x1C: /* SBB Ib, AL */
   13521       delta = dis_op_imm_A( 1, True, Iop_Sub8, True, delta, "sbb" );
   13522       break;
   13523    case 0x1D: /* SBB Iv, eAX */
   13524       delta = dis_op_imm_A( sz, True, Iop_Sub8, True, delta, "sbb" );
   13525       break;
   13526 
   13527    case 0x24: /* AND Ib, AL */
   13528       delta = dis_op_imm_A(  1, False, Iop_And8, True, delta, "and" );
   13529       break;
   13530    case 0x25: /* AND Iv, eAX */
   13531       delta = dis_op_imm_A( sz, False, Iop_And8, True, delta, "and" );
   13532       break;
   13533 
   13534    case 0x2C: /* SUB Ib, AL */
   13535       delta = dis_op_imm_A(  1, False, Iop_Sub8, True, delta, "sub" );
   13536       break;
   13537    case 0x2D: /* SUB Iv, eAX */
   13538       delta = dis_op_imm_A( sz, False, Iop_Sub8, True, delta, "sub" );
   13539       break;
   13540 
   13541    case 0x34: /* XOR Ib, AL */
   13542       delta = dis_op_imm_A(  1, False, Iop_Xor8, True, delta, "xor" );
   13543       break;
   13544    case 0x35: /* XOR Iv, eAX */
   13545       delta = dis_op_imm_A( sz, False, Iop_Xor8, True, delta, "xor" );
   13546       break;
   13547 
   13548    case 0x3C: /* CMP Ib, AL */
   13549       delta = dis_op_imm_A(  1, False, Iop_Sub8, False, delta, "cmp" );
   13550       break;
   13551    case 0x3D: /* CMP Iv, eAX */
   13552       delta = dis_op_imm_A( sz, False, Iop_Sub8, False, delta, "cmp" );
   13553       break;
   13554 
   13555    case 0xA8: /* TEST Ib, AL */
   13556       delta = dis_op_imm_A(  1, False, Iop_And8, False, delta, "test" );
   13557       break;
   13558    case 0xA9: /* TEST Iv, eAX */
   13559       delta = dis_op_imm_A( sz, False, Iop_And8, False, delta, "test" );
   13560       break;
   13561 
   13562    /* ------------------------ opl Ev, Gv ----------------- */
   13563 
   13564    case 0x02: /* ADD Eb,Gb */
   13565       delta = dis_op2_E_G ( sorb, False, Iop_Add8, True, 1, delta, "add" );
   13566       break;
   13567    case 0x03: /* ADD Ev,Gv */
   13568       delta = dis_op2_E_G ( sorb, False, Iop_Add8, True, sz, delta, "add" );
   13569       break;
   13570 
   13571    case 0x0A: /* OR Eb,Gb */
   13572       delta = dis_op2_E_G ( sorb, False, Iop_Or8, True, 1, delta, "or" );
   13573       break;
   13574    case 0x0B: /* OR Ev,Gv */
   13575       delta = dis_op2_E_G ( sorb, False, Iop_Or8, True, sz, delta, "or" );
   13576       break;
   13577 
   13578    case 0x12: /* ADC Eb,Gb */
   13579       delta = dis_op2_E_G ( sorb, True, Iop_Add8, True, 1, delta, "adc" );
   13580       break;
   13581    case 0x13: /* ADC Ev,Gv */
   13582       delta = dis_op2_E_G ( sorb, True, Iop_Add8, True, sz, delta, "adc" );
   13583       break;
   13584 
   13585    case 0x1A: /* SBB Eb,Gb */
   13586       delta = dis_op2_E_G ( sorb, True, Iop_Sub8, True, 1, delta, "sbb" );
   13587       break;
   13588    case 0x1B: /* SBB Ev,Gv */
   13589       delta = dis_op2_E_G ( sorb, True, Iop_Sub8, True, sz, delta, "sbb" );
   13590       break;
   13591 
   13592    case 0x22: /* AND Eb,Gb */
   13593       delta = dis_op2_E_G ( sorb, False, Iop_And8, True, 1, delta, "and" );
   13594       break;
   13595    case 0x23: /* AND Ev,Gv */
   13596       delta = dis_op2_E_G ( sorb, False, Iop_And8, True, sz, delta, "and" );
   13597       break;
   13598 
   13599    case 0x2A: /* SUB Eb,Gb */
   13600       delta = dis_op2_E_G ( sorb, False, Iop_Sub8, True, 1, delta, "sub" );
   13601       break;
   13602    case 0x2B: /* SUB Ev,Gv */
   13603       delta = dis_op2_E_G ( sorb, False, Iop_Sub8, True, sz, delta, "sub" );
   13604       break;
   13605 
   13606    case 0x32: /* XOR Eb,Gb */
   13607       delta = dis_op2_E_G ( sorb, False, Iop_Xor8, True, 1, delta, "xor" );
   13608       break;
   13609    case 0x33: /* XOR Ev,Gv */
   13610       delta = dis_op2_E_G ( sorb, False, Iop_Xor8, True, sz, delta, "xor" );
   13611       break;
   13612 
   13613    case 0x3A: /* CMP Eb,Gb */
   13614       delta = dis_op2_E_G ( sorb, False, Iop_Sub8, False, 1, delta, "cmp" );
   13615       break;
   13616    case 0x3B: /* CMP Ev,Gv */
   13617       delta = dis_op2_E_G ( sorb, False, Iop_Sub8, False, sz, delta, "cmp" );
   13618       break;
   13619 
   13620    case 0x84: /* TEST Eb,Gb */
   13621       delta = dis_op2_E_G ( sorb, False, Iop_And8, False, 1, delta, "test" );
   13622       break;
   13623    case 0x85: /* TEST Ev,Gv */
   13624       delta = dis_op2_E_G ( sorb, False, Iop_And8, False, sz, delta, "test" );
   13625       break;
   13626 
   13627    /* ------------------------ opl Gv, Ev ----------------- */
   13628 
   13629    case 0x00: /* ADD Gb,Eb */
   13630       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13631                             Iop_Add8, True, 1, delta, "add" );
   13632       break;
   13633    case 0x01: /* ADD Gv,Ev */
   13634       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13635                             Iop_Add8, True, sz, delta, "add" );
   13636       break;
   13637 
   13638    case 0x08: /* OR Gb,Eb */
   13639       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13640                             Iop_Or8, True, 1, delta, "or" );
   13641       break;
   13642    case 0x09: /* OR Gv,Ev */
   13643       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13644                             Iop_Or8, True, sz, delta, "or" );
   13645       break;
   13646 
   13647    case 0x10: /* ADC Gb,Eb */
   13648       delta = dis_op2_G_E ( sorb, pfx_lock, True,
   13649                             Iop_Add8, True, 1, delta, "adc" );
   13650       break;
   13651    case 0x11: /* ADC Gv,Ev */
   13652       delta = dis_op2_G_E ( sorb, pfx_lock, True,
   13653                             Iop_Add8, True, sz, delta, "adc" );
   13654       break;
   13655 
   13656    case 0x18: /* SBB Gb,Eb */
   13657       delta = dis_op2_G_E ( sorb, pfx_lock, True,
   13658                             Iop_Sub8, True, 1, delta, "sbb" );
   13659       break;
   13660    case 0x19: /* SBB Gv,Ev */
   13661       delta = dis_op2_G_E ( sorb, pfx_lock, True,
   13662                             Iop_Sub8, True, sz, delta, "sbb" );
   13663       break;
   13664 
   13665    case 0x20: /* AND Gb,Eb */
   13666       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13667                             Iop_And8, True, 1, delta, "and" );
   13668       break;
   13669    case 0x21: /* AND Gv,Ev */
   13670       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13671                             Iop_And8, True, sz, delta, "and" );
   13672       break;
   13673 
   13674    case 0x28: /* SUB Gb,Eb */
   13675       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13676                             Iop_Sub8, True, 1, delta, "sub" );
   13677       break;
   13678    case 0x29: /* SUB Gv,Ev */
   13679       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13680                             Iop_Sub8, True, sz, delta, "sub" );
   13681       break;
   13682 
   13683    case 0x30: /* XOR Gb,Eb */
   13684       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13685                             Iop_Xor8, True, 1, delta, "xor" );
   13686       break;
   13687    case 0x31: /* XOR Gv,Ev */
   13688       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13689                             Iop_Xor8, True, sz, delta, "xor" );
   13690       break;
   13691 
   13692    case 0x38: /* CMP Gb,Eb */
   13693       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13694                             Iop_Sub8, False, 1, delta, "cmp" );
   13695       break;
   13696    case 0x39: /* CMP Gv,Ev */
   13697       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13698                             Iop_Sub8, False, sz, delta, "cmp" );
   13699       break;
   13700 
   13701    /* ------------------------ POP ------------------------ */
   13702 
   13703    case 0x58: /* POP eAX */
   13704    case 0x59: /* POP eCX */
   13705    case 0x5A: /* POP eDX */
   13706    case 0x5B: /* POP eBX */
   13707    case 0x5D: /* POP eBP */
   13708    case 0x5E: /* POP eSI */
   13709    case 0x5F: /* POP eDI */
   13710    case 0x5C: /* POP eSP */
   13711       vassert(sz == 2 || sz == 4);
   13712       t1 = newTemp(szToITy(sz)); t2 = newTemp(Ity_I32);
   13713       assign(t2, getIReg(4, R_ESP));
   13714       assign(t1, loadLE(szToITy(sz),mkexpr(t2)));
   13715       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t2), mkU32(sz)));
   13716       putIReg(sz, opc-0x58, mkexpr(t1));
   13717       DIP("pop%c %s\n", nameISize(sz), nameIReg(sz,opc-0x58));
   13718       break;
   13719 
   13720    case 0x9D: /* POPF */
   13721       vassert(sz == 2 || sz == 4);
   13722       t1 = newTemp(Ity_I32); t2 = newTemp(Ity_I32);
   13723       assign(t2, getIReg(4, R_ESP));
   13724       assign(t1, widenUto32(loadLE(szToITy(sz),mkexpr(t2))));
   13725       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t2), mkU32(sz)));
   13726 
   13727       /* Generate IR to set %EFLAGS{O,S,Z,A,C,P,D,ID,AC} from the
   13728 	 value in t1. */
   13729       set_EFLAGS_from_value( t1, True/*emit_AC_emwarn*/,
   13730                                  ((Addr32)guest_EIP_bbstart)+delta );
   13731 
   13732       DIP("popf%c\n", nameISize(sz));
   13733       break;
   13734 
   13735    case 0x61: /* POPA */
   13736       /* This is almost certainly wrong for sz==2.  So ... */
   13737       if (sz != 4) goto decode_failure;
   13738 
   13739       /* t5 is the old %ESP value. */
   13740       t5 = newTemp(Ity_I32);
   13741       assign( t5, getIReg(4, R_ESP) );
   13742 
   13743       /* Reload all the registers, except %esp. */
   13744       putIReg(4,R_EAX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(28)) ));
   13745       putIReg(4,R_ECX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(24)) ));
   13746       putIReg(4,R_EDX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(20)) ));
   13747       putIReg(4,R_EBX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(16)) ));
   13748       /* ignore saved %ESP */
   13749       putIReg(4,R_EBP, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 8)) ));
   13750       putIReg(4,R_ESI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 4)) ));
   13751       putIReg(4,R_EDI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 0)) ));
   13752 
   13753       /* and move %ESP back up */
   13754       putIReg( 4, R_ESP, binop(Iop_Add32, mkexpr(t5), mkU32(8*4)) );
   13755 
   13756       DIP("popa%c\n", nameISize(sz));
   13757       break;
   13758 
   13759    case 0x8F: /* POPL/POPW m32 */
   13760      { Int    len;
   13761        UChar  rm = getIByte(delta);
   13762 
   13763        /* make sure this instruction is correct POP */
   13764        if (epartIsReg(rm) || gregOfRM(rm) != 0)
   13765           goto decode_failure;
   13766        /* and has correct size */
   13767        if (sz != 4 && sz != 2)
   13768           goto decode_failure;
   13769        ty = szToITy(sz);
   13770 
   13771        t1 = newTemp(Ity_I32); /* stack address */
   13772        t3 = newTemp(ty); /* data */
   13773        /* set t1 to ESP: t1 = ESP */
   13774        assign( t1, getIReg(4, R_ESP) );
   13775        /* load M[ESP] to virtual register t3: t3 = M[t1] */
   13776        assign( t3, loadLE(ty, mkexpr(t1)) );
   13777 
   13778        /* increase ESP; must be done before the STORE.  Intel manual says:
   13779             If the ESP register is used as a base register for addressing
   13780             a destination operand in memory, the POP instruction computes
   13781             the effective address of the operand after it increments the
   13782             ESP register.
   13783        */
   13784        putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t1), mkU32(sz)) );
   13785 
   13786        /* resolve MODR/M */
   13787        addr = disAMode ( &len, sorb, delta, dis_buf);
   13788        storeLE( mkexpr(addr), mkexpr(t3) );
   13789 
   13790        DIP("pop%c %s\n", sz==2 ? 'w' : 'l', dis_buf);
   13791 
   13792        delta += len;
   13793        break;
   13794      }
   13795 
   13796    case 0x1F: /* POP %DS */
   13797       dis_pop_segreg( R_DS, sz ); break;
   13798    case 0x07: /* POP %ES */
   13799       dis_pop_segreg( R_ES, sz ); break;
   13800    case 0x17: /* POP %SS */
   13801       dis_pop_segreg( R_SS, sz ); break;
   13802 
   13803    /* ------------------------ PUSH ----------------------- */
   13804 
   13805    case 0x50: /* PUSH eAX */
   13806    case 0x51: /* PUSH eCX */
   13807    case 0x52: /* PUSH eDX */
   13808    case 0x53: /* PUSH eBX */
   13809    case 0x55: /* PUSH eBP */
   13810    case 0x56: /* PUSH eSI */
   13811    case 0x57: /* PUSH eDI */
   13812    case 0x54: /* PUSH eSP */
   13813       /* This is the Right Way, in that the value to be pushed is
   13814          established before %esp is changed, so that pushl %esp
   13815          correctly pushes the old value. */
   13816       vassert(sz == 2 || sz == 4);
   13817       ty = sz==2 ? Ity_I16 : Ity_I32;
   13818       t1 = newTemp(ty); t2 = newTemp(Ity_I32);
   13819       assign(t1, getIReg(sz, opc-0x50));
   13820       assign(t2, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)));
   13821       putIReg(4, R_ESP, mkexpr(t2) );
   13822       storeLE(mkexpr(t2),mkexpr(t1));
   13823       DIP("push%c %s\n", nameISize(sz), nameIReg(sz,opc-0x50));
   13824       break;
   13825 
   13826 
   13827    case 0x68: /* PUSH Iv */
   13828       d32 = getUDisp(sz,delta); delta += sz;
   13829       goto do_push_I;
   13830    case 0x6A: /* PUSH Ib, sign-extended to sz */
   13831       d32 = getSDisp8(delta); delta += 1;
   13832       goto do_push_I;
   13833    do_push_I:
   13834       ty = szToITy(sz);
   13835       t1 = newTemp(Ity_I32); t2 = newTemp(ty);
   13836       assign( t1, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
   13837       putIReg(4, R_ESP, mkexpr(t1) );
   13838       /* stop mkU16 asserting if d32 is a negative 16-bit number
   13839          (bug #132813) */
   13840       if (ty == Ity_I16)
   13841          d32 &= 0xFFFF;
   13842       storeLE( mkexpr(t1), mkU(ty,d32) );
   13843       DIP("push%c $0x%x\n", nameISize(sz), d32);
   13844       break;
   13845 
   13846    case 0x9C: /* PUSHF */ {
   13847       vassert(sz == 2 || sz == 4);
   13848 
   13849       t1 = newTemp(Ity_I32);
   13850       assign( t1, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
   13851       putIReg(4, R_ESP, mkexpr(t1) );
   13852 
   13853       /* Calculate OSZACP, and patch in fixed fields as per
   13854          Intel docs.
   13855          - bit 1 is always 1
   13856          - bit 9 is Interrupt Enable (should always be 1 in user mode?)
   13857       */
   13858       t2 = newTemp(Ity_I32);
   13859       assign( t2, binop(Iop_Or32,
   13860                         mk_x86g_calculate_eflags_all(),
   13861                         mkU32( (1<<1)|(1<<9) ) ));
   13862 
   13863       /* Patch in the D flag.  This can simply be a copy of bit 10 of
   13864          baseBlock[OFFB_DFLAG]. */
   13865       t3 = newTemp(Ity_I32);
   13866       assign( t3, binop(Iop_Or32,
   13867                         mkexpr(t2),
   13868                         binop(Iop_And32,
   13869                               IRExpr_Get(OFFB_DFLAG,Ity_I32),
   13870                               mkU32(1<<10)))
   13871             );
   13872 
   13873       /* And patch in the ID flag. */
   13874       t4 = newTemp(Ity_I32);
   13875       assign( t4, binop(Iop_Or32,
   13876                         mkexpr(t3),
   13877                         binop(Iop_And32,
   13878                               binop(Iop_Shl32, IRExpr_Get(OFFB_IDFLAG,Ity_I32),
   13879                                                mkU8(21)),
   13880                               mkU32(1<<21)))
   13881             );
   13882 
   13883       /* And patch in the AC flag. */
   13884       t5 = newTemp(Ity_I32);
   13885       assign( t5, binop(Iop_Or32,
   13886                         mkexpr(t4),
   13887                         binop(Iop_And32,
   13888                               binop(Iop_Shl32, IRExpr_Get(OFFB_ACFLAG,Ity_I32),
   13889                                                mkU8(18)),
   13890                               mkU32(1<<18)))
   13891             );
   13892 
   13893       /* if sz==2, the stored value needs to be narrowed. */
   13894       if (sz == 2)
   13895         storeLE( mkexpr(t1), unop(Iop_32to16,mkexpr(t5)) );
   13896       else
   13897         storeLE( mkexpr(t1), mkexpr(t5) );
   13898 
   13899       DIP("pushf%c\n", nameISize(sz));
   13900       break;
   13901    }
   13902 
   13903    case 0x60: /* PUSHA */
   13904       /* This is almost certainly wrong for sz==2.  So ... */
   13905       if (sz != 4) goto decode_failure;
   13906 
   13907       /* This is the Right Way, in that the value to be pushed is
   13908          established before %esp is changed, so that pusha
   13909          correctly pushes the old %esp value.  New value of %esp is
   13910          pushed at start. */
   13911       /* t0 is the %ESP value we're going to push. */
   13912       t0 = newTemp(Ity_I32);
   13913       assign( t0, getIReg(4, R_ESP) );
   13914 
   13915       /* t5 will be the new %ESP value. */
   13916       t5 = newTemp(Ity_I32);
   13917       assign( t5, binop(Iop_Sub32, mkexpr(t0), mkU32(8*4)) );
   13918 
   13919       /* Update guest state before prodding memory. */
   13920       putIReg(4, R_ESP, mkexpr(t5));
   13921 
   13922       /* Dump all the registers. */
   13923       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(28)), getIReg(4,R_EAX) );
   13924       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(24)), getIReg(4,R_ECX) );
   13925       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(20)), getIReg(4,R_EDX) );
   13926       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(16)), getIReg(4,R_EBX) );
   13927       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(12)), mkexpr(t0) /*esp*/);
   13928       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 8)), getIReg(4,R_EBP) );
   13929       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 4)), getIReg(4,R_ESI) );
   13930       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 0)), getIReg(4,R_EDI) );
   13931 
   13932       DIP("pusha%c\n", nameISize(sz));
   13933       break;
   13934 
   13935    case 0x0E: /* PUSH %CS */
   13936       dis_push_segreg( R_CS, sz ); break;
   13937    case 0x1E: /* PUSH %DS */
   13938       dis_push_segreg( R_DS, sz ); break;
   13939    case 0x06: /* PUSH %ES */
   13940       dis_push_segreg( R_ES, sz ); break;
   13941    case 0x16: /* PUSH %SS */
   13942       dis_push_segreg( R_SS, sz ); break;
   13943 
   13944    /* ------------------------ SCAS et al ----------------- */
   13945 
   13946    case 0xA4: /* MOVS, no REP prefix */
   13947    case 0xA5:
   13948       if (sorb != 0)
   13949          goto decode_failure; /* else dis_string_op asserts */
   13950       dis_string_op( dis_MOVS, ( opc == 0xA4 ? 1 : sz ), "movs", sorb );
   13951       break;
   13952 
   13953   case 0xA6: /* CMPSb, no REP prefix */
   13954   case 0xA7:
   13955       if (sorb != 0)
   13956          goto decode_failure; /* else dis_string_op asserts */
   13957       dis_string_op( dis_CMPS, ( opc == 0xA6 ? 1 : sz ), "cmps", sorb );
   13958       break;
   13959 
   13960    case 0xAA: /* STOS, no REP prefix */
   13961    case 0xAB:
   13962       if (sorb != 0)
   13963          goto decode_failure; /* else dis_string_op asserts */
   13964       dis_string_op( dis_STOS, ( opc == 0xAA ? 1 : sz ), "stos", sorb );
   13965       break;
   13966 
   13967    case 0xAC: /* LODS, no REP prefix */
   13968    case 0xAD:
   13969       if (sorb != 0)
   13970          goto decode_failure; /* else dis_string_op asserts */
   13971       dis_string_op( dis_LODS, ( opc == 0xAC ? 1 : sz ), "lods", sorb );
   13972       break;
   13973 
   13974    case 0xAE: /* SCAS, no REP prefix */
   13975    case 0xAF:
   13976       if (sorb != 0)
   13977          goto decode_failure; /* else dis_string_op asserts */
   13978       dis_string_op( dis_SCAS, ( opc == 0xAE ? 1 : sz ), "scas", sorb );
   13979       break;
   13980 
   13981 
   13982    case 0xFC: /* CLD */
   13983       stmt( IRStmt_Put( OFFB_DFLAG, mkU32(1)) );
   13984       DIP("cld\n");
   13985       break;
   13986 
   13987    case 0xFD: /* STD */
   13988       stmt( IRStmt_Put( OFFB_DFLAG, mkU32(0xFFFFFFFF)) );
   13989       DIP("std\n");
   13990       break;
   13991 
   13992    case 0xF8: /* CLC */
   13993    case 0xF9: /* STC */
   13994    case 0xF5: /* CMC */
   13995       t0 = newTemp(Ity_I32);
   13996       t1 = newTemp(Ity_I32);
   13997       assign( t0, mk_x86g_calculate_eflags_all() );
   13998       switch (opc) {
   13999          case 0xF8:
   14000             assign( t1, binop(Iop_And32, mkexpr(t0),
   14001                                          mkU32(~X86G_CC_MASK_C)));
   14002             DIP("clc\n");
   14003             break;
   14004          case 0xF9:
   14005             assign( t1, binop(Iop_Or32, mkexpr(t0),
   14006                                         mkU32(X86G_CC_MASK_C)));
   14007             DIP("stc\n");
   14008             break;
   14009          case 0xF5:
   14010             assign( t1, binop(Iop_Xor32, mkexpr(t0),
   14011                                          mkU32(X86G_CC_MASK_C)));
   14012             DIP("cmc\n");
   14013             break;
   14014          default:
   14015             vpanic("disInstr(x86)(clc/stc/cmc)");
   14016       }
   14017       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   14018       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   14019       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t1) ));
   14020       /* Set NDEP even though it isn't used.  This makes redundant-PUT
   14021          elimination of previous stores to this field work better. */
   14022       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   14023       break;
   14024 
   14025    case 0xD6: /* SALC */
   14026       t0 = newTemp(Ity_I32);
   14027       t1 = newTemp(Ity_I32);
   14028       assign( t0,  binop(Iop_And32,
   14029                          mk_x86g_calculate_eflags_c(),
   14030                          mkU32(1)) );
   14031       assign( t1, binop(Iop_Sar32,
   14032                         binop(Iop_Shl32, mkexpr(t0), mkU8(31)),
   14033                         mkU8(31)) );
   14034       putIReg(1, R_EAX, unop(Iop_32to8, mkexpr(t1)) );
   14035       DIP("salc\n");
   14036       break;
   14037 
   14038    /* REPNE prefix insn */
   14039    case 0xF2: {
   14040       Addr32 eip_orig = guest_EIP_bbstart + delta_start;
   14041       if (sorb != 0) goto decode_failure;
   14042       abyte = getIByte(delta); delta++;
   14043 
   14044       if (abyte == 0x66) { sz = 2; abyte = getIByte(delta); delta++; }
   14045 
   14046       switch (abyte) {
   14047       /* According to the Intel manual, "repne movs" should never occur, but
   14048        * in practice it has happened, so allow for it here... */
   14049       case 0xA4: sz = 1;   /* REPNE MOVS<sz> */
   14050       case 0xA5:
   14051          dis_REP_op ( &dres, X86CondNZ, dis_MOVS, sz, eip_orig,
   14052                              guest_EIP_bbstart+delta, "repne movs" );
   14053          break;
   14054 
   14055       case 0xA6: sz = 1;   /* REPNE CMP<sz> */
   14056       case 0xA7:
   14057          dis_REP_op ( &dres, X86CondNZ, dis_CMPS, sz, eip_orig,
   14058                              guest_EIP_bbstart+delta, "repne cmps" );
   14059          break;
   14060 
   14061       case 0xAA: sz = 1;   /* REPNE STOS<sz> */
   14062       case 0xAB:
   14063          dis_REP_op ( &dres, X86CondNZ, dis_STOS, sz, eip_orig,
   14064                              guest_EIP_bbstart+delta, "repne stos" );
   14065          break;
   14066 
   14067       case 0xAE: sz = 1;   /* REPNE SCAS<sz> */
   14068       case 0xAF:
   14069          dis_REP_op ( &dres, X86CondNZ, dis_SCAS, sz, eip_orig,
   14070                              guest_EIP_bbstart+delta, "repne scas" );
   14071          break;
   14072 
   14073       default:
   14074          goto decode_failure;
   14075       }
   14076       break;
   14077    }
   14078 
   14079    /* REP/REPE prefix insn (for SCAS and CMPS, 0xF3 means REPE,
   14080       for the rest, it means REP) */
   14081    case 0xF3: {
   14082       Addr32 eip_orig = guest_EIP_bbstart + delta_start;
   14083       abyte = getIByte(delta); delta++;
   14084 
   14085       if (abyte == 0x66) { sz = 2; abyte = getIByte(delta); delta++; }
   14086 
   14087       if (sorb != 0 && abyte != 0x0F) goto decode_failure;
   14088 
   14089       switch (abyte) {
   14090       case 0x0F:
   14091          switch (getIByte(delta)) {
   14092          /* On older CPUs, TZCNT behaves the same as BSF.  */
   14093          case 0xBC: /* REP BSF Gv,Ev */
   14094             delta = dis_bs_E_G ( sorb, sz, delta + 1, True );
   14095             break;
   14096          /* On older CPUs, LZCNT behaves the same as BSR.  */
   14097          case 0xBD: /* REP BSR Gv,Ev */
   14098             delta = dis_bs_E_G ( sorb, sz, delta + 1, False );
   14099             break;
   14100          default:
   14101             goto decode_failure;
   14102          }
   14103          break;
   14104 
   14105       case 0xA4: sz = 1;   /* REP MOVS<sz> */
   14106       case 0xA5:
   14107          dis_REP_op ( &dres, X86CondAlways, dis_MOVS, sz, eip_orig,
   14108                              guest_EIP_bbstart+delta, "rep movs" );
   14109          break;
   14110 
   14111       case 0xA6: sz = 1;   /* REPE CMP<sz> */
   14112       case 0xA7:
   14113          dis_REP_op ( &dres, X86CondZ, dis_CMPS, sz, eip_orig,
   14114                              guest_EIP_bbstart+delta, "repe cmps" );
   14115          break;
   14116 
   14117       case 0xAA: sz = 1;   /* REP STOS<sz> */
   14118       case 0xAB:
   14119          dis_REP_op ( &dres, X86CondAlways, dis_STOS, sz, eip_orig,
   14120                              guest_EIP_bbstart+delta, "rep stos" );
   14121          break;
   14122 
   14123       case 0xAC: sz = 1;   /* REP LODS<sz> */
   14124       case 0xAD:
   14125          dis_REP_op ( &dres, X86CondAlways, dis_LODS, sz, eip_orig,
   14126                              guest_EIP_bbstart+delta, "rep lods" );
   14127          break;
   14128 
   14129       case 0xAE: sz = 1;   /* REPE SCAS<sz> */
   14130       case 0xAF:
   14131          dis_REP_op ( &dres, X86CondZ, dis_SCAS, sz, eip_orig,
   14132                              guest_EIP_bbstart+delta, "repe scas" );
   14133          break;
   14134 
   14135       case 0x90:           /* REP NOP (PAUSE) */
   14136          /* a hint to the P4 re spin-wait loop */
   14137          DIP("rep nop (P4 pause)\n");
   14138          /* "observe" the hint.  The Vex client needs to be careful not
   14139             to cause very long delays as a result, though. */
   14140          jmp_lit(&dres, Ijk_Yield, ((Addr32)guest_EIP_bbstart)+delta);
   14141          vassert(dres.whatNext == Dis_StopHere);
   14142          break;
   14143 
   14144       case 0xC3:           /* REP RET -- same as normal ret? */
   14145          dis_ret(&dres, 0);
   14146          DIP("rep ret\n");
   14147          break;
   14148 
   14149       default:
   14150          goto decode_failure;
   14151       }
   14152       break;
   14153    }
   14154 
   14155    /* ------------------------ XCHG ----------------------- */
   14156 
   14157    /* XCHG reg,mem automatically asserts LOCK# even without a LOCK
   14158       prefix; hence it must be translated with an IRCAS (at least, the
   14159       memory variant). */
   14160    case 0x86: /* XCHG Gb,Eb */
   14161       sz = 1;
   14162       /* Fall through ... */
   14163    case 0x87: /* XCHG Gv,Ev */
   14164       modrm = getIByte(delta);
   14165       ty = szToITy(sz);
   14166       t1 = newTemp(ty); t2 = newTemp(ty);
   14167       if (epartIsReg(modrm)) {
   14168          assign(t1, getIReg(sz, eregOfRM(modrm)));
   14169          assign(t2, getIReg(sz, gregOfRM(modrm)));
   14170          putIReg(sz, gregOfRM(modrm), mkexpr(t1));
   14171          putIReg(sz, eregOfRM(modrm), mkexpr(t2));
   14172          delta++;
   14173          DIP("xchg%c %s, %s\n",
   14174              nameISize(sz), nameIReg(sz,gregOfRM(modrm)),
   14175                             nameIReg(sz,eregOfRM(modrm)));
   14176       } else {
   14177          *expect_CAS = True;
   14178          addr = disAMode ( &alen, sorb, delta, dis_buf );
   14179          assign( t1, loadLE(ty,mkexpr(addr)) );
   14180          assign( t2, getIReg(sz,gregOfRM(modrm)) );
   14181          casLE( mkexpr(addr),
   14182                 mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
   14183          putIReg( sz, gregOfRM(modrm), mkexpr(t1) );
   14184          delta += alen;
   14185          DIP("xchg%c %s, %s\n", nameISize(sz),
   14186                                 nameIReg(sz,gregOfRM(modrm)), dis_buf);
   14187       }
   14188       break;
   14189 
   14190    case 0x90: /* XCHG eAX,eAX */
   14191       DIP("nop\n");
   14192       break;
   14193    case 0x91: /* XCHG eAX,eCX */
   14194    case 0x92: /* XCHG eAX,eDX */
   14195    case 0x93: /* XCHG eAX,eBX */
   14196    case 0x94: /* XCHG eAX,eSP */
   14197    case 0x95: /* XCHG eAX,eBP */
   14198    case 0x96: /* XCHG eAX,eSI */
   14199    case 0x97: /* XCHG eAX,eDI */
   14200       codegen_xchg_eAX_Reg ( sz, opc - 0x90 );
   14201       break;
   14202 
   14203    /* ------------------------ XLAT ----------------------- */
   14204 
   14205    case 0xD7: /* XLAT */
   14206       if (sz != 4) goto decode_failure; /* sz == 2 is also allowed (0x66) */
   14207       putIReg(
   14208          1,
   14209          R_EAX/*AL*/,
   14210          loadLE(Ity_I8,
   14211                 handleSegOverride(
   14212                    sorb,
   14213                    binop(Iop_Add32,
   14214                          getIReg(4, R_EBX),
   14215                          unop(Iop_8Uto32, getIReg(1, R_EAX/*AL*/))))));
   14216 
   14217       DIP("xlat%c [ebx]\n", nameISize(sz));
   14218       break;
   14219 
   14220    /* ------------------------ IN / OUT ----------------------- */
   14221 
   14222    case 0xE4: /* IN imm8, AL */
   14223       sz = 1;
   14224       t1 = newTemp(Ity_I32);
   14225       abyte = getIByte(delta); delta++;
   14226       assign(t1, mkU32( abyte & 0xFF ));
   14227       DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIReg(sz,R_EAX));
   14228       goto do_IN;
   14229    case 0xE5: /* IN imm8, eAX */
   14230       vassert(sz == 2 || sz == 4);
   14231       t1 = newTemp(Ity_I32);
   14232       abyte = getIByte(delta); delta++;
   14233       assign(t1, mkU32( abyte & 0xFF ));
   14234       DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIReg(sz,R_EAX));
   14235       goto do_IN;
   14236    case 0xEC: /* IN %DX, AL */
   14237       sz = 1;
   14238       t1 = newTemp(Ity_I32);
   14239       assign(t1, unop(Iop_16Uto32, getIReg(2, R_EDX)));
   14240       DIP("in%c %s,%s\n", nameISize(sz), nameIReg(2,R_EDX),
   14241                                          nameIReg(sz,R_EAX));
   14242       goto do_IN;
   14243    case 0xED: /* IN %DX, eAX */
   14244       vassert(sz == 2 || sz == 4);
   14245       t1 = newTemp(Ity_I32);
   14246       assign(t1, unop(Iop_16Uto32, getIReg(2, R_EDX)));
   14247       DIP("in%c %s,%s\n", nameISize(sz), nameIReg(2,R_EDX),
   14248                                          nameIReg(sz,R_EAX));
   14249       goto do_IN;
   14250    do_IN: {
   14251       /* At this point, sz indicates the width, and t1 is a 32-bit
   14252          value giving port number. */
   14253       IRDirty* d;
   14254       vassert(sz == 1 || sz == 2 || sz == 4);
   14255       ty = szToITy(sz);
   14256       t2 = newTemp(Ity_I32);
   14257       d = unsafeIRDirty_1_N(
   14258              t2,
   14259              0/*regparms*/,
   14260              "x86g_dirtyhelper_IN",
   14261              &x86g_dirtyhelper_IN,
   14262              mkIRExprVec_2( mkexpr(t1), mkU32(sz) )
   14263           );
   14264       /* do the call, dumping the result in t2. */
   14265       stmt( IRStmt_Dirty(d) );
   14266       putIReg(sz, R_EAX, narrowTo( ty, mkexpr(t2) ) );
   14267       break;
   14268    }
   14269 
   14270    case 0xE6: /* OUT AL, imm8 */
   14271       sz = 1;
   14272       t1 = newTemp(Ity_I32);
   14273       abyte = getIByte(delta); delta++;
   14274       assign( t1, mkU32( abyte & 0xFF ) );
   14275       DIP("out%c %s,$%d\n", nameISize(sz), nameIReg(sz,R_EAX), (Int)abyte);
   14276       goto do_OUT;
   14277    case 0xE7: /* OUT eAX, imm8 */
   14278       vassert(sz == 2 || sz == 4);
   14279       t1 = newTemp(Ity_I32);
   14280       abyte = getIByte(delta); delta++;
   14281       assign( t1, mkU32( abyte & 0xFF ) );
   14282       DIP("out%c %s,$%d\n", nameISize(sz), nameIReg(sz,R_EAX), (Int)abyte);
   14283       goto do_OUT;
   14284    case 0xEE: /* OUT AL, %DX */
   14285       sz = 1;
   14286       t1 = newTemp(Ity_I32);
   14287       assign( t1, unop(Iop_16Uto32, getIReg(2, R_EDX)) );
   14288       DIP("out%c %s,%s\n", nameISize(sz), nameIReg(sz,R_EAX),
   14289                                           nameIReg(2,R_EDX));
   14290       goto do_OUT;
   14291    case 0xEF: /* OUT eAX, %DX */
   14292       vassert(sz == 2 || sz == 4);
   14293       t1 = newTemp(Ity_I32);
   14294       assign( t1, unop(Iop_16Uto32, getIReg(2, R_EDX)) );
   14295       DIP("out%c %s,%s\n", nameISize(sz), nameIReg(sz,R_EAX),
   14296                                           nameIReg(2,R_EDX));
   14297       goto do_OUT;
   14298    do_OUT: {
   14299       /* At this point, sz indicates the width, and t1 is a 32-bit
   14300          value giving port number. */
   14301       IRDirty* d;
   14302       vassert(sz == 1 || sz == 2 || sz == 4);
   14303       ty = szToITy(sz);
   14304       d = unsafeIRDirty_0_N(
   14305              0/*regparms*/,
   14306              "x86g_dirtyhelper_OUT",
   14307              &x86g_dirtyhelper_OUT,
   14308              mkIRExprVec_3( mkexpr(t1),
   14309                             widenUto32( getIReg(sz, R_EAX) ),
   14310                             mkU32(sz) )
   14311           );
   14312       stmt( IRStmt_Dirty(d) );
   14313       break;
   14314    }
   14315 
   14316    /* ------------------------ (Grp1 extensions) ---------- */
   14317 
   14318    case 0x82: /* Grp1 Ib,Eb too.  Apparently this is the same as
   14319                  case 0x80, but only in 32-bit mode. */
   14320       /* fallthru */
   14321    case 0x80: /* Grp1 Ib,Eb */
   14322       modrm = getIByte(delta);
   14323       am_sz = lengthAMode(delta);
   14324       sz    = 1;
   14325       d_sz  = 1;
   14326       d32   = getUChar(delta + am_sz);
   14327       delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
   14328       break;
   14329 
   14330    case 0x81: /* Grp1 Iv,Ev */
   14331       modrm = getIByte(delta);
   14332       am_sz = lengthAMode(delta);
   14333       d_sz  = sz;
   14334       d32   = getUDisp(d_sz, delta + am_sz);
   14335       delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
   14336       break;
   14337 
   14338    case 0x83: /* Grp1 Ib,Ev */
   14339       modrm = getIByte(delta);
   14340       am_sz = lengthAMode(delta);
   14341       d_sz  = 1;
   14342       d32   = getSDisp8(delta + am_sz);
   14343       delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
   14344       break;
   14345 
   14346    /* ------------------------ (Grp2 extensions) ---------- */
   14347 
   14348    case 0xC0: { /* Grp2 Ib,Eb */
   14349       Bool decode_OK = True;
   14350       modrm = getIByte(delta);
   14351       am_sz = lengthAMode(delta);
   14352       d_sz  = 1;
   14353       d32   = getUChar(delta + am_sz);
   14354       sz    = 1;
   14355       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14356                          mkU8(d32 & 0xFF), NULL, &decode_OK );
   14357       if (!decode_OK)
   14358          goto decode_failure;
   14359       break;
   14360    }
   14361    case 0xC1: { /* Grp2 Ib,Ev */
   14362       Bool decode_OK = True;
   14363       modrm = getIByte(delta);
   14364       am_sz = lengthAMode(delta);
   14365       d_sz  = 1;
   14366       d32   = getUChar(delta + am_sz);
   14367       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14368                          mkU8(d32 & 0xFF), NULL, &decode_OK );
   14369       if (!decode_OK)
   14370          goto decode_failure;
   14371       break;
   14372    }
   14373    case 0xD0: { /* Grp2 1,Eb */
   14374       Bool decode_OK = True;
   14375       modrm = getIByte(delta);
   14376       am_sz = lengthAMode(delta);
   14377       d_sz  = 0;
   14378       d32   = 1;
   14379       sz    = 1;
   14380       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14381                          mkU8(d32), NULL, &decode_OK );
   14382       if (!decode_OK)
   14383          goto decode_failure;
   14384       break;
   14385    }
   14386    case 0xD1: { /* Grp2 1,Ev */
   14387       Bool decode_OK = True;
   14388       modrm = getUChar(delta);
   14389       am_sz = lengthAMode(delta);
   14390       d_sz  = 0;
   14391       d32   = 1;
   14392       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14393                          mkU8(d32), NULL, &decode_OK );
   14394       if (!decode_OK)
   14395          goto decode_failure;
   14396       break;
   14397    }
   14398    case 0xD2: { /* Grp2 CL,Eb */
   14399       Bool decode_OK = True;
   14400       modrm = getUChar(delta);
   14401       am_sz = lengthAMode(delta);
   14402       d_sz  = 0;
   14403       sz    = 1;
   14404       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14405                          getIReg(1,R_ECX), "%cl", &decode_OK );
   14406       if (!decode_OK)
   14407          goto decode_failure;
   14408       break;
   14409    }
   14410    case 0xD3: { /* Grp2 CL,Ev */
   14411       Bool decode_OK = True;
   14412       modrm = getIByte(delta);
   14413       am_sz = lengthAMode(delta);
   14414       d_sz  = 0;
   14415       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14416                          getIReg(1,R_ECX), "%cl", &decode_OK );
   14417       if (!decode_OK)
   14418          goto decode_failure;
   14419       break;
   14420    }
   14421 
   14422    /* ------------------------ (Grp3 extensions) ---------- */
   14423 
   14424    case 0xF6: { /* Grp3 Eb */
   14425       Bool decode_OK = True;
   14426       delta = dis_Grp3 ( sorb, pfx_lock, 1, delta, &decode_OK );
   14427       if (!decode_OK)
   14428          goto decode_failure;
   14429       break;
   14430    }
   14431    case 0xF7: { /* Grp3 Ev */
   14432       Bool decode_OK = True;
   14433       delta = dis_Grp3 ( sorb, pfx_lock, sz, delta, &decode_OK );
   14434       if (!decode_OK)
   14435          goto decode_failure;
   14436       break;
   14437    }
   14438 
   14439    /* ------------------------ (Grp4 extensions) ---------- */
   14440 
   14441    case 0xFE: { /* Grp4 Eb */
   14442       Bool decode_OK = True;
   14443       delta = dis_Grp4 ( sorb, pfx_lock, delta, &decode_OK );
   14444       if (!decode_OK)
   14445          goto decode_failure;
   14446       break;
   14447    }
   14448 
   14449    /* ------------------------ (Grp5 extensions) ---------- */
   14450 
   14451    case 0xFF: { /* Grp5 Ev */
   14452       Bool decode_OK = True;
   14453       delta = dis_Grp5 ( sorb, pfx_lock, sz, delta, &dres, &decode_OK );
   14454       if (!decode_OK)
   14455          goto decode_failure;
   14456       break;
   14457    }
   14458 
   14459    /* ------------------------ Escapes to 2-byte opcodes -- */
   14460 
   14461    case 0x0F: {
   14462       opc = getIByte(delta); delta++;
   14463       switch (opc) {
   14464 
   14465       /* =-=-=-=-=-=-=-=-=- Grp8 =-=-=-=-=-=-=-=-=-=-=-= */
   14466 
   14467       case 0xBA: { /* Grp8 Ib,Ev */
   14468          Bool decode_OK = False;
   14469          modrm = getUChar(delta);
   14470          am_sz = lengthAMode(delta);
   14471          d32   = getSDisp8(delta + am_sz);
   14472          delta = dis_Grp8_Imm ( sorb, pfx_lock, delta, modrm,
   14473                                 am_sz, sz, d32, &decode_OK );
   14474          if (!decode_OK)
   14475             goto decode_failure;
   14476          break;
   14477       }
   14478 
   14479       /* =-=-=-=-=-=-=-=-=- BSF/BSR -=-=-=-=-=-=-=-=-=-= */
   14480 
   14481       case 0xBC: /* BSF Gv,Ev */
   14482          delta = dis_bs_E_G ( sorb, sz, delta, True );
   14483          break;
   14484       case 0xBD: /* BSR Gv,Ev */
   14485          delta = dis_bs_E_G ( sorb, sz, delta, False );
   14486          break;
   14487 
   14488       /* =-=-=-=-=-=-=-=-=- BSWAP -=-=-=-=-=-=-=-=-=-=-= */
   14489 
   14490       case 0xC8: /* BSWAP %eax */
   14491       case 0xC9:
   14492       case 0xCA:
   14493       case 0xCB:
   14494       case 0xCC:
   14495       case 0xCD:
   14496       case 0xCE:
   14497       case 0xCF: /* BSWAP %edi */
   14498          /* AFAICS from the Intel docs, this only exists at size 4. */
   14499          if (sz != 4) goto decode_failure;
   14500 
   14501          t1 = newTemp(Ity_I32);
   14502          assign( t1, getIReg(4, opc-0xC8) );
   14503          t2 = math_BSWAP(t1, Ity_I32);
   14504 
   14505          putIReg(4, opc-0xC8, mkexpr(t2));
   14506          DIP("bswapl %s\n", nameIReg(4, opc-0xC8));
   14507          break;
   14508 
   14509       /* =-=-=-=-=-=-=-=-=- BT/BTS/BTR/BTC =-=-=-=-=-=-= */
   14510 
   14511       case 0xA3: /* BT Gv,Ev */
   14512          delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpNone );
   14513          break;
   14514       case 0xB3: /* BTR Gv,Ev */
   14515          delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpReset );
   14516          break;
   14517       case 0xAB: /* BTS Gv,Ev */
   14518          delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpSet );
   14519          break;
   14520       case 0xBB: /* BTC Gv,Ev */
   14521          delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpComp );
   14522          break;
   14523 
   14524       /* =-=-=-=-=-=-=-=-=- CMOV =-=-=-=-=-=-=-=-=-=-=-= */
   14525 
   14526       case 0x40:
   14527       case 0x41:
   14528       case 0x42: /* CMOVBb/CMOVNAEb (cmov below) */
   14529       case 0x43: /* CMOVNBb/CMOVAEb (cmov not below) */
   14530       case 0x44: /* CMOVZb/CMOVEb (cmov zero) */
   14531       case 0x45: /* CMOVNZb/CMOVNEb (cmov not zero) */
   14532       case 0x46: /* CMOVBEb/CMOVNAb (cmov below or equal) */
   14533       case 0x47: /* CMOVNBEb/CMOVAb (cmov not below or equal) */
   14534       case 0x48: /* CMOVSb (cmov negative) */
   14535       case 0x49: /* CMOVSb (cmov not negative) */
   14536       case 0x4A: /* CMOVP (cmov parity even) */
   14537       case 0x4B: /* CMOVNP (cmov parity odd) */
   14538       case 0x4C: /* CMOVLb/CMOVNGEb (cmov less) */
   14539       case 0x4D: /* CMOVGEb/CMOVNLb (cmov greater or equal) */
   14540       case 0x4E: /* CMOVLEb/CMOVNGb (cmov less or equal) */
   14541       case 0x4F: /* CMOVGb/CMOVNLEb (cmov greater) */
   14542          delta = dis_cmov_E_G(sorb, sz, (X86Condcode)(opc - 0x40), delta);
   14543          break;
   14544 
   14545       /* =-=-=-=-=-=-=-=-=- CMPXCHG -=-=-=-=-=-=-=-=-=-= */
   14546 
   14547       case 0xB0: /* CMPXCHG Gb,Eb */
   14548          delta = dis_cmpxchg_G_E ( sorb, pfx_lock, 1, delta );
   14549          break;
   14550       case 0xB1: /* CMPXCHG Gv,Ev */
   14551          delta = dis_cmpxchg_G_E ( sorb, pfx_lock, sz, delta );
   14552          break;
   14553 
   14554       case 0xC7: { /* CMPXCHG8B Gv (0F C7 /1) */
   14555          IRTemp expdHi    = newTemp(Ity_I32);
   14556          IRTemp expdLo    = newTemp(Ity_I32);
   14557          IRTemp dataHi    = newTemp(Ity_I32);
   14558          IRTemp dataLo    = newTemp(Ity_I32);
   14559          IRTemp oldHi     = newTemp(Ity_I32);
   14560          IRTemp oldLo     = newTemp(Ity_I32);
   14561          IRTemp flags_old = newTemp(Ity_I32);
   14562          IRTemp flags_new = newTemp(Ity_I32);
   14563          IRTemp success   = newTemp(Ity_I1);
   14564 
   14565          /* Translate this using a DCAS, even if there is no LOCK
   14566             prefix.  Life is too short to bother with generating two
   14567             different translations for the with/without-LOCK-prefix
   14568             cases. */
   14569          *expect_CAS = True;
   14570 
   14571 	 /* Decode, and generate address. */
   14572          if (sz != 4) goto decode_failure;
   14573          modrm = getIByte(delta);
   14574          if (epartIsReg(modrm)) goto decode_failure;
   14575          if (gregOfRM(modrm) != 1) goto decode_failure;
   14576          addr = disAMode ( &alen, sorb, delta, dis_buf );
   14577          delta += alen;
   14578 
   14579          /* Get the expected and new values. */
   14580          assign( expdHi, getIReg(4,R_EDX) );
   14581          assign( expdLo, getIReg(4,R_EAX) );
   14582          assign( dataHi, getIReg(4,R_ECX) );
   14583          assign( dataLo, getIReg(4,R_EBX) );
   14584 
   14585          /* Do the DCAS */
   14586          stmt( IRStmt_CAS(
   14587                   mkIRCAS( oldHi, oldLo,
   14588                            Iend_LE, mkexpr(addr),
   14589                            mkexpr(expdHi), mkexpr(expdLo),
   14590                            mkexpr(dataHi), mkexpr(dataLo)
   14591                )));
   14592 
   14593          /* success when oldHi:oldLo == expdHi:expdLo */
   14594          assign( success,
   14595                  binop(Iop_CasCmpEQ32,
   14596                        binop(Iop_Or32,
   14597                              binop(Iop_Xor32, mkexpr(oldHi), mkexpr(expdHi)),
   14598                              binop(Iop_Xor32, mkexpr(oldLo), mkexpr(expdLo))
   14599                        ),
   14600                        mkU32(0)
   14601                  ));
   14602 
   14603          /* If the DCAS is successful, that is to say oldHi:oldLo ==
   14604             expdHi:expdLo, then put expdHi:expdLo back in EDX:EAX,
   14605             which is where they came from originally.  Both the actual
   14606             contents of these two regs, and any shadow values, are
   14607             unchanged.  If the DCAS fails then we're putting into
   14608             EDX:EAX the value seen in memory. */
   14609          putIReg(4, R_EDX,
   14610                     IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(success)),
   14611                                   mkexpr(oldHi),
   14612                                   mkexpr(expdHi)
   14613                 ));
   14614          putIReg(4, R_EAX,
   14615                     IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(success)),
   14616                                   mkexpr(oldLo),
   14617                                   mkexpr(expdLo)
   14618                 ));
   14619 
   14620          /* Copy the success bit into the Z flag and leave the others
   14621             unchanged */
   14622          assign( flags_old, widenUto32(mk_x86g_calculate_eflags_all()));
   14623          assign(
   14624             flags_new,
   14625             binop(Iop_Or32,
   14626                   binop(Iop_And32, mkexpr(flags_old),
   14627                                    mkU32(~X86G_CC_MASK_Z)),
   14628                   binop(Iop_Shl32,
   14629                         binop(Iop_And32,
   14630                               unop(Iop_1Uto32, mkexpr(success)), mkU32(1)),
   14631                         mkU8(X86G_CC_SHIFT_Z)) ));
   14632 
   14633          stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   14634          stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(flags_new) ));
   14635          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   14636          /* Set NDEP even though it isn't used.  This makes
   14637             redundant-PUT elimination of previous stores to this field
   14638             work better. */
   14639          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   14640 
   14641          /* Sheesh.  Aren't you glad it was me and not you that had to
   14642 	    write and validate all this grunge? */
   14643 
   14644 	 DIP("cmpxchg8b %s\n", dis_buf);
   14645 	 break;
   14646       }
   14647 
   14648       /* =-=-=-=-=-=-=-=-=- CPUID -=-=-=-=-=-=-=-=-=-=-= */
   14649 
   14650       case 0xA2: { /* CPUID */
   14651          /* Uses dirty helper:
   14652                void dirtyhelper_CPUID_sse[012] ( VexGuestX86State* )
   14653             declared to mod eax, wr ebx, ecx, edx
   14654          */
   14655          IRDirty* d     = NULL;
   14656          HChar*   fName = NULL;
   14657          void*    fAddr = NULL;
   14658          if (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2) {
   14659             fName = "x86g_dirtyhelper_CPUID_sse2";
   14660             fAddr = &x86g_dirtyhelper_CPUID_sse2;
   14661          }
   14662          else
   14663          if (archinfo->hwcaps & VEX_HWCAPS_X86_SSE1) {
   14664             fName = "x86g_dirtyhelper_CPUID_sse1";
   14665             fAddr = &x86g_dirtyhelper_CPUID_sse1;
   14666          }
   14667          else
   14668          if (archinfo->hwcaps == 0/*no SSE*/) {
   14669             fName = "x86g_dirtyhelper_CPUID_sse0";
   14670             fAddr = &x86g_dirtyhelper_CPUID_sse0;
   14671          } else
   14672             vpanic("disInstr(x86)(cpuid)");
   14673 
   14674          vassert(fName); vassert(fAddr);
   14675          d = unsafeIRDirty_0_N ( 0/*regparms*/,
   14676                                  fName, fAddr, mkIRExprVec_0() );
   14677          /* declare guest state effects */
   14678          d->needsBBP = True;
   14679          d->nFxState = 4;
   14680          vex_bzero(&d->fxState, sizeof(d->fxState));
   14681          d->fxState[0].fx     = Ifx_Modify;
   14682          d->fxState[0].offset = OFFB_EAX;
   14683          d->fxState[0].size   = 4;
   14684          d->fxState[1].fx     = Ifx_Write;
   14685          d->fxState[1].offset = OFFB_EBX;
   14686          d->fxState[1].size   = 4;
   14687          d->fxState[2].fx     = Ifx_Modify;
   14688          d->fxState[2].offset = OFFB_ECX;
   14689          d->fxState[2].size   = 4;
   14690          d->fxState[3].fx     = Ifx_Write;
   14691          d->fxState[3].offset = OFFB_EDX;
   14692          d->fxState[3].size   = 4;
   14693          /* execute the dirty call, side-effecting guest state */
   14694          stmt( IRStmt_Dirty(d) );
   14695          /* CPUID is a serialising insn.  So, just in case someone is
   14696             using it as a memory fence ... */
   14697          stmt( IRStmt_MBE(Imbe_Fence) );
   14698          DIP("cpuid\n");
   14699          break;
   14700       }
   14701 
   14702 //--          if (!VG_(cpu_has_feature)(VG_X86_FEAT_CPUID))
   14703 //--             goto decode_failure;
   14704 //--
   14705 //--          t1 = newTemp(cb);
   14706 //--          t2 = newTemp(cb);
   14707 //--          t3 = newTemp(cb);
   14708 //--          t4 = newTemp(cb);
   14709 //--          uInstr0(cb, CALLM_S, 0);
   14710 //--
   14711 //--          uInstr2(cb, GET,   4, ArchReg, R_EAX, TempReg, t1);
   14712 //--          uInstr1(cb, PUSH,  4, TempReg, t1);
   14713 //--
   14714 //--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t2);
   14715 //--          uLiteral(cb, 0);
   14716 //--          uInstr1(cb, PUSH,  4, TempReg, t2);
   14717 //--
   14718 //--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t3);
   14719 //--          uLiteral(cb, 0);
   14720 //--          uInstr1(cb, PUSH,  4, TempReg, t3);
   14721 //--
   14722 //--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t4);
   14723 //--          uLiteral(cb, 0);
   14724 //--          uInstr1(cb, PUSH,  4, TempReg, t4);
   14725 //--
   14726 //--          uInstr1(cb, CALLM, 0, Lit16,   VGOFF_(helper_CPUID));
   14727 //--          uFlagsRWU(cb, FlagsEmpty, FlagsEmpty, FlagsEmpty);
   14728 //--
   14729 //--          uInstr1(cb, POP,   4, TempReg, t4);
   14730 //--          uInstr2(cb, PUT,   4, TempReg, t4, ArchReg, R_EDX);
   14731 //--
   14732 //--          uInstr1(cb, POP,   4, TempReg, t3);
   14733 //--          uInstr2(cb, PUT,   4, TempReg, t3, ArchReg, R_ECX);
   14734 //--
   14735 //--          uInstr1(cb, POP,   4, TempReg, t2);
   14736 //--          uInstr2(cb, PUT,   4, TempReg, t2, ArchReg, R_EBX);
   14737 //--
   14738 //--          uInstr1(cb, POP,   4, TempReg, t1);
   14739 //--          uInstr2(cb, PUT,   4, TempReg, t1, ArchReg, R_EAX);
   14740 //--
   14741 //--          uInstr0(cb, CALLM_E, 0);
   14742 //--          DIP("cpuid\n");
   14743 //--          break;
   14744 //--
   14745       /* =-=-=-=-=-=-=-=-=- MOVZX, MOVSX =-=-=-=-=-=-=-= */
   14746 
   14747       case 0xB6: /* MOVZXb Eb,Gv */
   14748          if (sz != 2 && sz != 4)
   14749             goto decode_failure;
   14750          delta = dis_movx_E_G ( sorb, delta, 1, sz, False );
   14751          break;
   14752 
   14753       case 0xB7: /* MOVZXw Ew,Gv */
   14754          if (sz != 4)
   14755             goto decode_failure;
   14756          delta = dis_movx_E_G ( sorb, delta, 2, 4, False );
   14757          break;
   14758 
   14759       case 0xBE: /* MOVSXb Eb,Gv */
   14760          if (sz != 2 && sz != 4)
   14761             goto decode_failure;
   14762          delta = dis_movx_E_G ( sorb, delta, 1, sz, True );
   14763          break;
   14764 
   14765       case 0xBF: /* MOVSXw Ew,Gv */
   14766          if (sz != 4 && /* accept movsww, sigh, see #250799 */sz != 2)
   14767             goto decode_failure;
   14768          delta = dis_movx_E_G ( sorb, delta, 2, sz, True );
   14769          break;
   14770 
   14771 //--       /* =-=-=-=-=-=-=-=-=-=-= MOVNTI -=-=-=-=-=-=-=-=-= */
   14772 //--
   14773 //--       case 0xC3: /* MOVNTI Gv,Ev */
   14774 //--          vg_assert(sz == 4);
   14775 //--          modrm = getUChar(eip);
   14776 //--          vg_assert(!epartIsReg(modrm));
   14777 //--          t1 = newTemp(cb);
   14778 //--          uInstr2(cb, GET, 4, ArchReg, gregOfRM(modrm), TempReg, t1);
   14779 //--          pair = disAMode ( cb, sorb, eip, dis_buf );
   14780 //--          t2 = LOW24(pair);
   14781 //--          eip += HI8(pair);
   14782 //--          uInstr2(cb, STORE, 4, TempReg, t1, TempReg, t2);
   14783 //--          DIP("movnti %s,%s\n", nameIReg(4,gregOfRM(modrm)), dis_buf);
   14784 //--          break;
   14785 
   14786       /* =-=-=-=-=-=-=-=-=- MUL/IMUL =-=-=-=-=-=-=-=-=-= */
   14787 
   14788       case 0xAF: /* IMUL Ev, Gv */
   14789          delta = dis_mul_E_G ( sorb, sz, delta );
   14790          break;
   14791 
   14792       /* =-=-=-=-=-=-=-=-=- NOPs =-=-=-=-=-=-=-=-=-=-=-= */
   14793 
   14794       case 0x1F:
   14795          modrm = getUChar(delta);
   14796          if (epartIsReg(modrm)) goto decode_failure;
   14797          addr = disAMode ( &alen, sorb, delta, dis_buf );
   14798          delta += alen;
   14799          DIP("nop%c %s\n", nameISize(sz), dis_buf);
   14800          break;
   14801 
   14802       /* =-=-=-=-=-=-=-=-=- Jcond d32 -=-=-=-=-=-=-=-=-= */
   14803       case 0x80:
   14804       case 0x81:
   14805       case 0x82: /* JBb/JNAEb (jump below) */
   14806       case 0x83: /* JNBb/JAEb (jump not below) */
   14807       case 0x84: /* JZb/JEb (jump zero) */
   14808       case 0x85: /* JNZb/JNEb (jump not zero) */
   14809       case 0x86: /* JBEb/JNAb (jump below or equal) */
   14810       case 0x87: /* JNBEb/JAb (jump not below or equal) */
   14811       case 0x88: /* JSb (jump negative) */
   14812       case 0x89: /* JSb (jump not negative) */
   14813       case 0x8A: /* JP (jump parity even) */
   14814       case 0x8B: /* JNP/JPO (jump parity odd) */
   14815       case 0x8C: /* JLb/JNGEb (jump less) */
   14816       case 0x8D: /* JGEb/JNLb (jump greater or equal) */
   14817       case 0x8E: /* JLEb/JNGb (jump less or equal) */
   14818       case 0x8F: /* JGb/JNLEb (jump greater) */
   14819        { Int    jmpDelta;
   14820          HChar* comment  = "";
   14821          jmpDelta = (Int)getUDisp32(delta);
   14822          d32 = (((Addr32)guest_EIP_bbstart)+delta+4) + jmpDelta;
   14823          delta += 4;
   14824          if (resteerCisOk
   14825              && vex_control.guest_chase_cond
   14826              && (Addr32)d32 != (Addr32)guest_EIP_bbstart
   14827              && jmpDelta < 0
   14828              && resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
   14829             /* Speculation: assume this backward branch is taken.  So
   14830                we need to emit a side-exit to the insn following this
   14831                one, on the negation of the condition, and continue at
   14832                the branch target address (d32).  If we wind up back at
   14833                the first instruction of the trace, just stop; it's
   14834                better to let the IR loop unroller handle that case.*/
   14835             stmt( IRStmt_Exit(
   14836                      mk_x86g_calculate_condition((X86Condcode)
   14837                                                  (1 ^ (opc - 0x80))),
   14838                      Ijk_Boring,
   14839                      IRConst_U32(guest_EIP_bbstart+delta),
   14840                      OFFB_EIP ) );
   14841             dres.whatNext   = Dis_ResteerC;
   14842             dres.continueAt = (Addr64)(Addr32)d32;
   14843             comment = "(assumed taken)";
   14844          }
   14845          else
   14846          if (resteerCisOk
   14847              && vex_control.guest_chase_cond
   14848              && (Addr32)d32 != (Addr32)guest_EIP_bbstart
   14849              && jmpDelta >= 0
   14850              && resteerOkFn( callback_opaque,
   14851                              (Addr64)(Addr32)(guest_EIP_bbstart+delta)) ) {
   14852             /* Speculation: assume this forward branch is not taken.
   14853                So we need to emit a side-exit to d32 (the dest) and
   14854                continue disassembling at the insn immediately
   14855                following this one. */
   14856             stmt( IRStmt_Exit(
   14857                      mk_x86g_calculate_condition((X86Condcode)(opc - 0x80)),
   14858                      Ijk_Boring,
   14859                      IRConst_U32(d32),
   14860                      OFFB_EIP ) );
   14861             dres.whatNext   = Dis_ResteerC;
   14862             dres.continueAt = (Addr64)(Addr32)(guest_EIP_bbstart+delta);
   14863             comment = "(assumed not taken)";
   14864          }
   14865          else {
   14866             /* Conservative default translation - end the block at
   14867                this point. */
   14868             jcc_01( &dres, (X86Condcode)(opc - 0x80),
   14869                     (Addr32)(guest_EIP_bbstart+delta), d32);
   14870             vassert(dres.whatNext == Dis_StopHere);
   14871          }
   14872          DIP("j%s-32 0x%x %s\n", name_X86Condcode(opc - 0x80), d32, comment);
   14873          break;
   14874        }
   14875 
   14876       /* =-=-=-=-=-=-=-=-=- RDTSC -=-=-=-=-=-=-=-=-=-=-= */
   14877       case 0x31: { /* RDTSC */
   14878          IRTemp   val  = newTemp(Ity_I64);
   14879          IRExpr** args = mkIRExprVec_0();
   14880          IRDirty* d    = unsafeIRDirty_1_N (
   14881                             val,
   14882                             0/*regparms*/,
   14883                             "x86g_dirtyhelper_RDTSC",
   14884                             &x86g_dirtyhelper_RDTSC,
   14885                             args
   14886                          );
   14887          /* execute the dirty call, dumping the result in val. */
   14888          stmt( IRStmt_Dirty(d) );
   14889          putIReg(4, R_EDX, unop(Iop_64HIto32, mkexpr(val)));
   14890          putIReg(4, R_EAX, unop(Iop_64to32, mkexpr(val)));
   14891          DIP("rdtsc\n");
   14892          break;
   14893       }
   14894 
   14895       /* =-=-=-=-=-=-=-=-=- PUSH/POP Sreg =-=-=-=-=-=-=-=-=-= */
   14896 
   14897       case 0xA1: /* POP %FS */
   14898          dis_pop_segreg( R_FS, sz ); break;
   14899       case 0xA9: /* POP %GS */
   14900          dis_pop_segreg( R_GS, sz ); break;
   14901 
   14902       case 0xA0: /* PUSH %FS */
   14903          dis_push_segreg( R_FS, sz ); break;
   14904       case 0xA8: /* PUSH %GS */
   14905          dis_push_segreg( R_GS, sz ); break;
   14906 
   14907       /* =-=-=-=-=-=-=-=-=- SETcc Eb =-=-=-=-=-=-=-=-=-= */
   14908       case 0x90:
   14909       case 0x91:
   14910       case 0x92: /* set-Bb/set-NAEb (jump below) */
   14911       case 0x93: /* set-NBb/set-AEb (jump not below) */
   14912       case 0x94: /* set-Zb/set-Eb (jump zero) */
   14913       case 0x95: /* set-NZb/set-NEb (jump not zero) */
   14914       case 0x96: /* set-BEb/set-NAb (jump below or equal) */
   14915       case 0x97: /* set-NBEb/set-Ab (jump not below or equal) */
   14916       case 0x98: /* set-Sb (jump negative) */
   14917       case 0x99: /* set-Sb (jump not negative) */
   14918       case 0x9A: /* set-P (jump parity even) */
   14919       case 0x9B: /* set-NP (jump parity odd) */
   14920       case 0x9C: /* set-Lb/set-NGEb (jump less) */
   14921       case 0x9D: /* set-GEb/set-NLb (jump greater or equal) */
   14922       case 0x9E: /* set-LEb/set-NGb (jump less or equal) */
   14923       case 0x9F: /* set-Gb/set-NLEb (jump greater) */
   14924          t1 = newTemp(Ity_I8);
   14925          assign( t1, unop(Iop_1Uto8,mk_x86g_calculate_condition(opc-0x90)) );
   14926          modrm = getIByte(delta);
   14927          if (epartIsReg(modrm)) {
   14928             delta++;
   14929             putIReg(1, eregOfRM(modrm), mkexpr(t1));
   14930             DIP("set%s %s\n", name_X86Condcode(opc-0x90),
   14931                               nameIReg(1,eregOfRM(modrm)));
   14932          } else {
   14933            addr = disAMode ( &alen, sorb, delta, dis_buf );
   14934            delta += alen;
   14935            storeLE( mkexpr(addr), mkexpr(t1) );
   14936            DIP("set%s %s\n", name_X86Condcode(opc-0x90), dis_buf);
   14937          }
   14938          break;
   14939 
   14940       /* =-=-=-=-=-=-=-=-=- SHLD/SHRD -=-=-=-=-=-=-=-=-= */
   14941 
   14942       case 0xA4: /* SHLDv imm8,Gv,Ev */
   14943          modrm = getIByte(delta);
   14944          d32   = delta + lengthAMode(delta);
   14945          vex_sprintf(dis_buf, "$%d", getIByte(d32));
   14946          delta = dis_SHLRD_Gv_Ev (
   14947                   sorb, delta, modrm, sz,
   14948                   mkU8(getIByte(d32)), True, /* literal */
   14949                   dis_buf, True );
   14950          break;
   14951       case 0xA5: /* SHLDv %cl,Gv,Ev */
   14952          modrm = getIByte(delta);
   14953          delta = dis_SHLRD_Gv_Ev (
   14954                     sorb, delta, modrm, sz,
   14955                     getIReg(1,R_ECX), False, /* not literal */
   14956                     "%cl", True );
   14957          break;
   14958 
   14959       case 0xAC: /* SHRDv imm8,Gv,Ev */
   14960          modrm = getIByte(delta);
   14961          d32   = delta + lengthAMode(delta);
   14962          vex_sprintf(dis_buf, "$%d", getIByte(d32));
   14963          delta = dis_SHLRD_Gv_Ev (
   14964                     sorb, delta, modrm, sz,
   14965                     mkU8(getIByte(d32)), True, /* literal */
   14966                     dis_buf, False );
   14967          break;
   14968       case 0xAD: /* SHRDv %cl,Gv,Ev */
   14969          modrm = getIByte(delta);
   14970          delta = dis_SHLRD_Gv_Ev (
   14971                     sorb, delta, modrm, sz,
   14972                     getIReg(1,R_ECX), False, /* not literal */
   14973                     "%cl", False );
   14974          break;
   14975 
   14976       /* =-=-=-=-=-=-=-=-=- SYSENTER -=-=-=-=-=-=-=-=-=-= */
   14977 
   14978       case 0x34:
   14979          /* Simple implementation needing a long explaination.
   14980 
   14981             sysenter is a kind of syscall entry.  The key thing here
   14982             is that the return address is not known -- that is
   14983             something that is beyond Vex's knowledge.  So this IR
   14984             forces a return to the scheduler, which can do what it
   14985             likes to simulate the systenter, but it MUST set this
   14986             thread's guest_EIP field with the continuation address
   14987             before resuming execution.  If that doesn't happen, the
   14988             thread will jump to address zero, which is probably
   14989             fatal.
   14990          */
   14991 
   14992          /* Note where we are, so we can back up the guest to this
   14993             point if the syscall needs to be restarted. */
   14994          stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
   14995                            mkU32(guest_EIP_curr_instr) ) );
   14996          jmp_lit(&dres, Ijk_Sys_sysenter, 0/*bogus next EIP value*/);
   14997          vassert(dres.whatNext == Dis_StopHere);
   14998          DIP("sysenter");
   14999          break;
   15000 
   15001       /* =-=-=-=-=-=-=-=-=- XADD -=-=-=-=-=-=-=-=-=-= */
   15002 
   15003       case 0xC0: { /* XADD Gb,Eb */
   15004          Bool decodeOK;
   15005          delta = dis_xadd_G_E ( sorb, pfx_lock, 1, delta, &decodeOK );
   15006          if (!decodeOK) goto decode_failure;
   15007          break;
   15008       }
   15009       case 0xC1: { /* XADD Gv,Ev */
   15010          Bool decodeOK;
   15011          delta = dis_xadd_G_E ( sorb, pfx_lock, sz, delta, &decodeOK );
   15012          if (!decodeOK) goto decode_failure;
   15013          break;
   15014       }
   15015 
   15016       /* =-=-=-=-=-=-=-=-=- MMXery =-=-=-=-=-=-=-=-=-=-= */
   15017 
   15018       case 0x71:
   15019       case 0x72:
   15020       case 0x73: /* PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
   15021 
   15022       case 0x6E: /* MOVD (src)ireg-or-mem, (dst)mmxreg */
   15023       case 0x7E: /* MOVD (src)mmxreg, (dst)ireg-or-mem */
   15024       case 0x7F: /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
   15025       case 0x6F: /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
   15026 
   15027       case 0xFC:
   15028       case 0xFD:
   15029       case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
   15030 
   15031       case 0xEC:
   15032       case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
   15033 
   15034       case 0xDC:
   15035       case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   15036 
   15037       case 0xF8:
   15038       case 0xF9:
   15039       case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
   15040 
   15041       case 0xE8:
   15042       case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
   15043 
   15044       case 0xD8:
   15045       case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   15046 
   15047       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
   15048       case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
   15049 
   15050       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
   15051 
   15052       case 0x74:
   15053       case 0x75:
   15054       case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
   15055 
   15056       case 0x64:
   15057       case 0x65:
   15058       case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
   15059 
   15060       case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
   15061       case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
   15062       case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
   15063 
   15064       case 0x68:
   15065       case 0x69:
   15066       case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
   15067 
   15068       case 0x60:
   15069       case 0x61:
   15070       case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
   15071 
   15072       case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
   15073       case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
   15074       case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
   15075       case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
   15076 
   15077       case 0xF1: /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
   15078       case 0xF2:
   15079       case 0xF3:
   15080 
   15081       case 0xD1: /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
   15082       case 0xD2:
   15083       case 0xD3:
   15084 
   15085       case 0xE1: /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
   15086       case 0xE2:
   15087       {
   15088          Int  delta0    = delta-1;
   15089          Bool decode_OK = False;
   15090 
   15091          /* If sz==2 this is SSE, and we assume sse idec has
   15092             already spotted those cases by now. */
   15093          if (sz != 4)
   15094             goto decode_failure;
   15095 
   15096          delta = dis_MMX ( &decode_OK, sorb, sz, delta-1 );
   15097          if (!decode_OK) {
   15098             delta = delta0;
   15099             goto decode_failure;
   15100          }
   15101          break;
   15102       }
   15103 
   15104       case 0x0E: /* FEMMS */
   15105       case 0x77: /* EMMS */
   15106          if (sz != 4)
   15107             goto decode_failure;
   15108          do_EMMS_preamble();
   15109          DIP("{f}emms\n");
   15110          break;
   15111 
   15112       /* =-=-=-=-=-=-=-=-=- SGDT and SIDT =-=-=-=-=-=-=-=-=-=-= */
   15113       case 0x01: /* 0F 01 /0 -- SGDT */
   15114                  /* 0F 01 /1 -- SIDT */
   15115       {
   15116           /* This is really revolting, but ... since each processor
   15117              (core) only has one IDT and one GDT, just let the guest
   15118              see it (pass-through semantics).  I can't see any way to
   15119              construct a faked-up value, so don't bother to try. */
   15120          modrm = getUChar(delta);
   15121          addr = disAMode ( &alen, sorb, delta, dis_buf );
   15122          delta += alen;
   15123          if (epartIsReg(modrm)) goto decode_failure;
   15124          if (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)
   15125             goto decode_failure;
   15126          switch (gregOfRM(modrm)) {
   15127             case 0: DIP("sgdt %s\n", dis_buf); break;
   15128             case 1: DIP("sidt %s\n", dis_buf); break;
   15129             default: vassert(0); /*NOTREACHED*/
   15130          }
   15131 
   15132          IRDirty* d = unsafeIRDirty_0_N (
   15133                           0/*regparms*/,
   15134                           "x86g_dirtyhelper_SxDT",
   15135                           &x86g_dirtyhelper_SxDT,
   15136                           mkIRExprVec_2( mkexpr(addr),
   15137                                          mkU32(gregOfRM(modrm)) )
   15138                       );
   15139          /* declare we're writing memory */
   15140          d->mFx   = Ifx_Write;
   15141          d->mAddr = mkexpr(addr);
   15142          d->mSize = 6;
   15143          stmt( IRStmt_Dirty(d) );
   15144          break;
   15145       }
   15146 
   15147       /* =-=-=-=-=-=-=-=-=- unimp2 =-=-=-=-=-=-=-=-=-=-= */
   15148 
   15149       default:
   15150          goto decode_failure;
   15151    } /* switch (opc) for the 2-byte opcodes */
   15152    goto decode_success;
   15153    } /* case 0x0F: of primary opcode */
   15154 
   15155    /* ------------------------ ??? ------------------------ */
   15156 
   15157   default:
   15158   decode_failure:
   15159    /* All decode failures end up here. */
   15160    vex_printf("vex x86->IR: unhandled instruction bytes: "
   15161               "0x%x 0x%x 0x%x 0x%x\n",
   15162               (Int)getIByte(delta_start+0),
   15163               (Int)getIByte(delta_start+1),
   15164               (Int)getIByte(delta_start+2),
   15165               (Int)getIByte(delta_start+3) );
   15166 
   15167    /* Tell the dispatcher that this insn cannot be decoded, and so has
   15168       not been executed, and (is currently) the next to be executed.
   15169       EIP should be up-to-date since it made so at the start of each
   15170       insn, but nevertheless be paranoid and update it again right
   15171       now. */
   15172    stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_curr_instr) ) );
   15173    jmp_lit(&dres, Ijk_NoDecode, guest_EIP_curr_instr);
   15174    vassert(dres.whatNext == Dis_StopHere);
   15175    dres.len = 0;
   15176    /* We also need to say that a CAS is not expected now, regardless
   15177       of what it might have been set to at the start of the function,
   15178       since the IR that we've emitted just above (to synthesis a
   15179       SIGILL) does not involve any CAS, and presumably no other IR has
   15180       been emitted for this (non-decoded) insn. */
   15181    *expect_CAS = False;
   15182    return dres;
   15183 
   15184    } /* switch (opc) for the main (primary) opcode switch. */
   15185 
   15186   decode_success:
   15187    /* All decode successes end up here. */
   15188    switch (dres.whatNext) {
   15189       case Dis_Continue:
   15190          stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_bbstart + delta) ) );
   15191          break;
   15192       case Dis_ResteerU:
   15193       case Dis_ResteerC:
   15194          stmt( IRStmt_Put( OFFB_EIP, mkU32(dres.continueAt) ) );
   15195          break;
   15196       case Dis_StopHere:
   15197          break;
   15198       default:
   15199          vassert(0);
   15200    }
   15201 
   15202    DIP("\n");
   15203    dres.len = delta - delta_start;
   15204    return dres;
   15205 }
   15206 
   15207 #undef DIP
   15208 #undef DIS
   15209 
   15210 
   15211 /*------------------------------------------------------------*/
   15212 /*--- Top-level fn                                         ---*/
   15213 /*------------------------------------------------------------*/
   15214 
   15215 /* Disassemble a single instruction into IR.  The instruction
   15216    is located in host memory at &guest_code[delta]. */
   15217 
   15218 DisResult disInstr_X86 ( IRSB*        irsb_IN,
   15219                          Bool         (*resteerOkFn) ( void*, Addr64 ),
   15220                          Bool         resteerCisOk,
   15221                          void*        callback_opaque,
   15222                          UChar*       guest_code_IN,
   15223                          Long         delta,
   15224                          Addr64       guest_IP,
   15225                          VexArch      guest_arch,
   15226                          VexArchInfo* archinfo,
   15227                          VexAbiInfo*  abiinfo,
   15228                          Bool         host_bigendian_IN )
   15229 {
   15230    Int       i, x1, x2;
   15231    Bool      expect_CAS, has_CAS;
   15232    DisResult dres;
   15233 
   15234    /* Set globals (see top of this file) */
   15235    vassert(guest_arch == VexArchX86);
   15236    guest_code           = guest_code_IN;
   15237    irsb                 = irsb_IN;
   15238    host_is_bigendian    = host_bigendian_IN;
   15239    guest_EIP_curr_instr = (Addr32)guest_IP;
   15240    guest_EIP_bbstart    = (Addr32)toUInt(guest_IP - delta);
   15241 
   15242    x1 = irsb_IN->stmts_used;
   15243    expect_CAS = False;
   15244    dres = disInstr_X86_WRK ( &expect_CAS, resteerOkFn,
   15245                              resteerCisOk,
   15246                              callback_opaque,
   15247                              delta, archinfo, abiinfo );
   15248    x2 = irsb_IN->stmts_used;
   15249    vassert(x2 >= x1);
   15250 
   15251    /* See comment at the top of disInstr_X86_WRK for meaning of
   15252       expect_CAS.  Here, we (sanity-)check for the presence/absence of
   15253       IRCAS as directed by the returned expect_CAS value. */
   15254    has_CAS = False;
   15255    for (i = x1; i < x2; i++) {
   15256       if (irsb_IN->stmts[i]->tag == Ist_CAS)
   15257          has_CAS = True;
   15258    }
   15259 
   15260    if (expect_CAS != has_CAS) {
   15261       /* inconsistency detected.  re-disassemble the instruction so as
   15262          to generate a useful error message; then assert. */
   15263       vex_traceflags |= VEX_TRACE_FE;
   15264       dres = disInstr_X86_WRK ( &expect_CAS, resteerOkFn,
   15265                                 resteerCisOk,
   15266                                 callback_opaque,
   15267                                 delta, archinfo, abiinfo );
   15268       for (i = x1; i < x2; i++) {
   15269          vex_printf("\t\t");
   15270          ppIRStmt(irsb_IN->stmts[i]);
   15271          vex_printf("\n");
   15272       }
   15273       /* Failure of this assertion is serious and denotes a bug in
   15274          disInstr. */
   15275       vpanic("disInstr_X86: inconsistency in LOCK prefix handling");
   15276    }
   15277 
   15278    return dres;
   15279 }
   15280 
   15281 
   15282 /*--------------------------------------------------------------------*/
   15283 /*--- end                                         guest_x86_toIR.c ---*/
   15284 /*--------------------------------------------------------------------*/
   15285