Home | History | Annotate | Download | only in priv
      1 
      2 /*--------------------------------------------------------------------*/
      3 /*--- begin                                       guest_x86_toIR.c ---*/
      4 /*--------------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2010 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 /* Translates x86 code to IR. */
     37 
     38 /* TODO:
     39 
     40    All Puts to CC_OP/CC_DEP1/CC_DEP2/CC_NDEP should really be checked
     41    to ensure a 32-bit value is being written.
     42 
     43    FUCOMI(P): what happens to A and S flags?  Currently are forced
     44       to zero.
     45 
     46    x87 FP Limitations:
     47 
     48    * all arithmetic done at 64 bits
     49 
     50    * no FP exceptions, except for handling stack over/underflow
     51 
     52    * FP rounding mode observed only for float->int conversions
     53      and int->float conversions which could lose accuracy, and
     54      for float-to-float rounding.  For all other operations,
     55      round-to-nearest is used, regardless.
     56 
     57    * FP sin/cos/tan/sincos: C2 flag is always cleared.  IOW the
     58      simulation claims the argument is in-range (-2^63 <= arg <= 2^63)
     59      even when it isn't.
     60 
     61    * some of the FCOM cases could do with testing -- not convinced
     62      that the args are the right way round.
     63 
     64    * FSAVE does not re-initialise the FPU; it should do
     65 
     66    * FINIT not only initialises the FPU environment, it also
     67      zeroes all the FP registers.  It should leave the registers
     68      unchanged.
     69 
     70    SAHF should cause eflags[1] == 1, and in fact it produces 0.  As
     71    per Intel docs this bit has no meaning anyway.  Since PUSHF is the
     72    only way to observe eflags[1], a proper fix would be to make that
     73    bit be set by PUSHF.
     74 
     75    The state of %eflags.AC (alignment check, bit 18) is recorded by
     76    the simulation (viz, if you set it with popf then a pushf produces
     77    the value you set it to), but it is otherwise ignored.  In
     78    particular, setting it to 1 does NOT cause alignment checking to
     79    happen.  Programs that set it to 1 and then rely on the resulting
     80    SIGBUSs to inform them of misaligned accesses will not work.
     81 
     82    Implementation of sysenter is necessarily partial.  sysenter is a
     83    kind of system call entry.  When doing a sysenter, the return
     84    address is not known -- that is something that is beyond Vex's
     85    knowledge.  So the generated IR forces a return to the scheduler,
     86    which can do what it likes to simulate the systenter, but it MUST
     87    set this thread's guest_EIP field with the continuation address
     88    before resuming execution.  If that doesn't happen, the thread will
     89    jump to address zero, which is probably fatal.
     90 
     91    This module uses global variables and so is not MT-safe (if that
     92    should ever become relevant).
     93 
     94    The delta values are 32-bit ints, not 64-bit ints.  That means
     95    this module may not work right if run on a 64-bit host.  That should
     96    be fixed properly, really -- if anyone ever wants to use Vex to
     97    translate x86 code for execution on a 64-bit host.
     98 
     99    casLE (implementation of lock-prefixed insns) and rep-prefixed
    100    insns: the side-exit back to the start of the insn is done with
    101    Ijk_Boring.  This is quite wrong, it should be done with
    102    Ijk_NoRedir, since otherwise the side exit, which is intended to
    103    restart the instruction for whatever reason, could go somewhere
    104    entirely else.  Doing it right (with Ijk_NoRedir jumps) would make
    105    no-redir jumps performance critical, at least for rep-prefixed
    106    instructions, since all iterations thereof would involve such a
    107    jump.  It's not such a big deal with casLE since the side exit is
    108    only taken if the CAS fails, that is, the location is contended,
    109    which is relatively unlikely.
    110 
    111    XXXX: Nov 2009: handling of SWP on ARM suffers from the same
    112    problem.
    113 
    114    Note also, the test for CAS success vs failure is done using
    115    Iop_CasCmp{EQ,NE}{8,16,32,64} rather than the ordinary
    116    Iop_Cmp{EQ,NE} equivalents.  This is so as to tell Memcheck that it
    117    shouldn't definedness-check these comparisons.  See
    118    COMMENT_ON_CasCmpEQ in memcheck/mc_translate.c for
    119    background/rationale.
    120 */
    121 
    122 /* Performance holes:
    123 
    124    - fcom ; fstsw %ax ; sahf
    125      sahf does not update the O flag (sigh) and so O needs to
    126      be computed.  This is done expensively; it would be better
    127      to have a calculate_eflags_o helper.
    128 
    129    - emwarns; some FP codes can generate huge numbers of these
    130      if the fpucw is changed in an inner loop.  It would be
    131      better for the guest state to have an emwarn-enable reg
    132      which can be set zero or nonzero.  If it is zero, emwarns
    133      are not flagged, and instead control just flows all the
    134      way through bbs as usual.
    135 */
    136 
    137 /* "Special" instructions.
    138 
    139    This instruction decoder can decode three special instructions
    140    which mean nothing natively (are no-ops as far as regs/mem are
    141    concerned) but have meaning for supporting Valgrind.  A special
    142    instruction is flagged by the 12-byte preamble C1C703 C1C70D C1C71D
    143    C1C713 (in the standard interpretation, that means: roll $3, %edi;
    144    roll $13, %edi; roll $29, %edi; roll $19, %edi).  Following that,
    145    one of the following 3 are allowed (standard interpretation in
    146    parentheses):
    147 
    148       87DB (xchgl %ebx,%ebx)   %EDX = client_request ( %EAX )
    149       87C9 (xchgl %ecx,%ecx)   %EAX = guest_NRADDR
    150       87D2 (xchgl %edx,%edx)   call-noredir *%EAX
    151 
    152    Any other bytes following the 12-byte preamble are illegal and
    153    constitute a failure in instruction decoding.  This all assumes
    154    that the preamble will never occur except in specific code
    155    fragments designed for Valgrind to catch.
    156 
    157    No prefixes may precede a "Special" instruction.
    158 */
    159 
    160 /* LOCK prefixed instructions.  These are translated using IR-level
    161    CAS statements (IRCAS) and are believed to preserve atomicity, even
    162    from the point of view of some other process racing against a
    163    simulated one (presumably they communicate via a shared memory
    164    segment).
    165 
    166    Handlers which are aware of LOCK prefixes are:
    167       dis_op2_G_E      (add, or, adc, sbb, and, sub, xor)
    168       dis_cmpxchg_G_E  (cmpxchg)
    169       dis_Grp1         (add, or, adc, sbb, and, sub, xor)
    170       dis_Grp3         (not, neg)
    171       dis_Grp4         (inc, dec)
    172       dis_Grp5         (inc, dec)
    173       dis_Grp8_Imm     (bts, btc, btr)
    174       dis_bt_G_E       (bts, btc, btr)
    175       dis_xadd_G_E     (xadd)
    176 */
    177 
    178 
    179 #include "libvex_basictypes.h"
    180 #include "libvex_ir.h"
    181 #include "libvex.h"
    182 #include "libvex_guest_x86.h"
    183 
    184 #include "main_util.h"
    185 #include "main_globals.h"
    186 #include "guest_generic_bb_to_IR.h"
    187 #include "guest_generic_x87.h"
    188 #include "guest_x86_defs.h"
    189 
    190 
    191 /*------------------------------------------------------------*/
    192 /*--- Globals                                              ---*/
    193 /*------------------------------------------------------------*/
    194 
    195 /* These are set at the start of the translation of an insn, right
    196    down in disInstr_X86, so that we don't have to pass them around
    197    endlessly.  They are all constant during the translation of any
    198    given insn. */
    199 
    200 /* We need to know this to do sub-register accesses correctly. */
    201 static Bool host_is_bigendian;
    202 
    203 /* Pointer to the guest code area (points to start of BB, not to the
    204    insn being processed). */
    205 static UChar* guest_code;
    206 
    207 /* The guest address corresponding to guest_code[0]. */
    208 static Addr32 guest_EIP_bbstart;
    209 
    210 /* The guest address for the instruction currently being
    211    translated. */
    212 static Addr32 guest_EIP_curr_instr;
    213 
    214 /* The IRSB* into which we're generating code. */
    215 static IRSB* irsb;
    216 
    217 
    218 /*------------------------------------------------------------*/
    219 /*--- Debugging output                                     ---*/
    220 /*------------------------------------------------------------*/
    221 
    222 #define DIP(format, args...)           \
    223    if (vex_traceflags & VEX_TRACE_FE)  \
    224       vex_printf(format, ## args)
    225 
    226 #define DIS(buf, format, args...)      \
    227    if (vex_traceflags & VEX_TRACE_FE)  \
    228       vex_sprintf(buf, format, ## args)
    229 
    230 
    231 /*------------------------------------------------------------*/
    232 /*--- Offsets of various parts of the x86 guest state.     ---*/
    233 /*------------------------------------------------------------*/
    234 
    235 #define OFFB_EAX       offsetof(VexGuestX86State,guest_EAX)
    236 #define OFFB_EBX       offsetof(VexGuestX86State,guest_EBX)
    237 #define OFFB_ECX       offsetof(VexGuestX86State,guest_ECX)
    238 #define OFFB_EDX       offsetof(VexGuestX86State,guest_EDX)
    239 #define OFFB_ESP       offsetof(VexGuestX86State,guest_ESP)
    240 #define OFFB_EBP       offsetof(VexGuestX86State,guest_EBP)
    241 #define OFFB_ESI       offsetof(VexGuestX86State,guest_ESI)
    242 #define OFFB_EDI       offsetof(VexGuestX86State,guest_EDI)
    243 
    244 #define OFFB_EIP       offsetof(VexGuestX86State,guest_EIP)
    245 
    246 #define OFFB_CC_OP     offsetof(VexGuestX86State,guest_CC_OP)
    247 #define OFFB_CC_DEP1   offsetof(VexGuestX86State,guest_CC_DEP1)
    248 #define OFFB_CC_DEP2   offsetof(VexGuestX86State,guest_CC_DEP2)
    249 #define OFFB_CC_NDEP   offsetof(VexGuestX86State,guest_CC_NDEP)
    250 
    251 #define OFFB_FPREGS    offsetof(VexGuestX86State,guest_FPREG[0])
    252 #define OFFB_FPTAGS    offsetof(VexGuestX86State,guest_FPTAG[0])
    253 #define OFFB_DFLAG     offsetof(VexGuestX86State,guest_DFLAG)
    254 #define OFFB_IDFLAG    offsetof(VexGuestX86State,guest_IDFLAG)
    255 #define OFFB_ACFLAG    offsetof(VexGuestX86State,guest_ACFLAG)
    256 #define OFFB_FTOP      offsetof(VexGuestX86State,guest_FTOP)
    257 #define OFFB_FC3210    offsetof(VexGuestX86State,guest_FC3210)
    258 #define OFFB_FPROUND   offsetof(VexGuestX86State,guest_FPROUND)
    259 
    260 #define OFFB_CS        offsetof(VexGuestX86State,guest_CS)
    261 #define OFFB_DS        offsetof(VexGuestX86State,guest_DS)
    262 #define OFFB_ES        offsetof(VexGuestX86State,guest_ES)
    263 #define OFFB_FS        offsetof(VexGuestX86State,guest_FS)
    264 #define OFFB_GS        offsetof(VexGuestX86State,guest_GS)
    265 #define OFFB_SS        offsetof(VexGuestX86State,guest_SS)
    266 #define OFFB_LDT       offsetof(VexGuestX86State,guest_LDT)
    267 #define OFFB_GDT       offsetof(VexGuestX86State,guest_GDT)
    268 
    269 #define OFFB_SSEROUND  offsetof(VexGuestX86State,guest_SSEROUND)
    270 #define OFFB_XMM0      offsetof(VexGuestX86State,guest_XMM0)
    271 #define OFFB_XMM1      offsetof(VexGuestX86State,guest_XMM1)
    272 #define OFFB_XMM2      offsetof(VexGuestX86State,guest_XMM2)
    273 #define OFFB_XMM3      offsetof(VexGuestX86State,guest_XMM3)
    274 #define OFFB_XMM4      offsetof(VexGuestX86State,guest_XMM4)
    275 #define OFFB_XMM5      offsetof(VexGuestX86State,guest_XMM5)
    276 #define OFFB_XMM6      offsetof(VexGuestX86State,guest_XMM6)
    277 #define OFFB_XMM7      offsetof(VexGuestX86State,guest_XMM7)
    278 
    279 #define OFFB_EMWARN    offsetof(VexGuestX86State,guest_EMWARN)
    280 
    281 #define OFFB_TISTART   offsetof(VexGuestX86State,guest_TISTART)
    282 #define OFFB_TILEN     offsetof(VexGuestX86State,guest_TILEN)
    283 #define OFFB_NRADDR    offsetof(VexGuestX86State,guest_NRADDR)
    284 
    285 #define OFFB_IP_AT_SYSCALL offsetof(VexGuestX86State,guest_IP_AT_SYSCALL)
    286 
    287 
    288 /*------------------------------------------------------------*/
    289 /*--- Helper bits and pieces for deconstructing the        ---*/
    290 /*--- x86 insn stream.                                     ---*/
    291 /*------------------------------------------------------------*/
    292 
    293 /* This is the Intel register encoding -- integer regs. */
    294 #define R_EAX 0
    295 #define R_ECX 1
    296 #define R_EDX 2
    297 #define R_EBX 3
    298 #define R_ESP 4
    299 #define R_EBP 5
    300 #define R_ESI 6
    301 #define R_EDI 7
    302 
    303 #define R_AL (0+R_EAX)
    304 #define R_AH (4+R_EAX)
    305 
    306 /* This is the Intel register encoding -- segment regs. */
    307 #define R_ES 0
    308 #define R_CS 1
    309 #define R_SS 2
    310 #define R_DS 3
    311 #define R_FS 4
    312 #define R_GS 5
    313 
    314 
    315 /* Add a statement to the list held by "irbb". */
    316 static void stmt ( IRStmt* st )
    317 {
    318    addStmtToIRSB( irsb, st );
    319 }
    320 
    321 /* Generate a new temporary of the given type. */
    322 static IRTemp newTemp ( IRType ty )
    323 {
    324    vassert(isPlausibleIRType(ty));
    325    return newIRTemp( irsb->tyenv, ty );
    326 }
    327 
    328 /* Various simple conversions */
    329 
    330 static UInt extend_s_8to32( UInt x )
    331 {
    332    return (UInt)((((Int)x) << 24) >> 24);
    333 }
    334 
    335 static UInt extend_s_16to32 ( UInt x )
    336 {
    337    return (UInt)((((Int)x) << 16) >> 16);
    338 }
    339 
    340 /* Fetch a byte from the guest insn stream. */
    341 static UChar getIByte ( Int delta )
    342 {
    343    return guest_code[delta];
    344 }
    345 
    346 /* Extract the reg field from a modRM byte. */
    347 static Int gregOfRM ( UChar mod_reg_rm )
    348 {
    349    return (Int)( (mod_reg_rm >> 3) & 7 );
    350 }
    351 
    352 /* Figure out whether the mod and rm parts of a modRM byte refer to a
    353    register or memory.  If so, the byte will have the form 11XXXYYY,
    354    where YYY is the register number. */
    355 static Bool epartIsReg ( UChar mod_reg_rm )
    356 {
    357    return toBool(0xC0 == (mod_reg_rm & 0xC0));
    358 }
    359 
    360 /* ... and extract the register number ... */
    361 static Int eregOfRM ( UChar mod_reg_rm )
    362 {
    363    return (Int)(mod_reg_rm & 0x7);
    364 }
    365 
    366 /* Get a 8/16/32-bit unsigned value out of the insn stream. */
    367 
    368 static UChar getUChar ( Int delta )
    369 {
    370    UChar v = guest_code[delta+0];
    371    return toUChar(v);
    372 }
    373 
    374 static UInt getUDisp16 ( Int delta )
    375 {
    376    UInt v = guest_code[delta+1]; v <<= 8;
    377    v |= guest_code[delta+0];
    378    return v & 0xFFFF;
    379 }
    380 
    381 static UInt getUDisp32 ( Int delta )
    382 {
    383    UInt v = guest_code[delta+3]; v <<= 8;
    384    v |= guest_code[delta+2]; v <<= 8;
    385    v |= guest_code[delta+1]; v <<= 8;
    386    v |= guest_code[delta+0];
    387    return v;
    388 }
    389 
    390 static UInt getUDisp ( Int size, Int delta )
    391 {
    392    switch (size) {
    393       case 4: return getUDisp32(delta);
    394       case 2: return getUDisp16(delta);
    395       case 1: return (UInt)getUChar(delta);
    396       default: vpanic("getUDisp(x86)");
    397    }
    398    return 0; /*notreached*/
    399 }
    400 
    401 
    402 /* Get a byte value out of the insn stream and sign-extend to 32
    403    bits. */
    404 static UInt getSDisp8 ( Int delta )
    405 {
    406    return extend_s_8to32( (UInt) (guest_code[delta]) );
    407 }
    408 
    409 static UInt getSDisp16 ( Int delta0 )
    410 {
    411    UChar* eip = (UChar*)(&guest_code[delta0]);
    412    UInt d = *eip++;
    413    d |= ((*eip++) << 8);
    414    return extend_s_16to32(d);
    415 }
    416 
    417 static UInt getSDisp ( Int size, Int delta )
    418 {
    419    switch (size) {
    420       case 4: return getUDisp32(delta);
    421       case 2: return getSDisp16(delta);
    422       case 1: return getSDisp8(delta);
    423       default: vpanic("getSDisp(x86)");
    424   }
    425   return 0; /*notreached*/
    426 }
    427 
    428 
    429 /*------------------------------------------------------------*/
    430 /*--- Helpers for constructing IR.                         ---*/
    431 /*------------------------------------------------------------*/
    432 
    433 /* Create a 1/2/4 byte read of an x86 integer registers.  For 16/8 bit
    434    register references, we need to take the host endianness into
    435    account.  Supplied value is 0 .. 7 and in the Intel instruction
    436    encoding. */
    437 
    438 static IRType szToITy ( Int n )
    439 {
    440    switch (n) {
    441       case 1: return Ity_I8;
    442       case 2: return Ity_I16;
    443       case 4: return Ity_I32;
    444       default: vpanic("szToITy(x86)");
    445    }
    446 }
    447 
    448 /* On a little-endian host, less significant bits of the guest
    449    registers are at lower addresses.  Therefore, if a reference to a
    450    register low half has the safe guest state offset as a reference to
    451    the full register.
    452 */
    453 static Int integerGuestRegOffset ( Int sz, UInt archreg )
    454 {
    455    vassert(archreg < 8);
    456 
    457    /* Correct for little-endian host only. */
    458    vassert(!host_is_bigendian);
    459 
    460    if (sz == 4 || sz == 2 || (sz == 1 && archreg < 4)) {
    461       switch (archreg) {
    462          case R_EAX: return OFFB_EAX;
    463          case R_EBX: return OFFB_EBX;
    464          case R_ECX: return OFFB_ECX;
    465          case R_EDX: return OFFB_EDX;
    466          case R_ESI: return OFFB_ESI;
    467          case R_EDI: return OFFB_EDI;
    468          case R_ESP: return OFFB_ESP;
    469          case R_EBP: return OFFB_EBP;
    470          default: vpanic("integerGuestRegOffset(x86,le)(4,2)");
    471       }
    472    }
    473 
    474    vassert(archreg >= 4 && archreg < 8 && sz == 1);
    475    switch (archreg-4) {
    476       case R_EAX: return 1+ OFFB_EAX;
    477       case R_EBX: return 1+ OFFB_EBX;
    478       case R_ECX: return 1+ OFFB_ECX;
    479       case R_EDX: return 1+ OFFB_EDX;
    480       default: vpanic("integerGuestRegOffset(x86,le)(1h)");
    481    }
    482 
    483    /* NOTREACHED */
    484    vpanic("integerGuestRegOffset(x86,le)");
    485 }
    486 
    487 static Int segmentGuestRegOffset ( UInt sreg )
    488 {
    489    switch (sreg) {
    490       case R_ES: return OFFB_ES;
    491       case R_CS: return OFFB_CS;
    492       case R_SS: return OFFB_SS;
    493       case R_DS: return OFFB_DS;
    494       case R_FS: return OFFB_FS;
    495       case R_GS: return OFFB_GS;
    496       default: vpanic("segmentGuestRegOffset(x86)");
    497    }
    498 }
    499 
    500 static Int xmmGuestRegOffset ( UInt xmmreg )
    501 {
    502    switch (xmmreg) {
    503       case 0: return OFFB_XMM0;
    504       case 1: return OFFB_XMM1;
    505       case 2: return OFFB_XMM2;
    506       case 3: return OFFB_XMM3;
    507       case 4: return OFFB_XMM4;
    508       case 5: return OFFB_XMM5;
    509       case 6: return OFFB_XMM6;
    510       case 7: return OFFB_XMM7;
    511       default: vpanic("xmmGuestRegOffset");
    512    }
    513 }
    514 
    515 /* Lanes of vector registers are always numbered from zero being the
    516    least significant lane (rightmost in the register).  */
    517 
    518 static Int xmmGuestRegLane16offset ( UInt xmmreg, Int laneno )
    519 {
    520    /* Correct for little-endian host only. */
    521    vassert(!host_is_bigendian);
    522    vassert(laneno >= 0 && laneno < 8);
    523    return xmmGuestRegOffset( xmmreg ) + 2 * laneno;
    524 }
    525 
    526 static Int xmmGuestRegLane32offset ( UInt xmmreg, Int laneno )
    527 {
    528    /* Correct for little-endian host only. */
    529    vassert(!host_is_bigendian);
    530    vassert(laneno >= 0 && laneno < 4);
    531    return xmmGuestRegOffset( xmmreg ) + 4 * laneno;
    532 }
    533 
    534 static Int xmmGuestRegLane64offset ( UInt xmmreg, Int laneno )
    535 {
    536    /* Correct for little-endian host only. */
    537    vassert(!host_is_bigendian);
    538    vassert(laneno >= 0 && laneno < 2);
    539    return xmmGuestRegOffset( xmmreg ) + 8 * laneno;
    540 }
    541 
    542 static IRExpr* getIReg ( Int sz, UInt archreg )
    543 {
    544    vassert(sz == 1 || sz == 2 || sz == 4);
    545    vassert(archreg < 8);
    546    return IRExpr_Get( integerGuestRegOffset(sz,archreg),
    547                       szToITy(sz) );
    548 }
    549 
    550 /* Ditto, but write to a reg instead. */
    551 static void putIReg ( Int sz, UInt archreg, IRExpr* e )
    552 {
    553    IRType ty = typeOfIRExpr(irsb->tyenv, e);
    554    switch (sz) {
    555       case 1: vassert(ty == Ity_I8); break;
    556       case 2: vassert(ty == Ity_I16); break;
    557       case 4: vassert(ty == Ity_I32); break;
    558       default: vpanic("putIReg(x86)");
    559    }
    560    vassert(archreg < 8);
    561    stmt( IRStmt_Put(integerGuestRegOffset(sz,archreg), e) );
    562 }
    563 
    564 static IRExpr* getSReg ( UInt sreg )
    565 {
    566    return IRExpr_Get( segmentGuestRegOffset(sreg), Ity_I16 );
    567 }
    568 
    569 static void putSReg ( UInt sreg, IRExpr* e )
    570 {
    571    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
    572    stmt( IRStmt_Put( segmentGuestRegOffset(sreg), e ) );
    573 }
    574 
    575 static IRExpr* getXMMReg ( UInt xmmreg )
    576 {
    577    return IRExpr_Get( xmmGuestRegOffset(xmmreg), Ity_V128 );
    578 }
    579 
    580 static IRExpr* getXMMRegLane64 ( UInt xmmreg, Int laneno )
    581 {
    582    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_I64 );
    583 }
    584 
    585 static IRExpr* getXMMRegLane64F ( UInt xmmreg, Int laneno )
    586 {
    587    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_F64 );
    588 }
    589 
    590 static IRExpr* getXMMRegLane32 ( UInt xmmreg, Int laneno )
    591 {
    592    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_I32 );
    593 }
    594 
    595 static IRExpr* getXMMRegLane32F ( UInt xmmreg, Int laneno )
    596 {
    597    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_F32 );
    598 }
    599 
    600 static void putXMMReg ( UInt xmmreg, IRExpr* e )
    601 {
    602    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
    603    stmt( IRStmt_Put( xmmGuestRegOffset(xmmreg), e ) );
    604 }
    605 
    606 static void putXMMRegLane64 ( UInt xmmreg, Int laneno, IRExpr* e )
    607 {
    608    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
    609    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
    610 }
    611 
    612 static void putXMMRegLane64F ( UInt xmmreg, Int laneno, IRExpr* e )
    613 {
    614    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
    615    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
    616 }
    617 
    618 static void putXMMRegLane32F ( UInt xmmreg, Int laneno, IRExpr* e )
    619 {
    620    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
    621    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
    622 }
    623 
    624 static void putXMMRegLane32 ( UInt xmmreg, Int laneno, IRExpr* e )
    625 {
    626    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
    627    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
    628 }
    629 
    630 static void putXMMRegLane16 ( UInt xmmreg, Int laneno, IRExpr* e )
    631 {
    632    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
    633    stmt( IRStmt_Put( xmmGuestRegLane16offset(xmmreg,laneno), e ) );
    634 }
    635 
    636 static void assign ( IRTemp dst, IRExpr* e )
    637 {
    638    stmt( IRStmt_WrTmp(dst, e) );
    639 }
    640 
    641 static void storeLE ( IRExpr* addr, IRExpr* data )
    642 {
    643    stmt( IRStmt_Store(Iend_LE, addr, data) );
    644 }
    645 
    646 static IRExpr* unop ( IROp op, IRExpr* a )
    647 {
    648    return IRExpr_Unop(op, a);
    649 }
    650 
    651 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
    652 {
    653    return IRExpr_Binop(op, a1, a2);
    654 }
    655 
    656 static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
    657 {
    658    return IRExpr_Triop(op, a1, a2, a3);
    659 }
    660 
    661 static IRExpr* mkexpr ( IRTemp tmp )
    662 {
    663    return IRExpr_RdTmp(tmp);
    664 }
    665 
    666 static IRExpr* mkU8 ( UInt i )
    667 {
    668    vassert(i < 256);
    669    return IRExpr_Const(IRConst_U8( (UChar)i ));
    670 }
    671 
    672 static IRExpr* mkU16 ( UInt i )
    673 {
    674    vassert(i < 65536);
    675    return IRExpr_Const(IRConst_U16( (UShort)i ));
    676 }
    677 
    678 static IRExpr* mkU32 ( UInt i )
    679 {
    680    return IRExpr_Const(IRConst_U32(i));
    681 }
    682 
    683 static IRExpr* mkU64 ( ULong i )
    684 {
    685    return IRExpr_Const(IRConst_U64(i));
    686 }
    687 
    688 static IRExpr* mkU ( IRType ty, UInt i )
    689 {
    690    if (ty == Ity_I8)  return mkU8(i);
    691    if (ty == Ity_I16) return mkU16(i);
    692    if (ty == Ity_I32) return mkU32(i);
    693    /* If this panics, it usually means you passed a size (1,2,4)
    694       value as the IRType, rather than a real IRType. */
    695    vpanic("mkU(x86)");
    696 }
    697 
    698 static IRExpr* mkV128 ( UShort mask )
    699 {
    700    return IRExpr_Const(IRConst_V128(mask));
    701 }
    702 
    703 static IRExpr* loadLE ( IRType ty, IRExpr* addr )
    704 {
    705    return IRExpr_Load(Iend_LE, ty, addr);
    706 }
    707 
    708 static IROp mkSizedOp ( IRType ty, IROp op8 )
    709 {
    710    Int adj;
    711    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
    712    vassert(op8 == Iop_Add8 || op8 == Iop_Sub8
    713            || op8 == Iop_Mul8
    714            || op8 == Iop_Or8 || op8 == Iop_And8 || op8 == Iop_Xor8
    715            || op8 == Iop_Shl8 || op8 == Iop_Shr8 || op8 == Iop_Sar8
    716            || op8 == Iop_CmpEQ8 || op8 == Iop_CmpNE8
    717            || op8 == Iop_CasCmpNE8
    718            || op8 == Iop_Not8);
    719    adj = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
    720    return adj + op8;
    721 }
    722 
    723 static IROp mkWidenOp ( Int szSmall, Int szBig, Bool signd )
    724 {
    725    if (szSmall == 1 && szBig == 4) {
    726       return signd ? Iop_8Sto32 : Iop_8Uto32;
    727    }
    728    if (szSmall == 1 && szBig == 2) {
    729       return signd ? Iop_8Sto16 : Iop_8Uto16;
    730    }
    731    if (szSmall == 2 && szBig == 4) {
    732       return signd ? Iop_16Sto32 : Iop_16Uto32;
    733    }
    734    vpanic("mkWidenOp(x86,guest)");
    735 }
    736 
    737 static IRExpr* mkAnd1 ( IRExpr* x, IRExpr* y )
    738 {
    739    vassert(typeOfIRExpr(irsb->tyenv,x) == Ity_I1);
    740    vassert(typeOfIRExpr(irsb->tyenv,y) == Ity_I1);
    741    return unop(Iop_32to1,
    742                binop(Iop_And32,
    743                      unop(Iop_1Uto32,x),
    744                      unop(Iop_1Uto32,y)));
    745 }
    746 
    747 /* Generate a compare-and-swap operation, operating on memory at
    748    'addr'.  The expected value is 'expVal' and the new value is
    749    'newVal'.  If the operation fails, then transfer control (with a
    750    no-redir jump (XXX no -- see comment at top of this file)) to
    751    'restart_point', which is presumably the address of the guest
    752    instruction again -- retrying, essentially. */
    753 static void casLE ( IRExpr* addr, IRExpr* expVal, IRExpr* newVal,
    754                     Addr32 restart_point )
    755 {
    756    IRCAS* cas;
    757    IRType tyE    = typeOfIRExpr(irsb->tyenv, expVal);
    758    IRType tyN    = typeOfIRExpr(irsb->tyenv, newVal);
    759    IRTemp oldTmp = newTemp(tyE);
    760    IRTemp expTmp = newTemp(tyE);
    761    vassert(tyE == tyN);
    762    vassert(tyE == Ity_I32 || tyE == Ity_I16 || tyE == Ity_I8);
    763    assign(expTmp, expVal);
    764    cas = mkIRCAS( IRTemp_INVALID, oldTmp, Iend_LE, addr,
    765                   NULL, mkexpr(expTmp), NULL, newVal );
    766    stmt( IRStmt_CAS(cas) );
    767    stmt( IRStmt_Exit(
    768             binop( mkSizedOp(tyE,Iop_CasCmpNE8),
    769                    mkexpr(oldTmp), mkexpr(expTmp) ),
    770             Ijk_Boring, /*Ijk_NoRedir*/
    771             IRConst_U32( restart_point )
    772          ));
    773 }
    774 
    775 
    776 /*------------------------------------------------------------*/
    777 /*--- Helpers for %eflags.                                 ---*/
    778 /*------------------------------------------------------------*/
    779 
    780 /* -------------- Evaluating the flags-thunk. -------------- */
    781 
    782 /* Build IR to calculate all the eflags from stored
    783    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
    784    Ity_I32. */
    785 static IRExpr* mk_x86g_calculate_eflags_all ( void )
    786 {
    787    IRExpr** args
    788       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
    789                        IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
    790                        IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
    791                        IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
    792    IRExpr* call
    793       = mkIRExprCCall(
    794            Ity_I32,
    795            0/*regparm*/,
    796            "x86g_calculate_eflags_all", &x86g_calculate_eflags_all,
    797            args
    798         );
    799    /* Exclude OP and NDEP from definedness checking.  We're only
    800       interested in DEP1 and DEP2. */
    801    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
    802    return call;
    803 }
    804 
    805 /* Build IR to calculate some particular condition from stored
    806    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
    807    Ity_Bit. */
    808 static IRExpr* mk_x86g_calculate_condition ( X86Condcode cond )
    809 {
    810    IRExpr** args
    811       = mkIRExprVec_5( mkU32(cond),
    812                        IRExpr_Get(OFFB_CC_OP,  Ity_I32),
    813                        IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
    814                        IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
    815                        IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
    816    IRExpr* call
    817       = mkIRExprCCall(
    818            Ity_I32,
    819            0/*regparm*/,
    820            "x86g_calculate_condition", &x86g_calculate_condition,
    821            args
    822         );
    823    /* Exclude the requested condition, OP and NDEP from definedness
    824       checking.  We're only interested in DEP1 and DEP2. */
    825    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<1) | (1<<4);
    826    return unop(Iop_32to1, call);
    827 }
    828 
    829 /* Build IR to calculate just the carry flag from stored
    830    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression :: Ity_I32. */
    831 static IRExpr* mk_x86g_calculate_eflags_c ( void )
    832 {
    833    IRExpr** args
    834       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
    835                        IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
    836                        IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
    837                        IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
    838    IRExpr* call
    839       = mkIRExprCCall(
    840            Ity_I32,
    841            3/*regparm*/,
    842            "x86g_calculate_eflags_c", &x86g_calculate_eflags_c,
    843            args
    844         );
    845    /* Exclude OP and NDEP from definedness checking.  We're only
    846       interested in DEP1 and DEP2. */
    847    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
    848    return call;
    849 }
    850 
    851 
    852 /* -------------- Building the flags-thunk. -------------- */
    853 
    854 /* The machinery in this section builds the flag-thunk following a
    855    flag-setting operation.  Hence the various setFlags_* functions.
    856 */
    857 
    858 static Bool isAddSub ( IROp op8 )
    859 {
    860    return toBool(op8 == Iop_Add8 || op8 == Iop_Sub8);
    861 }
    862 
    863 static Bool isLogic ( IROp op8 )
    864 {
    865    return toBool(op8 == Iop_And8 || op8 == Iop_Or8 || op8 == Iop_Xor8);
    866 }
    867 
    868 /* U-widen 8/16/32 bit int expr to 32. */
    869 static IRExpr* widenUto32 ( IRExpr* e )
    870 {
    871    switch (typeOfIRExpr(irsb->tyenv,e)) {
    872       case Ity_I32: return e;
    873       case Ity_I16: return unop(Iop_16Uto32,e);
    874       case Ity_I8:  return unop(Iop_8Uto32,e);
    875       default: vpanic("widenUto32");
    876    }
    877 }
    878 
    879 /* S-widen 8/16/32 bit int expr to 32. */
    880 static IRExpr* widenSto32 ( IRExpr* e )
    881 {
    882    switch (typeOfIRExpr(irsb->tyenv,e)) {
    883       case Ity_I32: return e;
    884       case Ity_I16: return unop(Iop_16Sto32,e);
    885       case Ity_I8:  return unop(Iop_8Sto32,e);
    886       default: vpanic("widenSto32");
    887    }
    888 }
    889 
    890 /* Narrow 8/16/32 bit int expr to 8/16/32.  Clearly only some
    891    of these combinations make sense. */
    892 static IRExpr* narrowTo ( IRType dst_ty, IRExpr* e )
    893 {
    894    IRType src_ty = typeOfIRExpr(irsb->tyenv,e);
    895    if (src_ty == dst_ty)
    896       return e;
    897    if (src_ty == Ity_I32 && dst_ty == Ity_I16)
    898       return unop(Iop_32to16, e);
    899    if (src_ty == Ity_I32 && dst_ty == Ity_I8)
    900       return unop(Iop_32to8, e);
    901 
    902    vex_printf("\nsrc, dst tys are: ");
    903    ppIRType(src_ty);
    904    vex_printf(", ");
    905    ppIRType(dst_ty);
    906    vex_printf("\n");
    907    vpanic("narrowTo(x86)");
    908 }
    909 
    910 
    911 /* Set the flags thunk OP, DEP1 and DEP2 fields.  The supplied op is
    912    auto-sized up to the real op. */
    913 
    914 static
    915 void setFlags_DEP1_DEP2 ( IROp op8, IRTemp dep1, IRTemp dep2, IRType ty )
    916 {
    917    Int ccOp = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
    918 
    919    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
    920 
    921    switch (op8) {
    922       case Iop_Add8: ccOp += X86G_CC_OP_ADDB;   break;
    923       case Iop_Sub8: ccOp += X86G_CC_OP_SUBB;   break;
    924       default:       ppIROp(op8);
    925                      vpanic("setFlags_DEP1_DEP2(x86)");
    926    }
    927    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
    928    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(dep1))) );
    929    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(mkexpr(dep2))) );
    930    /* Set NDEP even though it isn't used.  This makes redundant-PUT
    931       elimination of previous stores to this field work better. */
    932    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
    933 }
    934 
    935 
    936 /* Set the OP and DEP1 fields only, and write zero to DEP2. */
    937 
    938 static
    939 void setFlags_DEP1 ( IROp op8, IRTemp dep1, IRType ty )
    940 {
    941    Int ccOp = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
    942 
    943    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
    944 
    945    switch (op8) {
    946       case Iop_Or8:
    947       case Iop_And8:
    948       case Iop_Xor8: ccOp += X86G_CC_OP_LOGICB; break;
    949       default:       ppIROp(op8);
    950                      vpanic("setFlags_DEP1(x86)");
    951    }
    952    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
    953    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(dep1))) );
    954    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0)) );
    955    /* Set NDEP even though it isn't used.  This makes redundant-PUT
    956       elimination of previous stores to this field work better. */
    957    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
    958 }
    959 
    960 
    961 /* For shift operations, we put in the result and the undershifted
    962    result.  Except if the shift amount is zero, the thunk is left
    963    unchanged. */
    964 
    965 static void setFlags_DEP1_DEP2_shift ( IROp    op32,
    966                                        IRTemp  res,
    967                                        IRTemp  resUS,
    968                                        IRType  ty,
    969                                        IRTemp  guard )
    970 {
    971    Int ccOp = ty==Ity_I8 ? 2 : (ty==Ity_I16 ? 1 : 0);
    972 
    973    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
    974    vassert(guard);
    975 
    976    /* Both kinds of right shifts are handled by the same thunk
    977       operation. */
    978    switch (op32) {
    979       case Iop_Shr32:
    980       case Iop_Sar32: ccOp = X86G_CC_OP_SHRL - ccOp; break;
    981       case Iop_Shl32: ccOp = X86G_CC_OP_SHLL - ccOp; break;
    982       default:        ppIROp(op32);
    983                       vpanic("setFlags_DEP1_DEP2_shift(x86)");
    984    }
    985 
    986    /* DEP1 contains the result, DEP2 contains the undershifted value. */
    987    stmt( IRStmt_Put( OFFB_CC_OP,
    988                      IRExpr_Mux0X( mkexpr(guard),
    989                                    IRExpr_Get(OFFB_CC_OP,Ity_I32),
    990                                    mkU32(ccOp))) );
    991    stmt( IRStmt_Put( OFFB_CC_DEP1,
    992                      IRExpr_Mux0X( mkexpr(guard),
    993                                    IRExpr_Get(OFFB_CC_DEP1,Ity_I32),
    994                                    widenUto32(mkexpr(res)))) );
    995    stmt( IRStmt_Put( OFFB_CC_DEP2,
    996                      IRExpr_Mux0X( mkexpr(guard),
    997                                    IRExpr_Get(OFFB_CC_DEP2,Ity_I32),
    998                                    widenUto32(mkexpr(resUS)))) );
    999    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   1000       elimination of previous stores to this field work better. */
   1001    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   1002 }
   1003 
   1004 
   1005 /* For the inc/dec case, we store in DEP1 the result value and in NDEP
   1006    the former value of the carry flag, which unfortunately we have to
   1007    compute. */
   1008 
   1009 static void setFlags_INC_DEC ( Bool inc, IRTemp res, IRType ty )
   1010 {
   1011    Int ccOp = inc ? X86G_CC_OP_INCB : X86G_CC_OP_DECB;
   1012 
   1013    ccOp += ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
   1014    vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
   1015 
   1016    /* This has to come first, because calculating the C flag
   1017       may require reading all four thunk fields. */
   1018    stmt( IRStmt_Put( OFFB_CC_NDEP, mk_x86g_calculate_eflags_c()) );
   1019    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
   1020    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(res))) );
   1021    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0)) );
   1022 }
   1023 
   1024 
   1025 /* Multiplies are pretty much like add and sub: DEP1 and DEP2 hold the
   1026    two arguments. */
   1027 
   1028 static
   1029 void setFlags_MUL ( IRType ty, IRTemp arg1, IRTemp arg2, UInt base_op )
   1030 {
   1031    switch (ty) {
   1032       case Ity_I8:
   1033          stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+0) ) );
   1034          break;
   1035       case Ity_I16:
   1036          stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+1) ) );
   1037          break;
   1038       case Ity_I32:
   1039          stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+2) ) );
   1040          break;
   1041       default:
   1042          vpanic("setFlags_MUL(x86)");
   1043    }
   1044    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(arg1)) ));
   1045    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(mkexpr(arg2)) ));
   1046    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   1047       elimination of previous stores to this field work better. */
   1048    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   1049 }
   1050 
   1051 
   1052 /* -------------- Condition codes. -------------- */
   1053 
   1054 /* Condition codes, using the Intel encoding.  */
   1055 
   1056 static HChar* name_X86Condcode ( X86Condcode cond )
   1057 {
   1058    switch (cond) {
   1059       case X86CondO:      return "o";
   1060       case X86CondNO:     return "no";
   1061       case X86CondB:      return "b";
   1062       case X86CondNB:     return "nb";
   1063       case X86CondZ:      return "z";
   1064       case X86CondNZ:     return "nz";
   1065       case X86CondBE:     return "be";
   1066       case X86CondNBE:    return "nbe";
   1067       case X86CondS:      return "s";
   1068       case X86CondNS:     return "ns";
   1069       case X86CondP:      return "p";
   1070       case X86CondNP:     return "np";
   1071       case X86CondL:      return "l";
   1072       case X86CondNL:     return "nl";
   1073       case X86CondLE:     return "le";
   1074       case X86CondNLE:    return "nle";
   1075       case X86CondAlways: return "ALWAYS";
   1076       default: vpanic("name_X86Condcode");
   1077    }
   1078 }
   1079 
   1080 static
   1081 X86Condcode positiveIse_X86Condcode ( X86Condcode  cond,
   1082                                       Bool*        needInvert )
   1083 {
   1084    vassert(cond >= X86CondO && cond <= X86CondNLE);
   1085    if (cond & 1) {
   1086       *needInvert = True;
   1087       return cond-1;
   1088    } else {
   1089       *needInvert = False;
   1090       return cond;
   1091    }
   1092 }
   1093 
   1094 
   1095 /* -------------- Helpers for ADD/SUB with carry. -------------- */
   1096 
   1097 /* Given ta1, ta2 and tres, compute tres = ADC(ta1,ta2) and set flags
   1098    appropriately.
   1099 
   1100    Optionally, generate a store for the 'tres' value.  This can either
   1101    be a normal store, or it can be a cas-with-possible-failure style
   1102    store:
   1103 
   1104    if taddr is IRTemp_INVALID, then no store is generated.
   1105 
   1106    if taddr is not IRTemp_INVALID, then a store (using taddr as
   1107    the address) is generated:
   1108 
   1109      if texpVal is IRTemp_INVALID then a normal store is
   1110      generated, and restart_point must be zero (it is irrelevant).
   1111 
   1112      if texpVal is not IRTemp_INVALID then a cas-style store is
   1113      generated.  texpVal is the expected value, restart_point
   1114      is the restart point if the store fails, and texpVal must
   1115      have the same type as tres.
   1116 */
   1117 static void helper_ADC ( Int sz,
   1118                          IRTemp tres, IRTemp ta1, IRTemp ta2,
   1119                          /* info about optional store: */
   1120                          IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
   1121 {
   1122    UInt    thunkOp;
   1123    IRType  ty    = szToITy(sz);
   1124    IRTemp  oldc  = newTemp(Ity_I32);
   1125    IRTemp  oldcn = newTemp(ty);
   1126    IROp    plus  = mkSizedOp(ty, Iop_Add8);
   1127    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
   1128 
   1129    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
   1130    vassert(sz == 1 || sz == 2 || sz == 4);
   1131    thunkOp = sz==4 ? X86G_CC_OP_ADCL
   1132                    : (sz==2 ? X86G_CC_OP_ADCW : X86G_CC_OP_ADCB);
   1133 
   1134    /* oldc = old carry flag, 0 or 1 */
   1135    assign( oldc,  binop(Iop_And32,
   1136                         mk_x86g_calculate_eflags_c(),
   1137                         mkU32(1)) );
   1138 
   1139    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
   1140 
   1141    assign( tres, binop(plus,
   1142                        binop(plus,mkexpr(ta1),mkexpr(ta2)),
   1143                        mkexpr(oldcn)) );
   1144 
   1145    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
   1146       start of this function. */
   1147    if (taddr != IRTemp_INVALID) {
   1148       if (texpVal == IRTemp_INVALID) {
   1149          vassert(restart_point == 0);
   1150          storeLE( mkexpr(taddr), mkexpr(tres) );
   1151       } else {
   1152          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
   1153          /* .. and hence 'texpVal' has the same type as 'tres'. */
   1154          casLE( mkexpr(taddr),
   1155                 mkexpr(texpVal), mkexpr(tres), restart_point );
   1156       }
   1157    }
   1158 
   1159    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(thunkOp) ) );
   1160    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(ta1)) ));
   1161    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(binop(xor, mkexpr(ta2),
   1162                                                          mkexpr(oldcn)) )) );
   1163    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
   1164 }
   1165 
   1166 
   1167 /* Given ta1, ta2 and tres, compute tres = SBB(ta1,ta2) and set flags
   1168    appropriately.  As with helper_ADC, possibly generate a store of
   1169    the result -- see comments on helper_ADC for details.
   1170 */
   1171 static void helper_SBB ( Int sz,
   1172                          IRTemp tres, IRTemp ta1, IRTemp ta2,
   1173                          /* info about optional store: */
   1174                          IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
   1175 {
   1176    UInt    thunkOp;
   1177    IRType  ty    = szToITy(sz);
   1178    IRTemp  oldc  = newTemp(Ity_I32);
   1179    IRTemp  oldcn = newTemp(ty);
   1180    IROp    minus = mkSizedOp(ty, Iop_Sub8);
   1181    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
   1182 
   1183    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
   1184    vassert(sz == 1 || sz == 2 || sz == 4);
   1185    thunkOp = sz==4 ? X86G_CC_OP_SBBL
   1186                    : (sz==2 ? X86G_CC_OP_SBBW : X86G_CC_OP_SBBB);
   1187 
   1188    /* oldc = old carry flag, 0 or 1 */
   1189    assign( oldc, binop(Iop_And32,
   1190                        mk_x86g_calculate_eflags_c(),
   1191                        mkU32(1)) );
   1192 
   1193    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
   1194 
   1195    assign( tres, binop(minus,
   1196                        binop(minus,mkexpr(ta1),mkexpr(ta2)),
   1197                        mkexpr(oldcn)) );
   1198 
   1199    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
   1200       start of this function. */
   1201    if (taddr != IRTemp_INVALID) {
   1202       if (texpVal == IRTemp_INVALID) {
   1203          vassert(restart_point == 0);
   1204          storeLE( mkexpr(taddr), mkexpr(tres) );
   1205       } else {
   1206          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
   1207          /* .. and hence 'texpVal' has the same type as 'tres'. */
   1208          casLE( mkexpr(taddr),
   1209                 mkexpr(texpVal), mkexpr(tres), restart_point );
   1210       }
   1211    }
   1212 
   1213    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(thunkOp) ) );
   1214    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(ta1) )) );
   1215    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(binop(xor, mkexpr(ta2),
   1216                                                          mkexpr(oldcn)) )) );
   1217    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
   1218 }
   1219 
   1220 
   1221 /* -------------- Helpers for disassembly printing. -------------- */
   1222 
   1223 static HChar* nameGrp1 ( Int opc_aux )
   1224 {
   1225    static HChar* grp1_names[8]
   1226      = { "add", "or", "adc", "sbb", "and", "sub", "xor", "cmp" };
   1227    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp1(x86)");
   1228    return grp1_names[opc_aux];
   1229 }
   1230 
   1231 static HChar* nameGrp2 ( Int opc_aux )
   1232 {
   1233    static HChar* grp2_names[8]
   1234      = { "rol", "ror", "rcl", "rcr", "shl", "shr", "shl", "sar" };
   1235    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp2(x86)");
   1236    return grp2_names[opc_aux];
   1237 }
   1238 
   1239 static HChar* nameGrp4 ( Int opc_aux )
   1240 {
   1241    static HChar* grp4_names[8]
   1242      = { "inc", "dec", "???", "???", "???", "???", "???", "???" };
   1243    if (opc_aux < 0 || opc_aux > 1) vpanic("nameGrp4(x86)");
   1244    return grp4_names[opc_aux];
   1245 }
   1246 
   1247 static HChar* nameGrp5 ( Int opc_aux )
   1248 {
   1249    static HChar* grp5_names[8]
   1250      = { "inc", "dec", "call*", "call*", "jmp*", "jmp*", "push", "???" };
   1251    if (opc_aux < 0 || opc_aux > 6) vpanic("nameGrp5(x86)");
   1252    return grp5_names[opc_aux];
   1253 }
   1254 
   1255 static HChar* nameGrp8 ( Int opc_aux )
   1256 {
   1257    static HChar* grp8_names[8]
   1258      = { "???", "???", "???", "???", "bt", "bts", "btr", "btc" };
   1259    if (opc_aux < 4 || opc_aux > 7) vpanic("nameGrp8(x86)");
   1260    return grp8_names[opc_aux];
   1261 }
   1262 
   1263 static HChar* nameIReg ( Int size, Int reg )
   1264 {
   1265    static HChar* ireg32_names[8]
   1266      = { "%eax", "%ecx", "%edx", "%ebx",
   1267          "%esp", "%ebp", "%esi", "%edi" };
   1268    static HChar* ireg16_names[8]
   1269      = { "%ax", "%cx", "%dx", "%bx", "%sp", "%bp", "%si", "%di" };
   1270    static HChar* ireg8_names[8]
   1271      = { "%al", "%cl", "%dl", "%bl",
   1272          "%ah{sp}", "%ch{bp}", "%dh{si}", "%bh{di}" };
   1273    if (reg < 0 || reg > 7) goto bad;
   1274    switch (size) {
   1275       case 4: return ireg32_names[reg];
   1276       case 2: return ireg16_names[reg];
   1277       case 1: return ireg8_names[reg];
   1278    }
   1279   bad:
   1280    vpanic("nameIReg(X86)");
   1281    return NULL; /*notreached*/
   1282 }
   1283 
   1284 static HChar* nameSReg ( UInt sreg )
   1285 {
   1286    switch (sreg) {
   1287       case R_ES: return "%es";
   1288       case R_CS: return "%cs";
   1289       case R_SS: return "%ss";
   1290       case R_DS: return "%ds";
   1291       case R_FS: return "%fs";
   1292       case R_GS: return "%gs";
   1293       default: vpanic("nameSReg(x86)");
   1294    }
   1295 }
   1296 
   1297 static HChar* nameMMXReg ( Int mmxreg )
   1298 {
   1299    static HChar* mmx_names[8]
   1300      = { "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" };
   1301    if (mmxreg < 0 || mmxreg > 7) vpanic("nameMMXReg(x86,guest)");
   1302    return mmx_names[mmxreg];
   1303 }
   1304 
   1305 static HChar* nameXMMReg ( Int xmmreg )
   1306 {
   1307    static HChar* xmm_names[8]
   1308      = { "%xmm0", "%xmm1", "%xmm2", "%xmm3",
   1309          "%xmm4", "%xmm5", "%xmm6", "%xmm7" };
   1310    if (xmmreg < 0 || xmmreg > 7) vpanic("name_of_xmm_reg");
   1311    return xmm_names[xmmreg];
   1312 }
   1313 
   1314 static HChar* nameMMXGran ( Int gran )
   1315 {
   1316    switch (gran) {
   1317       case 0: return "b";
   1318       case 1: return "w";
   1319       case 2: return "d";
   1320       case 3: return "q";
   1321       default: vpanic("nameMMXGran(x86,guest)");
   1322    }
   1323 }
   1324 
   1325 static HChar nameISize ( Int size )
   1326 {
   1327    switch (size) {
   1328       case 4: return 'l';
   1329       case 2: return 'w';
   1330       case 1: return 'b';
   1331       default: vpanic("nameISize(x86)");
   1332    }
   1333 }
   1334 
   1335 
   1336 /*------------------------------------------------------------*/
   1337 /*--- JMP helpers                                          ---*/
   1338 /*------------------------------------------------------------*/
   1339 
   1340 static void jmp_lit( IRJumpKind kind, Addr32 d32 )
   1341 {
   1342    irsb->next     = mkU32(d32);
   1343    irsb->jumpkind = kind;
   1344 }
   1345 
   1346 static void jmp_treg( IRJumpKind kind, IRTemp t )
   1347 {
   1348    irsb->next = mkexpr(t);
   1349    irsb->jumpkind = kind;
   1350 }
   1351 
   1352 static
   1353 void jcc_01( X86Condcode cond, Addr32 d32_false, Addr32 d32_true )
   1354 {
   1355    Bool        invert;
   1356    X86Condcode condPos;
   1357    condPos = positiveIse_X86Condcode ( cond, &invert );
   1358    if (invert) {
   1359       stmt( IRStmt_Exit( mk_x86g_calculate_condition(condPos),
   1360                          Ijk_Boring,
   1361                          IRConst_U32(d32_false) ) );
   1362       irsb->next     = mkU32(d32_true);
   1363       irsb->jumpkind = Ijk_Boring;
   1364    } else {
   1365       stmt( IRStmt_Exit( mk_x86g_calculate_condition(condPos),
   1366                          Ijk_Boring,
   1367                          IRConst_U32(d32_true) ) );
   1368       irsb->next     = mkU32(d32_false);
   1369       irsb->jumpkind = Ijk_Boring;
   1370    }
   1371 }
   1372 
   1373 
   1374 /*------------------------------------------------------------*/
   1375 /*--- Disassembling addressing modes                       ---*/
   1376 /*------------------------------------------------------------*/
   1377 
   1378 static
   1379 HChar* sorbTxt ( UChar sorb )
   1380 {
   1381    switch (sorb) {
   1382       case 0:    return ""; /* no override */
   1383       case 0x3E: return "%ds";
   1384       case 0x26: return "%es:";
   1385       case 0x64: return "%fs:";
   1386       case 0x65: return "%gs:";
   1387       default: vpanic("sorbTxt(x86,guest)");
   1388    }
   1389 }
   1390 
   1391 
   1392 /* 'virtual' is an IRExpr* holding a virtual address.  Convert it to a
   1393    linear address by adding any required segment override as indicated
   1394    by sorb. */
   1395 static
   1396 IRExpr* handleSegOverride ( UChar sorb, IRExpr* virtual )
   1397 {
   1398    Int    sreg;
   1399    IRType hWordTy;
   1400    IRTemp ldt_ptr, gdt_ptr, seg_selector, r64;
   1401 
   1402    if (sorb == 0)
   1403       /* the common case - no override */
   1404       return virtual;
   1405 
   1406    switch (sorb) {
   1407       case 0x3E: sreg = R_DS; break;
   1408       case 0x26: sreg = R_ES; break;
   1409       case 0x64: sreg = R_FS; break;
   1410       case 0x65: sreg = R_GS; break;
   1411       default: vpanic("handleSegOverride(x86,guest)");
   1412    }
   1413 
   1414    hWordTy = sizeof(HWord)==4 ? Ity_I32 : Ity_I64;
   1415 
   1416    seg_selector = newTemp(Ity_I32);
   1417    ldt_ptr      = newTemp(hWordTy);
   1418    gdt_ptr      = newTemp(hWordTy);
   1419    r64          = newTemp(Ity_I64);
   1420 
   1421    assign( seg_selector, unop(Iop_16Uto32, getSReg(sreg)) );
   1422    assign( ldt_ptr, IRExpr_Get( OFFB_LDT, hWordTy ));
   1423    assign( gdt_ptr, IRExpr_Get( OFFB_GDT, hWordTy ));
   1424 
   1425    /*
   1426    Call this to do the translation and limit checks:
   1427    ULong x86g_use_seg_selector ( HWord ldt, HWord gdt,
   1428                                  UInt seg_selector, UInt virtual_addr )
   1429    */
   1430    assign(
   1431       r64,
   1432       mkIRExprCCall(
   1433          Ity_I64,
   1434          0/*regparms*/,
   1435          "x86g_use_seg_selector",
   1436          &x86g_use_seg_selector,
   1437          mkIRExprVec_4( mkexpr(ldt_ptr), mkexpr(gdt_ptr),
   1438                         mkexpr(seg_selector), virtual)
   1439       )
   1440    );
   1441 
   1442    /* If the high 32 of the result are non-zero, there was a
   1443       failure in address translation.  In which case, make a
   1444       quick exit.
   1445    */
   1446    stmt(
   1447       IRStmt_Exit(
   1448          binop(Iop_CmpNE32, unop(Iop_64HIto32, mkexpr(r64)), mkU32(0)),
   1449          Ijk_MapFail,
   1450          IRConst_U32( guest_EIP_curr_instr )
   1451       )
   1452    );
   1453 
   1454    /* otherwise, here's the translated result. */
   1455    return unop(Iop_64to32, mkexpr(r64));
   1456 }
   1457 
   1458 
   1459 /* Generate IR to calculate an address indicated by a ModRM and
   1460    following SIB bytes.  The expression, and the number of bytes in
   1461    the address mode, are returned.  Note that this fn should not be
   1462    called if the R/M part of the address denotes a register instead of
   1463    memory.  If print_codegen is true, text of the addressing mode is
   1464    placed in buf.
   1465 
   1466    The computed address is stored in a new tempreg, and the
   1467    identity of the tempreg is returned.  */
   1468 
   1469 static IRTemp disAMode_copy2tmp ( IRExpr* addr32 )
   1470 {
   1471    IRTemp tmp = newTemp(Ity_I32);
   1472    assign( tmp, addr32 );
   1473    return tmp;
   1474 }
   1475 
   1476 static
   1477 IRTemp disAMode ( Int* len, UChar sorb, Int delta, HChar* buf )
   1478 {
   1479    UChar mod_reg_rm = getIByte(delta);
   1480    delta++;
   1481 
   1482    buf[0] = (UChar)0;
   1483 
   1484    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
   1485       jump table seems a bit excessive.
   1486    */
   1487    mod_reg_rm &= 0xC7;                      /* is now XX000YYY */
   1488    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
   1489                                             /* is now XX0XXYYY */
   1490    mod_reg_rm &= 0x1F;                      /* is now 000XXYYY */
   1491    switch (mod_reg_rm) {
   1492 
   1493       /* (%eax) .. (%edi), not including (%esp) or (%ebp).
   1494          --> GET %reg, t
   1495       */
   1496       case 0x00: case 0x01: case 0x02: case 0x03:
   1497       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
   1498          { UChar rm = mod_reg_rm;
   1499            DIS(buf, "%s(%s)", sorbTxt(sorb), nameIReg(4,rm));
   1500            *len = 1;
   1501            return disAMode_copy2tmp(
   1502                   handleSegOverride(sorb, getIReg(4,rm)));
   1503          }
   1504 
   1505       /* d8(%eax) ... d8(%edi), not including d8(%esp)
   1506          --> GET %reg, t ; ADDL d8, t
   1507       */
   1508       case 0x08: case 0x09: case 0x0A: case 0x0B:
   1509       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
   1510          { UChar rm = toUChar(mod_reg_rm & 7);
   1511            UInt  d  = getSDisp8(delta);
   1512            DIS(buf, "%s%d(%s)", sorbTxt(sorb), (Int)d, nameIReg(4,rm));
   1513            *len = 2;
   1514            return disAMode_copy2tmp(
   1515                   handleSegOverride(sorb,
   1516                      binop(Iop_Add32,getIReg(4,rm),mkU32(d))));
   1517          }
   1518 
   1519       /* d32(%eax) ... d32(%edi), not including d32(%esp)
   1520          --> GET %reg, t ; ADDL d8, t
   1521       */
   1522       case 0x10: case 0x11: case 0x12: case 0x13:
   1523       /* ! 14 */ case 0x15: case 0x16: case 0x17:
   1524          { UChar rm = toUChar(mod_reg_rm & 7);
   1525            UInt  d  = getUDisp32(delta);
   1526            DIS(buf, "%s0x%x(%s)", sorbTxt(sorb), (Int)d, nameIReg(4,rm));
   1527            *len = 5;
   1528            return disAMode_copy2tmp(
   1529                   handleSegOverride(sorb,
   1530                      binop(Iop_Add32,getIReg(4,rm),mkU32(d))));
   1531          }
   1532 
   1533       /* a register, %eax .. %edi.  This shouldn't happen. */
   1534       case 0x18: case 0x19: case 0x1A: case 0x1B:
   1535       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
   1536          vpanic("disAMode(x86): not an addr!");
   1537 
   1538       /* a 32-bit literal address
   1539          --> MOV d32, tmp
   1540       */
   1541       case 0x05:
   1542          { UInt d = getUDisp32(delta);
   1543            *len = 5;
   1544            DIS(buf, "%s(0x%x)", sorbTxt(sorb), d);
   1545            return disAMode_copy2tmp(
   1546                      handleSegOverride(sorb, mkU32(d)));
   1547          }
   1548 
   1549       case 0x04: {
   1550          /* SIB, with no displacement.  Special cases:
   1551             -- %esp cannot act as an index value.
   1552                If index_r indicates %esp, zero is used for the index.
   1553             -- when mod is zero and base indicates EBP, base is instead
   1554                a 32-bit literal.
   1555             It's all madness, I tell you.  Extract %index, %base and
   1556             scale from the SIB byte.  The value denoted is then:
   1557                | %index == %ESP && %base == %EBP
   1558                = d32 following SIB byte
   1559                | %index == %ESP && %base != %EBP
   1560                = %base
   1561                | %index != %ESP && %base == %EBP
   1562                = d32 following SIB byte + (%index << scale)
   1563                | %index != %ESP && %base != %ESP
   1564                = %base + (%index << scale)
   1565 
   1566             What happens to the souls of CPU architects who dream up such
   1567             horrendous schemes, do you suppose?
   1568          */
   1569          UChar sib     = getIByte(delta);
   1570          UChar scale   = toUChar((sib >> 6) & 3);
   1571          UChar index_r = toUChar((sib >> 3) & 7);
   1572          UChar base_r  = toUChar(sib & 7);
   1573          delta++;
   1574 
   1575          if (index_r != R_ESP && base_r != R_EBP) {
   1576             DIS(buf, "%s(%s,%s,%d)", sorbTxt(sorb),
   1577                       nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
   1578             *len = 2;
   1579             return
   1580                disAMode_copy2tmp(
   1581                handleSegOverride(sorb,
   1582                   binop(Iop_Add32,
   1583                         getIReg(4,base_r),
   1584                         binop(Iop_Shl32, getIReg(4,index_r),
   1585                               mkU8(scale)))));
   1586          }
   1587 
   1588          if (index_r != R_ESP && base_r == R_EBP) {
   1589             UInt d = getUDisp32(delta);
   1590             DIS(buf, "%s0x%x(,%s,%d)", sorbTxt(sorb), d,
   1591                       nameIReg(4,index_r), 1<<scale);
   1592             *len = 6;
   1593             return
   1594                disAMode_copy2tmp(
   1595                handleSegOverride(sorb,
   1596                   binop(Iop_Add32,
   1597                         binop(Iop_Shl32, getIReg(4,index_r), mkU8(scale)),
   1598                         mkU32(d))));
   1599          }
   1600 
   1601          if (index_r == R_ESP && base_r != R_EBP) {
   1602             DIS(buf, "%s(%s,,)", sorbTxt(sorb), nameIReg(4,base_r));
   1603             *len = 2;
   1604             return disAMode_copy2tmp(
   1605                    handleSegOverride(sorb, getIReg(4,base_r)));
   1606          }
   1607 
   1608          if (index_r == R_ESP && base_r == R_EBP) {
   1609             UInt d = getUDisp32(delta);
   1610             DIS(buf, "%s0x%x(,,)", sorbTxt(sorb), d);
   1611             *len = 6;
   1612             return disAMode_copy2tmp(
   1613                    handleSegOverride(sorb, mkU32(d)));
   1614          }
   1615          /*NOTREACHED*/
   1616          vassert(0);
   1617       }
   1618 
   1619       /* SIB, with 8-bit displacement.  Special cases:
   1620          -- %esp cannot act as an index value.
   1621             If index_r indicates %esp, zero is used for the index.
   1622          Denoted value is:
   1623             | %index == %ESP
   1624             = d8 + %base
   1625             | %index != %ESP
   1626             = d8 + %base + (%index << scale)
   1627       */
   1628       case 0x0C: {
   1629          UChar sib     = getIByte(delta);
   1630          UChar scale   = toUChar((sib >> 6) & 3);
   1631          UChar index_r = toUChar((sib >> 3) & 7);
   1632          UChar base_r  = toUChar(sib & 7);
   1633          UInt  d       = getSDisp8(delta+1);
   1634 
   1635          if (index_r == R_ESP) {
   1636             DIS(buf, "%s%d(%s,,)", sorbTxt(sorb),
   1637                                    (Int)d, nameIReg(4,base_r));
   1638             *len = 3;
   1639             return disAMode_copy2tmp(
   1640                    handleSegOverride(sorb,
   1641                       binop(Iop_Add32, getIReg(4,base_r), mkU32(d)) ));
   1642          } else {
   1643             DIS(buf, "%s%d(%s,%s,%d)", sorbTxt(sorb), (Int)d,
   1644                      nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
   1645             *len = 3;
   1646             return
   1647                 disAMode_copy2tmp(
   1648                 handleSegOverride(sorb,
   1649                   binop(Iop_Add32,
   1650                         binop(Iop_Add32,
   1651                               getIReg(4,base_r),
   1652                               binop(Iop_Shl32,
   1653                                     getIReg(4,index_r), mkU8(scale))),
   1654                         mkU32(d))));
   1655          }
   1656 	 /*NOTREACHED*/
   1657          vassert(0);
   1658       }
   1659 
   1660       /* SIB, with 32-bit displacement.  Special cases:
   1661          -- %esp cannot act as an index value.
   1662             If index_r indicates %esp, zero is used for the index.
   1663          Denoted value is:
   1664             | %index == %ESP
   1665             = d32 + %base
   1666             | %index != %ESP
   1667             = d32 + %base + (%index << scale)
   1668       */
   1669       case 0x14: {
   1670          UChar sib     = getIByte(delta);
   1671          UChar scale   = toUChar((sib >> 6) & 3);
   1672          UChar index_r = toUChar((sib >> 3) & 7);
   1673          UChar base_r  = toUChar(sib & 7);
   1674          UInt d        = getUDisp32(delta+1);
   1675 
   1676          if (index_r == R_ESP) {
   1677             DIS(buf, "%s%d(%s,,)", sorbTxt(sorb),
   1678                                    (Int)d, nameIReg(4,base_r));
   1679             *len = 6;
   1680             return disAMode_copy2tmp(
   1681                    handleSegOverride(sorb,
   1682                       binop(Iop_Add32, getIReg(4,base_r), mkU32(d)) ));
   1683          } else {
   1684             DIS(buf, "%s%d(%s,%s,%d)", sorbTxt(sorb), (Int)d,
   1685                      nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
   1686             *len = 6;
   1687             return
   1688                 disAMode_copy2tmp(
   1689                 handleSegOverride(sorb,
   1690                   binop(Iop_Add32,
   1691                         binop(Iop_Add32,
   1692                               getIReg(4,base_r),
   1693                               binop(Iop_Shl32,
   1694                                     getIReg(4,index_r), mkU8(scale))),
   1695                         mkU32(d))));
   1696          }
   1697 	 /*NOTREACHED*/
   1698          vassert(0);
   1699       }
   1700 
   1701       default:
   1702          vpanic("disAMode(x86)");
   1703          return 0; /*notreached*/
   1704    }
   1705 }
   1706 
   1707 
   1708 /* Figure out the number of (insn-stream) bytes constituting the amode
   1709    beginning at delta.  Is useful for getting hold of literals beyond
   1710    the end of the amode before it has been disassembled.  */
   1711 
   1712 static UInt lengthAMode ( Int delta )
   1713 {
   1714    UChar mod_reg_rm = getIByte(delta); delta++;
   1715 
   1716    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
   1717       jump table seems a bit excessive.
   1718    */
   1719    mod_reg_rm &= 0xC7;               /* is now XX000YYY */
   1720    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
   1721                                      /* is now XX0XXYYY */
   1722    mod_reg_rm &= 0x1F;               /* is now 000XXYYY */
   1723    switch (mod_reg_rm) {
   1724 
   1725       /* (%eax) .. (%edi), not including (%esp) or (%ebp). */
   1726       case 0x00: case 0x01: case 0x02: case 0x03:
   1727       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
   1728          return 1;
   1729 
   1730       /* d8(%eax) ... d8(%edi), not including d8(%esp). */
   1731       case 0x08: case 0x09: case 0x0A: case 0x0B:
   1732       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
   1733          return 2;
   1734 
   1735       /* d32(%eax) ... d32(%edi), not including d32(%esp). */
   1736       case 0x10: case 0x11: case 0x12: case 0x13:
   1737       /* ! 14 */ case 0x15: case 0x16: case 0x17:
   1738          return 5;
   1739 
   1740       /* a register, %eax .. %edi.  (Not an addr, but still handled.) */
   1741       case 0x18: case 0x19: case 0x1A: case 0x1B:
   1742       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
   1743          return 1;
   1744 
   1745       /* a 32-bit literal address. */
   1746       case 0x05: return 5;
   1747 
   1748       /* SIB, no displacement.  */
   1749       case 0x04: {
   1750          UChar sib    = getIByte(delta);
   1751          UChar base_r = toUChar(sib & 7);
   1752          if (base_r == R_EBP) return 6; else return 2;
   1753       }
   1754       /* SIB, with 8-bit displacement.  */
   1755       case 0x0C: return 3;
   1756 
   1757       /* SIB, with 32-bit displacement.  */
   1758       case 0x14: return 6;
   1759 
   1760       default:
   1761          vpanic("lengthAMode");
   1762          return 0; /*notreached*/
   1763    }
   1764 }
   1765 
   1766 /*------------------------------------------------------------*/
   1767 /*--- Disassembling common idioms                          ---*/
   1768 /*------------------------------------------------------------*/
   1769 
   1770 /* Handle binary integer instructions of the form
   1771       op E, G  meaning
   1772       op reg-or-mem, reg
   1773    Is passed the a ptr to the modRM byte, the actual operation, and the
   1774    data size.  Returns the address advanced completely over this
   1775    instruction.
   1776 
   1777    E(src) is reg-or-mem
   1778    G(dst) is reg.
   1779 
   1780    If E is reg, -->    GET %G,  tmp
   1781                        OP %E,   tmp
   1782                        PUT tmp, %G
   1783 
   1784    If E is mem and OP is not reversible,
   1785                 -->    (getAddr E) -> tmpa
   1786                        LD (tmpa), tmpa
   1787                        GET %G, tmp2
   1788                        OP tmpa, tmp2
   1789                        PUT tmp2, %G
   1790 
   1791    If E is mem and OP is reversible
   1792                 -->    (getAddr E) -> tmpa
   1793                        LD (tmpa), tmpa
   1794                        OP %G, tmpa
   1795                        PUT tmpa, %G
   1796 */
   1797 static
   1798 UInt dis_op2_E_G ( UChar       sorb,
   1799                    Bool        addSubCarry,
   1800                    IROp        op8,
   1801                    Bool        keep,
   1802                    Int         size,
   1803                    Int         delta0,
   1804                    HChar*      t_x86opc )
   1805 {
   1806    HChar   dis_buf[50];
   1807    Int     len;
   1808    IRType  ty   = szToITy(size);
   1809    IRTemp  dst1 = newTemp(ty);
   1810    IRTemp  src  = newTemp(ty);
   1811    IRTemp  dst0 = newTemp(ty);
   1812    UChar   rm   = getUChar(delta0);
   1813    IRTemp  addr = IRTemp_INVALID;
   1814 
   1815    /* addSubCarry == True indicates the intended operation is
   1816       add-with-carry or subtract-with-borrow. */
   1817    if (addSubCarry) {
   1818       vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
   1819       vassert(keep);
   1820    }
   1821 
   1822    if (epartIsReg(rm)) {
   1823       /* Specially handle XOR reg,reg, because that doesn't really
   1824          depend on reg, and doing the obvious thing potentially
   1825          generates a spurious value check failure due to the bogus
   1826          dependency.  Ditto SBB reg,reg. */
   1827       if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
   1828           && gregOfRM(rm) == eregOfRM(rm)) {
   1829          putIReg(size, gregOfRM(rm), mkU(ty,0));
   1830       }
   1831       assign( dst0, getIReg(size,gregOfRM(rm)) );
   1832       assign( src,  getIReg(size,eregOfRM(rm)) );
   1833 
   1834       if (addSubCarry && op8 == Iop_Add8) {
   1835          helper_ADC( size, dst1, dst0, src,
   1836                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1837          putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1838       } else
   1839       if (addSubCarry && op8 == Iop_Sub8) {
   1840          helper_SBB( size, dst1, dst0, src,
   1841                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1842          putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1843       } else {
   1844          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   1845          if (isAddSub(op8))
   1846             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   1847          else
   1848             setFlags_DEP1(op8, dst1, ty);
   1849          if (keep)
   1850             putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1851       }
   1852 
   1853       DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
   1854                           nameIReg(size,eregOfRM(rm)),
   1855                           nameIReg(size,gregOfRM(rm)));
   1856       return 1+delta0;
   1857    } else {
   1858       /* E refers to memory */
   1859       addr = disAMode ( &len, sorb, delta0, dis_buf);
   1860       assign( dst0, getIReg(size,gregOfRM(rm)) );
   1861       assign( src,  loadLE(szToITy(size), mkexpr(addr)) );
   1862 
   1863       if (addSubCarry && op8 == Iop_Add8) {
   1864          helper_ADC( size, dst1, dst0, src,
   1865                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1866          putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1867       } else
   1868       if (addSubCarry && op8 == Iop_Sub8) {
   1869          helper_SBB( size, dst1, dst0, src,
   1870                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1871          putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1872       } else {
   1873          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   1874          if (isAddSub(op8))
   1875             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   1876          else
   1877             setFlags_DEP1(op8, dst1, ty);
   1878          if (keep)
   1879             putIReg(size, gregOfRM(rm), mkexpr(dst1));
   1880       }
   1881 
   1882       DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
   1883                           dis_buf,nameIReg(size,gregOfRM(rm)));
   1884       return len+delta0;
   1885    }
   1886 }
   1887 
   1888 
   1889 
   1890 /* Handle binary integer instructions of the form
   1891       op G, E  meaning
   1892       op reg, reg-or-mem
   1893    Is passed the a ptr to the modRM byte, the actual operation, and the
   1894    data size.  Returns the address advanced completely over this
   1895    instruction.
   1896 
   1897    G(src) is reg.
   1898    E(dst) is reg-or-mem
   1899 
   1900    If E is reg, -->    GET %E,  tmp
   1901                        OP %G,   tmp
   1902                        PUT tmp, %E
   1903 
   1904    If E is mem, -->    (getAddr E) -> tmpa
   1905                        LD (tmpa), tmpv
   1906                        OP %G, tmpv
   1907                        ST tmpv, (tmpa)
   1908 */
   1909 static
   1910 UInt dis_op2_G_E ( UChar       sorb,
   1911                    Bool        locked,
   1912                    Bool        addSubCarry,
   1913                    IROp        op8,
   1914                    Bool        keep,
   1915                    Int         size,
   1916                    Int         delta0,
   1917                    HChar*      t_x86opc )
   1918 {
   1919    HChar   dis_buf[50];
   1920    Int     len;
   1921    IRType  ty   = szToITy(size);
   1922    IRTemp  dst1 = newTemp(ty);
   1923    IRTemp  src  = newTemp(ty);
   1924    IRTemp  dst0 = newTemp(ty);
   1925    UChar   rm   = getIByte(delta0);
   1926    IRTemp  addr = IRTemp_INVALID;
   1927 
   1928    /* addSubCarry == True indicates the intended operation is
   1929       add-with-carry or subtract-with-borrow. */
   1930    if (addSubCarry) {
   1931       vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
   1932       vassert(keep);
   1933    }
   1934 
   1935    if (epartIsReg(rm)) {
   1936       /* Specially handle XOR reg,reg, because that doesn't really
   1937          depend on reg, and doing the obvious thing potentially
   1938          generates a spurious value check failure due to the bogus
   1939          dependency.  Ditto SBB reg,reg.*/
   1940       if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
   1941           && gregOfRM(rm) == eregOfRM(rm)) {
   1942          putIReg(size, eregOfRM(rm), mkU(ty,0));
   1943       }
   1944       assign(dst0, getIReg(size,eregOfRM(rm)));
   1945       assign(src,  getIReg(size,gregOfRM(rm)));
   1946 
   1947       if (addSubCarry && op8 == Iop_Add8) {
   1948          helper_ADC( size, dst1, dst0, src,
   1949                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1950          putIReg(size, eregOfRM(rm), mkexpr(dst1));
   1951       } else
   1952       if (addSubCarry && op8 == Iop_Sub8) {
   1953          helper_SBB( size, dst1, dst0, src,
   1954                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   1955          putIReg(size, eregOfRM(rm), mkexpr(dst1));
   1956       } else {
   1957          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   1958          if (isAddSub(op8))
   1959             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   1960          else
   1961             setFlags_DEP1(op8, dst1, ty);
   1962          if (keep)
   1963             putIReg(size, eregOfRM(rm), mkexpr(dst1));
   1964       }
   1965 
   1966       DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
   1967                           nameIReg(size,gregOfRM(rm)),
   1968                           nameIReg(size,eregOfRM(rm)));
   1969       return 1+delta0;
   1970    }
   1971 
   1972    /* E refers to memory */
   1973    {
   1974       addr = disAMode ( &len, sorb, delta0, dis_buf);
   1975       assign(dst0, loadLE(ty,mkexpr(addr)));
   1976       assign(src,  getIReg(size,gregOfRM(rm)));
   1977 
   1978       if (addSubCarry && op8 == Iop_Add8) {
   1979          if (locked) {
   1980             /* cas-style store */
   1981             helper_ADC( size, dst1, dst0, src,
   1982                         /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
   1983          } else {
   1984             /* normal store */
   1985             helper_ADC( size, dst1, dst0, src,
   1986                         /*store*/addr, IRTemp_INVALID, 0 );
   1987          }
   1988       } else
   1989       if (addSubCarry && op8 == Iop_Sub8) {
   1990          if (locked) {
   1991             /* cas-style store */
   1992             helper_SBB( size, dst1, dst0, src,
   1993                         /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
   1994          } else {
   1995             /* normal store */
   1996             helper_SBB( size, dst1, dst0, src,
   1997                         /*store*/addr, IRTemp_INVALID, 0 );
   1998          }
   1999       } else {
   2000          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   2001          if (keep) {
   2002             if (locked) {
   2003                if (0) vex_printf("locked case\n" );
   2004                casLE( mkexpr(addr),
   2005                       mkexpr(dst0)/*expval*/,
   2006                       mkexpr(dst1)/*newval*/, guest_EIP_curr_instr );
   2007             } else {
   2008                if (0) vex_printf("nonlocked case\n");
   2009                storeLE(mkexpr(addr), mkexpr(dst1));
   2010             }
   2011          }
   2012          if (isAddSub(op8))
   2013             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2014          else
   2015             setFlags_DEP1(op8, dst1, ty);
   2016       }
   2017 
   2018       DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
   2019                           nameIReg(size,gregOfRM(rm)), dis_buf);
   2020       return len+delta0;
   2021    }
   2022 }
   2023 
   2024 
   2025 /* Handle move instructions of the form
   2026       mov E, G  meaning
   2027       mov reg-or-mem, reg
   2028    Is passed the a ptr to the modRM byte, and the data size.  Returns
   2029    the address advanced completely over this instruction.
   2030 
   2031    E(src) is reg-or-mem
   2032    G(dst) is reg.
   2033 
   2034    If E is reg, -->    GET %E,  tmpv
   2035                        PUT tmpv, %G
   2036 
   2037    If E is mem  -->    (getAddr E) -> tmpa
   2038                        LD (tmpa), tmpb
   2039                        PUT tmpb, %G
   2040 */
   2041 static
   2042 UInt dis_mov_E_G ( UChar       sorb,
   2043                    Int         size,
   2044                    Int         delta0 )
   2045 {
   2046    Int len;
   2047    UChar rm = getIByte(delta0);
   2048    HChar dis_buf[50];
   2049 
   2050    if (epartIsReg(rm)) {
   2051       putIReg(size, gregOfRM(rm), getIReg(size, eregOfRM(rm)));
   2052       DIP("mov%c %s,%s\n", nameISize(size),
   2053                            nameIReg(size,eregOfRM(rm)),
   2054                            nameIReg(size,gregOfRM(rm)));
   2055       return 1+delta0;
   2056    }
   2057 
   2058    /* E refers to memory */
   2059    {
   2060       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
   2061       putIReg(size, gregOfRM(rm), loadLE(szToITy(size), mkexpr(addr)));
   2062       DIP("mov%c %s,%s\n", nameISize(size),
   2063                            dis_buf,nameIReg(size,gregOfRM(rm)));
   2064       return delta0+len;
   2065    }
   2066 }
   2067 
   2068 
   2069 /* Handle move instructions of the form
   2070       mov G, E  meaning
   2071       mov reg, reg-or-mem
   2072    Is passed the a ptr to the modRM byte, and the data size.  Returns
   2073    the address advanced completely over this instruction.
   2074 
   2075    G(src) is reg.
   2076    E(dst) is reg-or-mem
   2077 
   2078    If E is reg, -->    GET %G,  tmp
   2079                        PUT tmp, %E
   2080 
   2081    If E is mem, -->    (getAddr E) -> tmpa
   2082                        GET %G, tmpv
   2083                        ST tmpv, (tmpa)
   2084 */
   2085 static
   2086 UInt dis_mov_G_E ( UChar       sorb,
   2087                    Int         size,
   2088                    Int         delta0 )
   2089 {
   2090    Int len;
   2091    UChar rm = getIByte(delta0);
   2092    HChar dis_buf[50];
   2093 
   2094    if (epartIsReg(rm)) {
   2095       putIReg(size, eregOfRM(rm), getIReg(size, gregOfRM(rm)));
   2096       DIP("mov%c %s,%s\n", nameISize(size),
   2097                            nameIReg(size,gregOfRM(rm)),
   2098                            nameIReg(size,eregOfRM(rm)));
   2099       return 1+delta0;
   2100    }
   2101 
   2102    /* E refers to memory */
   2103    {
   2104       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf);
   2105       storeLE( mkexpr(addr), getIReg(size, gregOfRM(rm)) );
   2106       DIP("mov%c %s,%s\n", nameISize(size),
   2107                            nameIReg(size,gregOfRM(rm)), dis_buf);
   2108       return len+delta0;
   2109    }
   2110 }
   2111 
   2112 
   2113 /* op $immediate, AL/AX/EAX. */
   2114 static
   2115 UInt dis_op_imm_A ( Int    size,
   2116                     Bool   carrying,
   2117                     IROp   op8,
   2118                     Bool   keep,
   2119                     Int    delta,
   2120                     HChar* t_x86opc )
   2121 {
   2122    IRType ty   = szToITy(size);
   2123    IRTemp dst0 = newTemp(ty);
   2124    IRTemp src  = newTemp(ty);
   2125    IRTemp dst1 = newTemp(ty);
   2126    UInt lit    = getUDisp(size,delta);
   2127    assign(dst0, getIReg(size,R_EAX));
   2128    assign(src,  mkU(ty,lit));
   2129 
   2130    if (isAddSub(op8) && !carrying) {
   2131       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   2132       setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2133    }
   2134    else
   2135    if (isLogic(op8)) {
   2136       vassert(!carrying);
   2137       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   2138       setFlags_DEP1(op8, dst1, ty);
   2139    }
   2140    else
   2141    if (op8 == Iop_Add8 && carrying) {
   2142       helper_ADC( size, dst1, dst0, src,
   2143                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2144    }
   2145    else
   2146    if (op8 == Iop_Sub8 && carrying) {
   2147       helper_SBB( size, dst1, dst0, src,
   2148                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2149    }
   2150    else
   2151       vpanic("dis_op_imm_A(x86,guest)");
   2152 
   2153    if (keep)
   2154       putIReg(size, R_EAX, mkexpr(dst1));
   2155 
   2156    DIP("%s%c $0x%x, %s\n", t_x86opc, nameISize(size),
   2157                            lit, nameIReg(size,R_EAX));
   2158    return delta+size;
   2159 }
   2160 
   2161 
   2162 /* Sign- and Zero-extending moves. */
   2163 static
   2164 UInt dis_movx_E_G ( UChar      sorb,
   2165                     Int delta, Int szs, Int szd, Bool sign_extend )
   2166 {
   2167    UChar rm = getIByte(delta);
   2168    if (epartIsReg(rm)) {
   2169       if (szd == szs) {
   2170          // mutant case.  See #250799
   2171          putIReg(szd, gregOfRM(rm),
   2172                            getIReg(szs,eregOfRM(rm)));
   2173       } else {
   2174          // normal case
   2175          putIReg(szd, gregOfRM(rm),
   2176                       unop(mkWidenOp(szs,szd,sign_extend),
   2177                            getIReg(szs,eregOfRM(rm))));
   2178       }
   2179       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
   2180                                nameISize(szs), nameISize(szd),
   2181                                nameIReg(szs,eregOfRM(rm)),
   2182                                nameIReg(szd,gregOfRM(rm)));
   2183       return 1+delta;
   2184    }
   2185 
   2186    /* E refers to memory */
   2187    {
   2188       Int    len;
   2189       HChar  dis_buf[50];
   2190       IRTemp addr = disAMode ( &len, sorb, delta, dis_buf );
   2191       if (szd == szs) {
   2192          // mutant case.  See #250799
   2193          putIReg(szd, gregOfRM(rm),
   2194                            loadLE(szToITy(szs),mkexpr(addr)));
   2195       } else {
   2196          // normal case
   2197          putIReg(szd, gregOfRM(rm),
   2198                       unop(mkWidenOp(szs,szd,sign_extend),
   2199                            loadLE(szToITy(szs),mkexpr(addr))));
   2200       }
   2201       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
   2202                                nameISize(szs), nameISize(szd),
   2203                                dis_buf, nameIReg(szd,gregOfRM(rm)));
   2204       return len+delta;
   2205    }
   2206 }
   2207 
   2208 
   2209 /* Generate code to divide ArchRegs EDX:EAX / DX:AX / AX by the 32 /
   2210    16 / 8 bit quantity in the given IRTemp.  */
   2211 static
   2212 void codegen_div ( Int sz, IRTemp t, Bool signed_divide )
   2213 {
   2214    IROp   op    = signed_divide ? Iop_DivModS64to32 : Iop_DivModU64to32;
   2215    IRTemp src64 = newTemp(Ity_I64);
   2216    IRTemp dst64 = newTemp(Ity_I64);
   2217    switch (sz) {
   2218       case 4:
   2219          assign( src64, binop(Iop_32HLto64,
   2220                               getIReg(4,R_EDX), getIReg(4,R_EAX)) );
   2221          assign( dst64, binop(op, mkexpr(src64), mkexpr(t)) );
   2222          putIReg( 4, R_EAX, unop(Iop_64to32,mkexpr(dst64)) );
   2223          putIReg( 4, R_EDX, unop(Iop_64HIto32,mkexpr(dst64)) );
   2224          break;
   2225       case 2: {
   2226          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
   2227          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
   2228          assign( src64, unop(widen3264,
   2229                              binop(Iop_16HLto32,
   2230                                    getIReg(2,R_EDX), getIReg(2,R_EAX))) );
   2231          assign( dst64, binop(op, mkexpr(src64), unop(widen1632,mkexpr(t))) );
   2232          putIReg( 2, R_EAX, unop(Iop_32to16,unop(Iop_64to32,mkexpr(dst64))) );
   2233          putIReg( 2, R_EDX, unop(Iop_32to16,unop(Iop_64HIto32,mkexpr(dst64))) );
   2234          break;
   2235       }
   2236       case 1: {
   2237          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
   2238          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
   2239          IROp widen816  = signed_divide ? Iop_8Sto16  : Iop_8Uto16;
   2240          assign( src64, unop(widen3264, unop(widen1632, getIReg(2,R_EAX))) );
   2241          assign( dst64,
   2242                  binop(op, mkexpr(src64),
   2243                            unop(widen1632, unop(widen816, mkexpr(t)))) );
   2244          putIReg( 1, R_AL, unop(Iop_16to8, unop(Iop_32to16,
   2245                            unop(Iop_64to32,mkexpr(dst64)))) );
   2246          putIReg( 1, R_AH, unop(Iop_16to8, unop(Iop_32to16,
   2247                            unop(Iop_64HIto32,mkexpr(dst64)))) );
   2248          break;
   2249       }
   2250       default: vpanic("codegen_div(x86)");
   2251    }
   2252 }
   2253 
   2254 
   2255 static
   2256 UInt dis_Grp1 ( UChar sorb, Bool locked,
   2257                 Int delta, UChar modrm,
   2258                 Int am_sz, Int d_sz, Int sz, UInt d32 )
   2259 {
   2260    Int     len;
   2261    HChar   dis_buf[50];
   2262    IRType  ty   = szToITy(sz);
   2263    IRTemp  dst1 = newTemp(ty);
   2264    IRTemp  src  = newTemp(ty);
   2265    IRTemp  dst0 = newTemp(ty);
   2266    IRTemp  addr = IRTemp_INVALID;
   2267    IROp    op8  = Iop_INVALID;
   2268    UInt    mask = sz==1 ? 0xFF : (sz==2 ? 0xFFFF : 0xFFFFFFFF);
   2269 
   2270    switch (gregOfRM(modrm)) {
   2271       case 0: op8 = Iop_Add8; break;  case 1: op8 = Iop_Or8;  break;
   2272       case 2: break;  // ADC
   2273       case 3: break;  // SBB
   2274       case 4: op8 = Iop_And8; break;  case 5: op8 = Iop_Sub8; break;
   2275       case 6: op8 = Iop_Xor8; break;  case 7: op8 = Iop_Sub8; break;
   2276       /*NOTREACHED*/
   2277       default: vpanic("dis_Grp1: unhandled case");
   2278    }
   2279 
   2280    if (epartIsReg(modrm)) {
   2281       vassert(am_sz == 1);
   2282 
   2283       assign(dst0, getIReg(sz,eregOfRM(modrm)));
   2284       assign(src,  mkU(ty,d32 & mask));
   2285 
   2286       if (gregOfRM(modrm) == 2 /* ADC */) {
   2287          helper_ADC( sz, dst1, dst0, src,
   2288                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2289       } else
   2290       if (gregOfRM(modrm) == 3 /* SBB */) {
   2291          helper_SBB( sz, dst1, dst0, src,
   2292                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2293       } else {
   2294          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   2295          if (isAddSub(op8))
   2296             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2297          else
   2298             setFlags_DEP1(op8, dst1, ty);
   2299       }
   2300 
   2301       if (gregOfRM(modrm) < 7)
   2302          putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
   2303 
   2304       delta += (am_sz + d_sz);
   2305       DIP("%s%c $0x%x, %s\n", nameGrp1(gregOfRM(modrm)), nameISize(sz), d32,
   2306                               nameIReg(sz,eregOfRM(modrm)));
   2307    } else {
   2308       addr = disAMode ( &len, sorb, delta, dis_buf);
   2309 
   2310       assign(dst0, loadLE(ty,mkexpr(addr)));
   2311       assign(src, mkU(ty,d32 & mask));
   2312 
   2313       if (gregOfRM(modrm) == 2 /* ADC */) {
   2314          if (locked) {
   2315             /* cas-style store */
   2316             helper_ADC( sz, dst1, dst0, src,
   2317                        /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
   2318          } else {
   2319             /* normal store */
   2320             helper_ADC( sz, dst1, dst0, src,
   2321                         /*store*/addr, IRTemp_INVALID, 0 );
   2322          }
   2323       } else
   2324       if (gregOfRM(modrm) == 3 /* SBB */) {
   2325          if (locked) {
   2326             /* cas-style store */
   2327             helper_SBB( sz, dst1, dst0, src,
   2328                        /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
   2329          } else {
   2330             /* normal store */
   2331             helper_SBB( sz, dst1, dst0, src,
   2332                         /*store*/addr, IRTemp_INVALID, 0 );
   2333          }
   2334       } else {
   2335          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   2336          if (gregOfRM(modrm) < 7) {
   2337             if (locked) {
   2338                casLE( mkexpr(addr), mkexpr(dst0)/*expVal*/,
   2339                                     mkexpr(dst1)/*newVal*/,
   2340                                     guest_EIP_curr_instr );
   2341             } else {
   2342                storeLE(mkexpr(addr), mkexpr(dst1));
   2343             }
   2344          }
   2345          if (isAddSub(op8))
   2346             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2347          else
   2348             setFlags_DEP1(op8, dst1, ty);
   2349       }
   2350 
   2351       delta += (len+d_sz);
   2352       DIP("%s%c $0x%x, %s\n", nameGrp1(gregOfRM(modrm)), nameISize(sz),
   2353                               d32, dis_buf);
   2354    }
   2355    return delta;
   2356 }
   2357 
   2358 
   2359 /* Group 2 extended opcodes.  shift_expr must be an 8-bit typed
   2360    expression. */
   2361 
   2362 static
   2363 UInt dis_Grp2 ( UChar sorb,
   2364                 Int delta, UChar modrm,
   2365                 Int am_sz, Int d_sz, Int sz, IRExpr* shift_expr,
   2366                 HChar* shift_expr_txt, Bool* decode_OK )
   2367 {
   2368    /* delta on entry points at the modrm byte. */
   2369    HChar  dis_buf[50];
   2370    Int    len;
   2371    Bool   isShift, isRotate, isRotateC;
   2372    IRType ty    = szToITy(sz);
   2373    IRTemp dst0  = newTemp(ty);
   2374    IRTemp dst1  = newTemp(ty);
   2375    IRTemp addr  = IRTemp_INVALID;
   2376 
   2377    *decode_OK = True;
   2378 
   2379    vassert(sz == 1 || sz == 2 || sz == 4);
   2380 
   2381    /* Put value to shift/rotate in dst0. */
   2382    if (epartIsReg(modrm)) {
   2383       assign(dst0, getIReg(sz, eregOfRM(modrm)));
   2384       delta += (am_sz + d_sz);
   2385    } else {
   2386       addr = disAMode ( &len, sorb, delta, dis_buf);
   2387       assign(dst0, loadLE(ty,mkexpr(addr)));
   2388       delta += len + d_sz;
   2389    }
   2390 
   2391    isShift = False;
   2392    switch (gregOfRM(modrm)) { case 4: case 5: case 7: isShift = True; }
   2393 
   2394    isRotate = False;
   2395    switch (gregOfRM(modrm)) { case 0: case 1: isRotate = True; }
   2396 
   2397    isRotateC = False;
   2398    switch (gregOfRM(modrm)) { case 2: case 3: isRotateC = True; }
   2399 
   2400    if (gregOfRM(modrm) == 6) {
   2401       *decode_OK = False;
   2402       return delta;
   2403    }
   2404 
   2405    if (!isShift && !isRotate && !isRotateC) {
   2406       /*NOTREACHED*/
   2407       vpanic("dis_Grp2(Reg): unhandled case(x86)");
   2408    }
   2409 
   2410    if (isRotateC) {
   2411       /* call a helper; these insns are so ridiculous they do not
   2412          deserve better */
   2413       Bool     left = toBool(gregOfRM(modrm) == 2);
   2414       IRTemp   r64  = newTemp(Ity_I64);
   2415       IRExpr** args
   2416          = mkIRExprVec_4( widenUto32(mkexpr(dst0)), /* thing to rotate */
   2417                           widenUto32(shift_expr),   /* rotate amount */
   2418                           widenUto32(mk_x86g_calculate_eflags_all()),
   2419                           mkU32(sz) );
   2420       assign( r64, mkIRExprCCall(
   2421                       Ity_I64,
   2422                       0/*regparm*/,
   2423                       left ? "x86g_calculate_RCL" : "x86g_calculate_RCR",
   2424                       left ? &x86g_calculate_RCL  : &x86g_calculate_RCR,
   2425                       args
   2426                    )
   2427             );
   2428       /* new eflags in hi half r64; new value in lo half r64 */
   2429       assign( dst1, narrowTo(ty, unop(Iop_64to32, mkexpr(r64))) );
   2430       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   2431       stmt( IRStmt_Put( OFFB_CC_DEP1, unop(Iop_64HIto32, mkexpr(r64)) ));
   2432       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   2433       /* Set NDEP even though it isn't used.  This makes redundant-PUT
   2434          elimination of previous stores to this field work better. */
   2435       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   2436    }
   2437 
   2438    if (isShift) {
   2439 
   2440       IRTemp pre32     = newTemp(Ity_I32);
   2441       IRTemp res32     = newTemp(Ity_I32);
   2442       IRTemp res32ss   = newTemp(Ity_I32);
   2443       IRTemp shift_amt = newTemp(Ity_I8);
   2444       IROp   op32;
   2445 
   2446       switch (gregOfRM(modrm)) {
   2447          case 4: op32 = Iop_Shl32; break;
   2448          case 5: op32 = Iop_Shr32; break;
   2449          case 7: op32 = Iop_Sar32; break;
   2450          /*NOTREACHED*/
   2451          default: vpanic("dis_Grp2:shift"); break;
   2452       }
   2453 
   2454       /* Widen the value to be shifted to 32 bits, do the shift, and
   2455          narrow back down.  This seems surprisingly long-winded, but
   2456          unfortunately the Intel semantics requires that 8/16-bit
   2457          shifts give defined results for shift values all the way up
   2458          to 31, and this seems the simplest way to do it.  It has the
   2459          advantage that the only IR level shifts generated are of 32
   2460          bit values, and the shift amount is guaranteed to be in the
   2461          range 0 .. 31, thereby observing the IR semantics requiring
   2462          all shift values to be in the range 0 .. 2^word_size-1. */
   2463 
   2464       /* shift_amt = shift_expr & 31, regardless of operation size */
   2465       assign( shift_amt, binop(Iop_And8, shift_expr, mkU8(31)) );
   2466 
   2467       /* suitably widen the value to be shifted to 32 bits. */
   2468       assign( pre32, op32==Iop_Sar32 ? widenSto32(mkexpr(dst0))
   2469                                      : widenUto32(mkexpr(dst0)) );
   2470 
   2471       /* res32 = pre32 `shift` shift_amt */
   2472       assign( res32, binop(op32, mkexpr(pre32), mkexpr(shift_amt)) );
   2473 
   2474       /* res32ss = pre32 `shift` ((shift_amt - 1) & 31) */
   2475       assign( res32ss,
   2476               binop(op32,
   2477                     mkexpr(pre32),
   2478                     binop(Iop_And8,
   2479                           binop(Iop_Sub8,
   2480                                 mkexpr(shift_amt), mkU8(1)),
   2481                           mkU8(31))) );
   2482 
   2483       /* Build the flags thunk. */
   2484       setFlags_DEP1_DEP2_shift(op32, res32, res32ss, ty, shift_amt);
   2485 
   2486       /* Narrow the result back down. */
   2487       assign( dst1, narrowTo(ty, mkexpr(res32)) );
   2488 
   2489    } /* if (isShift) */
   2490 
   2491    else
   2492    if (isRotate) {
   2493       Int    ccOp      = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
   2494       Bool   left      = toBool(gregOfRM(modrm) == 0);
   2495       IRTemp rot_amt   = newTemp(Ity_I8);
   2496       IRTemp rot_amt32 = newTemp(Ity_I8);
   2497       IRTemp oldFlags  = newTemp(Ity_I32);
   2498 
   2499       /* rot_amt = shift_expr & mask */
   2500       /* By masking the rotate amount thusly, the IR-level Shl/Shr
   2501          expressions never shift beyond the word size and thus remain
   2502          well defined. */
   2503       assign(rot_amt32, binop(Iop_And8, shift_expr, mkU8(31)));
   2504 
   2505       if (ty == Ity_I32)
   2506          assign(rot_amt, mkexpr(rot_amt32));
   2507       else
   2508          assign(rot_amt, binop(Iop_And8, mkexpr(rot_amt32), mkU8(8*sz-1)));
   2509 
   2510       if (left) {
   2511 
   2512          /* dst1 = (dst0 << rot_amt) | (dst0 >>u (wordsize-rot_amt)) */
   2513          assign(dst1,
   2514             binop( mkSizedOp(ty,Iop_Or8),
   2515                    binop( mkSizedOp(ty,Iop_Shl8),
   2516                           mkexpr(dst0),
   2517                           mkexpr(rot_amt)
   2518                    ),
   2519                    binop( mkSizedOp(ty,Iop_Shr8),
   2520                           mkexpr(dst0),
   2521                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
   2522                    )
   2523             )
   2524          );
   2525          ccOp += X86G_CC_OP_ROLB;
   2526 
   2527       } else { /* right */
   2528 
   2529          /* dst1 = (dst0 >>u rot_amt) | (dst0 << (wordsize-rot_amt)) */
   2530          assign(dst1,
   2531             binop( mkSizedOp(ty,Iop_Or8),
   2532                    binop( mkSizedOp(ty,Iop_Shr8),
   2533                           mkexpr(dst0),
   2534                           mkexpr(rot_amt)
   2535                    ),
   2536                    binop( mkSizedOp(ty,Iop_Shl8),
   2537                           mkexpr(dst0),
   2538                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
   2539                    )
   2540             )
   2541          );
   2542          ccOp += X86G_CC_OP_RORB;
   2543 
   2544       }
   2545 
   2546       /* dst1 now holds the rotated value.  Build flag thunk.  We
   2547          need the resulting value for this, and the previous flags.
   2548          Except don't set it if the rotate count is zero. */
   2549 
   2550       assign(oldFlags, mk_x86g_calculate_eflags_all());
   2551 
   2552       /* CC_DEP1 is the rotated value.  CC_NDEP is flags before. */
   2553       stmt( IRStmt_Put( OFFB_CC_OP,
   2554                         IRExpr_Mux0X( mkexpr(rot_amt32),
   2555                                       IRExpr_Get(OFFB_CC_OP,Ity_I32),
   2556                                       mkU32(ccOp))) );
   2557       stmt( IRStmt_Put( OFFB_CC_DEP1,
   2558                         IRExpr_Mux0X( mkexpr(rot_amt32),
   2559                                       IRExpr_Get(OFFB_CC_DEP1,Ity_I32),
   2560                                       widenUto32(mkexpr(dst1)))) );
   2561       stmt( IRStmt_Put( OFFB_CC_DEP2,
   2562                         IRExpr_Mux0X( mkexpr(rot_amt32),
   2563                                       IRExpr_Get(OFFB_CC_DEP2,Ity_I32),
   2564                                       mkU32(0))) );
   2565       stmt( IRStmt_Put( OFFB_CC_NDEP,
   2566                         IRExpr_Mux0X( mkexpr(rot_amt32),
   2567                                       IRExpr_Get(OFFB_CC_NDEP,Ity_I32),
   2568                                       mkexpr(oldFlags))) );
   2569    } /* if (isRotate) */
   2570 
   2571    /* Save result, and finish up. */
   2572    if (epartIsReg(modrm)) {
   2573       putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
   2574       if (vex_traceflags & VEX_TRACE_FE) {
   2575          vex_printf("%s%c ",
   2576                     nameGrp2(gregOfRM(modrm)), nameISize(sz) );
   2577          if (shift_expr_txt)
   2578             vex_printf("%s", shift_expr_txt);
   2579          else
   2580             ppIRExpr(shift_expr);
   2581          vex_printf(", %s\n", nameIReg(sz,eregOfRM(modrm)));
   2582       }
   2583    } else {
   2584       storeLE(mkexpr(addr), mkexpr(dst1));
   2585       if (vex_traceflags & VEX_TRACE_FE) {
   2586          vex_printf("%s%c ",
   2587                     nameGrp2(gregOfRM(modrm)), nameISize(sz) );
   2588          if (shift_expr_txt)
   2589             vex_printf("%s", shift_expr_txt);
   2590          else
   2591             ppIRExpr(shift_expr);
   2592          vex_printf(", %s\n", dis_buf);
   2593       }
   2594    }
   2595    return delta;
   2596 }
   2597 
   2598 
   2599 /* Group 8 extended opcodes (but BT/BTS/BTC/BTR only). */
   2600 static
   2601 UInt dis_Grp8_Imm ( UChar sorb,
   2602                     Bool locked,
   2603                     Int delta, UChar modrm,
   2604                     Int am_sz, Int sz, UInt src_val,
   2605                     Bool* decode_OK )
   2606 {
   2607    /* src_val denotes a d8.
   2608       And delta on entry points at the modrm byte. */
   2609 
   2610    IRType ty     = szToITy(sz);
   2611    IRTemp t2     = newTemp(Ity_I32);
   2612    IRTemp t2m    = newTemp(Ity_I32);
   2613    IRTemp t_addr = IRTemp_INVALID;
   2614    HChar  dis_buf[50];
   2615    UInt   mask;
   2616 
   2617    /* we're optimists :-) */
   2618    *decode_OK = True;
   2619 
   2620    /* Limit src_val -- the bit offset -- to something within a word.
   2621       The Intel docs say that literal offsets larger than a word are
   2622       masked in this way. */
   2623    switch (sz) {
   2624       case 2:  src_val &= 15; break;
   2625       case 4:  src_val &= 31; break;
   2626       default: *decode_OK = False; return delta;
   2627    }
   2628 
   2629    /* Invent a mask suitable for the operation. */
   2630    switch (gregOfRM(modrm)) {
   2631       case 4: /* BT */  mask = 0;               break;
   2632       case 5: /* BTS */ mask = 1 << src_val;    break;
   2633       case 6: /* BTR */ mask = ~(1 << src_val); break;
   2634       case 7: /* BTC */ mask = 1 << src_val;    break;
   2635          /* If this needs to be extended, probably simplest to make a
   2636             new function to handle the other cases (0 .. 3).  The
   2637             Intel docs do however not indicate any use for 0 .. 3, so
   2638             we don't expect this to happen. */
   2639       default: *decode_OK = False; return delta;
   2640    }
   2641 
   2642    /* Fetch the value to be tested and modified into t2, which is
   2643       32-bits wide regardless of sz. */
   2644    if (epartIsReg(modrm)) {
   2645       vassert(am_sz == 1);
   2646       assign( t2, widenUto32(getIReg(sz, eregOfRM(modrm))) );
   2647       delta += (am_sz + 1);
   2648       DIP("%s%c $0x%x, %s\n", nameGrp8(gregOfRM(modrm)), nameISize(sz),
   2649                               src_val, nameIReg(sz,eregOfRM(modrm)));
   2650    } else {
   2651       Int len;
   2652       t_addr = disAMode ( &len, sorb, delta, dis_buf);
   2653       delta  += (len+1);
   2654       assign( t2, widenUto32(loadLE(ty, mkexpr(t_addr))) );
   2655       DIP("%s%c $0x%x, %s\n", nameGrp8(gregOfRM(modrm)), nameISize(sz),
   2656                               src_val, dis_buf);
   2657    }
   2658 
   2659    /* Compute the new value into t2m, if non-BT. */
   2660    switch (gregOfRM(modrm)) {
   2661       case 4: /* BT */
   2662          break;
   2663       case 5: /* BTS */
   2664          assign( t2m, binop(Iop_Or32, mkU32(mask), mkexpr(t2)) );
   2665          break;
   2666       case 6: /* BTR */
   2667          assign( t2m, binop(Iop_And32, mkU32(mask), mkexpr(t2)) );
   2668          break;
   2669       case 7: /* BTC */
   2670          assign( t2m, binop(Iop_Xor32, mkU32(mask), mkexpr(t2)) );
   2671          break;
   2672       default:
   2673          /*NOTREACHED*/ /*the previous switch guards this*/
   2674          vassert(0);
   2675    }
   2676 
   2677    /* Write the result back, if non-BT.  If the CAS fails then we
   2678       side-exit from the trace at this point, and so the flag state is
   2679       not affected.  This is of course as required. */
   2680    if (gregOfRM(modrm) != 4 /* BT */) {
   2681       if (epartIsReg(modrm)) {
   2682          putIReg(sz, eregOfRM(modrm), narrowTo(ty, mkexpr(t2m)));
   2683       } else {
   2684          if (locked) {
   2685             casLE( mkexpr(t_addr),
   2686                    narrowTo(ty, mkexpr(t2))/*expd*/,
   2687                    narrowTo(ty, mkexpr(t2m))/*new*/,
   2688                    guest_EIP_curr_instr );
   2689          } else {
   2690             storeLE(mkexpr(t_addr), narrowTo(ty, mkexpr(t2m)));
   2691          }
   2692       }
   2693    }
   2694 
   2695    /* Copy relevant bit from t2 into the carry flag. */
   2696    /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
   2697    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   2698    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   2699    stmt( IRStmt_Put(
   2700             OFFB_CC_DEP1,
   2701             binop(Iop_And32,
   2702                   binop(Iop_Shr32, mkexpr(t2), mkU8(src_val)),
   2703                   mkU32(1))
   2704        ));
   2705    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   2706       elimination of previous stores to this field work better. */
   2707    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   2708 
   2709    return delta;
   2710 }
   2711 
   2712 
   2713 /* Signed/unsigned widening multiply.  Generate IR to multiply the
   2714    value in EAX/AX/AL by the given IRTemp, and park the result in
   2715    EDX:EAX/DX:AX/AX.
   2716 */
   2717 static void codegen_mulL_A_D ( Int sz, Bool syned,
   2718                                IRTemp tmp, HChar* tmp_txt )
   2719 {
   2720    IRType ty = szToITy(sz);
   2721    IRTemp t1 = newTemp(ty);
   2722 
   2723    assign( t1, getIReg(sz, R_EAX) );
   2724 
   2725    switch (ty) {
   2726       case Ity_I32: {
   2727          IRTemp res64   = newTemp(Ity_I64);
   2728          IRTemp resHi   = newTemp(Ity_I32);
   2729          IRTemp resLo   = newTemp(Ity_I32);
   2730          IROp   mulOp   = syned ? Iop_MullS32 : Iop_MullU32;
   2731          UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
   2732          setFlags_MUL ( Ity_I32, t1, tmp, tBaseOp );
   2733          assign( res64, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   2734          assign( resHi, unop(Iop_64HIto32,mkexpr(res64)));
   2735          assign( resLo, unop(Iop_64to32,mkexpr(res64)));
   2736          putIReg(4, R_EDX, mkexpr(resHi));
   2737          putIReg(4, R_EAX, mkexpr(resLo));
   2738          break;
   2739       }
   2740       case Ity_I16: {
   2741          IRTemp res32   = newTemp(Ity_I32);
   2742          IRTemp resHi   = newTemp(Ity_I16);
   2743          IRTemp resLo   = newTemp(Ity_I16);
   2744          IROp   mulOp   = syned ? Iop_MullS16 : Iop_MullU16;
   2745          UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
   2746          setFlags_MUL ( Ity_I16, t1, tmp, tBaseOp );
   2747          assign( res32, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   2748          assign( resHi, unop(Iop_32HIto16,mkexpr(res32)));
   2749          assign( resLo, unop(Iop_32to16,mkexpr(res32)));
   2750          putIReg(2, R_EDX, mkexpr(resHi));
   2751          putIReg(2, R_EAX, mkexpr(resLo));
   2752          break;
   2753       }
   2754       case Ity_I8: {
   2755          IRTemp res16   = newTemp(Ity_I16);
   2756          IRTemp resHi   = newTemp(Ity_I8);
   2757          IRTemp resLo   = newTemp(Ity_I8);
   2758          IROp   mulOp   = syned ? Iop_MullS8 : Iop_MullU8;
   2759          UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
   2760          setFlags_MUL ( Ity_I8, t1, tmp, tBaseOp );
   2761          assign( res16, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   2762          assign( resHi, unop(Iop_16HIto8,mkexpr(res16)));
   2763          assign( resLo, unop(Iop_16to8,mkexpr(res16)));
   2764          putIReg(2, R_EAX, mkexpr(res16));
   2765          break;
   2766       }
   2767       default:
   2768          vpanic("codegen_mulL_A_D(x86)");
   2769    }
   2770    DIP("%s%c %s\n", syned ? "imul" : "mul", nameISize(sz), tmp_txt);
   2771 }
   2772 
   2773 
   2774 /* Group 3 extended opcodes. */
   2775 static
   2776 UInt dis_Grp3 ( UChar sorb, Bool locked, Int sz, Int delta, Bool* decode_OK )
   2777 {
   2778    UInt    d32;
   2779    UChar   modrm;
   2780    HChar   dis_buf[50];
   2781    Int     len;
   2782    IRTemp  addr;
   2783    IRType  ty = szToITy(sz);
   2784    IRTemp  t1 = newTemp(ty);
   2785    IRTemp dst1, src, dst0;
   2786 
   2787    *decode_OK = True; /* may change this later */
   2788 
   2789    modrm = getIByte(delta);
   2790 
   2791    if (locked && (gregOfRM(modrm) != 2 && gregOfRM(modrm) != 3)) {
   2792       /* LOCK prefix only allowed with not and neg subopcodes */
   2793       *decode_OK = False;
   2794       return delta;
   2795    }
   2796 
   2797    if (epartIsReg(modrm)) {
   2798       switch (gregOfRM(modrm)) {
   2799          case 0: { /* TEST */
   2800             delta++; d32 = getUDisp(sz, delta); delta += sz;
   2801             dst1 = newTemp(ty);
   2802             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
   2803                                getIReg(sz,eregOfRM(modrm)),
   2804                                mkU(ty,d32)));
   2805             setFlags_DEP1( Iop_And8, dst1, ty );
   2806             DIP("test%c $0x%x, %s\n", nameISize(sz), d32,
   2807                                       nameIReg(sz, eregOfRM(modrm)));
   2808             break;
   2809          }
   2810          case 1: /* UNDEFINED */
   2811            /* The Intel docs imply this insn is undefined and binutils
   2812               agrees.  Unfortunately Core 2 will run it (with who
   2813               knows what result?)  sandpile.org reckons it's an alias
   2814               for case 0.  We play safe. */
   2815            *decode_OK = False;
   2816            break;
   2817          case 2: /* NOT */
   2818             delta++;
   2819             putIReg(sz, eregOfRM(modrm),
   2820                         unop(mkSizedOp(ty,Iop_Not8),
   2821                              getIReg(sz, eregOfRM(modrm))));
   2822             DIP("not%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
   2823             break;
   2824          case 3: /* NEG */
   2825             delta++;
   2826             dst0 = newTemp(ty);
   2827             src  = newTemp(ty);
   2828             dst1 = newTemp(ty);
   2829             assign(dst0, mkU(ty,0));
   2830             assign(src,  getIReg(sz,eregOfRM(modrm)));
   2831             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0), mkexpr(src)));
   2832             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
   2833             putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
   2834             DIP("neg%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
   2835             break;
   2836          case 4: /* MUL (unsigned widening) */
   2837             delta++;
   2838             src = newTemp(ty);
   2839             assign(src, getIReg(sz,eregOfRM(modrm)));
   2840             codegen_mulL_A_D ( sz, False, src, nameIReg(sz,eregOfRM(modrm)) );
   2841             break;
   2842          case 5: /* IMUL (signed widening) */
   2843             delta++;
   2844             src = newTemp(ty);
   2845             assign(src, getIReg(sz,eregOfRM(modrm)));
   2846             codegen_mulL_A_D ( sz, True, src, nameIReg(sz,eregOfRM(modrm)) );
   2847             break;
   2848          case 6: /* DIV */
   2849             delta++;
   2850             assign( t1, getIReg(sz, eregOfRM(modrm)) );
   2851             codegen_div ( sz, t1, False );
   2852             DIP("div%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
   2853             break;
   2854          case 7: /* IDIV */
   2855             delta++;
   2856             assign( t1, getIReg(sz, eregOfRM(modrm)) );
   2857             codegen_div ( sz, t1, True );
   2858             DIP("idiv%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
   2859             break;
   2860          default:
   2861             /* This can't happen - gregOfRM should return 0 .. 7 only */
   2862             vpanic("Grp3(x86)");
   2863       }
   2864    } else {
   2865       addr = disAMode ( &len, sorb, delta, dis_buf );
   2866       t1   = newTemp(ty);
   2867       delta += len;
   2868       assign(t1, loadLE(ty,mkexpr(addr)));
   2869       switch (gregOfRM(modrm)) {
   2870          case 0: { /* TEST */
   2871             d32 = getUDisp(sz, delta); delta += sz;
   2872             dst1 = newTemp(ty);
   2873             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
   2874                                mkexpr(t1), mkU(ty,d32)));
   2875             setFlags_DEP1( Iop_And8, dst1, ty );
   2876             DIP("test%c $0x%x, %s\n", nameISize(sz), d32, dis_buf);
   2877             break;
   2878          }
   2879          case 1: /* UNDEFINED */
   2880            /* See comment above on R case */
   2881            *decode_OK = False;
   2882            break;
   2883          case 2: /* NOT */
   2884             dst1 = newTemp(ty);
   2885             assign(dst1, unop(mkSizedOp(ty,Iop_Not8), mkexpr(t1)));
   2886             if (locked) {
   2887                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
   2888                                     guest_EIP_curr_instr );
   2889             } else {
   2890                storeLE( mkexpr(addr), mkexpr(dst1) );
   2891             }
   2892             DIP("not%c %s\n", nameISize(sz), dis_buf);
   2893             break;
   2894          case 3: /* NEG */
   2895             dst0 = newTemp(ty);
   2896             src  = newTemp(ty);
   2897             dst1 = newTemp(ty);
   2898             assign(dst0, mkU(ty,0));
   2899             assign(src,  mkexpr(t1));
   2900             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8),
   2901                                mkexpr(dst0), mkexpr(src)));
   2902             if (locked) {
   2903                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
   2904                                     guest_EIP_curr_instr );
   2905             } else {
   2906                storeLE( mkexpr(addr), mkexpr(dst1) );
   2907             }
   2908             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
   2909             DIP("neg%c %s\n", nameISize(sz), dis_buf);
   2910             break;
   2911          case 4: /* MUL */
   2912             codegen_mulL_A_D ( sz, False, t1, dis_buf );
   2913             break;
   2914          case 5: /* IMUL */
   2915             codegen_mulL_A_D ( sz, True, t1, dis_buf );
   2916             break;
   2917          case 6: /* DIV */
   2918             codegen_div ( sz, t1, False );
   2919             DIP("div%c %s\n", nameISize(sz), dis_buf);
   2920             break;
   2921          case 7: /* IDIV */
   2922             codegen_div ( sz, t1, True );
   2923             DIP("idiv%c %s\n", nameISize(sz), dis_buf);
   2924             break;
   2925          default:
   2926             /* This can't happen - gregOfRM should return 0 .. 7 only */
   2927             vpanic("Grp3(x86)");
   2928       }
   2929    }
   2930    return delta;
   2931 }
   2932 
   2933 
   2934 /* Group 4 extended opcodes. */
   2935 static
   2936 UInt dis_Grp4 ( UChar sorb, Bool locked, Int delta, Bool* decode_OK )
   2937 {
   2938    Int   alen;
   2939    UChar modrm;
   2940    HChar dis_buf[50];
   2941    IRType ty = Ity_I8;
   2942    IRTemp t1 = newTemp(ty);
   2943    IRTemp t2 = newTemp(ty);
   2944 
   2945    *decode_OK = True;
   2946 
   2947    modrm = getIByte(delta);
   2948 
   2949    if (locked && (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)) {
   2950       /* LOCK prefix only allowed with inc and dec subopcodes */
   2951       *decode_OK = False;
   2952       return delta;
   2953    }
   2954 
   2955    if (epartIsReg(modrm)) {
   2956       assign(t1, getIReg(1, eregOfRM(modrm)));
   2957       switch (gregOfRM(modrm)) {
   2958          case 0: /* INC */
   2959             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
   2960             putIReg(1, eregOfRM(modrm), mkexpr(t2));
   2961             setFlags_INC_DEC( True, t2, ty );
   2962             break;
   2963          case 1: /* DEC */
   2964             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
   2965             putIReg(1, eregOfRM(modrm), mkexpr(t2));
   2966             setFlags_INC_DEC( False, t2, ty );
   2967             break;
   2968          default:
   2969             *decode_OK = False;
   2970             return delta;
   2971       }
   2972       delta++;
   2973       DIP("%sb %s\n", nameGrp4(gregOfRM(modrm)),
   2974                       nameIReg(1, eregOfRM(modrm)));
   2975    } else {
   2976       IRTemp addr = disAMode ( &alen, sorb, delta, dis_buf );
   2977       assign( t1, loadLE(ty, mkexpr(addr)) );
   2978       switch (gregOfRM(modrm)) {
   2979          case 0: /* INC */
   2980             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
   2981             if (locked) {
   2982                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
   2983                       guest_EIP_curr_instr );
   2984             } else {
   2985                storeLE( mkexpr(addr), mkexpr(t2) );
   2986             }
   2987             setFlags_INC_DEC( True, t2, ty );
   2988             break;
   2989          case 1: /* DEC */
   2990             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
   2991             if (locked) {
   2992                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
   2993                       guest_EIP_curr_instr );
   2994             } else {
   2995                storeLE( mkexpr(addr), mkexpr(t2) );
   2996             }
   2997             setFlags_INC_DEC( False, t2, ty );
   2998             break;
   2999          default:
   3000             *decode_OK = False;
   3001             return delta;
   3002       }
   3003       delta += alen;
   3004       DIP("%sb %s\n", nameGrp4(gregOfRM(modrm)), dis_buf);
   3005    }
   3006    return delta;
   3007 }
   3008 
   3009 
   3010 /* Group 5 extended opcodes. */
   3011 static
   3012 UInt dis_Grp5 ( UChar sorb, Bool locked, Int sz, Int delta,
   3013                 DisResult* dres, Bool* decode_OK )
   3014 {
   3015    Int     len;
   3016    UChar   modrm;
   3017    HChar   dis_buf[50];
   3018    IRTemp  addr = IRTemp_INVALID;
   3019    IRType  ty = szToITy(sz);
   3020    IRTemp  t1 = newTemp(ty);
   3021    IRTemp  t2 = IRTemp_INVALID;
   3022 
   3023    *decode_OK = True;
   3024 
   3025    modrm = getIByte(delta);
   3026 
   3027    if (locked && (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)) {
   3028       /* LOCK prefix only allowed with inc and dec subopcodes */
   3029       *decode_OK = False;
   3030       return delta;
   3031    }
   3032 
   3033    if (epartIsReg(modrm)) {
   3034       assign(t1, getIReg(sz,eregOfRM(modrm)));
   3035       switch (gregOfRM(modrm)) {
   3036          case 0: /* INC */
   3037             vassert(sz == 2 || sz == 4);
   3038             t2 = newTemp(ty);
   3039             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
   3040                              mkexpr(t1), mkU(ty,1)));
   3041             setFlags_INC_DEC( True, t2, ty );
   3042             putIReg(sz,eregOfRM(modrm),mkexpr(t2));
   3043             break;
   3044          case 1: /* DEC */
   3045             vassert(sz == 2 || sz == 4);
   3046             t2 = newTemp(ty);
   3047             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
   3048                              mkexpr(t1), mkU(ty,1)));
   3049             setFlags_INC_DEC( False, t2, ty );
   3050             putIReg(sz,eregOfRM(modrm),mkexpr(t2));
   3051             break;
   3052          case 2: /* call Ev */
   3053             vassert(sz == 4);
   3054             t2 = newTemp(Ity_I32);
   3055             assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
   3056             putIReg(4, R_ESP, mkexpr(t2));
   3057             storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta+1));
   3058             jmp_treg(Ijk_Call,t1);
   3059             dres->whatNext = Dis_StopHere;
   3060             break;
   3061          case 4: /* jmp Ev */
   3062             vassert(sz == 4);
   3063             jmp_treg(Ijk_Boring,t1);
   3064             dres->whatNext = Dis_StopHere;
   3065             break;
   3066          case 6: /* PUSH Ev */
   3067             vassert(sz == 4 || sz == 2);
   3068             t2 = newTemp(Ity_I32);
   3069             assign( t2, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
   3070             putIReg(4, R_ESP, mkexpr(t2) );
   3071             storeLE( mkexpr(t2), mkexpr(t1) );
   3072             break;
   3073          default:
   3074             *decode_OK = False;
   3075             return delta;
   3076       }
   3077       delta++;
   3078       DIP("%s%c %s\n", nameGrp5(gregOfRM(modrm)),
   3079                        nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
   3080    } else {
   3081       addr = disAMode ( &len, sorb, delta, dis_buf );
   3082       assign(t1, loadLE(ty,mkexpr(addr)));
   3083       switch (gregOfRM(modrm)) {
   3084          case 0: /* INC */
   3085             t2 = newTemp(ty);
   3086             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
   3087                              mkexpr(t1), mkU(ty,1)));
   3088             if (locked) {
   3089                casLE( mkexpr(addr),
   3090                       mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
   3091             } else {
   3092                storeLE(mkexpr(addr),mkexpr(t2));
   3093             }
   3094             setFlags_INC_DEC( True, t2, ty );
   3095             break;
   3096          case 1: /* DEC */
   3097             t2 = newTemp(ty);
   3098             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
   3099                              mkexpr(t1), mkU(ty,1)));
   3100             if (locked) {
   3101                casLE( mkexpr(addr),
   3102                       mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
   3103             } else {
   3104                storeLE(mkexpr(addr),mkexpr(t2));
   3105             }
   3106             setFlags_INC_DEC( False, t2, ty );
   3107             break;
   3108          case 2: /* call Ev */
   3109             vassert(sz == 4);
   3110             t2 = newTemp(Ity_I32);
   3111             assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
   3112             putIReg(4, R_ESP, mkexpr(t2));
   3113             storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta+len));
   3114             jmp_treg(Ijk_Call,t1);
   3115             dres->whatNext = Dis_StopHere;
   3116             break;
   3117          case 4: /* JMP Ev */
   3118             vassert(sz == 4);
   3119             jmp_treg(Ijk_Boring,t1);
   3120             dres->whatNext = Dis_StopHere;
   3121             break;
   3122          case 6: /* PUSH Ev */
   3123             vassert(sz == 4 || sz == 2);
   3124             t2 = newTemp(Ity_I32);
   3125             assign( t2, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
   3126             putIReg(4, R_ESP, mkexpr(t2) );
   3127             storeLE( mkexpr(t2), mkexpr(t1) );
   3128             break;
   3129          default:
   3130             *decode_OK = False;
   3131             return delta;
   3132       }
   3133       delta += len;
   3134       DIP("%s%c %s\n", nameGrp5(gregOfRM(modrm)),
   3135                        nameISize(sz), dis_buf);
   3136    }
   3137    return delta;
   3138 }
   3139 
   3140 
   3141 /*------------------------------------------------------------*/
   3142 /*--- Disassembling string ops (including REP prefixes)    ---*/
   3143 /*------------------------------------------------------------*/
   3144 
   3145 /* Code shared by all the string ops */
   3146 static
   3147 void dis_string_op_increment(Int sz, Int t_inc)
   3148 {
   3149    if (sz == 4 || sz == 2) {
   3150       assign( t_inc,
   3151               binop(Iop_Shl32, IRExpr_Get( OFFB_DFLAG, Ity_I32 ),
   3152                                mkU8(sz/2) ) );
   3153    } else {
   3154       assign( t_inc,
   3155               IRExpr_Get( OFFB_DFLAG, Ity_I32 ) );
   3156    }
   3157 }
   3158 
   3159 static
   3160 void dis_string_op( void (*dis_OP)( Int, IRTemp ),
   3161                     Int sz, HChar* name, UChar sorb )
   3162 {
   3163    IRTemp t_inc = newTemp(Ity_I32);
   3164    vassert(sorb == 0); /* hmm.  so what was the point of passing it in? */
   3165    dis_string_op_increment(sz, t_inc);
   3166    dis_OP( sz, t_inc );
   3167    DIP("%s%c\n", name, nameISize(sz));
   3168 }
   3169 
   3170 static
   3171 void dis_MOVS ( Int sz, IRTemp t_inc )
   3172 {
   3173    IRType ty = szToITy(sz);
   3174    IRTemp td = newTemp(Ity_I32);   /* EDI */
   3175    IRTemp ts = newTemp(Ity_I32);   /* ESI */
   3176 
   3177    assign( td, getIReg(4, R_EDI) );
   3178    assign( ts, getIReg(4, R_ESI) );
   3179 
   3180    storeLE( mkexpr(td), loadLE(ty,mkexpr(ts)) );
   3181 
   3182    putIReg( 4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
   3183    putIReg( 4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
   3184 }
   3185 
   3186 static
   3187 void dis_LODS ( Int sz, IRTemp t_inc )
   3188 {
   3189    IRType ty = szToITy(sz);
   3190    IRTemp ts = newTemp(Ity_I32);   /* ESI */
   3191 
   3192    assign( ts, getIReg(4, R_ESI) );
   3193 
   3194    putIReg( sz, R_EAX, loadLE(ty, mkexpr(ts)) );
   3195 
   3196    putIReg( 4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
   3197 }
   3198 
   3199 static
   3200 void dis_STOS ( Int sz, IRTemp t_inc )
   3201 {
   3202    IRType ty = szToITy(sz);
   3203    IRTemp ta = newTemp(ty);        /* EAX */
   3204    IRTemp td = newTemp(Ity_I32);   /* EDI */
   3205 
   3206    assign( ta, getIReg(sz, R_EAX) );
   3207    assign( td, getIReg(4, R_EDI) );
   3208 
   3209    storeLE( mkexpr(td), mkexpr(ta) );
   3210 
   3211    putIReg( 4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
   3212 }
   3213 
   3214 static
   3215 void dis_CMPS ( Int sz, IRTemp t_inc )
   3216 {
   3217    IRType ty  = szToITy(sz);
   3218    IRTemp tdv = newTemp(ty);      /* (EDI) */
   3219    IRTemp tsv = newTemp(ty);      /* (ESI) */
   3220    IRTemp td  = newTemp(Ity_I32); /*  EDI  */
   3221    IRTemp ts  = newTemp(Ity_I32); /*  ESI  */
   3222 
   3223    assign( td, getIReg(4, R_EDI) );
   3224    assign( ts, getIReg(4, R_ESI) );
   3225 
   3226    assign( tdv, loadLE(ty,mkexpr(td)) );
   3227    assign( tsv, loadLE(ty,mkexpr(ts)) );
   3228 
   3229    setFlags_DEP1_DEP2 ( Iop_Sub8, tsv, tdv, ty );
   3230 
   3231    putIReg(4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
   3232    putIReg(4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
   3233 }
   3234 
   3235 static
   3236 void dis_SCAS ( Int sz, IRTemp t_inc )
   3237 {
   3238    IRType ty  = szToITy(sz);
   3239    IRTemp ta  = newTemp(ty);       /*  EAX  */
   3240    IRTemp td  = newTemp(Ity_I32);  /*  EDI  */
   3241    IRTemp tdv = newTemp(ty);       /* (EDI) */
   3242 
   3243    assign( ta, getIReg(sz, R_EAX) );
   3244    assign( td, getIReg(4, R_EDI) );
   3245 
   3246    assign( tdv, loadLE(ty,mkexpr(td)) );
   3247    setFlags_DEP1_DEP2 ( Iop_Sub8, ta, tdv, ty );
   3248 
   3249    putIReg(4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
   3250 }
   3251 
   3252 
   3253 /* Wrap the appropriate string op inside a REP/REPE/REPNE.
   3254    We assume the insn is the last one in the basic block, and so emit a jump
   3255    to the next insn, rather than just falling through. */
   3256 static
   3257 void dis_REP_op ( X86Condcode cond,
   3258                   void (*dis_OP)(Int, IRTemp),
   3259                   Int sz, Addr32 eip, Addr32 eip_next, HChar* name )
   3260 {
   3261    IRTemp t_inc = newTemp(Ity_I32);
   3262    IRTemp tc    = newTemp(Ity_I32);  /*  ECX  */
   3263 
   3264    assign( tc, getIReg(4,R_ECX) );
   3265 
   3266    stmt( IRStmt_Exit( binop(Iop_CmpEQ32,mkexpr(tc),mkU32(0)),
   3267                       Ijk_Boring,
   3268                       IRConst_U32(eip_next) ) );
   3269 
   3270    putIReg(4, R_ECX, binop(Iop_Sub32, mkexpr(tc), mkU32(1)) );
   3271 
   3272    dis_string_op_increment(sz, t_inc);
   3273    dis_OP (sz, t_inc);
   3274 
   3275    if (cond == X86CondAlways) {
   3276       jmp_lit(Ijk_Boring,eip);
   3277    } else {
   3278       stmt( IRStmt_Exit( mk_x86g_calculate_condition(cond),
   3279                          Ijk_Boring,
   3280                          IRConst_U32(eip) ) );
   3281       jmp_lit(Ijk_Boring,eip_next);
   3282    }
   3283    DIP("%s%c\n", name, nameISize(sz));
   3284 }
   3285 
   3286 
   3287 /*------------------------------------------------------------*/
   3288 /*--- Arithmetic, etc.                                     ---*/
   3289 /*------------------------------------------------------------*/
   3290 
   3291 /* IMUL E, G.  Supplied eip points to the modR/M byte. */
   3292 static
   3293 UInt dis_mul_E_G ( UChar       sorb,
   3294                    Int         size,
   3295                    Int         delta0 )
   3296 {
   3297    Int    alen;
   3298    HChar  dis_buf[50];
   3299    UChar  rm = getIByte(delta0);
   3300    IRType ty = szToITy(size);
   3301    IRTemp te = newTemp(ty);
   3302    IRTemp tg = newTemp(ty);
   3303    IRTemp resLo = newTemp(ty);
   3304 
   3305    assign( tg, getIReg(size, gregOfRM(rm)) );
   3306    if (epartIsReg(rm)) {
   3307       assign( te, getIReg(size, eregOfRM(rm)) );
   3308    } else {
   3309       IRTemp addr = disAMode( &alen, sorb, delta0, dis_buf );
   3310       assign( te, loadLE(ty,mkexpr(addr)) );
   3311    }
   3312 
   3313    setFlags_MUL ( ty, te, tg, X86G_CC_OP_SMULB );
   3314 
   3315    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tg) ) );
   3316 
   3317    putIReg(size, gregOfRM(rm), mkexpr(resLo) );
   3318 
   3319    if (epartIsReg(rm)) {
   3320       DIP("imul%c %s, %s\n", nameISize(size),
   3321                              nameIReg(size,eregOfRM(rm)),
   3322                              nameIReg(size,gregOfRM(rm)));
   3323       return 1+delta0;
   3324    } else {
   3325       DIP("imul%c %s, %s\n", nameISize(size),
   3326                              dis_buf, nameIReg(size,gregOfRM(rm)));
   3327       return alen+delta0;
   3328    }
   3329 }
   3330 
   3331 
   3332 /* IMUL I * E -> G.  Supplied eip points to the modR/M byte. */
   3333 static
   3334 UInt dis_imul_I_E_G ( UChar       sorb,
   3335                       Int         size,
   3336                       Int         delta,
   3337                       Int         litsize )
   3338 {
   3339    Int    d32, alen;
   3340    HChar  dis_buf[50];
   3341    UChar  rm = getIByte(delta);
   3342    IRType ty = szToITy(size);
   3343    IRTemp te = newTemp(ty);
   3344    IRTemp tl = newTemp(ty);
   3345    IRTemp resLo = newTemp(ty);
   3346 
   3347    vassert(size == 1 || size == 2 || size == 4);
   3348 
   3349    if (epartIsReg(rm)) {
   3350       assign(te, getIReg(size, eregOfRM(rm)));
   3351       delta++;
   3352    } else {
   3353       IRTemp addr = disAMode( &alen, sorb, delta, dis_buf );
   3354       assign(te, loadLE(ty, mkexpr(addr)));
   3355       delta += alen;
   3356    }
   3357    d32 = getSDisp(litsize,delta);
   3358    delta += litsize;
   3359 
   3360    if (size == 1) d32 &= 0xFF;
   3361    if (size == 2) d32 &= 0xFFFF;
   3362 
   3363    assign(tl, mkU(ty,d32));
   3364 
   3365    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tl) ));
   3366 
   3367    setFlags_MUL ( ty, te, tl, X86G_CC_OP_SMULB );
   3368 
   3369    putIReg(size, gregOfRM(rm), mkexpr(resLo));
   3370 
   3371    DIP("imul %d, %s, %s\n", d32,
   3372        ( epartIsReg(rm) ? nameIReg(size,eregOfRM(rm)) : dis_buf ),
   3373        nameIReg(size,gregOfRM(rm)) );
   3374    return delta;
   3375 }
   3376 
   3377 
   3378 /* Generate an IR sequence to do a count-leading-zeroes operation on
   3379    the supplied IRTemp, and return a new IRTemp holding the result.
   3380    'ty' may be Ity_I16 or Ity_I32 only.  In the case where the
   3381    argument is zero, return the number of bits in the word (the
   3382    natural semantics). */
   3383 static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
   3384 {
   3385    vassert(ty == Ity_I32 || ty == Ity_I16);
   3386 
   3387    IRTemp src32 = newTemp(Ity_I32);
   3388    assign(src32, widenUto32( mkexpr(src) ));
   3389 
   3390    IRTemp src32x = newTemp(Ity_I32);
   3391    assign(src32x,
   3392           binop(Iop_Shl32, mkexpr(src32),
   3393                            mkU8(32 - 8 * sizeofIRType(ty))));
   3394 
   3395    // Clz32 has undefined semantics when its input is zero, so
   3396    // special-case around that.
   3397    IRTemp res32 = newTemp(Ity_I32);
   3398    assign(res32,
   3399           IRExpr_Mux0X(
   3400              unop(Iop_1Uto8,
   3401                   binop(Iop_CmpEQ32, mkexpr(src32x), mkU32(0))),
   3402              unop(Iop_Clz32, mkexpr(src32x)),
   3403              mkU32(8 * sizeofIRType(ty))
   3404    ));
   3405 
   3406    IRTemp res = newTemp(ty);
   3407    assign(res, narrowTo(ty, mkexpr(res32)));
   3408    return res;
   3409 }
   3410 
   3411 
   3412 /*------------------------------------------------------------*/
   3413 /*---                                                      ---*/
   3414 /*--- x87 FLOATING POINT INSTRUCTIONS                      ---*/
   3415 /*---                                                      ---*/
   3416 /*------------------------------------------------------------*/
   3417 
   3418 /* --- Helper functions for dealing with the register stack. --- */
   3419 
   3420 /* --- Set the emulation-warning pseudo-register. --- */
   3421 
   3422 static void put_emwarn ( IRExpr* e /* :: Ity_I32 */ )
   3423 {
   3424    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   3425    stmt( IRStmt_Put( OFFB_EMWARN, e ) );
   3426 }
   3427 
   3428 /* --- Produce an IRExpr* denoting a 64-bit QNaN. --- */
   3429 
   3430 static IRExpr* mkQNaN64 ( void )
   3431 {
   3432   /* QNaN is 0 2047 1 0(51times)
   3433      == 0b 11111111111b 1 0(51times)
   3434      == 0x7FF8 0000 0000 0000
   3435    */
   3436    return IRExpr_Const(IRConst_F64i(0x7FF8000000000000ULL));
   3437 }
   3438 
   3439 /* --------- Get/put the top-of-stack pointer. --------- */
   3440 
   3441 static IRExpr* get_ftop ( void )
   3442 {
   3443    return IRExpr_Get( OFFB_FTOP, Ity_I32 );
   3444 }
   3445 
   3446 static void put_ftop ( IRExpr* e )
   3447 {
   3448    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   3449    stmt( IRStmt_Put( OFFB_FTOP, e ) );
   3450 }
   3451 
   3452 /* --------- Get/put the C3210 bits. --------- */
   3453 
   3454 static IRExpr* get_C3210 ( void )
   3455 {
   3456    return IRExpr_Get( OFFB_FC3210, Ity_I32 );
   3457 }
   3458 
   3459 static void put_C3210 ( IRExpr* e )
   3460 {
   3461    stmt( IRStmt_Put( OFFB_FC3210, e ) );
   3462 }
   3463 
   3464 /* --------- Get/put the FPU rounding mode. --------- */
   3465 static IRExpr* /* :: Ity_I32 */ get_fpround ( void )
   3466 {
   3467    return IRExpr_Get( OFFB_FPROUND, Ity_I32 );
   3468 }
   3469 
   3470 static void put_fpround ( IRExpr* /* :: Ity_I32 */ e )
   3471 {
   3472    stmt( IRStmt_Put( OFFB_FPROUND, e ) );
   3473 }
   3474 
   3475 
   3476 /* --------- Synthesise a 2-bit FPU rounding mode. --------- */
   3477 /* Produces a value in 0 .. 3, which is encoded as per the type
   3478    IRRoundingMode.  Since the guest_FPROUND value is also encoded as
   3479    per IRRoundingMode, we merely need to get it and mask it for
   3480    safety.
   3481 */
   3482 static IRExpr* /* :: Ity_I32 */ get_roundingmode ( void )
   3483 {
   3484    return binop( Iop_And32, get_fpround(), mkU32(3) );
   3485 }
   3486 
   3487 static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
   3488 {
   3489    return mkU32(Irrm_NEAREST);
   3490 }
   3491 
   3492 
   3493 /* --------- Get/set FP register tag bytes. --------- */
   3494 
   3495 /* Given i, and some expression e, generate 'ST_TAG(i) = e'. */
   3496 
   3497 static void put_ST_TAG ( Int i, IRExpr* value )
   3498 {
   3499    IRRegArray* descr;
   3500    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_I8);
   3501    descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   3502    stmt( IRStmt_PutI( descr, get_ftop(), i, value ) );
   3503 }
   3504 
   3505 /* Given i, generate an expression yielding 'ST_TAG(i)'.  This will be
   3506    zero to indicate "Empty" and nonzero to indicate "NonEmpty".  */
   3507 
   3508 static IRExpr* get_ST_TAG ( Int i )
   3509 {
   3510    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   3511    return IRExpr_GetI( descr, get_ftop(), i );
   3512 }
   3513 
   3514 
   3515 /* --------- Get/set FP registers. --------- */
   3516 
   3517 /* Given i, and some expression e, emit 'ST(i) = e' and set the
   3518    register's tag to indicate the register is full.  The previous
   3519    state of the register is not checked. */
   3520 
   3521 static void put_ST_UNCHECKED ( Int i, IRExpr* value )
   3522 {
   3523    IRRegArray* descr;
   3524    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_F64);
   3525    descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
   3526    stmt( IRStmt_PutI( descr, get_ftop(), i, value ) );
   3527    /* Mark the register as in-use. */
   3528    put_ST_TAG(i, mkU8(1));
   3529 }
   3530 
   3531 /* Given i, and some expression e, emit
   3532       ST(i) = is_full(i) ? NaN : e
   3533    and set the tag accordingly.
   3534 */
   3535 
   3536 static void put_ST ( Int i, IRExpr* value )
   3537 {
   3538    put_ST_UNCHECKED( i,
   3539                      IRExpr_Mux0X( get_ST_TAG(i),
   3540                                    /* 0 means empty */
   3541                                    value,
   3542                                    /* non-0 means full */
   3543                                    mkQNaN64()
   3544                    )
   3545    );
   3546 }
   3547 
   3548 
   3549 /* Given i, generate an expression yielding 'ST(i)'. */
   3550 
   3551 static IRExpr* get_ST_UNCHECKED ( Int i )
   3552 {
   3553    IRRegArray* descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
   3554    return IRExpr_GetI( descr, get_ftop(), i );
   3555 }
   3556 
   3557 
   3558 /* Given i, generate an expression yielding
   3559   is_full(i) ? ST(i) : NaN
   3560 */
   3561 
   3562 static IRExpr* get_ST ( Int i )
   3563 {
   3564    return
   3565       IRExpr_Mux0X( get_ST_TAG(i),
   3566                     /* 0 means empty */
   3567                     mkQNaN64(),
   3568                     /* non-0 means full */
   3569                     get_ST_UNCHECKED(i));
   3570 }
   3571 
   3572 
   3573 /* Adjust FTOP downwards by one register. */
   3574 
   3575 static void fp_push ( void )
   3576 {
   3577    put_ftop( binop(Iop_Sub32, get_ftop(), mkU32(1)) );
   3578 }
   3579 
   3580 /* Adjust FTOP upwards by one register, and mark the vacated register
   3581    as empty.  */
   3582 
   3583 static void fp_pop ( void )
   3584 {
   3585    put_ST_TAG(0, mkU8(0));
   3586    put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
   3587 }
   3588 
   3589 /* Clear the C2 bit of the FPU status register, for
   3590    sin/cos/tan/sincos. */
   3591 
   3592 static void clear_C2 ( void )
   3593 {
   3594    put_C3210( binop(Iop_And32, get_C3210(), mkU32(~X86G_FC_MASK_C2)) );
   3595 }
   3596 
   3597 /* Invent a plausible-looking FPU status word value:
   3598       ((ftop & 7) << 11) | (c3210 & 0x4700)
   3599  */
   3600 static IRExpr* get_FPU_sw ( void )
   3601 {
   3602    return
   3603       unop(Iop_32to16,
   3604            binop(Iop_Or32,
   3605                  binop(Iop_Shl32,
   3606                        binop(Iop_And32, get_ftop(), mkU32(7)),
   3607                              mkU8(11)),
   3608                        binop(Iop_And32, get_C3210(), mkU32(0x4700))
   3609       ));
   3610 }
   3611 
   3612 
   3613 /* ------------------------------------------------------- */
   3614 /* Given all that stack-mangling junk, we can now go ahead
   3615    and describe FP instructions.
   3616 */
   3617 
   3618 /* ST(0) = ST(0) `op` mem64/32(addr)
   3619    Need to check ST(0)'s tag on read, but not on write.
   3620 */
   3621 static
   3622 void fp_do_op_mem_ST_0 ( IRTemp addr, HChar* op_txt, HChar* dis_buf,
   3623                          IROp op, Bool dbl )
   3624 {
   3625    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
   3626    if (dbl) {
   3627       put_ST_UNCHECKED(0,
   3628          triop( op,
   3629                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3630                 get_ST(0),
   3631                 loadLE(Ity_F64,mkexpr(addr))
   3632          ));
   3633    } else {
   3634       put_ST_UNCHECKED(0,
   3635          triop( op,
   3636                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3637                 get_ST(0),
   3638                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr)))
   3639          ));
   3640    }
   3641 }
   3642 
   3643 
   3644 /* ST(0) = mem64/32(addr) `op` ST(0)
   3645    Need to check ST(0)'s tag on read, but not on write.
   3646 */
   3647 static
   3648 void fp_do_oprev_mem_ST_0 ( IRTemp addr, HChar* op_txt, HChar* dis_buf,
   3649                             IROp op, Bool dbl )
   3650 {
   3651    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
   3652    if (dbl) {
   3653       put_ST_UNCHECKED(0,
   3654          triop( op,
   3655                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3656                 loadLE(Ity_F64,mkexpr(addr)),
   3657                 get_ST(0)
   3658          ));
   3659    } else {
   3660       put_ST_UNCHECKED(0,
   3661          triop( op,
   3662                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3663                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr))),
   3664                 get_ST(0)
   3665          ));
   3666    }
   3667 }
   3668 
   3669 
   3670 /* ST(dst) = ST(dst) `op` ST(src).
   3671    Check dst and src tags when reading but not on write.
   3672 */
   3673 static
   3674 void fp_do_op_ST_ST ( HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
   3675                       Bool pop_after )
   3676 {
   3677    DIP("f%s%s st(%d), st(%d)\n", op_txt, pop_after?"p":"",
   3678                                  (Int)st_src, (Int)st_dst );
   3679    put_ST_UNCHECKED(
   3680       st_dst,
   3681       triop( op,
   3682              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3683              get_ST(st_dst),
   3684              get_ST(st_src) )
   3685    );
   3686    if (pop_after)
   3687       fp_pop();
   3688 }
   3689 
   3690 /* ST(dst) = ST(src) `op` ST(dst).
   3691    Check dst and src tags when reading but not on write.
   3692 */
   3693 static
   3694 void fp_do_oprev_ST_ST ( HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
   3695                          Bool pop_after )
   3696 {
   3697    DIP("f%s%s st(%d), st(%d)\n", op_txt, pop_after?"p":"",
   3698                                  (Int)st_src, (Int)st_dst );
   3699    put_ST_UNCHECKED(
   3700       st_dst,
   3701       triop( op,
   3702              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   3703              get_ST(st_src),
   3704              get_ST(st_dst) )
   3705    );
   3706    if (pop_after)
   3707       fp_pop();
   3708 }
   3709 
   3710 /* %eflags(Z,P,C) = UCOMI( st(0), st(i) ) */
   3711 static void fp_do_ucomi_ST0_STi ( UInt i, Bool pop_after )
   3712 {
   3713    DIP("fucomi%s %%st(0),%%st(%d)\n", pop_after ? "p" : "", (Int)i );
   3714    /* This is a bit of a hack (and isn't really right).  It sets
   3715       Z,P,C,O correctly, but forces A and S to zero, whereas the Intel
   3716       documentation implies A and S are unchanged.
   3717    */
   3718    /* It's also fishy in that it is used both for COMIP and
   3719       UCOMIP, and they aren't the same (although similar). */
   3720    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   3721    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   3722    stmt( IRStmt_Put( OFFB_CC_DEP1,
   3723                      binop( Iop_And32,
   3724                             binop(Iop_CmpF64, get_ST(0), get_ST(i)),
   3725                             mkU32(0x45)
   3726        )));
   3727    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   3728       elimination of previous stores to this field work better. */
   3729    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   3730    if (pop_after)
   3731       fp_pop();
   3732 }
   3733 
   3734 
   3735 static
   3736 UInt dis_FPU ( Bool* decode_ok, UChar sorb, Int delta )
   3737 {
   3738    Int    len;
   3739    UInt   r_src, r_dst;
   3740    HChar  dis_buf[50];
   3741    IRTemp t1, t2;
   3742 
   3743    /* On entry, delta points at the second byte of the insn (the modrm
   3744       byte).*/
   3745    UChar first_opcode = getIByte(delta-1);
   3746    UChar modrm        = getIByte(delta+0);
   3747 
   3748    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD8 opcodes +-+-+-+-+-+-+-+ */
   3749 
   3750    if (first_opcode == 0xD8) {
   3751       if (modrm < 0xC0) {
   3752 
   3753          /* bits 5,4,3 are an opcode extension, and the modRM also
   3754            specifies an address. */
   3755          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   3756          delta += len;
   3757 
   3758          switch (gregOfRM(modrm)) {
   3759 
   3760             case 0: /* FADD single-real */
   3761                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, False );
   3762                break;
   3763 
   3764             case 1: /* FMUL single-real */
   3765                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, False );
   3766                break;
   3767 
   3768             case 2: /* FCOM single-real */
   3769                DIP("fcoms %s\n", dis_buf);
   3770                /* This forces C1 to zero, which isn't right. */
   3771                put_C3210(
   3772                    binop( Iop_And32,
   3773                           binop(Iop_Shl32,
   3774                                 binop(Iop_CmpF64,
   3775                                       get_ST(0),
   3776                                       unop(Iop_F32toF64,
   3777                                            loadLE(Ity_F32,mkexpr(addr)))),
   3778                                 mkU8(8)),
   3779                           mkU32(0x4500)
   3780                    ));
   3781                break;
   3782 
   3783             case 3: /* FCOMP single-real */
   3784                DIP("fcomps %s\n", dis_buf);
   3785                /* This forces C1 to zero, which isn't right. */
   3786                put_C3210(
   3787                    binop( Iop_And32,
   3788                           binop(Iop_Shl32,
   3789                                 binop(Iop_CmpF64,
   3790                                       get_ST(0),
   3791                                       unop(Iop_F32toF64,
   3792                                            loadLE(Ity_F32,mkexpr(addr)))),
   3793                                 mkU8(8)),
   3794                           mkU32(0x4500)
   3795                    ));
   3796                fp_pop();
   3797                break;
   3798 
   3799             case 4: /* FSUB single-real */
   3800                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, False );
   3801                break;
   3802 
   3803             case 5: /* FSUBR single-real */
   3804                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, False );
   3805                break;
   3806 
   3807             case 6: /* FDIV single-real */
   3808                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, False );
   3809                break;
   3810 
   3811             case 7: /* FDIVR single-real */
   3812                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, False );
   3813                break;
   3814 
   3815             default:
   3816                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   3817                vex_printf("first_opcode == 0xD8\n");
   3818                goto decode_fail;
   3819          }
   3820       } else {
   3821          delta++;
   3822          switch (modrm) {
   3823 
   3824             case 0xC0 ... 0xC7: /* FADD %st(?),%st(0) */
   3825                fp_do_op_ST_ST ( "add", Iop_AddF64, modrm - 0xC0, 0, False );
   3826                break;
   3827 
   3828             case 0xC8 ... 0xCF: /* FMUL %st(?),%st(0) */
   3829                fp_do_op_ST_ST ( "mul", Iop_MulF64, modrm - 0xC8, 0, False );
   3830                break;
   3831 
   3832             /* Dunno if this is right */
   3833             case 0xD0 ... 0xD7: /* FCOM %st(?),%st(0) */
   3834                r_dst = (UInt)modrm - 0xD0;
   3835                DIP("fcom %%st(0),%%st(%d)\n", (Int)r_dst);
   3836                /* This forces C1 to zero, which isn't right. */
   3837                put_C3210(
   3838                    binop( Iop_And32,
   3839                           binop(Iop_Shl32,
   3840                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   3841                                 mkU8(8)),
   3842                           mkU32(0x4500)
   3843                    ));
   3844                break;
   3845 
   3846             /* Dunno if this is right */
   3847             case 0xD8 ... 0xDF: /* FCOMP %st(?),%st(0) */
   3848                r_dst = (UInt)modrm - 0xD8;
   3849                DIP("fcomp %%st(0),%%st(%d)\n", (Int)r_dst);
   3850                /* This forces C1 to zero, which isn't right. */
   3851                put_C3210(
   3852                    binop( Iop_And32,
   3853                           binop(Iop_Shl32,
   3854                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   3855                                 mkU8(8)),
   3856                           mkU32(0x4500)
   3857                    ));
   3858                fp_pop();
   3859                break;
   3860 
   3861             case 0xE0 ... 0xE7: /* FSUB %st(?),%st(0) */
   3862                fp_do_op_ST_ST ( "sub", Iop_SubF64, modrm - 0xE0, 0, False );
   3863                break;
   3864 
   3865             case 0xE8 ... 0xEF: /* FSUBR %st(?),%st(0) */
   3866                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, modrm - 0xE8, 0, False );
   3867                break;
   3868 
   3869             case 0xF0 ... 0xF7: /* FDIV %st(?),%st(0) */
   3870                fp_do_op_ST_ST ( "div", Iop_DivF64, modrm - 0xF0, 0, False );
   3871                break;
   3872 
   3873             case 0xF8 ... 0xFF: /* FDIVR %st(?),%st(0) */
   3874                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, modrm - 0xF8, 0, False );
   3875                break;
   3876 
   3877             default:
   3878                goto decode_fail;
   3879          }
   3880       }
   3881    }
   3882 
   3883    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD9 opcodes +-+-+-+-+-+-+-+ */
   3884    else
   3885    if (first_opcode == 0xD9) {
   3886       if (modrm < 0xC0) {
   3887 
   3888          /* bits 5,4,3 are an opcode extension, and the modRM also
   3889             specifies an address. */
   3890          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   3891          delta += len;
   3892 
   3893          switch (gregOfRM(modrm)) {
   3894 
   3895             case 0: /* FLD single-real */
   3896                DIP("flds %s\n", dis_buf);
   3897                fp_push();
   3898                put_ST(0, unop(Iop_F32toF64,
   3899                               loadLE(Ity_F32, mkexpr(addr))));
   3900                break;
   3901 
   3902             case 2: /* FST single-real */
   3903                DIP("fsts %s\n", dis_buf);
   3904                storeLE(mkexpr(addr),
   3905                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
   3906                break;
   3907 
   3908             case 3: /* FSTP single-real */
   3909                DIP("fstps %s\n", dis_buf);
   3910                storeLE(mkexpr(addr),
   3911                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
   3912                fp_pop();
   3913                break;
   3914 
   3915             case 4: { /* FLDENV m28 */
   3916                /* Uses dirty helper:
   3917                      VexEmWarn x86g_do_FLDENV ( VexGuestX86State*, HWord ) */
   3918                IRTemp   ew = newTemp(Ity_I32);
   3919                IRDirty* d  = unsafeIRDirty_0_N (
   3920                                 0/*regparms*/,
   3921                                 "x86g_dirtyhelper_FLDENV",
   3922                                 &x86g_dirtyhelper_FLDENV,
   3923                                 mkIRExprVec_1( mkexpr(addr) )
   3924                              );
   3925                d->needsBBP = True;
   3926                d->tmp      = ew;
   3927                /* declare we're reading memory */
   3928                d->mFx   = Ifx_Read;
   3929                d->mAddr = mkexpr(addr);
   3930                d->mSize = 28;
   3931 
   3932                /* declare we're writing guest state */
   3933                d->nFxState = 4;
   3934 
   3935                d->fxState[0].fx     = Ifx_Write;
   3936                d->fxState[0].offset = OFFB_FTOP;
   3937                d->fxState[0].size   = sizeof(UInt);
   3938 
   3939                d->fxState[1].fx     = Ifx_Write;
   3940                d->fxState[1].offset = OFFB_FPTAGS;
   3941                d->fxState[1].size   = 8 * sizeof(UChar);
   3942 
   3943                d->fxState[2].fx     = Ifx_Write;
   3944                d->fxState[2].offset = OFFB_FPROUND;
   3945                d->fxState[2].size   = sizeof(UInt);
   3946 
   3947                d->fxState[3].fx     = Ifx_Write;
   3948                d->fxState[3].offset = OFFB_FC3210;
   3949                d->fxState[3].size   = sizeof(UInt);
   3950 
   3951                stmt( IRStmt_Dirty(d) );
   3952 
   3953                /* ew contains any emulation warning we may need to
   3954                   issue.  If needed, side-exit to the next insn,
   3955                   reporting the warning, so that Valgrind's dispatcher
   3956                   sees the warning. */
   3957                put_emwarn( mkexpr(ew) );
   3958                stmt(
   3959                   IRStmt_Exit(
   3960                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   3961                      Ijk_EmWarn,
   3962                      IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta)
   3963                   )
   3964                );
   3965 
   3966                DIP("fldenv %s\n", dis_buf);
   3967                break;
   3968             }
   3969 
   3970             case 5: {/* FLDCW */
   3971                /* The only thing we observe in the control word is the
   3972                   rounding mode.  Therefore, pass the 16-bit value
   3973                   (x87 native-format control word) to a clean helper,
   3974                   getting back a 64-bit value, the lower half of which
   3975                   is the FPROUND value to store, and the upper half of
   3976                   which is the emulation-warning token which may be
   3977                   generated.
   3978                */
   3979                /* ULong x86h_check_fldcw ( UInt ); */
   3980                IRTemp t64 = newTemp(Ity_I64);
   3981                IRTemp ew = newTemp(Ity_I32);
   3982                DIP("fldcw %s\n", dis_buf);
   3983                assign( t64, mkIRExprCCall(
   3984                                Ity_I64, 0/*regparms*/,
   3985                                "x86g_check_fldcw",
   3986                                &x86g_check_fldcw,
   3987                                mkIRExprVec_1(
   3988                                   unop( Iop_16Uto32,
   3989                                         loadLE(Ity_I16, mkexpr(addr)))
   3990                                )
   3991                             )
   3992                      );
   3993 
   3994                put_fpround( unop(Iop_64to32, mkexpr(t64)) );
   3995                assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
   3996                put_emwarn( mkexpr(ew) );
   3997                /* Finally, if an emulation warning was reported,
   3998                   side-exit to the next insn, reporting the warning,
   3999                   so that Valgrind's dispatcher sees the warning. */
   4000                stmt(
   4001                   IRStmt_Exit(
   4002                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   4003                      Ijk_EmWarn,
   4004                      IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta)
   4005                   )
   4006                );
   4007                break;
   4008             }
   4009 
   4010             case 6: { /* FNSTENV m28 */
   4011                /* Uses dirty helper:
   4012                      void x86g_do_FSTENV ( VexGuestX86State*, HWord ) */
   4013                IRDirty* d = unsafeIRDirty_0_N (
   4014                                0/*regparms*/,
   4015                                "x86g_dirtyhelper_FSTENV",
   4016                                &x86g_dirtyhelper_FSTENV,
   4017                                mkIRExprVec_1( mkexpr(addr) )
   4018                             );
   4019                d->needsBBP = True;
   4020                /* declare we're writing memory */
   4021                d->mFx   = Ifx_Write;
   4022                d->mAddr = mkexpr(addr);
   4023                d->mSize = 28;
   4024 
   4025                /* declare we're reading guest state */
   4026                d->nFxState = 4;
   4027 
   4028                d->fxState[0].fx     = Ifx_Read;
   4029                d->fxState[0].offset = OFFB_FTOP;
   4030                d->fxState[0].size   = sizeof(UInt);
   4031 
   4032                d->fxState[1].fx     = Ifx_Read;
   4033                d->fxState[1].offset = OFFB_FPTAGS;
   4034                d->fxState[1].size   = 8 * sizeof(UChar);
   4035 
   4036                d->fxState[2].fx     = Ifx_Read;
   4037                d->fxState[2].offset = OFFB_FPROUND;
   4038                d->fxState[2].size   = sizeof(UInt);
   4039 
   4040                d->fxState[3].fx     = Ifx_Read;
   4041                d->fxState[3].offset = OFFB_FC3210;
   4042                d->fxState[3].size   = sizeof(UInt);
   4043 
   4044                stmt( IRStmt_Dirty(d) );
   4045 
   4046                DIP("fnstenv %s\n", dis_buf);
   4047                break;
   4048             }
   4049 
   4050             case 7: /* FNSTCW */
   4051               /* Fake up a native x87 FPU control word.  The only
   4052                  thing it depends on is FPROUND[1:0], so call a clean
   4053                  helper to cook it up. */
   4054                /* UInt x86h_create_fpucw ( UInt fpround ) */
   4055                DIP("fnstcw %s\n", dis_buf);
   4056                storeLE(
   4057                   mkexpr(addr),
   4058                   unop( Iop_32to16,
   4059                         mkIRExprCCall(
   4060                            Ity_I32, 0/*regp*/,
   4061                            "x86g_create_fpucw", &x86g_create_fpucw,
   4062                            mkIRExprVec_1( get_fpround() )
   4063                         )
   4064                   )
   4065                );
   4066                break;
   4067 
   4068             default:
   4069                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   4070                vex_printf("first_opcode == 0xD9\n");
   4071                goto decode_fail;
   4072          }
   4073 
   4074       } else {
   4075          delta++;
   4076          switch (modrm) {
   4077 
   4078             case 0xC0 ... 0xC7: /* FLD %st(?) */
   4079                r_src = (UInt)modrm - 0xC0;
   4080                DIP("fld %%st(%d)\n", (Int)r_src);
   4081                t1 = newTemp(Ity_F64);
   4082                assign(t1, get_ST(r_src));
   4083                fp_push();
   4084                put_ST(0, mkexpr(t1));
   4085                break;
   4086 
   4087             case 0xC8 ... 0xCF: /* FXCH %st(?) */
   4088                r_src = (UInt)modrm - 0xC8;
   4089                DIP("fxch %%st(%d)\n", (Int)r_src);
   4090                t1 = newTemp(Ity_F64);
   4091                t2 = newTemp(Ity_F64);
   4092                assign(t1, get_ST(0));
   4093                assign(t2, get_ST(r_src));
   4094                put_ST_UNCHECKED(0, mkexpr(t2));
   4095                put_ST_UNCHECKED(r_src, mkexpr(t1));
   4096                break;
   4097 
   4098             case 0xE0: /* FCHS */
   4099                DIP("fchs\n");
   4100                put_ST_UNCHECKED(0, unop(Iop_NegF64, get_ST(0)));
   4101                break;
   4102 
   4103             case 0xE1: /* FABS */
   4104                DIP("fabs\n");
   4105                put_ST_UNCHECKED(0, unop(Iop_AbsF64, get_ST(0)));
   4106                break;
   4107 
   4108             case 0xE4: /* FTST */
   4109                DIP("ftst\n");
   4110                /* This forces C1 to zero, which isn't right. */
   4111                /* Well, in fact the Intel docs say (bizarrely): "C1 is
   4112                   set to 0 if stack underflow occurred; otherwise, set
   4113                   to 0" which is pretty nonsensical.  I guess it's a
   4114                    typo. */
   4115                put_C3210(
   4116                    binop( Iop_And32,
   4117                           binop(Iop_Shl32,
   4118                                 binop(Iop_CmpF64,
   4119                                       get_ST(0),
   4120                                       IRExpr_Const(IRConst_F64i(0x0ULL))),
   4121                                 mkU8(8)),
   4122                           mkU32(0x4500)
   4123                    ));
   4124                break;
   4125 
   4126             case 0xE5: { /* FXAM */
   4127                /* This is an interesting one.  It examines %st(0),
   4128                   regardless of whether the tag says it's empty or not.
   4129                   Here, just pass both the tag (in our format) and the
   4130                   value (as a double, actually a ULong) to a helper
   4131                   function. */
   4132                IRExpr** args
   4133                   = mkIRExprVec_2( unop(Iop_8Uto32, get_ST_TAG(0)),
   4134                                    unop(Iop_ReinterpF64asI64,
   4135                                         get_ST_UNCHECKED(0)) );
   4136                put_C3210(mkIRExprCCall(
   4137                             Ity_I32,
   4138                             0/*regparm*/,
   4139                             "x86g_calculate_FXAM", &x86g_calculate_FXAM,
   4140                             args
   4141                         ));
   4142                DIP("fxam\n");
   4143                break;
   4144             }
   4145 
   4146             case 0xE8: /* FLD1 */
   4147                DIP("fld1\n");
   4148                fp_push();
   4149                /* put_ST(0, IRExpr_Const(IRConst_F64(1.0))); */
   4150                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff0000000000000ULL)));
   4151                break;
   4152 
   4153             case 0xE9: /* FLDL2T */
   4154                DIP("fldl2t\n");
   4155                fp_push();
   4156                /* put_ST(0, IRExpr_Const(IRConst_F64(3.32192809488736234781))); */
   4157                put_ST(0, IRExpr_Const(IRConst_F64i(0x400a934f0979a371ULL)));
   4158                break;
   4159 
   4160             case 0xEA: /* FLDL2E */
   4161                DIP("fldl2e\n");
   4162                fp_push();
   4163                /* put_ST(0, IRExpr_Const(IRConst_F64(1.44269504088896340739))); */
   4164                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff71547652b82feULL)));
   4165                break;
   4166 
   4167             case 0xEB: /* FLDPI */
   4168                DIP("fldpi\n");
   4169                fp_push();
   4170                /* put_ST(0, IRExpr_Const(IRConst_F64(3.14159265358979323851))); */
   4171                put_ST(0, IRExpr_Const(IRConst_F64i(0x400921fb54442d18ULL)));
   4172                break;
   4173 
   4174             case 0xEC: /* FLDLG2 */
   4175                DIP("fldlg2\n");
   4176                fp_push();
   4177                /* put_ST(0, IRExpr_Const(IRConst_F64(0.301029995663981143))); */
   4178                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fd34413509f79ffULL)));
   4179                break;
   4180 
   4181             case 0xED: /* FLDLN2 */
   4182                DIP("fldln2\n");
   4183                fp_push();
   4184                /* put_ST(0, IRExpr_Const(IRConst_F64(0.69314718055994530942))); */
   4185                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fe62e42fefa39efULL)));
   4186                break;
   4187 
   4188             case 0xEE: /* FLDZ */
   4189                DIP("fldz\n");
   4190                fp_push();
   4191                /* put_ST(0, IRExpr_Const(IRConst_F64(0.0))); */
   4192                put_ST(0, IRExpr_Const(IRConst_F64i(0x0000000000000000ULL)));
   4193                break;
   4194 
   4195             case 0xF0: /* F2XM1 */
   4196                DIP("f2xm1\n");
   4197                put_ST_UNCHECKED(0,
   4198                   binop(Iop_2xm1F64,
   4199                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4200                         get_ST(0)));
   4201                break;
   4202 
   4203             case 0xF1: /* FYL2X */
   4204                DIP("fyl2x\n");
   4205                put_ST_UNCHECKED(1,
   4206                   triop(Iop_Yl2xF64,
   4207                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4208                         get_ST(1),
   4209                         get_ST(0)));
   4210                fp_pop();
   4211                break;
   4212 
   4213             case 0xF2: /* FPTAN */
   4214                DIP("ftan\n");
   4215                put_ST_UNCHECKED(0,
   4216                   binop(Iop_TanF64,
   4217                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4218                         get_ST(0)));
   4219                fp_push();
   4220                put_ST(0, IRExpr_Const(IRConst_F64(1.0)));
   4221                clear_C2(); /* HACK */
   4222                break;
   4223 
   4224             case 0xF3: /* FPATAN */
   4225                DIP("fpatan\n");
   4226                put_ST_UNCHECKED(1,
   4227                   triop(Iop_AtanF64,
   4228                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4229                         get_ST(1),
   4230                         get_ST(0)));
   4231                fp_pop();
   4232                break;
   4233 
   4234             case 0xF4: { /* FXTRACT */
   4235                IRTemp argF = newTemp(Ity_F64);
   4236                IRTemp sigF = newTemp(Ity_F64);
   4237                IRTemp expF = newTemp(Ity_F64);
   4238                IRTemp argI = newTemp(Ity_I64);
   4239                IRTemp sigI = newTemp(Ity_I64);
   4240                IRTemp expI = newTemp(Ity_I64);
   4241                DIP("fxtract\n");
   4242                assign( argF, get_ST(0) );
   4243                assign( argI, unop(Iop_ReinterpF64asI64, mkexpr(argF)));
   4244                assign( sigI,
   4245                        mkIRExprCCall(
   4246                           Ity_I64, 0/*regparms*/,
   4247                           "x86amd64g_calculate_FXTRACT",
   4248                           &x86amd64g_calculate_FXTRACT,
   4249                           mkIRExprVec_2( mkexpr(argI),
   4250                                          mkIRExpr_HWord(0)/*sig*/ ))
   4251                );
   4252                assign( expI,
   4253                        mkIRExprCCall(
   4254                           Ity_I64, 0/*regparms*/,
   4255                           "x86amd64g_calculate_FXTRACT",
   4256                           &x86amd64g_calculate_FXTRACT,
   4257                           mkIRExprVec_2( mkexpr(argI),
   4258                                          mkIRExpr_HWord(1)/*exp*/ ))
   4259                );
   4260                assign( sigF, unop(Iop_ReinterpI64asF64, mkexpr(sigI)) );
   4261                assign( expF, unop(Iop_ReinterpI64asF64, mkexpr(expI)) );
   4262                /* exponent */
   4263                put_ST_UNCHECKED(0, mkexpr(expF) );
   4264                fp_push();
   4265                /* significand */
   4266                put_ST(0, mkexpr(sigF) );
   4267                break;
   4268             }
   4269 
   4270             case 0xF5: { /* FPREM1 -- IEEE compliant */
   4271                IRTemp a1 = newTemp(Ity_F64);
   4272                IRTemp a2 = newTemp(Ity_F64);
   4273                DIP("fprem1\n");
   4274                /* Do FPREM1 twice, once to get the remainder, and once
   4275                   to get the C3210 flag values. */
   4276                assign( a1, get_ST(0) );
   4277                assign( a2, get_ST(1) );
   4278                put_ST_UNCHECKED(0,
   4279                   triop(Iop_PRem1F64,
   4280                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4281                         mkexpr(a1),
   4282                         mkexpr(a2)));
   4283                put_C3210(
   4284                   triop(Iop_PRem1C3210F64,
   4285                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4286                         mkexpr(a1),
   4287                         mkexpr(a2)) );
   4288                break;
   4289             }
   4290 
   4291             case 0xF7: /* FINCSTP */
   4292                DIP("fprem\n");
   4293                put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
   4294                break;
   4295 
   4296             case 0xF8: { /* FPREM -- not IEEE compliant */
   4297                IRTemp a1 = newTemp(Ity_F64);
   4298                IRTemp a2 = newTemp(Ity_F64);
   4299                DIP("fprem\n");
   4300                /* Do FPREM twice, once to get the remainder, and once
   4301                   to get the C3210 flag values. */
   4302                assign( a1, get_ST(0) );
   4303                assign( a2, get_ST(1) );
   4304                put_ST_UNCHECKED(0,
   4305                   triop(Iop_PRemF64,
   4306                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4307                         mkexpr(a1),
   4308                         mkexpr(a2)));
   4309                put_C3210(
   4310                   triop(Iop_PRemC3210F64,
   4311                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4312                         mkexpr(a1),
   4313                         mkexpr(a2)) );
   4314                break;
   4315             }
   4316 
   4317             case 0xF9: /* FYL2XP1 */
   4318                DIP("fyl2xp1\n");
   4319                put_ST_UNCHECKED(1,
   4320                   triop(Iop_Yl2xp1F64,
   4321                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4322                         get_ST(1),
   4323                         get_ST(0)));
   4324                fp_pop();
   4325                break;
   4326 
   4327             case 0xFA: /* FSQRT */
   4328                DIP("fsqrt\n");
   4329                put_ST_UNCHECKED(0,
   4330                   binop(Iop_SqrtF64,
   4331                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4332                         get_ST(0)));
   4333                break;
   4334 
   4335             case 0xFB: { /* FSINCOS */
   4336                IRTemp a1 = newTemp(Ity_F64);
   4337                assign( a1, get_ST(0) );
   4338                DIP("fsincos\n");
   4339                put_ST_UNCHECKED(0,
   4340                   binop(Iop_SinF64,
   4341                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4342                         mkexpr(a1)));
   4343                fp_push();
   4344                put_ST(0,
   4345                   binop(Iop_CosF64,
   4346                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4347                         mkexpr(a1)));
   4348                clear_C2(); /* HACK */
   4349                break;
   4350             }
   4351 
   4352             case 0xFC: /* FRNDINT */
   4353                DIP("frndint\n");
   4354                put_ST_UNCHECKED(0,
   4355                   binop(Iop_RoundF64toInt, get_roundingmode(), get_ST(0)) );
   4356                break;
   4357 
   4358             case 0xFD: /* FSCALE */
   4359                DIP("fscale\n");
   4360                put_ST_UNCHECKED(0,
   4361                   triop(Iop_ScaleF64,
   4362                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4363                         get_ST(0),
   4364                         get_ST(1)));
   4365                break;
   4366 
   4367             case 0xFE: /* FSIN */
   4368                DIP("fsin\n");
   4369                put_ST_UNCHECKED(0,
   4370                   binop(Iop_SinF64,
   4371                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4372                         get_ST(0)));
   4373                clear_C2(); /* HACK */
   4374                break;
   4375 
   4376             case 0xFF: /* FCOS */
   4377                DIP("fcos\n");
   4378                put_ST_UNCHECKED(0,
   4379                   binop(Iop_CosF64,
   4380                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4381                         get_ST(0)));
   4382                clear_C2(); /* HACK */
   4383                break;
   4384 
   4385             default:
   4386                goto decode_fail;
   4387          }
   4388       }
   4389    }
   4390 
   4391    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDA opcodes +-+-+-+-+-+-+-+ */
   4392    else
   4393    if (first_opcode == 0xDA) {
   4394 
   4395       if (modrm < 0xC0) {
   4396 
   4397          /* bits 5,4,3 are an opcode extension, and the modRM also
   4398             specifies an address. */
   4399          IROp   fop;
   4400          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   4401          delta += len;
   4402          switch (gregOfRM(modrm)) {
   4403 
   4404             case 0: /* FIADD m32int */ /* ST(0) += m32int */
   4405                DIP("fiaddl %s\n", dis_buf);
   4406                fop = Iop_AddF64;
   4407                goto do_fop_m32;
   4408 
   4409             case 1: /* FIMUL m32int */ /* ST(0) *= m32int */
   4410                DIP("fimull %s\n", dis_buf);
   4411                fop = Iop_MulF64;
   4412                goto do_fop_m32;
   4413 
   4414             case 2: /* FICOM m32int */
   4415                DIP("ficoml %s\n", dis_buf);
   4416                /* This forces C1 to zero, which isn't right. */
   4417                put_C3210(
   4418                    binop( Iop_And32,
   4419                           binop(Iop_Shl32,
   4420                                 binop(Iop_CmpF64,
   4421                                       get_ST(0),
   4422                                       unop(Iop_I32StoF64,
   4423                                            loadLE(Ity_I32,mkexpr(addr)))),
   4424                                 mkU8(8)),
   4425                           mkU32(0x4500)
   4426                    ));
   4427                break;
   4428 
   4429             case 3: /* FICOMP m32int */
   4430                DIP("ficompl %s\n", dis_buf);
   4431                /* This forces C1 to zero, which isn't right. */
   4432                put_C3210(
   4433                    binop( Iop_And32,
   4434                           binop(Iop_Shl32,
   4435                                 binop(Iop_CmpF64,
   4436                                       get_ST(0),
   4437                                       unop(Iop_I32StoF64,
   4438                                            loadLE(Ity_I32,mkexpr(addr)))),
   4439                                 mkU8(8)),
   4440                           mkU32(0x4500)
   4441                    ));
   4442                fp_pop();
   4443                break;
   4444 
   4445             case 4: /* FISUB m32int */ /* ST(0) -= m32int */
   4446                DIP("fisubl %s\n", dis_buf);
   4447                fop = Iop_SubF64;
   4448                goto do_fop_m32;
   4449 
   4450             case 5: /* FISUBR m32int */ /* ST(0) = m32int - ST(0) */
   4451                DIP("fisubrl %s\n", dis_buf);
   4452                fop = Iop_SubF64;
   4453                goto do_foprev_m32;
   4454 
   4455             case 6: /* FIDIV m32int */ /* ST(0) /= m32int */
   4456                DIP("fidivl %s\n", dis_buf);
   4457                fop = Iop_DivF64;
   4458                goto do_fop_m32;
   4459 
   4460             case 7: /* FIDIVR m32int */ /* ST(0) = m32int / ST(0) */
   4461                DIP("fidivrl %s\n", dis_buf);
   4462                fop = Iop_DivF64;
   4463                goto do_foprev_m32;
   4464 
   4465             do_fop_m32:
   4466                put_ST_UNCHECKED(0,
   4467                   triop(fop,
   4468                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4469                         get_ST(0),
   4470                         unop(Iop_I32StoF64,
   4471                              loadLE(Ity_I32, mkexpr(addr)))));
   4472                break;
   4473 
   4474             do_foprev_m32:
   4475                put_ST_UNCHECKED(0,
   4476                   triop(fop,
   4477                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4478                         unop(Iop_I32StoF64,
   4479                              loadLE(Ity_I32, mkexpr(addr))),
   4480                         get_ST(0)));
   4481                break;
   4482 
   4483             default:
   4484                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   4485                vex_printf("first_opcode == 0xDA\n");
   4486                goto decode_fail;
   4487          }
   4488 
   4489       } else {
   4490 
   4491          delta++;
   4492          switch (modrm) {
   4493 
   4494             case 0xC0 ... 0xC7: /* FCMOVB ST(i), ST(0) */
   4495                r_src = (UInt)modrm - 0xC0;
   4496                DIP("fcmovb %%st(%d), %%st(0)\n", (Int)r_src);
   4497                put_ST_UNCHECKED(0,
   4498                                 IRExpr_Mux0X(
   4499                                     unop(Iop_1Uto8,
   4500                                          mk_x86g_calculate_condition(X86CondB)),
   4501                                     get_ST(0), get_ST(r_src)) );
   4502                break;
   4503 
   4504             case 0xC8 ... 0xCF: /* FCMOVE(Z) ST(i), ST(0) */
   4505                r_src = (UInt)modrm - 0xC8;
   4506                DIP("fcmovz %%st(%d), %%st(0)\n", (Int)r_src);
   4507                put_ST_UNCHECKED(0,
   4508                                 IRExpr_Mux0X(
   4509                                     unop(Iop_1Uto8,
   4510                                          mk_x86g_calculate_condition(X86CondZ)),
   4511                                     get_ST(0), get_ST(r_src)) );
   4512                break;
   4513 
   4514             case 0xD0 ... 0xD7: /* FCMOVBE ST(i), ST(0) */
   4515                r_src = (UInt)modrm - 0xD0;
   4516                DIP("fcmovbe %%st(%d), %%st(0)\n", (Int)r_src);
   4517                put_ST_UNCHECKED(0,
   4518                                 IRExpr_Mux0X(
   4519                                     unop(Iop_1Uto8,
   4520                                          mk_x86g_calculate_condition(X86CondBE)),
   4521                                     get_ST(0), get_ST(r_src)) );
   4522                break;
   4523 
   4524             case 0xD8 ... 0xDF: /* FCMOVU ST(i), ST(0) */
   4525                r_src = (UInt)modrm - 0xD8;
   4526                DIP("fcmovu %%st(%d), %%st(0)\n", (Int)r_src);
   4527                put_ST_UNCHECKED(0,
   4528                                 IRExpr_Mux0X(
   4529                                     unop(Iop_1Uto8,
   4530                                          mk_x86g_calculate_condition(X86CondP)),
   4531                                     get_ST(0), get_ST(r_src)) );
   4532                break;
   4533 
   4534             case 0xE9: /* FUCOMPP %st(0),%st(1) */
   4535                DIP("fucompp %%st(0),%%st(1)\n");
   4536                /* This forces C1 to zero, which isn't right. */
   4537                put_C3210(
   4538                    binop( Iop_And32,
   4539                           binop(Iop_Shl32,
   4540                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
   4541                                 mkU8(8)),
   4542                           mkU32(0x4500)
   4543                    ));
   4544                fp_pop();
   4545                fp_pop();
   4546                break;
   4547 
   4548             default:
   4549                goto decode_fail;
   4550          }
   4551 
   4552       }
   4553    }
   4554 
   4555    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDB opcodes +-+-+-+-+-+-+-+ */
   4556    else
   4557    if (first_opcode == 0xDB) {
   4558       if (modrm < 0xC0) {
   4559 
   4560          /* bits 5,4,3 are an opcode extension, and the modRM also
   4561             specifies an address. */
   4562          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   4563          delta += len;
   4564 
   4565          switch (gregOfRM(modrm)) {
   4566 
   4567             case 0: /* FILD m32int */
   4568                DIP("fildl %s\n", dis_buf);
   4569                fp_push();
   4570                put_ST(0, unop(Iop_I32StoF64,
   4571                               loadLE(Ity_I32, mkexpr(addr))));
   4572                break;
   4573 
   4574             case 1: /* FISTTPL m32 (SSE3) */
   4575                DIP("fisttpl %s\n", dis_buf);
   4576                storeLE( mkexpr(addr),
   4577                         binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) );
   4578                fp_pop();
   4579                break;
   4580 
   4581             case 2: /* FIST m32 */
   4582                DIP("fistl %s\n", dis_buf);
   4583                storeLE( mkexpr(addr),
   4584                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
   4585                break;
   4586 
   4587             case 3: /* FISTP m32 */
   4588                DIP("fistpl %s\n", dis_buf);
   4589                storeLE( mkexpr(addr),
   4590                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
   4591                fp_pop();
   4592                break;
   4593 
   4594             case 5: { /* FLD extended-real */
   4595                /* Uses dirty helper:
   4596                      ULong x86g_loadF80le ( UInt )
   4597                   addr holds the address.  First, do a dirty call to
   4598                   get hold of the data. */
   4599                IRTemp   val  = newTemp(Ity_I64);
   4600                IRExpr** args = mkIRExprVec_1 ( mkexpr(addr) );
   4601 
   4602                IRDirty* d = unsafeIRDirty_1_N (
   4603                                val,
   4604                                0/*regparms*/,
   4605                                "x86g_dirtyhelper_loadF80le",
   4606                                &x86g_dirtyhelper_loadF80le,
   4607                                args
   4608                             );
   4609                /* declare that we're reading memory */
   4610                d->mFx   = Ifx_Read;
   4611                d->mAddr = mkexpr(addr);
   4612                d->mSize = 10;
   4613 
   4614                /* execute the dirty call, dumping the result in val. */
   4615                stmt( IRStmt_Dirty(d) );
   4616                fp_push();
   4617                put_ST(0, unop(Iop_ReinterpI64asF64, mkexpr(val)));
   4618 
   4619                DIP("fldt %s\n", dis_buf);
   4620                break;
   4621             }
   4622 
   4623             case 7: { /* FSTP extended-real */
   4624                /* Uses dirty helper: void x86g_storeF80le ( UInt, ULong ) */
   4625                IRExpr** args
   4626                   = mkIRExprVec_2( mkexpr(addr),
   4627                                    unop(Iop_ReinterpF64asI64, get_ST(0)) );
   4628 
   4629                IRDirty* d = unsafeIRDirty_0_N (
   4630                                0/*regparms*/,
   4631                                "x86g_dirtyhelper_storeF80le",
   4632                                &x86g_dirtyhelper_storeF80le,
   4633                                args
   4634                             );
   4635                /* declare we're writing memory */
   4636                d->mFx   = Ifx_Write;
   4637                d->mAddr = mkexpr(addr);
   4638                d->mSize = 10;
   4639 
   4640                /* execute the dirty call. */
   4641                stmt( IRStmt_Dirty(d) );
   4642                fp_pop();
   4643 
   4644                DIP("fstpt\n %s", dis_buf);
   4645                break;
   4646             }
   4647 
   4648             default:
   4649                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   4650                vex_printf("first_opcode == 0xDB\n");
   4651                goto decode_fail;
   4652          }
   4653 
   4654       } else {
   4655 
   4656          delta++;
   4657          switch (modrm) {
   4658 
   4659             case 0xC0 ... 0xC7: /* FCMOVNB ST(i), ST(0) */
   4660                r_src = (UInt)modrm - 0xC0;
   4661                DIP("fcmovnb %%st(%d), %%st(0)\n", (Int)r_src);
   4662                put_ST_UNCHECKED(0,
   4663                                 IRExpr_Mux0X(
   4664                                     unop(Iop_1Uto8,
   4665                                          mk_x86g_calculate_condition(X86CondNB)),
   4666                                     get_ST(0), get_ST(r_src)) );
   4667                break;
   4668 
   4669             case 0xC8 ... 0xCF: /* FCMOVNE(NZ) ST(i), ST(0) */
   4670                r_src = (UInt)modrm - 0xC8;
   4671                DIP("fcmovnz %%st(%d), %%st(0)\n", (Int)r_src);
   4672                put_ST_UNCHECKED(0,
   4673                                 IRExpr_Mux0X(
   4674                                     unop(Iop_1Uto8,
   4675                                          mk_x86g_calculate_condition(X86CondNZ)),
   4676                                     get_ST(0), get_ST(r_src)) );
   4677                break;
   4678 
   4679             case 0xD0 ... 0xD7: /* FCMOVNBE ST(i), ST(0) */
   4680                r_src = (UInt)modrm - 0xD0;
   4681                DIP("fcmovnbe %%st(%d), %%st(0)\n", (Int)r_src);
   4682                put_ST_UNCHECKED(0,
   4683                                 IRExpr_Mux0X(
   4684                                     unop(Iop_1Uto8,
   4685                                          mk_x86g_calculate_condition(X86CondNBE)),
   4686                                     get_ST(0), get_ST(r_src)) );
   4687                break;
   4688 
   4689             case 0xD8 ... 0xDF: /* FCMOVNU ST(i), ST(0) */
   4690                r_src = (UInt)modrm - 0xD8;
   4691                DIP("fcmovnu %%st(%d), %%st(0)\n", (Int)r_src);
   4692                put_ST_UNCHECKED(0,
   4693                                 IRExpr_Mux0X(
   4694                                     unop(Iop_1Uto8,
   4695                                          mk_x86g_calculate_condition(X86CondNP)),
   4696                                     get_ST(0), get_ST(r_src)) );
   4697                break;
   4698 
   4699             case 0xE2:
   4700                DIP("fnclex\n");
   4701                break;
   4702 
   4703             case 0xE3: {
   4704                /* Uses dirty helper:
   4705                      void x86g_do_FINIT ( VexGuestX86State* ) */
   4706                IRDirty* d  = unsafeIRDirty_0_N (
   4707                                 0/*regparms*/,
   4708                                 "x86g_dirtyhelper_FINIT",
   4709                                 &x86g_dirtyhelper_FINIT,
   4710                                 mkIRExprVec_0()
   4711                              );
   4712                d->needsBBP = True;
   4713 
   4714                /* declare we're writing guest state */
   4715                d->nFxState = 5;
   4716 
   4717                d->fxState[0].fx     = Ifx_Write;
   4718                d->fxState[0].offset = OFFB_FTOP;
   4719                d->fxState[0].size   = sizeof(UInt);
   4720 
   4721                d->fxState[1].fx     = Ifx_Write;
   4722                d->fxState[1].offset = OFFB_FPREGS;
   4723                d->fxState[1].size   = 8 * sizeof(ULong);
   4724 
   4725                d->fxState[2].fx     = Ifx_Write;
   4726                d->fxState[2].offset = OFFB_FPTAGS;
   4727                d->fxState[2].size   = 8 * sizeof(UChar);
   4728 
   4729                d->fxState[3].fx     = Ifx_Write;
   4730                d->fxState[3].offset = OFFB_FPROUND;
   4731                d->fxState[3].size   = sizeof(UInt);
   4732 
   4733                d->fxState[4].fx     = Ifx_Write;
   4734                d->fxState[4].offset = OFFB_FC3210;
   4735                d->fxState[4].size   = sizeof(UInt);
   4736 
   4737                stmt( IRStmt_Dirty(d) );
   4738 
   4739                DIP("fninit\n");
   4740                break;
   4741             }
   4742 
   4743             case 0xE8 ... 0xEF: /* FUCOMI %st(0),%st(?) */
   4744                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, False );
   4745                break;
   4746 
   4747             case 0xF0 ... 0xF7: /* FCOMI %st(0),%st(?) */
   4748                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, False );
   4749                break;
   4750 
   4751             default:
   4752                goto decode_fail;
   4753          }
   4754       }
   4755    }
   4756 
   4757    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDC opcodes +-+-+-+-+-+-+-+ */
   4758    else
   4759    if (first_opcode == 0xDC) {
   4760       if (modrm < 0xC0) {
   4761 
   4762          /* bits 5,4,3 are an opcode extension, and the modRM also
   4763             specifies an address. */
   4764          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   4765          delta += len;
   4766 
   4767          switch (gregOfRM(modrm)) {
   4768 
   4769             case 0: /* FADD double-real */
   4770                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, True );
   4771                break;
   4772 
   4773             case 1: /* FMUL double-real */
   4774                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, True );
   4775                break;
   4776 
   4777             case 2: /* FCOM double-real */
   4778                DIP("fcoml %s\n", dis_buf);
   4779                /* This forces C1 to zero, which isn't right. */
   4780                put_C3210(
   4781                    binop( Iop_And32,
   4782                           binop(Iop_Shl32,
   4783                                 binop(Iop_CmpF64,
   4784                                       get_ST(0),
   4785                                       loadLE(Ity_F64,mkexpr(addr))),
   4786                                 mkU8(8)),
   4787                           mkU32(0x4500)
   4788                    ));
   4789                break;
   4790 
   4791             case 3: /* FCOMP double-real */
   4792                DIP("fcompl %s\n", dis_buf);
   4793                /* This forces C1 to zero, which isn't right. */
   4794                put_C3210(
   4795                    binop( Iop_And32,
   4796                           binop(Iop_Shl32,
   4797                                 binop(Iop_CmpF64,
   4798                                       get_ST(0),
   4799                                       loadLE(Ity_F64,mkexpr(addr))),
   4800                                 mkU8(8)),
   4801                           mkU32(0x4500)
   4802                    ));
   4803                fp_pop();
   4804                break;
   4805 
   4806             case 4: /* FSUB double-real */
   4807                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, True );
   4808                break;
   4809 
   4810             case 5: /* FSUBR double-real */
   4811                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, True );
   4812                break;
   4813 
   4814             case 6: /* FDIV double-real */
   4815                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, True );
   4816                break;
   4817 
   4818             case 7: /* FDIVR double-real */
   4819                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, True );
   4820                break;
   4821 
   4822             default:
   4823                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   4824                vex_printf("first_opcode == 0xDC\n");
   4825                goto decode_fail;
   4826          }
   4827 
   4828       } else {
   4829 
   4830          delta++;
   4831          switch (modrm) {
   4832 
   4833             case 0xC0 ... 0xC7: /* FADD %st(0),%st(?) */
   4834                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, False );
   4835                break;
   4836 
   4837             case 0xC8 ... 0xCF: /* FMUL %st(0),%st(?) */
   4838                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, False );
   4839                break;
   4840 
   4841             case 0xE0 ... 0xE7: /* FSUBR %st(0),%st(?) */
   4842                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0, modrm - 0xE0, False );
   4843                break;
   4844 
   4845             case 0xE8 ... 0xEF: /* FSUB %st(0),%st(?) */
   4846                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0, modrm - 0xE8, False );
   4847                break;
   4848 
   4849             case 0xF0 ... 0xF7: /* FDIVR %st(0),%st(?) */
   4850                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, False );
   4851                break;
   4852 
   4853             case 0xF8 ... 0xFF: /* FDIV %st(0),%st(?) */
   4854                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, False );
   4855                break;
   4856 
   4857             default:
   4858                goto decode_fail;
   4859          }
   4860 
   4861       }
   4862    }
   4863 
   4864    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDD opcodes +-+-+-+-+-+-+-+ */
   4865    else
   4866    if (first_opcode == 0xDD) {
   4867 
   4868       if (modrm < 0xC0) {
   4869 
   4870          /* bits 5,4,3 are an opcode extension, and the modRM also
   4871             specifies an address. */
   4872          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   4873          delta += len;
   4874 
   4875          switch (gregOfRM(modrm)) {
   4876 
   4877             case 0: /* FLD double-real */
   4878                DIP("fldl %s\n", dis_buf);
   4879                fp_push();
   4880                put_ST(0, loadLE(Ity_F64, mkexpr(addr)));
   4881                break;
   4882 
   4883             case 1: /* FISTTPQ m64 (SSE3) */
   4884                DIP("fistppll %s\n", dis_buf);
   4885                storeLE( mkexpr(addr),
   4886                         binop(Iop_F64toI64S, mkU32(Irrm_ZERO), get_ST(0)) );
   4887                fp_pop();
   4888                break;
   4889 
   4890             case 2: /* FST double-real */
   4891                DIP("fstl %s\n", dis_buf);
   4892                storeLE(mkexpr(addr), get_ST(0));
   4893                break;
   4894 
   4895             case 3: /* FSTP double-real */
   4896                DIP("fstpl %s\n", dis_buf);
   4897                storeLE(mkexpr(addr), get_ST(0));
   4898                fp_pop();
   4899                break;
   4900 
   4901             case 4: { /* FRSTOR m108 */
   4902                /* Uses dirty helper:
   4903                      VexEmWarn x86g_do_FRSTOR ( VexGuestX86State*, Addr32 ) */
   4904                IRTemp   ew = newTemp(Ity_I32);
   4905                IRDirty* d  = unsafeIRDirty_0_N (
   4906                                 0/*regparms*/,
   4907                                 "x86g_dirtyhelper_FRSTOR",
   4908                                 &x86g_dirtyhelper_FRSTOR,
   4909                                 mkIRExprVec_1( mkexpr(addr) )
   4910                              );
   4911                d->needsBBP = True;
   4912                d->tmp      = ew;
   4913                /* declare we're reading memory */
   4914                d->mFx   = Ifx_Read;
   4915                d->mAddr = mkexpr(addr);
   4916                d->mSize = 108;
   4917 
   4918                /* declare we're writing guest state */
   4919                d->nFxState = 5;
   4920 
   4921                d->fxState[0].fx     = Ifx_Write;
   4922                d->fxState[0].offset = OFFB_FTOP;
   4923                d->fxState[0].size   = sizeof(UInt);
   4924 
   4925                d->fxState[1].fx     = Ifx_Write;
   4926                d->fxState[1].offset = OFFB_FPREGS;
   4927                d->fxState[1].size   = 8 * sizeof(ULong);
   4928 
   4929                d->fxState[2].fx     = Ifx_Write;
   4930                d->fxState[2].offset = OFFB_FPTAGS;
   4931                d->fxState[2].size   = 8 * sizeof(UChar);
   4932 
   4933                d->fxState[3].fx     = Ifx_Write;
   4934                d->fxState[3].offset = OFFB_FPROUND;
   4935                d->fxState[3].size   = sizeof(UInt);
   4936 
   4937                d->fxState[4].fx     = Ifx_Write;
   4938                d->fxState[4].offset = OFFB_FC3210;
   4939                d->fxState[4].size   = sizeof(UInt);
   4940 
   4941                stmt( IRStmt_Dirty(d) );
   4942 
   4943                /* ew contains any emulation warning we may need to
   4944                   issue.  If needed, side-exit to the next insn,
   4945                   reporting the warning, so that Valgrind's dispatcher
   4946                   sees the warning. */
   4947                put_emwarn( mkexpr(ew) );
   4948                stmt(
   4949                   IRStmt_Exit(
   4950                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   4951                      Ijk_EmWarn,
   4952                      IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta)
   4953                   )
   4954                );
   4955 
   4956                DIP("frstor %s\n", dis_buf);
   4957                break;
   4958             }
   4959 
   4960             case 6: { /* FNSAVE m108 */
   4961                /* Uses dirty helper:
   4962                      void x86g_do_FSAVE ( VexGuestX86State*, UInt ) */
   4963                IRDirty* d = unsafeIRDirty_0_N (
   4964                                0/*regparms*/,
   4965                                "x86g_dirtyhelper_FSAVE",
   4966                                &x86g_dirtyhelper_FSAVE,
   4967                                mkIRExprVec_1( mkexpr(addr) )
   4968                             );
   4969                d->needsBBP = True;
   4970                /* declare we're writing memory */
   4971                d->mFx   = Ifx_Write;
   4972                d->mAddr = mkexpr(addr);
   4973                d->mSize = 108;
   4974 
   4975                /* declare we're reading guest state */
   4976                d->nFxState = 5;
   4977 
   4978                d->fxState[0].fx     = Ifx_Read;
   4979                d->fxState[0].offset = OFFB_FTOP;
   4980                d->fxState[0].size   = sizeof(UInt);
   4981 
   4982                d->fxState[1].fx     = Ifx_Read;
   4983                d->fxState[1].offset = OFFB_FPREGS;
   4984                d->fxState[1].size   = 8 * sizeof(ULong);
   4985 
   4986                d->fxState[2].fx     = Ifx_Read;
   4987                d->fxState[2].offset = OFFB_FPTAGS;
   4988                d->fxState[2].size   = 8 * sizeof(UChar);
   4989 
   4990                d->fxState[3].fx     = Ifx_Read;
   4991                d->fxState[3].offset = OFFB_FPROUND;
   4992                d->fxState[3].size   = sizeof(UInt);
   4993 
   4994                d->fxState[4].fx     = Ifx_Read;
   4995                d->fxState[4].offset = OFFB_FC3210;
   4996                d->fxState[4].size   = sizeof(UInt);
   4997 
   4998                stmt( IRStmt_Dirty(d) );
   4999 
   5000                DIP("fnsave %s\n", dis_buf);
   5001                break;
   5002             }
   5003 
   5004             case 7: { /* FNSTSW m16 */
   5005                IRExpr* sw = get_FPU_sw();
   5006                vassert(typeOfIRExpr(irsb->tyenv, sw) == Ity_I16);
   5007                storeLE( mkexpr(addr), sw );
   5008                DIP("fnstsw %s\n", dis_buf);
   5009                break;
   5010             }
   5011 
   5012             default:
   5013                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   5014                vex_printf("first_opcode == 0xDD\n");
   5015                goto decode_fail;
   5016          }
   5017       } else {
   5018          delta++;
   5019          switch (modrm) {
   5020 
   5021             case 0xC0 ... 0xC7: /* FFREE %st(?) */
   5022                r_dst = (UInt)modrm - 0xC0;
   5023                DIP("ffree %%st(%d)\n", (Int)r_dst);
   5024                put_ST_TAG ( r_dst, mkU8(0) );
   5025                break;
   5026 
   5027             case 0xD0 ... 0xD7: /* FST %st(0),%st(?) */
   5028                r_dst = (UInt)modrm - 0xD0;
   5029                DIP("fst %%st(0),%%st(%d)\n", (Int)r_dst);
   5030                /* P4 manual says: "If the destination operand is a
   5031                   non-empty register, the invalid-operation exception
   5032                   is not generated.  Hence put_ST_UNCHECKED. */
   5033                put_ST_UNCHECKED(r_dst, get_ST(0));
   5034                break;
   5035 
   5036             case 0xD8 ... 0xDF: /* FSTP %st(0),%st(?) */
   5037                r_dst = (UInt)modrm - 0xD8;
   5038                DIP("fstp %%st(0),%%st(%d)\n", (Int)r_dst);
   5039                /* P4 manual says: "If the destination operand is a
   5040                   non-empty register, the invalid-operation exception
   5041                   is not generated.  Hence put_ST_UNCHECKED. */
   5042                put_ST_UNCHECKED(r_dst, get_ST(0));
   5043                fp_pop();
   5044                break;
   5045 
   5046             case 0xE0 ... 0xE7: /* FUCOM %st(0),%st(?) */
   5047                r_dst = (UInt)modrm - 0xE0;
   5048                DIP("fucom %%st(0),%%st(%d)\n", (Int)r_dst);
   5049                /* This forces C1 to zero, which isn't right. */
   5050                put_C3210(
   5051                    binop( Iop_And32,
   5052                           binop(Iop_Shl32,
   5053                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   5054                                 mkU8(8)),
   5055                           mkU32(0x4500)
   5056                    ));
   5057                break;
   5058 
   5059             case 0xE8 ... 0xEF: /* FUCOMP %st(0),%st(?) */
   5060                r_dst = (UInt)modrm - 0xE8;
   5061                DIP("fucomp %%st(0),%%st(%d)\n", (Int)r_dst);
   5062                /* This forces C1 to zero, which isn't right. */
   5063                put_C3210(
   5064                    binop( Iop_And32,
   5065                           binop(Iop_Shl32,
   5066                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   5067                                 mkU8(8)),
   5068                           mkU32(0x4500)
   5069                    ));
   5070                fp_pop();
   5071                break;
   5072 
   5073             default:
   5074                goto decode_fail;
   5075          }
   5076       }
   5077    }
   5078 
   5079    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDE opcodes +-+-+-+-+-+-+-+ */
   5080    else
   5081    if (first_opcode == 0xDE) {
   5082 
   5083       if (modrm < 0xC0) {
   5084 
   5085          /* bits 5,4,3 are an opcode extension, and the modRM also
   5086             specifies an address. */
   5087          IROp   fop;
   5088          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5089          delta += len;
   5090 
   5091          switch (gregOfRM(modrm)) {
   5092 
   5093             case 0: /* FIADD m16int */ /* ST(0) += m16int */
   5094                DIP("fiaddw %s\n", dis_buf);
   5095                fop = Iop_AddF64;
   5096                goto do_fop_m16;
   5097 
   5098             case 1: /* FIMUL m16int */ /* ST(0) *= m16int */
   5099                DIP("fimulw %s\n", dis_buf);
   5100                fop = Iop_MulF64;
   5101                goto do_fop_m16;
   5102 
   5103             case 2: /* FICOM m16int */
   5104                DIP("ficomw %s\n", dis_buf);
   5105                /* This forces C1 to zero, which isn't right. */
   5106                put_C3210(
   5107                    binop( Iop_And32,
   5108                           binop(Iop_Shl32,
   5109                                 binop(Iop_CmpF64,
   5110                                       get_ST(0),
   5111                                       unop(Iop_I32StoF64,
   5112                                          unop(Iop_16Sto32,
   5113                                            loadLE(Ity_I16,mkexpr(addr))))),
   5114                                 mkU8(8)),
   5115                           mkU32(0x4500)
   5116                    ));
   5117                break;
   5118 
   5119             case 3: /* FICOMP m16int */
   5120                DIP("ficompw %s\n", dis_buf);
   5121                /* This forces C1 to zero, which isn't right. */
   5122                put_C3210(
   5123                    binop( Iop_And32,
   5124                           binop(Iop_Shl32,
   5125                                 binop(Iop_CmpF64,
   5126                                       get_ST(0),
   5127                                       unop(Iop_I32StoF64,
   5128                                          unop(Iop_16Sto32,
   5129                                               loadLE(Ity_I16,mkexpr(addr))))),
   5130                                 mkU8(8)),
   5131                           mkU32(0x4500)
   5132                    ));
   5133                fp_pop();
   5134                break;
   5135 
   5136             case 4: /* FISUB m16int */ /* ST(0) -= m16int */
   5137                DIP("fisubw %s\n", dis_buf);
   5138                fop = Iop_SubF64;
   5139                goto do_fop_m16;
   5140 
   5141             case 5: /* FISUBR m16int */ /* ST(0) = m16int - ST(0) */
   5142                DIP("fisubrw %s\n", dis_buf);
   5143                fop = Iop_SubF64;
   5144                goto do_foprev_m16;
   5145 
   5146             case 6: /* FIDIV m16int */ /* ST(0) /= m16int */
   5147                DIP("fisubw %s\n", dis_buf);
   5148                fop = Iop_DivF64;
   5149                goto do_fop_m16;
   5150 
   5151             case 7: /* FIDIVR m16int */ /* ST(0) = m16int / ST(0) */
   5152                DIP("fidivrw %s\n", dis_buf);
   5153                fop = Iop_DivF64;
   5154                goto do_foprev_m16;
   5155 
   5156             do_fop_m16:
   5157                put_ST_UNCHECKED(0,
   5158                   triop(fop,
   5159                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5160                         get_ST(0),
   5161                         unop(Iop_I32StoF64,
   5162                              unop(Iop_16Sto32,
   5163                                   loadLE(Ity_I16, mkexpr(addr))))));
   5164                break;
   5165 
   5166             do_foprev_m16:
   5167                put_ST_UNCHECKED(0,
   5168                   triop(fop,
   5169                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5170                         unop(Iop_I32StoF64,
   5171                              unop(Iop_16Sto32,
   5172                                   loadLE(Ity_I16, mkexpr(addr)))),
   5173                         get_ST(0)));
   5174                break;
   5175 
   5176             default:
   5177                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   5178                vex_printf("first_opcode == 0xDE\n");
   5179                goto decode_fail;
   5180          }
   5181 
   5182       } else {
   5183 
   5184          delta++;
   5185          switch (modrm) {
   5186 
   5187             case 0xC0 ... 0xC7: /* FADDP %st(0),%st(?) */
   5188                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, True );
   5189                break;
   5190 
   5191             case 0xC8 ... 0xCF: /* FMULP %st(0),%st(?) */
   5192                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, True );
   5193                break;
   5194 
   5195             case 0xD9: /* FCOMPP %st(0),%st(1) */
   5196                DIP("fuompp %%st(0),%%st(1)\n");
   5197                /* This forces C1 to zero, which isn't right. */
   5198                put_C3210(
   5199                    binop( Iop_And32,
   5200                           binop(Iop_Shl32,
   5201                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
   5202                                 mkU8(8)),
   5203                           mkU32(0x4500)
   5204                    ));
   5205                fp_pop();
   5206                fp_pop();
   5207                break;
   5208 
   5209             case 0xE0 ... 0xE7: /* FSUBRP %st(0),%st(?) */
   5210                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0,  modrm - 0xE0, True );
   5211                break;
   5212 
   5213             case 0xE8 ... 0xEF: /* FSUBP %st(0),%st(?) */
   5214                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0,  modrm - 0xE8, True );
   5215                break;
   5216 
   5217             case 0xF0 ... 0xF7: /* FDIVRP %st(0),%st(?) */
   5218                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, True );
   5219                break;
   5220 
   5221             case 0xF8 ... 0xFF: /* FDIVP %st(0),%st(?) */
   5222                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, True );
   5223                break;
   5224 
   5225             default:
   5226                goto decode_fail;
   5227          }
   5228 
   5229       }
   5230    }
   5231 
   5232    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDF opcodes +-+-+-+-+-+-+-+ */
   5233    else
   5234    if (first_opcode == 0xDF) {
   5235 
   5236       if (modrm < 0xC0) {
   5237 
   5238          /* bits 5,4,3 are an opcode extension, and the modRM also
   5239             specifies an address. */
   5240          IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5241          delta += len;
   5242 
   5243          switch (gregOfRM(modrm)) {
   5244 
   5245             case 0: /* FILD m16int */
   5246                DIP("fildw %s\n", dis_buf);
   5247                fp_push();
   5248                put_ST(0, unop(Iop_I32StoF64,
   5249                               unop(Iop_16Sto32,
   5250                                    loadLE(Ity_I16, mkexpr(addr)))));
   5251                break;
   5252 
   5253             case 1: /* FISTTPS m16 (SSE3) */
   5254                DIP("fisttps %s\n", dis_buf);
   5255                storeLE( mkexpr(addr),
   5256                         binop(Iop_F64toI16S, mkU32(Irrm_ZERO), get_ST(0)) );
   5257                fp_pop();
   5258                break;
   5259 
   5260             case 2: /* FIST m16 */
   5261                DIP("fistp %s\n", dis_buf);
   5262                storeLE( mkexpr(addr),
   5263                         binop(Iop_F64toI16S, get_roundingmode(), get_ST(0)) );
   5264                break;
   5265 
   5266             case 3: /* FISTP m16 */
   5267                DIP("fistps %s\n", dis_buf);
   5268                storeLE( mkexpr(addr),
   5269                         binop(Iop_F64toI16S, get_roundingmode(), get_ST(0)) );
   5270                fp_pop();
   5271                break;
   5272 
   5273             case 5: /* FILD m64 */
   5274                DIP("fildll %s\n", dis_buf);
   5275                fp_push();
   5276                put_ST(0, binop(Iop_I64StoF64,
   5277                                get_roundingmode(),
   5278                                loadLE(Ity_I64, mkexpr(addr))));
   5279                break;
   5280 
   5281             case 7: /* FISTP m64 */
   5282                DIP("fistpll %s\n", dis_buf);
   5283                storeLE( mkexpr(addr),
   5284                         binop(Iop_F64toI64S, get_roundingmode(), get_ST(0)) );
   5285                fp_pop();
   5286                break;
   5287 
   5288             default:
   5289                vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
   5290                vex_printf("first_opcode == 0xDF\n");
   5291                goto decode_fail;
   5292          }
   5293 
   5294       } else {
   5295 
   5296          delta++;
   5297          switch (modrm) {
   5298 
   5299             case 0xC0: /* FFREEP %st(0) */
   5300                DIP("ffreep %%st(%d)\n", 0);
   5301                put_ST_TAG ( 0, mkU8(0) );
   5302                fp_pop();
   5303                break;
   5304 
   5305             case 0xE0: /* FNSTSW %ax */
   5306                DIP("fnstsw %%ax\n");
   5307                /* Get the FPU status word value and dump it in %AX. */
   5308                if (0) {
   5309                   /* The obvious thing to do is simply dump the 16-bit
   5310                      status word value in %AX.  However, due to a
   5311                      limitation in Memcheck's origin tracking
   5312                      machinery, this causes Memcheck not to track the
   5313                      origin of any undefinedness into %AH (only into
   5314                      %AL/%AX/%EAX), which means origins are lost in
   5315                      the sequence "fnstsw %ax; test $M,%ah; jcond .." */
   5316                   putIReg(2, R_EAX, get_FPU_sw());
   5317                } else {
   5318                   /* So a somewhat lame kludge is to make it very
   5319                      clear to Memcheck that the value is written to
   5320                      both %AH and %AL.  This generates marginally
   5321                      worse code, but I don't think it matters much. */
   5322                   IRTemp t16 = newTemp(Ity_I16);
   5323                   assign(t16, get_FPU_sw());
   5324                   putIReg( 1, R_AL, unop(Iop_16to8, mkexpr(t16)) );
   5325                   putIReg( 1, R_AH, unop(Iop_16HIto8, mkexpr(t16)) );
   5326                }
   5327                break;
   5328 
   5329             case 0xE8 ... 0xEF: /* FUCOMIP %st(0),%st(?) */
   5330                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, True );
   5331                break;
   5332 
   5333             case 0xF0 ... 0xF7: /* FCOMIP %st(0),%st(?) */
   5334                /* not really right since COMIP != UCOMIP */
   5335                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, True );
   5336                break;
   5337 
   5338             default:
   5339                goto decode_fail;
   5340          }
   5341       }
   5342 
   5343    }
   5344 
   5345    else
   5346    vpanic("dis_FPU(x86): invalid primary opcode");
   5347 
   5348    *decode_ok = True;
   5349    return delta;
   5350 
   5351   decode_fail:
   5352    *decode_ok = False;
   5353    return delta;
   5354 }
   5355 
   5356 
   5357 /*------------------------------------------------------------*/
   5358 /*---                                                      ---*/
   5359 /*--- MMX INSTRUCTIONS                                     ---*/
   5360 /*---                                                      ---*/
   5361 /*------------------------------------------------------------*/
   5362 
   5363 /* Effect of MMX insns on x87 FPU state (table 11-2 of
   5364    IA32 arch manual, volume 3):
   5365 
   5366    Read from, or write to MMX register (viz, any insn except EMMS):
   5367    * All tags set to Valid (non-empty) -- FPTAGS[i] := nonzero
   5368    * FP stack pointer set to zero
   5369 
   5370    EMMS:
   5371    * All tags set to Invalid (empty) -- FPTAGS[i] := zero
   5372    * FP stack pointer set to zero
   5373 */
   5374 
   5375 static void do_MMX_preamble ( void )
   5376 {
   5377    Int         i;
   5378    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   5379    IRExpr*     zero  = mkU32(0);
   5380    IRExpr*     tag1  = mkU8(1);
   5381    put_ftop(zero);
   5382    for (i = 0; i < 8; i++)
   5383       stmt( IRStmt_PutI( descr, zero, i, tag1 ) );
   5384 }
   5385 
   5386 static void do_EMMS_preamble ( void )
   5387 {
   5388    Int         i;
   5389    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   5390    IRExpr*     zero  = mkU32(0);
   5391    IRExpr*     tag0  = mkU8(0);
   5392    put_ftop(zero);
   5393    for (i = 0; i < 8; i++)
   5394       stmt( IRStmt_PutI( descr, zero, i, tag0 ) );
   5395 }
   5396 
   5397 
   5398 static IRExpr* getMMXReg ( UInt archreg )
   5399 {
   5400    vassert(archreg < 8);
   5401    return IRExpr_Get( OFFB_FPREGS + 8 * archreg, Ity_I64 );
   5402 }
   5403 
   5404 
   5405 static void putMMXReg ( UInt archreg, IRExpr* e )
   5406 {
   5407    vassert(archreg < 8);
   5408    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   5409    stmt( IRStmt_Put( OFFB_FPREGS + 8 * archreg, e ) );
   5410 }
   5411 
   5412 
   5413 /* Helper for non-shift MMX insns.  Note this is incomplete in the
   5414    sense that it does not first call do_MMX_preamble() -- that is the
   5415    responsibility of its caller. */
   5416 
   5417 static
   5418 UInt dis_MMXop_regmem_to_reg ( UChar  sorb,
   5419                                Int    delta,
   5420                                UChar  opc,
   5421                                HChar* name,
   5422                                Bool   show_granularity )
   5423 {
   5424    HChar   dis_buf[50];
   5425    UChar   modrm = getIByte(delta);
   5426    Bool    isReg = epartIsReg(modrm);
   5427    IRExpr* argL  = NULL;
   5428    IRExpr* argR  = NULL;
   5429    IRExpr* argG  = NULL;
   5430    IRExpr* argE  = NULL;
   5431    IRTemp  res   = newTemp(Ity_I64);
   5432 
   5433    Bool    invG  = False;
   5434    IROp    op    = Iop_INVALID;
   5435    void*   hAddr = NULL;
   5436    HChar*  hName = NULL;
   5437    Bool    eLeft = False;
   5438 
   5439 #  define XXX(_name) do { hAddr = &_name; hName = #_name; } while (0)
   5440 
   5441    switch (opc) {
   5442       /* Original MMX ones */
   5443       case 0xFC: op = Iop_Add8x8; break;
   5444       case 0xFD: op = Iop_Add16x4; break;
   5445       case 0xFE: op = Iop_Add32x2; break;
   5446 
   5447       case 0xEC: op = Iop_QAdd8Sx8; break;
   5448       case 0xED: op = Iop_QAdd16Sx4; break;
   5449 
   5450       case 0xDC: op = Iop_QAdd8Ux8; break;
   5451       case 0xDD: op = Iop_QAdd16Ux4; break;
   5452 
   5453       case 0xF8: op = Iop_Sub8x8;  break;
   5454       case 0xF9: op = Iop_Sub16x4; break;
   5455       case 0xFA: op = Iop_Sub32x2; break;
   5456 
   5457       case 0xE8: op = Iop_QSub8Sx8; break;
   5458       case 0xE9: op = Iop_QSub16Sx4; break;
   5459 
   5460       case 0xD8: op = Iop_QSub8Ux8; break;
   5461       case 0xD9: op = Iop_QSub16Ux4; break;
   5462 
   5463       case 0xE5: op = Iop_MulHi16Sx4; break;
   5464       case 0xD5: op = Iop_Mul16x4; break;
   5465       case 0xF5: XXX(x86g_calculate_mmx_pmaddwd); break;
   5466 
   5467       case 0x74: op = Iop_CmpEQ8x8; break;
   5468       case 0x75: op = Iop_CmpEQ16x4; break;
   5469       case 0x76: op = Iop_CmpEQ32x2; break;
   5470 
   5471       case 0x64: op = Iop_CmpGT8Sx8; break;
   5472       case 0x65: op = Iop_CmpGT16Sx4; break;
   5473       case 0x66: op = Iop_CmpGT32Sx2; break;
   5474 
   5475       case 0x6B: op = Iop_QNarrow32Sx2; eLeft = True; break;
   5476       case 0x63: op = Iop_QNarrow16Sx4; eLeft = True; break;
   5477       case 0x67: op = Iop_QNarrow16Ux4; eLeft = True; break;
   5478 
   5479       case 0x68: op = Iop_InterleaveHI8x8;  eLeft = True; break;
   5480       case 0x69: op = Iop_InterleaveHI16x4; eLeft = True; break;
   5481       case 0x6A: op = Iop_InterleaveHI32x2; eLeft = True; break;
   5482 
   5483       case 0x60: op = Iop_InterleaveLO8x8;  eLeft = True; break;
   5484       case 0x61: op = Iop_InterleaveLO16x4; eLeft = True; break;
   5485       case 0x62: op = Iop_InterleaveLO32x2; eLeft = True; break;
   5486 
   5487       case 0xDB: op = Iop_And64; break;
   5488       case 0xDF: op = Iop_And64; invG = True; break;
   5489       case 0xEB: op = Iop_Or64; break;
   5490       case 0xEF: /* Possibly do better here if argL and argR are the
   5491                     same reg */
   5492                  op = Iop_Xor64; break;
   5493 
   5494       /* Introduced in SSE1 */
   5495       case 0xE0: op = Iop_Avg8Ux8;    break;
   5496       case 0xE3: op = Iop_Avg16Ux4;   break;
   5497       case 0xEE: op = Iop_Max16Sx4;   break;
   5498       case 0xDE: op = Iop_Max8Ux8;    break;
   5499       case 0xEA: op = Iop_Min16Sx4;   break;
   5500       case 0xDA: op = Iop_Min8Ux8;    break;
   5501       case 0xE4: op = Iop_MulHi16Ux4; break;
   5502       case 0xF6: XXX(x86g_calculate_mmx_psadbw); break;
   5503 
   5504       /* Introduced in SSE2 */
   5505       case 0xD4: op = Iop_Add64; break;
   5506       case 0xFB: op = Iop_Sub64; break;
   5507 
   5508       default:
   5509          vex_printf("\n0x%x\n", (Int)opc);
   5510          vpanic("dis_MMXop_regmem_to_reg");
   5511    }
   5512 
   5513 #  undef XXX
   5514 
   5515    argG = getMMXReg(gregOfRM(modrm));
   5516    if (invG)
   5517       argG = unop(Iop_Not64, argG);
   5518 
   5519    if (isReg) {
   5520       delta++;
   5521       argE = getMMXReg(eregOfRM(modrm));
   5522    } else {
   5523       Int    len;
   5524       IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5525       delta += len;
   5526       argE = loadLE(Ity_I64, mkexpr(addr));
   5527    }
   5528 
   5529    if (eLeft) {
   5530       argL = argE;
   5531       argR = argG;
   5532    } else {
   5533       argL = argG;
   5534       argR = argE;
   5535    }
   5536 
   5537    if (op != Iop_INVALID) {
   5538       vassert(hName == NULL);
   5539       vassert(hAddr == NULL);
   5540       assign(res, binop(op, argL, argR));
   5541    } else {
   5542       vassert(hName != NULL);
   5543       vassert(hAddr != NULL);
   5544       assign( res,
   5545               mkIRExprCCall(
   5546                  Ity_I64,
   5547                  0/*regparms*/, hName, hAddr,
   5548                  mkIRExprVec_2( argL, argR )
   5549               )
   5550             );
   5551    }
   5552 
   5553    putMMXReg( gregOfRM(modrm), mkexpr(res) );
   5554 
   5555    DIP("%s%s %s, %s\n",
   5556        name, show_granularity ? nameMMXGran(opc & 3) : "",
   5557        ( isReg ? nameMMXReg(eregOfRM(modrm)) : dis_buf ),
   5558        nameMMXReg(gregOfRM(modrm)) );
   5559 
   5560    return delta;
   5561 }
   5562 
   5563 
   5564 /* Vector by scalar shift of G by the amount specified at the bottom
   5565    of E.  This is a straight copy of dis_SSE_shiftG_byE. */
   5566 
   5567 static UInt dis_MMX_shiftG_byE ( UChar sorb, Int delta,
   5568                                  HChar* opname, IROp op )
   5569 {
   5570    HChar   dis_buf[50];
   5571    Int     alen, size;
   5572    IRTemp  addr;
   5573    Bool    shl, shr, sar;
   5574    UChar   rm   = getIByte(delta);
   5575    IRTemp  g0   = newTemp(Ity_I64);
   5576    IRTemp  g1   = newTemp(Ity_I64);
   5577    IRTemp  amt  = newTemp(Ity_I32);
   5578    IRTemp  amt8 = newTemp(Ity_I8);
   5579 
   5580    if (epartIsReg(rm)) {
   5581       assign( amt, unop(Iop_64to32, getMMXReg(eregOfRM(rm))) );
   5582       DIP("%s %s,%s\n", opname,
   5583                         nameMMXReg(eregOfRM(rm)),
   5584                         nameMMXReg(gregOfRM(rm)) );
   5585       delta++;
   5586    } else {
   5587       addr = disAMode ( &alen, sorb, delta, dis_buf );
   5588       assign( amt, loadLE(Ity_I32, mkexpr(addr)) );
   5589       DIP("%s %s,%s\n", opname,
   5590                         dis_buf,
   5591                         nameMMXReg(gregOfRM(rm)) );
   5592       delta += alen;
   5593    }
   5594    assign( g0,   getMMXReg(gregOfRM(rm)) );
   5595    assign( amt8, unop(Iop_32to8, mkexpr(amt)) );
   5596 
   5597    shl = shr = sar = False;
   5598    size = 0;
   5599    switch (op) {
   5600       case Iop_ShlN16x4: shl = True; size = 32; break;
   5601       case Iop_ShlN32x2: shl = True; size = 32; break;
   5602       case Iop_Shl64:    shl = True; size = 64; break;
   5603       case Iop_ShrN16x4: shr = True; size = 16; break;
   5604       case Iop_ShrN32x2: shr = True; size = 32; break;
   5605       case Iop_Shr64:    shr = True; size = 64; break;
   5606       case Iop_SarN16x4: sar = True; size = 16; break;
   5607       case Iop_SarN32x2: sar = True; size = 32; break;
   5608       default: vassert(0);
   5609    }
   5610 
   5611    if (shl || shr) {
   5612      assign(
   5613         g1,
   5614         IRExpr_Mux0X(
   5615            unop(Iop_1Uto8,binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size))),
   5616            mkU64(0),
   5617            binop(op, mkexpr(g0), mkexpr(amt8))
   5618         )
   5619      );
   5620    } else
   5621    if (sar) {
   5622      assign(
   5623         g1,
   5624         IRExpr_Mux0X(
   5625            unop(Iop_1Uto8,binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size))),
   5626            binop(op, mkexpr(g0), mkU8(size-1)),
   5627            binop(op, mkexpr(g0), mkexpr(amt8))
   5628         )
   5629      );
   5630    } else {
   5631       /*NOTREACHED*/
   5632       vassert(0);
   5633    }
   5634 
   5635    putMMXReg( gregOfRM(rm), mkexpr(g1) );
   5636    return delta;
   5637 }
   5638 
   5639 
   5640 /* Vector by scalar shift of E by an immediate byte.  This is a
   5641    straight copy of dis_SSE_shiftE_imm. */
   5642 
   5643 static
   5644 UInt dis_MMX_shiftE_imm ( Int delta, HChar* opname, IROp op )
   5645 {
   5646    Bool    shl, shr, sar;
   5647    UChar   rm   = getIByte(delta);
   5648    IRTemp  e0   = newTemp(Ity_I64);
   5649    IRTemp  e1   = newTemp(Ity_I64);
   5650    UChar   amt, size;
   5651    vassert(epartIsReg(rm));
   5652    vassert(gregOfRM(rm) == 2
   5653            || gregOfRM(rm) == 4 || gregOfRM(rm) == 6);
   5654    amt = getIByte(delta+1);
   5655    delta += 2;
   5656    DIP("%s $%d,%s\n", opname,
   5657                       (Int)amt,
   5658                       nameMMXReg(eregOfRM(rm)) );
   5659 
   5660    assign( e0, getMMXReg(eregOfRM(rm)) );
   5661 
   5662    shl = shr = sar = False;
   5663    size = 0;
   5664    switch (op) {
   5665       case Iop_ShlN16x4: shl = True; size = 16; break;
   5666       case Iop_ShlN32x2: shl = True; size = 32; break;
   5667       case Iop_Shl64:    shl = True; size = 64; break;
   5668       case Iop_SarN16x4: sar = True; size = 16; break;
   5669       case Iop_SarN32x2: sar = True; size = 32; break;
   5670       case Iop_ShrN16x4: shr = True; size = 16; break;
   5671       case Iop_ShrN32x2: shr = True; size = 32; break;
   5672       case Iop_Shr64:    shr = True; size = 64; break;
   5673       default: vassert(0);
   5674    }
   5675 
   5676    if (shl || shr) {
   5677       assign( e1, amt >= size
   5678                      ? mkU64(0)
   5679                      : binop(op, mkexpr(e0), mkU8(amt))
   5680       );
   5681    } else
   5682    if (sar) {
   5683       assign( e1, amt >= size
   5684                      ? binop(op, mkexpr(e0), mkU8(size-1))
   5685                      : binop(op, mkexpr(e0), mkU8(amt))
   5686       );
   5687    } else {
   5688       /*NOTREACHED*/
   5689       vassert(0);
   5690    }
   5691 
   5692    putMMXReg( eregOfRM(rm), mkexpr(e1) );
   5693    return delta;
   5694 }
   5695 
   5696 
   5697 /* Completely handle all MMX instructions except emms. */
   5698 
   5699 static
   5700 UInt dis_MMX ( Bool* decode_ok, UChar sorb, Int sz, Int delta )
   5701 {
   5702    Int   len;
   5703    UChar modrm;
   5704    HChar dis_buf[50];
   5705    UChar opc = getIByte(delta);
   5706    delta++;
   5707 
   5708    /* dis_MMX handles all insns except emms. */
   5709    do_MMX_preamble();
   5710 
   5711    switch (opc) {
   5712 
   5713       case 0x6E:
   5714          /* MOVD (src)ireg-or-mem (E), (dst)mmxreg (G)*/
   5715          if (sz != 4)
   5716             goto mmx_decode_failure;
   5717          modrm = getIByte(delta);
   5718          if (epartIsReg(modrm)) {
   5719             delta++;
   5720             putMMXReg(
   5721                gregOfRM(modrm),
   5722                binop( Iop_32HLto64,
   5723                       mkU32(0),
   5724                       getIReg(4, eregOfRM(modrm)) ) );
   5725             DIP("movd %s, %s\n",
   5726                 nameIReg(4,eregOfRM(modrm)), nameMMXReg(gregOfRM(modrm)));
   5727          } else {
   5728             IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5729             delta += len;
   5730             putMMXReg(
   5731                gregOfRM(modrm),
   5732                binop( Iop_32HLto64,
   5733                       mkU32(0),
   5734                       loadLE(Ity_I32, mkexpr(addr)) ) );
   5735             DIP("movd %s, %s\n", dis_buf, nameMMXReg(gregOfRM(modrm)));
   5736          }
   5737          break;
   5738 
   5739       case 0x7E: /* MOVD (src)mmxreg (G), (dst)ireg-or-mem (E) */
   5740          if (sz != 4)
   5741             goto mmx_decode_failure;
   5742          modrm = getIByte(delta);
   5743          if (epartIsReg(modrm)) {
   5744             delta++;
   5745             putIReg( 4, eregOfRM(modrm),
   5746                      unop(Iop_64to32, getMMXReg(gregOfRM(modrm)) ) );
   5747             DIP("movd %s, %s\n",
   5748                 nameMMXReg(gregOfRM(modrm)), nameIReg(4,eregOfRM(modrm)));
   5749          } else {
   5750             IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5751             delta += len;
   5752             storeLE( mkexpr(addr),
   5753                      unop(Iop_64to32, getMMXReg(gregOfRM(modrm)) ) );
   5754             DIP("movd %s, %s\n", nameMMXReg(gregOfRM(modrm)), dis_buf);
   5755          }
   5756          break;
   5757 
   5758       case 0x6F:
   5759          /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
   5760          if (sz != 4)
   5761             goto mmx_decode_failure;
   5762          modrm = getIByte(delta);
   5763          if (epartIsReg(modrm)) {
   5764             delta++;
   5765             putMMXReg( gregOfRM(modrm), getMMXReg(eregOfRM(modrm)) );
   5766             DIP("movq %s, %s\n",
   5767                 nameMMXReg(eregOfRM(modrm)), nameMMXReg(gregOfRM(modrm)));
   5768          } else {
   5769             IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5770             delta += len;
   5771             putMMXReg( gregOfRM(modrm), loadLE(Ity_I64, mkexpr(addr)) );
   5772             DIP("movq %s, %s\n",
   5773                 dis_buf, nameMMXReg(gregOfRM(modrm)));
   5774          }
   5775          break;
   5776 
   5777       case 0x7F:
   5778          /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
   5779          if (sz != 4)
   5780             goto mmx_decode_failure;
   5781          modrm = getIByte(delta);
   5782          if (epartIsReg(modrm)) {
   5783             delta++;
   5784             putMMXReg( eregOfRM(modrm), getMMXReg(gregOfRM(modrm)) );
   5785             DIP("movq %s, %s\n",
   5786                 nameMMXReg(gregOfRM(modrm)), nameMMXReg(eregOfRM(modrm)));
   5787          } else {
   5788             IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   5789             delta += len;
   5790             storeLE( mkexpr(addr), getMMXReg(gregOfRM(modrm)) );
   5791             DIP("mov(nt)q %s, %s\n",
   5792                 nameMMXReg(gregOfRM(modrm)), dis_buf);
   5793          }
   5794          break;
   5795 
   5796       case 0xFC:
   5797       case 0xFD:
   5798       case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
   5799          if (sz != 4)
   5800             goto mmx_decode_failure;
   5801          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "padd", True );
   5802          break;
   5803 
   5804       case 0xEC:
   5805       case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
   5806          if (sz != 4)
   5807             goto mmx_decode_failure;
   5808          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "padds", True );
   5809          break;
   5810 
   5811       case 0xDC:
   5812       case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   5813          if (sz != 4)
   5814             goto mmx_decode_failure;
   5815          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "paddus", True );
   5816          break;
   5817 
   5818       case 0xF8:
   5819       case 0xF9:
   5820       case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
   5821          if (sz != 4)
   5822             goto mmx_decode_failure;
   5823          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psub", True );
   5824          break;
   5825 
   5826       case 0xE8:
   5827       case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
   5828          if (sz != 4)
   5829             goto mmx_decode_failure;
   5830          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psubs", True );
   5831          break;
   5832 
   5833       case 0xD8:
   5834       case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   5835          if (sz != 4)
   5836             goto mmx_decode_failure;
   5837          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psubus", True );
   5838          break;
   5839 
   5840       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
   5841          if (sz != 4)
   5842             goto mmx_decode_failure;
   5843          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmulhw", False );
   5844          break;
   5845 
   5846       case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
   5847          if (sz != 4)
   5848             goto mmx_decode_failure;
   5849          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmullw", False );
   5850          break;
   5851 
   5852       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
   5853          vassert(sz == 4);
   5854          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmaddwd", False );
   5855          break;
   5856 
   5857       case 0x74:
   5858       case 0x75:
   5859       case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
   5860          if (sz != 4)
   5861             goto mmx_decode_failure;
   5862          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pcmpeq", True );
   5863          break;
   5864 
   5865       case 0x64:
   5866       case 0x65:
   5867       case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
   5868          if (sz != 4)
   5869             goto mmx_decode_failure;
   5870          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pcmpgt", True );
   5871          break;
   5872 
   5873       case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
   5874          if (sz != 4)
   5875             goto mmx_decode_failure;
   5876          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packssdw", False );
   5877          break;
   5878 
   5879       case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
   5880          if (sz != 4)
   5881             goto mmx_decode_failure;
   5882          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packsswb", False );
   5883          break;
   5884 
   5885       case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
   5886          if (sz != 4)
   5887             goto mmx_decode_failure;
   5888          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packuswb", False );
   5889          break;
   5890 
   5891       case 0x68:
   5892       case 0x69:
   5893       case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
   5894          if (sz != 4)
   5895             goto mmx_decode_failure;
   5896          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "punpckh", True );
   5897          break;
   5898 
   5899       case 0x60:
   5900       case 0x61:
   5901       case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
   5902          if (sz != 4)
   5903             goto mmx_decode_failure;
   5904          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "punpckl", True );
   5905          break;
   5906 
   5907       case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
   5908          if (sz != 4)
   5909             goto mmx_decode_failure;
   5910          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pand", False );
   5911          break;
   5912 
   5913       case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
   5914          if (sz != 4)
   5915             goto mmx_decode_failure;
   5916          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pandn", False );
   5917          break;
   5918 
   5919       case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
   5920          if (sz != 4)
   5921             goto mmx_decode_failure;
   5922          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "por", False );
   5923          break;
   5924 
   5925       case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
   5926          if (sz != 4)
   5927             goto mmx_decode_failure;
   5928          delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pxor", False );
   5929          break;
   5930 
   5931 #     define SHIFT_BY_REG(_name,_op)                                 \
   5932                 delta = dis_MMX_shiftG_byE(sorb, delta, _name, _op); \
   5933                 break;
   5934 
   5935       /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
   5936       case 0xF1: SHIFT_BY_REG("psllw", Iop_ShlN16x4);
   5937       case 0xF2: SHIFT_BY_REG("pslld", Iop_ShlN32x2);
   5938       case 0xF3: SHIFT_BY_REG("psllq", Iop_Shl64);
   5939 
   5940       /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
   5941       case 0xD1: SHIFT_BY_REG("psrlw", Iop_ShrN16x4);
   5942       case 0xD2: SHIFT_BY_REG("psrld", Iop_ShrN32x2);
   5943       case 0xD3: SHIFT_BY_REG("psrlq", Iop_Shr64);
   5944 
   5945       /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
   5946       case 0xE1: SHIFT_BY_REG("psraw", Iop_SarN16x4);
   5947       case 0xE2: SHIFT_BY_REG("psrad", Iop_SarN32x2);
   5948 
   5949 #     undef SHIFT_BY_REG
   5950 
   5951       case 0x71:
   5952       case 0x72:
   5953       case 0x73: {
   5954          /* (sz==4): PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
   5955          UChar byte2, subopc;
   5956          if (sz != 4)
   5957             goto mmx_decode_failure;
   5958          byte2  = getIByte(delta);           /* amode / sub-opcode */
   5959          subopc = toUChar( (byte2 >> 3) & 7 );
   5960 
   5961 #        define SHIFT_BY_IMM(_name,_op)                         \
   5962              do { delta = dis_MMX_shiftE_imm(delta,_name,_op);  \
   5963              } while (0)
   5964 
   5965               if (subopc == 2 /*SRL*/ && opc == 0x71)
   5966                  SHIFT_BY_IMM("psrlw", Iop_ShrN16x4);
   5967          else if (subopc == 2 /*SRL*/ && opc == 0x72)
   5968                  SHIFT_BY_IMM("psrld", Iop_ShrN32x2);
   5969          else if (subopc == 2 /*SRL*/ && opc == 0x73)
   5970                  SHIFT_BY_IMM("psrlq", Iop_Shr64);
   5971 
   5972          else if (subopc == 4 /*SAR*/ && opc == 0x71)
   5973                  SHIFT_BY_IMM("psraw", Iop_SarN16x4);
   5974          else if (subopc == 4 /*SAR*/ && opc == 0x72)
   5975                  SHIFT_BY_IMM("psrad", Iop_SarN32x2);
   5976 
   5977          else if (subopc == 6 /*SHL*/ && opc == 0x71)
   5978                  SHIFT_BY_IMM("psllw", Iop_ShlN16x4);
   5979          else if (subopc == 6 /*SHL*/ && opc == 0x72)
   5980                  SHIFT_BY_IMM("pslld", Iop_ShlN32x2);
   5981          else if (subopc == 6 /*SHL*/ && opc == 0x73)
   5982                  SHIFT_BY_IMM("psllq", Iop_Shl64);
   5983 
   5984          else goto mmx_decode_failure;
   5985 
   5986 #        undef SHIFT_BY_IMM
   5987          break;
   5988       }
   5989 
   5990       case 0xF7: {
   5991          IRTemp addr    = newTemp(Ity_I32);
   5992          IRTemp regD    = newTemp(Ity_I64);
   5993          IRTemp regM    = newTemp(Ity_I64);
   5994          IRTemp mask    = newTemp(Ity_I64);
   5995          IRTemp olddata = newTemp(Ity_I64);
   5996          IRTemp newdata = newTemp(Ity_I64);
   5997 
   5998          modrm = getIByte(delta);
   5999          if (sz != 4 || (!epartIsReg(modrm)))
   6000             goto mmx_decode_failure;
   6001          delta++;
   6002 
   6003          assign( addr, handleSegOverride( sorb, getIReg(4, R_EDI) ));
   6004          assign( regM, getMMXReg( eregOfRM(modrm) ));
   6005          assign( regD, getMMXReg( gregOfRM(modrm) ));
   6006          assign( mask, binop(Iop_SarN8x8, mkexpr(regM), mkU8(7)) );
   6007          assign( olddata, loadLE( Ity_I64, mkexpr(addr) ));
   6008          assign( newdata,
   6009                  binop(Iop_Or64,
   6010                        binop(Iop_And64,
   6011                              mkexpr(regD),
   6012                              mkexpr(mask) ),
   6013                        binop(Iop_And64,
   6014                              mkexpr(olddata),
   6015                              unop(Iop_Not64, mkexpr(mask)))) );
   6016          storeLE( mkexpr(addr), mkexpr(newdata) );
   6017          DIP("maskmovq %s,%s\n", nameMMXReg( eregOfRM(modrm) ),
   6018                                  nameMMXReg( gregOfRM(modrm) ) );
   6019          break;
   6020       }
   6021 
   6022       /* --- MMX decode failure --- */
   6023       default:
   6024       mmx_decode_failure:
   6025          *decode_ok = False;
   6026          return delta; /* ignored */
   6027 
   6028    }
   6029 
   6030    *decode_ok = True;
   6031    return delta;
   6032 }
   6033 
   6034 
   6035 /*------------------------------------------------------------*/
   6036 /*--- More misc arithmetic and other obscure insns.        ---*/
   6037 /*------------------------------------------------------------*/
   6038 
   6039 /* Double length left and right shifts.  Apparently only required in
   6040    v-size (no b- variant). */
   6041 static
   6042 UInt dis_SHLRD_Gv_Ev ( UChar sorb,
   6043                        Int delta, UChar modrm,
   6044                        Int sz,
   6045                        IRExpr* shift_amt,
   6046                        Bool amt_is_literal,
   6047                        HChar* shift_amt_txt,
   6048                        Bool left_shift )
   6049 {
   6050    /* shift_amt :: Ity_I8 is the amount to shift.  shift_amt_txt is used
   6051       for printing it.   And eip on entry points at the modrm byte. */
   6052    Int len;
   6053    HChar dis_buf[50];
   6054 
   6055    IRType ty       = szToITy(sz);
   6056    IRTemp gsrc     = newTemp(ty);
   6057    IRTemp esrc     = newTemp(ty);
   6058    IRTemp addr     = IRTemp_INVALID;
   6059    IRTemp tmpSH    = newTemp(Ity_I8);
   6060    IRTemp tmpL     = IRTemp_INVALID;
   6061    IRTemp tmpRes   = IRTemp_INVALID;
   6062    IRTemp tmpSubSh = IRTemp_INVALID;
   6063    IROp   mkpair;
   6064    IROp   getres;
   6065    IROp   shift;
   6066    IRExpr* mask = NULL;
   6067 
   6068    vassert(sz == 2 || sz == 4);
   6069 
   6070    /* The E-part is the destination; this is shifted.  The G-part
   6071       supplies bits to be shifted into the E-part, but is not
   6072       changed.
   6073 
   6074       If shifting left, form a double-length word with E at the top
   6075       and G at the bottom, and shift this left.  The result is then in
   6076       the high part.
   6077 
   6078       If shifting right, form a double-length word with G at the top
   6079       and E at the bottom, and shift this right.  The result is then
   6080       at the bottom.  */
   6081 
   6082    /* Fetch the operands. */
   6083 
   6084    assign( gsrc, getIReg(sz, gregOfRM(modrm)) );
   6085 
   6086    if (epartIsReg(modrm)) {
   6087       delta++;
   6088       assign( esrc, getIReg(sz, eregOfRM(modrm)) );
   6089       DIP("sh%cd%c %s, %s, %s\n",
   6090           ( left_shift ? 'l' : 'r' ), nameISize(sz),
   6091           shift_amt_txt,
   6092           nameIReg(sz, gregOfRM(modrm)), nameIReg(sz, eregOfRM(modrm)));
   6093    } else {
   6094       addr = disAMode ( &len, sorb, delta, dis_buf );
   6095       delta += len;
   6096       assign( esrc, loadLE(ty, mkexpr(addr)) );
   6097       DIP("sh%cd%c %s, %s, %s\n",
   6098           ( left_shift ? 'l' : 'r' ), nameISize(sz),
   6099           shift_amt_txt,
   6100           nameIReg(sz, gregOfRM(modrm)), dis_buf);
   6101    }
   6102 
   6103    /* Round up the relevant primops. */
   6104 
   6105    if (sz == 4) {
   6106       tmpL     = newTemp(Ity_I64);
   6107       tmpRes   = newTemp(Ity_I32);
   6108       tmpSubSh = newTemp(Ity_I32);
   6109       mkpair   = Iop_32HLto64;
   6110       getres   = left_shift ? Iop_64HIto32 : Iop_64to32;
   6111       shift    = left_shift ? Iop_Shl64 : Iop_Shr64;
   6112       mask     = mkU8(31);
   6113    } else {
   6114       /* sz == 2 */
   6115       tmpL     = newTemp(Ity_I32);
   6116       tmpRes   = newTemp(Ity_I16);
   6117       tmpSubSh = newTemp(Ity_I16);
   6118       mkpair   = Iop_16HLto32;
   6119       getres   = left_shift ? Iop_32HIto16 : Iop_32to16;
   6120       shift    = left_shift ? Iop_Shl32 : Iop_Shr32;
   6121       mask     = mkU8(15);
   6122    }
   6123 
   6124    /* Do the shift, calculate the subshift value, and set
   6125       the flag thunk. */
   6126 
   6127    assign( tmpSH, binop(Iop_And8, shift_amt, mask) );
   6128 
   6129    if (left_shift)
   6130       assign( tmpL, binop(mkpair, mkexpr(esrc), mkexpr(gsrc)) );
   6131    else
   6132       assign( tmpL, binop(mkpair, mkexpr(gsrc), mkexpr(esrc)) );
   6133 
   6134    assign( tmpRes, unop(getres, binop(shift, mkexpr(tmpL), mkexpr(tmpSH)) ) );
   6135    assign( tmpSubSh,
   6136            unop(getres,
   6137                 binop(shift,
   6138                       mkexpr(tmpL),
   6139                       binop(Iop_And8,
   6140                             binop(Iop_Sub8, mkexpr(tmpSH), mkU8(1) ),
   6141                             mask))) );
   6142 
   6143    setFlags_DEP1_DEP2_shift ( left_shift ? Iop_Shl32 : Iop_Sar32,
   6144                               tmpRes, tmpSubSh, ty, tmpSH );
   6145 
   6146    /* Put result back. */
   6147 
   6148    if (epartIsReg(modrm)) {
   6149       putIReg(sz, eregOfRM(modrm), mkexpr(tmpRes));
   6150    } else {
   6151       storeLE( mkexpr(addr), mkexpr(tmpRes) );
   6152    }
   6153 
   6154    if (amt_is_literal) delta++;
   6155    return delta;
   6156 }
   6157 
   6158 
   6159 /* Handle BT/BTS/BTR/BTC Gv, Ev.  Apparently b-size is not
   6160    required. */
   6161 
   6162 typedef enum { BtOpNone, BtOpSet, BtOpReset, BtOpComp } BtOp;
   6163 
   6164 static HChar* nameBtOp ( BtOp op )
   6165 {
   6166    switch (op) {
   6167       case BtOpNone:  return "";
   6168       case BtOpSet:   return "s";
   6169       case BtOpReset: return "r";
   6170       case BtOpComp:  return "c";
   6171       default: vpanic("nameBtOp(x86)");
   6172    }
   6173 }
   6174 
   6175 
   6176 static
   6177 UInt dis_bt_G_E ( VexAbiInfo* vbi,
   6178                   UChar sorb, Bool locked, Int sz, Int delta, BtOp op )
   6179 {
   6180    HChar  dis_buf[50];
   6181    UChar  modrm;
   6182    Int    len;
   6183    IRTemp t_fetched, t_bitno0, t_bitno1, t_bitno2, t_addr0,
   6184           t_addr1, t_esp, t_mask, t_new;
   6185 
   6186    vassert(sz == 2 || sz == 4);
   6187 
   6188    t_fetched = t_bitno0 = t_bitno1 = t_bitno2
   6189              = t_addr0 = t_addr1 = t_esp
   6190              = t_mask = t_new = IRTemp_INVALID;
   6191 
   6192    t_fetched = newTemp(Ity_I8);
   6193    t_new     = newTemp(Ity_I8);
   6194    t_bitno0  = newTemp(Ity_I32);
   6195    t_bitno1  = newTemp(Ity_I32);
   6196    t_bitno2  = newTemp(Ity_I8);
   6197    t_addr1   = newTemp(Ity_I32);
   6198    modrm     = getIByte(delta);
   6199 
   6200    assign( t_bitno0, widenSto32(getIReg(sz, gregOfRM(modrm))) );
   6201 
   6202    if (epartIsReg(modrm)) {
   6203       delta++;
   6204       /* Get it onto the client's stack. */
   6205       t_esp = newTemp(Ity_I32);
   6206       t_addr0 = newTemp(Ity_I32);
   6207 
   6208       /* For the choice of the value 128, see comment in dis_bt_G_E in
   6209          guest_amd64_toIR.c.  We point out here only that 128 is
   6210          fast-cased in Memcheck and is > 0, so seems like a good
   6211          choice. */
   6212       vassert(vbi->guest_stack_redzone_size == 0);
   6213       assign( t_esp, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(128)) );
   6214       putIReg(4, R_ESP, mkexpr(t_esp));
   6215 
   6216       storeLE( mkexpr(t_esp), getIReg(sz, eregOfRM(modrm)) );
   6217 
   6218       /* Make t_addr0 point at it. */
   6219       assign( t_addr0, mkexpr(t_esp) );
   6220 
   6221       /* Mask out upper bits of the shift amount, since we're doing a
   6222          reg. */
   6223       assign( t_bitno1, binop(Iop_And32,
   6224                               mkexpr(t_bitno0),
   6225                               mkU32(sz == 4 ? 31 : 15)) );
   6226 
   6227    } else {
   6228       t_addr0 = disAMode ( &len, sorb, delta, dis_buf );
   6229       delta += len;
   6230       assign( t_bitno1, mkexpr(t_bitno0) );
   6231    }
   6232 
   6233    /* At this point: t_addr0 is the address being operated on.  If it
   6234       was a reg, we will have pushed it onto the client's stack.
   6235       t_bitno1 is the bit number, suitably masked in the case of a
   6236       reg.  */
   6237 
   6238    /* Now the main sequence. */
   6239    assign( t_addr1,
   6240            binop(Iop_Add32,
   6241                  mkexpr(t_addr0),
   6242                  binop(Iop_Sar32, mkexpr(t_bitno1), mkU8(3))) );
   6243 
   6244    /* t_addr1 now holds effective address */
   6245 
   6246    assign( t_bitno2,
   6247            unop(Iop_32to8,
   6248                 binop(Iop_And32, mkexpr(t_bitno1), mkU32(7))) );
   6249 
   6250    /* t_bitno2 contains offset of bit within byte */
   6251 
   6252    if (op != BtOpNone) {
   6253       t_mask = newTemp(Ity_I8);
   6254       assign( t_mask, binop(Iop_Shl8, mkU8(1), mkexpr(t_bitno2)) );
   6255    }
   6256 
   6257    /* t_mask is now a suitable byte mask */
   6258 
   6259    assign( t_fetched, loadLE(Ity_I8, mkexpr(t_addr1)) );
   6260 
   6261    if (op != BtOpNone) {
   6262       switch (op) {
   6263          case BtOpSet:
   6264             assign( t_new,
   6265                     binop(Iop_Or8, mkexpr(t_fetched), mkexpr(t_mask)) );
   6266             break;
   6267          case BtOpComp:
   6268             assign( t_new,
   6269                     binop(Iop_Xor8, mkexpr(t_fetched), mkexpr(t_mask)) );
   6270             break;
   6271          case BtOpReset:
   6272             assign( t_new,
   6273                     binop(Iop_And8, mkexpr(t_fetched),
   6274                                     unop(Iop_Not8, mkexpr(t_mask))) );
   6275             break;
   6276          default:
   6277             vpanic("dis_bt_G_E(x86)");
   6278       }
   6279       if (locked && !epartIsReg(modrm)) {
   6280          casLE( mkexpr(t_addr1), mkexpr(t_fetched)/*expd*/,
   6281                                  mkexpr(t_new)/*new*/,
   6282                                  guest_EIP_curr_instr );
   6283       } else {
   6284          storeLE( mkexpr(t_addr1), mkexpr(t_new) );
   6285       }
   6286    }
   6287 
   6288    /* Side effect done; now get selected bit into Carry flag */
   6289    /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
   6290    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   6291    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   6292    stmt( IRStmt_Put(
   6293             OFFB_CC_DEP1,
   6294             binop(Iop_And32,
   6295                   binop(Iop_Shr32,
   6296                         unop(Iop_8Uto32, mkexpr(t_fetched)),
   6297                         mkexpr(t_bitno2)),
   6298                   mkU32(1)))
   6299        );
   6300    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   6301       elimination of previous stores to this field work better. */
   6302    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   6303 
   6304    /* Move reg operand from stack back to reg */
   6305    if (epartIsReg(modrm)) {
   6306       /* t_esp still points at it. */
   6307       putIReg(sz, eregOfRM(modrm), loadLE(szToITy(sz), mkexpr(t_esp)) );
   6308       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t_esp), mkU32(128)) );
   6309    }
   6310 
   6311    DIP("bt%s%c %s, %s\n",
   6312        nameBtOp(op), nameISize(sz), nameIReg(sz, gregOfRM(modrm)),
   6313        ( epartIsReg(modrm) ? nameIReg(sz, eregOfRM(modrm)) : dis_buf ) );
   6314 
   6315    return delta;
   6316 }
   6317 
   6318 
   6319 
   6320 /* Handle BSF/BSR.  Only v-size seems necessary. */
   6321 static
   6322 UInt dis_bs_E_G ( UChar sorb, Int sz, Int delta, Bool fwds )
   6323 {
   6324    Bool   isReg;
   6325    UChar  modrm;
   6326    HChar  dis_buf[50];
   6327 
   6328    IRType ty  = szToITy(sz);
   6329    IRTemp src = newTemp(ty);
   6330    IRTemp dst = newTemp(ty);
   6331 
   6332    IRTemp src32 = newTemp(Ity_I32);
   6333    IRTemp dst32 = newTemp(Ity_I32);
   6334    IRTemp src8  = newTemp(Ity_I8);
   6335 
   6336    vassert(sz == 4 || sz == 2);
   6337 
   6338    modrm = getIByte(delta);
   6339 
   6340    isReg = epartIsReg(modrm);
   6341    if (isReg) {
   6342       delta++;
   6343       assign( src, getIReg(sz, eregOfRM(modrm)) );
   6344    } else {
   6345       Int    len;
   6346       IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
   6347       delta += len;
   6348       assign( src, loadLE(ty, mkexpr(addr)) );
   6349    }
   6350 
   6351    DIP("bs%c%c %s, %s\n",
   6352        fwds ? 'f' : 'r', nameISize(sz),
   6353        ( isReg ? nameIReg(sz, eregOfRM(modrm)) : dis_buf ),
   6354        nameIReg(sz, gregOfRM(modrm)));
   6355 
   6356    /* Generate an 8-bit expression which is zero iff the
   6357       original is zero, and nonzero otherwise */
   6358    assign( src8,
   6359            unop(Iop_1Uto8, binop(mkSizedOp(ty,Iop_CmpNE8),
   6360                            mkexpr(src), mkU(ty,0))) );
   6361 
   6362    /* Flags: Z is 1 iff source value is zero.  All others
   6363       are undefined -- we force them to zero. */
   6364    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   6365    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   6366    stmt( IRStmt_Put(
   6367             OFFB_CC_DEP1,
   6368             IRExpr_Mux0X( mkexpr(src8),
   6369                           /* src==0 */
   6370                           mkU32(X86G_CC_MASK_Z),
   6371                           /* src!=0 */
   6372                           mkU32(0)
   6373                         )
   6374        ));
   6375    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   6376       elimination of previous stores to this field work better. */
   6377    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   6378 
   6379    /* Result: iff source value is zero, we can't use
   6380       Iop_Clz32/Iop_Ctz32 as they have no defined result in that case.
   6381       But anyway, Intel x86 semantics say the result is undefined in
   6382       such situations.  Hence handle the zero case specially. */
   6383 
   6384    /* Bleh.  What we compute:
   6385 
   6386           bsf32:  if src == 0 then 0 else  Ctz32(src)
   6387           bsr32:  if src == 0 then 0 else  31 - Clz32(src)
   6388 
   6389           bsf16:  if src == 0 then 0 else  Ctz32(16Uto32(src))
   6390           bsr16:  if src == 0 then 0 else  31 - Clz32(16Uto32(src))
   6391 
   6392       First, widen src to 32 bits if it is not already.
   6393 
   6394       Postscript 15 Oct 04: it seems that at least VIA Nehemiah leaves the
   6395       dst register unchanged when src == 0.  Hence change accordingly.
   6396    */
   6397    if (sz == 2)
   6398       assign( src32, unop(Iop_16Uto32, mkexpr(src)) );
   6399    else
   6400       assign( src32, mkexpr(src) );
   6401 
   6402    /* The main computation, guarding against zero. */
   6403    assign( dst32,
   6404            IRExpr_Mux0X(
   6405               mkexpr(src8),
   6406               /* src == 0 -- leave dst unchanged */
   6407               widenUto32( getIReg( sz, gregOfRM(modrm) ) ),
   6408               /* src != 0 */
   6409               fwds ? unop(Iop_Ctz32, mkexpr(src32))
   6410                    : binop(Iop_Sub32,
   6411                            mkU32(31),
   6412                            unop(Iop_Clz32, mkexpr(src32)))
   6413            )
   6414          );
   6415 
   6416    if (sz == 2)
   6417       assign( dst, unop(Iop_32to16, mkexpr(dst32)) );
   6418    else
   6419       assign( dst, mkexpr(dst32) );
   6420 
   6421    /* dump result back */
   6422    putIReg( sz, gregOfRM(modrm), mkexpr(dst) );
   6423 
   6424    return delta;
   6425 }
   6426 
   6427 
   6428 static
   6429 void codegen_xchg_eAX_Reg ( Int sz, Int reg )
   6430 {
   6431    IRType ty = szToITy(sz);
   6432    IRTemp t1 = newTemp(ty);
   6433    IRTemp t2 = newTemp(ty);
   6434    vassert(sz == 2 || sz == 4);
   6435    assign( t1, getIReg(sz, R_EAX) );
   6436    assign( t2, getIReg(sz, reg) );
   6437    putIReg( sz, R_EAX, mkexpr(t2) );
   6438    putIReg( sz, reg, mkexpr(t1) );
   6439    DIP("xchg%c %s, %s\n",
   6440        nameISize(sz), nameIReg(sz, R_EAX), nameIReg(sz, reg));
   6441 }
   6442 
   6443 
   6444 static
   6445 void codegen_SAHF ( void )
   6446 {
   6447    /* Set the flags to:
   6448       (x86g_calculate_flags_all() & X86G_CC_MASK_O)  -- retain the old O flag
   6449       | (%AH & (X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
   6450                 |X86G_CC_MASK_P|X86G_CC_MASK_C)
   6451    */
   6452    UInt   mask_SZACP = X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
   6453                        |X86G_CC_MASK_C|X86G_CC_MASK_P;
   6454    IRTemp oldflags   = newTemp(Ity_I32);
   6455    assign( oldflags, mk_x86g_calculate_eflags_all() );
   6456    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   6457    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   6458    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   6459    stmt( IRStmt_Put( OFFB_CC_DEP1,
   6460          binop(Iop_Or32,
   6461                binop(Iop_And32, mkexpr(oldflags), mkU32(X86G_CC_MASK_O)),
   6462                binop(Iop_And32,
   6463                      binop(Iop_Shr32, getIReg(4, R_EAX), mkU8(8)),
   6464                      mkU32(mask_SZACP))
   6465               )
   6466    ));
   6467    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   6468       elimination of previous stores to this field work better. */
   6469    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   6470 }
   6471 
   6472 
   6473 static
   6474 void codegen_LAHF ( void  )
   6475 {
   6476    /* AH <- EFLAGS(SF:ZF:0:AF:0:PF:1:CF) */
   6477    IRExpr* eax_with_hole;
   6478    IRExpr* new_byte;
   6479    IRExpr* new_eax;
   6480    UInt    mask_SZACP = X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
   6481                         |X86G_CC_MASK_C|X86G_CC_MASK_P;
   6482 
   6483    IRTemp  flags = newTemp(Ity_I32);
   6484    assign( flags, mk_x86g_calculate_eflags_all() );
   6485 
   6486    eax_with_hole
   6487       = binop(Iop_And32, getIReg(4, R_EAX), mkU32(0xFFFF00FF));
   6488    new_byte
   6489       = binop(Iop_Or32, binop(Iop_And32, mkexpr(flags), mkU32(mask_SZACP)),
   6490                         mkU32(1<<1));
   6491    new_eax
   6492       = binop(Iop_Or32, eax_with_hole,
   6493                         binop(Iop_Shl32, new_byte, mkU8(8)));
   6494    putIReg(4, R_EAX, new_eax);
   6495 }
   6496 
   6497 
   6498 static
   6499 UInt dis_cmpxchg_G_E ( UChar       sorb,
   6500                        Bool        locked,
   6501                        Int         size,
   6502                        Int         delta0 )
   6503 {
   6504    HChar dis_buf[50];
   6505    Int   len;
   6506 
   6507    IRType ty    = szToITy(size);
   6508    IRTemp acc   = newTemp(ty);
   6509    IRTemp src   = newTemp(ty);
   6510    IRTemp dest  = newTemp(ty);
   6511    IRTemp dest2 = newTemp(ty);
   6512    IRTemp acc2  = newTemp(ty);
   6513    IRTemp cond8 = newTemp(Ity_I8);
   6514    IRTemp addr  = IRTemp_INVALID;
   6515    UChar  rm    = getUChar(delta0);
   6516 
   6517    /* There are 3 cases to consider:
   6518 
   6519       reg-reg: ignore any lock prefix, generate sequence based
   6520                on Mux0X
   6521 
   6522       reg-mem, not locked: ignore any lock prefix, generate sequence
   6523                            based on Mux0X
   6524 
   6525       reg-mem, locked: use IRCAS
   6526    */
   6527    if (epartIsReg(rm)) {
   6528       /* case 1 */
   6529       assign( dest, getIReg(size, eregOfRM(rm)) );
   6530       delta0++;
   6531       assign( src, getIReg(size, gregOfRM(rm)) );
   6532       assign( acc, getIReg(size, R_EAX) );
   6533       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   6534       assign( cond8, unop(Iop_1Uto8, mk_x86g_calculate_condition(X86CondZ)) );
   6535       assign( dest2, IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(src)) );
   6536       assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
   6537       putIReg(size, R_EAX, mkexpr(acc2));
   6538       putIReg(size, eregOfRM(rm), mkexpr(dest2));
   6539       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   6540                                nameIReg(size,gregOfRM(rm)),
   6541                                nameIReg(size,eregOfRM(rm)) );
   6542    }
   6543    else if (!epartIsReg(rm) && !locked) {
   6544       /* case 2 */
   6545       addr = disAMode ( &len, sorb, delta0, dis_buf );
   6546       assign( dest, loadLE(ty, mkexpr(addr)) );
   6547       delta0 += len;
   6548       assign( src, getIReg(size, gregOfRM(rm)) );
   6549       assign( acc, getIReg(size, R_EAX) );
   6550       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   6551       assign( cond8, unop(Iop_1Uto8, mk_x86g_calculate_condition(X86CondZ)) );
   6552       assign( dest2, IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(src)) );
   6553       assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
   6554       putIReg(size, R_EAX, mkexpr(acc2));
   6555       storeLE( mkexpr(addr), mkexpr(dest2) );
   6556       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   6557                                nameIReg(size,gregOfRM(rm)), dis_buf);
   6558    }
   6559    else if (!epartIsReg(rm) && locked) {
   6560       /* case 3 */
   6561       /* src is new value.  acc is expected value.  dest is old value.
   6562          Compute success from the output of the IRCAS, and steer the
   6563          new value for EAX accordingly: in case of success, EAX is
   6564          unchanged. */
   6565       addr = disAMode ( &len, sorb, delta0, dis_buf );
   6566       delta0 += len;
   6567       assign( src, getIReg(size, gregOfRM(rm)) );
   6568       assign( acc, getIReg(size, R_EAX) );
   6569       stmt( IRStmt_CAS(
   6570          mkIRCAS( IRTemp_INVALID, dest, Iend_LE, mkexpr(addr),
   6571                   NULL, mkexpr(acc), NULL, mkexpr(src) )
   6572       ));
   6573       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   6574       assign( cond8, unop(Iop_1Uto8, mk_x86g_calculate_condition(X86CondZ)) );
   6575       assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
   6576       putIReg(size, R_EAX, mkexpr(acc2));
   6577       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   6578                                nameIReg(size,gregOfRM(rm)), dis_buf);
   6579    }
   6580    else vassert(0);
   6581 
   6582    return delta0;
   6583 }
   6584 
   6585 
   6586 /* Handle conditional move instructions of the form
   6587       cmovcc E(reg-or-mem), G(reg)
   6588 
   6589    E(src) is reg-or-mem
   6590    G(dst) is reg.
   6591 
   6592    If E is reg, -->    GET %E, tmps
   6593                        GET %G, tmpd
   6594                        CMOVcc tmps, tmpd
   6595                        PUT tmpd, %G
   6596 
   6597    If E is mem  -->    (getAddr E) -> tmpa
   6598                        LD (tmpa), tmps
   6599                        GET %G, tmpd
   6600                        CMOVcc tmps, tmpd
   6601                        PUT tmpd, %G
   6602 */
   6603 static
   6604 UInt dis_cmov_E_G ( UChar       sorb,
   6605                     Int         sz,
   6606                     X86Condcode cond,
   6607                     Int         delta0 )
   6608 {
   6609    UChar rm  = getIByte(delta0);
   6610    HChar dis_buf[50];
   6611    Int   len;
   6612 
   6613    IRType ty   = szToITy(sz);
   6614    IRTemp tmps = newTemp(ty);
   6615    IRTemp tmpd = newTemp(ty);
   6616 
   6617    if (epartIsReg(rm)) {
   6618       assign( tmps, getIReg(sz, eregOfRM(rm)) );
   6619       assign( tmpd, getIReg(sz, gregOfRM(rm)) );
   6620 
   6621       putIReg(sz, gregOfRM(rm),
   6622                   IRExpr_Mux0X( unop(Iop_1Uto8,
   6623                                      mk_x86g_calculate_condition(cond)),
   6624                                 mkexpr(tmpd),
   6625                                 mkexpr(tmps) )
   6626              );
   6627       DIP("cmov%c%s %s,%s\n", nameISize(sz),
   6628                               name_X86Condcode(cond),
   6629                               nameIReg(sz,eregOfRM(rm)),
   6630                               nameIReg(sz,gregOfRM(rm)));
   6631       return 1+delta0;
   6632    }
   6633 
   6634    /* E refers to memory */
   6635    {
   6636       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
   6637       assign( tmps, loadLE(ty, mkexpr(addr)) );
   6638       assign( tmpd, getIReg(sz, gregOfRM(rm)) );
   6639 
   6640       putIReg(sz, gregOfRM(rm),
   6641                   IRExpr_Mux0X( unop(Iop_1Uto8,
   6642                                      mk_x86g_calculate_condition(cond)),
   6643                                 mkexpr(tmpd),
   6644                                 mkexpr(tmps) )
   6645              );
   6646 
   6647       DIP("cmov%c%s %s,%s\n", nameISize(sz),
   6648                               name_X86Condcode(cond),
   6649                               dis_buf,
   6650                               nameIReg(sz,gregOfRM(rm)));
   6651       return len+delta0;
   6652    }
   6653 }
   6654 
   6655 
   6656 static
   6657 UInt dis_xadd_G_E ( UChar sorb, Bool locked, Int sz, Int delta0,
   6658                     Bool* decodeOK )
   6659 {
   6660    Int   len;
   6661    UChar rm = getIByte(delta0);
   6662    HChar dis_buf[50];
   6663 
   6664    IRType ty    = szToITy(sz);
   6665    IRTemp tmpd  = newTemp(ty);
   6666    IRTemp tmpt0 = newTemp(ty);
   6667    IRTemp tmpt1 = newTemp(ty);
   6668 
   6669    /* There are 3 cases to consider:
   6670 
   6671       reg-reg: ignore any lock prefix,
   6672                generate 'naive' (non-atomic) sequence
   6673 
   6674       reg-mem, not locked: ignore any lock prefix, generate 'naive'
   6675                            (non-atomic) sequence
   6676 
   6677       reg-mem, locked: use IRCAS
   6678    */
   6679 
   6680    if (epartIsReg(rm)) {
   6681       /* case 1 */
   6682       assign( tmpd,  getIReg(sz, eregOfRM(rm)));
   6683       assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
   6684       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   6685                            mkexpr(tmpd), mkexpr(tmpt0)) );
   6686       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   6687       putIReg(sz, eregOfRM(rm), mkexpr(tmpt1));
   6688       putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
   6689       DIP("xadd%c %s, %s\n",
   6690           nameISize(sz), nameIReg(sz,gregOfRM(rm)),
   6691           				 nameIReg(sz,eregOfRM(rm)));
   6692       *decodeOK = True;
   6693       return 1+delta0;
   6694    }
   6695    else if (!epartIsReg(rm) && !locked) {
   6696       /* case 2 */
   6697       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
   6698       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
   6699       assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
   6700       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   6701                            mkexpr(tmpd), mkexpr(tmpt0)) );
   6702       storeLE( mkexpr(addr), mkexpr(tmpt1) );
   6703       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   6704       putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
   6705       DIP("xadd%c %s, %s\n",
   6706           nameISize(sz), nameIReg(sz,gregOfRM(rm)), dis_buf);
   6707       *decodeOK = True;
   6708       return len+delta0;
   6709    }
   6710    else if (!epartIsReg(rm) && locked) {
   6711       /* case 3 */
   6712       IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
   6713       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
   6714       assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
   6715       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   6716                            mkexpr(tmpd), mkexpr(tmpt0)) );
   6717       casLE( mkexpr(addr), mkexpr(tmpd)/*expVal*/,
   6718                            mkexpr(tmpt1)/*newVal*/, guest_EIP_curr_instr );
   6719       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   6720       putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
   6721       DIP("xadd%c %s, %s\n",
   6722           nameISize(sz), nameIReg(sz,gregOfRM(rm)), dis_buf);
   6723       *decodeOK = True;
   6724       return len+delta0;
   6725    }
   6726    /*UNREACHED*/
   6727    vassert(0);
   6728 }
   6729 
   6730 /* Move 16 bits from Ew (ireg or mem) to G (a segment register). */
   6731 
   6732 static
   6733 UInt dis_mov_Ew_Sw ( UChar sorb, Int delta0 )
   6734 {
   6735    Int    len;
   6736    IRTemp addr;
   6737    UChar  rm  = getIByte(delta0);
   6738    HChar  dis_buf[50];
   6739 
   6740    if (epartIsReg(rm)) {
   6741       putSReg( gregOfRM(rm), getIReg(2, eregOfRM(rm)) );
   6742       DIP("movw %s,%s\n", nameIReg(2,eregOfRM(rm)), nameSReg(gregOfRM(rm)));
   6743       return 1+delta0;
   6744    } else {
   6745       addr = disAMode ( &len, sorb, delta0, dis_buf );
   6746       putSReg( gregOfRM(rm), loadLE(Ity_I16, mkexpr(addr)) );
   6747       DIP("movw %s,%s\n", dis_buf, nameSReg(gregOfRM(rm)));
   6748       return len+delta0;
   6749    }
   6750 }
   6751 
   6752 /* Move 16 bits from G (a segment register) to Ew (ireg or mem).  If
   6753    dst is ireg and sz==4, zero out top half of it.  */
   6754 
   6755 static
   6756 UInt dis_mov_Sw_Ew ( UChar sorb,
   6757                      Int   sz,
   6758                      Int   delta0 )
   6759 {
   6760    Int    len;
   6761    IRTemp addr;
   6762    UChar  rm  = getIByte(delta0);
   6763    HChar  dis_buf[50];
   6764 
   6765    vassert(sz == 2 || sz == 4);
   6766 
   6767    if (epartIsReg(rm)) {
   6768       if (sz == 4)
   6769          putIReg(4, eregOfRM(rm), unop(Iop_16Uto32, getSReg(gregOfRM(rm))));
   6770       else
   6771          putIReg(2, eregOfRM(rm), getSReg(gregOfRM(rm)));
   6772 
   6773       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), nameIReg(sz,eregOfRM(rm)));
   6774       return 1+delta0;
   6775    } else {
   6776       addr = disAMode ( &len, sorb, delta0, dis_buf );
   6777       storeLE( mkexpr(addr), getSReg(gregOfRM(rm)) );
   6778       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), dis_buf);
   6779       return len+delta0;
   6780    }
   6781 }
   6782 
   6783 
   6784 static
   6785 void dis_push_segreg ( UInt sreg, Int sz )
   6786 {
   6787     IRTemp t1 = newTemp(Ity_I16);
   6788     IRTemp ta = newTemp(Ity_I32);
   6789     vassert(sz == 2 || sz == 4);
   6790 
   6791     assign( t1, getSReg(sreg) );
   6792     assign( ta, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)) );
   6793     putIReg(4, R_ESP, mkexpr(ta));
   6794     storeLE( mkexpr(ta), mkexpr(t1) );
   6795 
   6796     DIP("push%c %s\n", sz==2 ? 'w' : 'l', nameSReg(sreg));
   6797 }
   6798 
   6799 static
   6800 void dis_pop_segreg ( UInt sreg, Int sz )
   6801 {
   6802     IRTemp t1 = newTemp(Ity_I16);
   6803     IRTemp ta = newTemp(Ity_I32);
   6804     vassert(sz == 2 || sz == 4);
   6805 
   6806     assign( ta, getIReg(4, R_ESP) );
   6807     assign( t1, loadLE(Ity_I16, mkexpr(ta)) );
   6808 
   6809     putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(ta), mkU32(sz)) );
   6810     putSReg( sreg, mkexpr(t1) );
   6811     DIP("pop%c %s\n", sz==2 ? 'w' : 'l', nameSReg(sreg));
   6812 }
   6813 
   6814 static
   6815 void dis_ret ( UInt d32 )
   6816 {
   6817    IRTemp t1 = newTemp(Ity_I32), t2 = newTemp(Ity_I32);
   6818    assign(t1, getIReg(4,R_ESP));
   6819    assign(t2, loadLE(Ity_I32,mkexpr(t1)));
   6820    putIReg(4, R_ESP,binop(Iop_Add32, mkexpr(t1), mkU32(4+d32)));
   6821    jmp_treg(Ijk_Ret,t2);
   6822 }
   6823 
   6824 /*------------------------------------------------------------*/
   6825 /*--- SSE/SSE2/SSE3 helpers                                ---*/
   6826 /*------------------------------------------------------------*/
   6827 
   6828 /* Worker function; do not call directly.
   6829    Handles full width G = G `op` E   and   G = (not G) `op` E.
   6830 */
   6831 
   6832 static UInt dis_SSE_E_to_G_all_wrk (
   6833                UChar sorb, Int delta,
   6834                HChar* opname, IROp op,
   6835                Bool   invertG
   6836             )
   6837 {
   6838    HChar   dis_buf[50];
   6839    Int     alen;
   6840    IRTemp  addr;
   6841    UChar   rm = getIByte(delta);
   6842    IRExpr* gpart
   6843       = invertG ? unop(Iop_NotV128, getXMMReg(gregOfRM(rm)))
   6844                 : getXMMReg(gregOfRM(rm));
   6845    if (epartIsReg(rm)) {
   6846       putXMMReg( gregOfRM(rm),
   6847                  binop(op, gpart,
   6848                            getXMMReg(eregOfRM(rm))) );
   6849       DIP("%s %s,%s\n", opname,
   6850                         nameXMMReg(eregOfRM(rm)),
   6851                         nameXMMReg(gregOfRM(rm)) );
   6852       return delta+1;
   6853    } else {
   6854       addr = disAMode ( &alen, sorb, delta, dis_buf );
   6855       putXMMReg( gregOfRM(rm),
   6856                  binop(op, gpart,
   6857                            loadLE(Ity_V128, mkexpr(addr))) );
   6858       DIP("%s %s,%s\n", opname,
   6859                         dis_buf,
   6860                         nameXMMReg(gregOfRM(rm)) );
   6861       return delta+alen;
   6862    }
   6863 }
   6864 
   6865 
   6866 /* All lanes SSE binary operation, G = G `op` E. */
   6867 
   6868 static
   6869 UInt dis_SSE_E_to_G_all ( UChar sorb, Int delta, HChar* opname, IROp op )
   6870 {
   6871    return dis_SSE_E_to_G_all_wrk( sorb, delta, opname, op, False );
   6872 }
   6873 
   6874 /* All lanes SSE binary operation, G = (not G) `op` E. */
   6875 
   6876 static
   6877 UInt dis_SSE_E_to_G_all_invG ( UChar sorb, Int delta,
   6878                                HChar* opname, IROp op )
   6879 {
   6880    return dis_SSE_E_to_G_all_wrk( sorb, delta, opname, op, True );
   6881 }
   6882 
   6883 
   6884 /* Lowest 32-bit lane only SSE binary operation, G = G `op` E. */
   6885 
   6886 static UInt dis_SSE_E_to_G_lo32 ( UChar sorb, Int delta,
   6887                                   HChar* opname, IROp op )
   6888 {
   6889    HChar   dis_buf[50];
   6890    Int     alen;
   6891    IRTemp  addr;
   6892    UChar   rm = getIByte(delta);
   6893    IRExpr* gpart = getXMMReg(gregOfRM(rm));
   6894    if (epartIsReg(rm)) {
   6895       putXMMReg( gregOfRM(rm),
   6896                  binop(op, gpart,
   6897                            getXMMReg(eregOfRM(rm))) );
   6898       DIP("%s %s,%s\n", opname,
   6899                         nameXMMReg(eregOfRM(rm)),
   6900                         nameXMMReg(gregOfRM(rm)) );
   6901       return delta+1;
   6902    } else {
   6903       /* We can only do a 32-bit memory read, so the upper 3/4 of the
   6904          E operand needs to be made simply of zeroes. */
   6905       IRTemp epart = newTemp(Ity_V128);
   6906       addr = disAMode ( &alen, sorb, delta, dis_buf );
   6907       assign( epart, unop( Iop_32UtoV128,
   6908                            loadLE(Ity_I32, mkexpr(addr))) );
   6909       putXMMReg( gregOfRM(rm),
   6910                  binop(op, gpart, mkexpr(epart)) );
   6911       DIP("%s %s,%s\n", opname,
   6912                         dis_buf,
   6913                         nameXMMReg(gregOfRM(rm)) );
   6914       return delta+alen;
   6915    }
   6916 }
   6917 
   6918 
   6919 /* Lower 64-bit lane only SSE binary operation, G = G `op` E. */
   6920 
   6921 static UInt dis_SSE_E_to_G_lo64 ( UChar sorb, Int delta,
   6922                                   HChar* opname, IROp op )
   6923 {
   6924    HChar   dis_buf[50];
   6925    Int     alen;
   6926    IRTemp  addr;
   6927    UChar   rm = getIByte(delta);
   6928    IRExpr* gpart = getXMMReg(gregOfRM(rm));
   6929    if (epartIsReg(rm)) {
   6930       putXMMReg( gregOfRM(rm),
   6931                  binop(op, gpart,
   6932                            getXMMReg(eregOfRM(rm))) );
   6933       DIP("%s %s,%s\n", opname,
   6934                         nameXMMReg(eregOfRM(rm)),
   6935                         nameXMMReg(gregOfRM(rm)) );
   6936       return delta+1;
   6937    } else {
   6938       /* We can only do a 64-bit memory read, so the upper half of the
   6939          E operand needs to be made simply of zeroes. */
   6940       IRTemp epart = newTemp(Ity_V128);
   6941       addr = disAMode ( &alen, sorb, delta, dis_buf );
   6942       assign( epart, unop( Iop_64UtoV128,
   6943                            loadLE(Ity_I64, mkexpr(addr))) );
   6944       putXMMReg( gregOfRM(rm),
   6945                  binop(op, gpart, mkexpr(epart)) );
   6946       DIP("%s %s,%s\n", opname,
   6947                         dis_buf,
   6948                         nameXMMReg(gregOfRM(rm)) );
   6949       return delta+alen;
   6950    }
   6951 }
   6952 
   6953 
   6954 /* All lanes unary SSE operation, G = op(E). */
   6955 
   6956 static UInt dis_SSE_E_to_G_unary_all (
   6957                UChar sorb, Int delta,
   6958                HChar* opname, IROp op
   6959             )
   6960 {
   6961    HChar   dis_buf[50];
   6962    Int     alen;
   6963    IRTemp  addr;
   6964    UChar   rm = getIByte(delta);
   6965    if (epartIsReg(rm)) {
   6966       putXMMReg( gregOfRM(rm),
   6967                  unop(op, getXMMReg(eregOfRM(rm))) );
   6968       DIP("%s %s,%s\n", opname,
   6969                         nameXMMReg(eregOfRM(rm)),
   6970                         nameXMMReg(gregOfRM(rm)) );
   6971       return delta+1;
   6972    } else {
   6973       addr = disAMode ( &alen, sorb, delta, dis_buf );
   6974       putXMMReg( gregOfRM(rm),
   6975                  unop(op, loadLE(Ity_V128, mkexpr(addr))) );
   6976       DIP("%s %s,%s\n", opname,
   6977                         dis_buf,
   6978                         nameXMMReg(gregOfRM(rm)) );
   6979       return delta+alen;
   6980    }
   6981 }
   6982 
   6983 
   6984 /* Lowest 32-bit lane only unary SSE operation, G = op(E). */
   6985 
   6986 static UInt dis_SSE_E_to_G_unary_lo32 (
   6987                UChar sorb, Int delta,
   6988                HChar* opname, IROp op
   6989             )
   6990 {
   6991    /* First we need to get the old G value and patch the low 32 bits
   6992       of the E operand into it.  Then apply op and write back to G. */
   6993    HChar   dis_buf[50];
   6994    Int     alen;
   6995    IRTemp  addr;
   6996    UChar   rm = getIByte(delta);
   6997    IRTemp  oldG0 = newTemp(Ity_V128);
   6998    IRTemp  oldG1 = newTemp(Ity_V128);
   6999 
   7000    assign( oldG0, getXMMReg(gregOfRM(rm)) );
   7001 
   7002    if (epartIsReg(rm)) {
   7003       assign( oldG1,
   7004               binop( Iop_SetV128lo32,
   7005                      mkexpr(oldG0),
   7006                      getXMMRegLane32(eregOfRM(rm), 0)) );
   7007       putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
   7008       DIP("%s %s,%s\n", opname,
   7009                         nameXMMReg(eregOfRM(rm)),
   7010                         nameXMMReg(gregOfRM(rm)) );
   7011       return delta+1;
   7012    } else {
   7013       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7014       assign( oldG1,
   7015               binop( Iop_SetV128lo32,
   7016                      mkexpr(oldG0),
   7017                      loadLE(Ity_I32, mkexpr(addr)) ));
   7018       putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
   7019       DIP("%s %s,%s\n", opname,
   7020                         dis_buf,
   7021                         nameXMMReg(gregOfRM(rm)) );
   7022       return delta+alen;
   7023    }
   7024 }
   7025 
   7026 
   7027 /* Lowest 64-bit lane only unary SSE operation, G = op(E). */
   7028 
   7029 static UInt dis_SSE_E_to_G_unary_lo64 (
   7030                UChar sorb, Int delta,
   7031                HChar* opname, IROp op
   7032             )
   7033 {
   7034    /* First we need to get the old G value and patch the low 64 bits
   7035       of the E operand into it.  Then apply op and write back to G. */
   7036    HChar   dis_buf[50];
   7037    Int     alen;
   7038    IRTemp  addr;
   7039    UChar   rm = getIByte(delta);
   7040    IRTemp  oldG0 = newTemp(Ity_V128);
   7041    IRTemp  oldG1 = newTemp(Ity_V128);
   7042 
   7043    assign( oldG0, getXMMReg(gregOfRM(rm)) );
   7044 
   7045    if (epartIsReg(rm)) {
   7046       assign( oldG1,
   7047               binop( Iop_SetV128lo64,
   7048                      mkexpr(oldG0),
   7049                      getXMMRegLane64(eregOfRM(rm), 0)) );
   7050       putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
   7051       DIP("%s %s,%s\n", opname,
   7052                         nameXMMReg(eregOfRM(rm)),
   7053                         nameXMMReg(gregOfRM(rm)) );
   7054       return delta+1;
   7055    } else {
   7056       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7057       assign( oldG1,
   7058               binop( Iop_SetV128lo64,
   7059                      mkexpr(oldG0),
   7060                      loadLE(Ity_I64, mkexpr(addr)) ));
   7061       putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
   7062       DIP("%s %s,%s\n", opname,
   7063                         dis_buf,
   7064                         nameXMMReg(gregOfRM(rm)) );
   7065       return delta+alen;
   7066    }
   7067 }
   7068 
   7069 
   7070 /* SSE integer binary operation:
   7071       G = G `op` E   (eLeft == False)
   7072       G = E `op` G   (eLeft == True)
   7073 */
   7074 static UInt dis_SSEint_E_to_G(
   7075                UChar sorb, Int delta,
   7076                HChar* opname, IROp op,
   7077                Bool   eLeft
   7078             )
   7079 {
   7080    HChar   dis_buf[50];
   7081    Int     alen;
   7082    IRTemp  addr;
   7083    UChar   rm = getIByte(delta);
   7084    IRExpr* gpart = getXMMReg(gregOfRM(rm));
   7085    IRExpr* epart = NULL;
   7086    if (epartIsReg(rm)) {
   7087       epart = getXMMReg(eregOfRM(rm));
   7088       DIP("%s %s,%s\n", opname,
   7089                         nameXMMReg(eregOfRM(rm)),
   7090                         nameXMMReg(gregOfRM(rm)) );
   7091       delta += 1;
   7092    } else {
   7093       addr  = disAMode ( &alen, sorb, delta, dis_buf );
   7094       epart = loadLE(Ity_V128, mkexpr(addr));
   7095       DIP("%s %s,%s\n", opname,
   7096                         dis_buf,
   7097                         nameXMMReg(gregOfRM(rm)) );
   7098       delta += alen;
   7099    }
   7100    putXMMReg( gregOfRM(rm),
   7101               eLeft ? binop(op, epart, gpart)
   7102 	            : binop(op, gpart, epart) );
   7103    return delta;
   7104 }
   7105 
   7106 
   7107 /* Helper for doing SSE FP comparisons. */
   7108 
   7109 static void findSSECmpOp ( Bool* needNot, IROp* op,
   7110                            Int imm8, Bool all_lanes, Int sz )
   7111 {
   7112    imm8 &= 7;
   7113    *needNot = False;
   7114    *op      = Iop_INVALID;
   7115    if (imm8 >= 4) {
   7116       *needNot = True;
   7117       imm8 -= 4;
   7118    }
   7119 
   7120    if (sz == 4 && all_lanes) {
   7121       switch (imm8) {
   7122          case 0: *op = Iop_CmpEQ32Fx4; return;
   7123          case 1: *op = Iop_CmpLT32Fx4; return;
   7124          case 2: *op = Iop_CmpLE32Fx4; return;
   7125          case 3: *op = Iop_CmpUN32Fx4; return;
   7126          default: break;
   7127       }
   7128    }
   7129    if (sz == 4 && !all_lanes) {
   7130       switch (imm8) {
   7131          case 0: *op = Iop_CmpEQ32F0x4; return;
   7132          case 1: *op = Iop_CmpLT32F0x4; return;
   7133          case 2: *op = Iop_CmpLE32F0x4; return;
   7134          case 3: *op = Iop_CmpUN32F0x4; return;
   7135          default: break;
   7136       }
   7137    }
   7138    if (sz == 8 && all_lanes) {
   7139       switch (imm8) {
   7140          case 0: *op = Iop_CmpEQ64Fx2; return;
   7141          case 1: *op = Iop_CmpLT64Fx2; return;
   7142          case 2: *op = Iop_CmpLE64Fx2; return;
   7143          case 3: *op = Iop_CmpUN64Fx2; return;
   7144          default: break;
   7145       }
   7146    }
   7147    if (sz == 8 && !all_lanes) {
   7148       switch (imm8) {
   7149          case 0: *op = Iop_CmpEQ64F0x2; return;
   7150          case 1: *op = Iop_CmpLT64F0x2; return;
   7151          case 2: *op = Iop_CmpLE64F0x2; return;
   7152          case 3: *op = Iop_CmpUN64F0x2; return;
   7153          default: break;
   7154       }
   7155    }
   7156    vpanic("findSSECmpOp(x86,guest)");
   7157 }
   7158 
   7159 /* Handles SSE 32F/64F comparisons. */
   7160 
   7161 static UInt dis_SSEcmp_E_to_G ( UChar sorb, Int delta,
   7162 				HChar* opname, Bool all_lanes, Int sz )
   7163 {
   7164    HChar   dis_buf[50];
   7165    Int     alen, imm8;
   7166    IRTemp  addr;
   7167    Bool    needNot = False;
   7168    IROp    op      = Iop_INVALID;
   7169    IRTemp  plain   = newTemp(Ity_V128);
   7170    UChar   rm      = getIByte(delta);
   7171    UShort  mask    = 0;
   7172    vassert(sz == 4 || sz == 8);
   7173    if (epartIsReg(rm)) {
   7174       imm8 = getIByte(delta+1);
   7175       findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
   7176       assign( plain, binop(op, getXMMReg(gregOfRM(rm)),
   7177                                getXMMReg(eregOfRM(rm))) );
   7178       delta += 2;
   7179       DIP("%s $%d,%s,%s\n", opname,
   7180                             (Int)imm8,
   7181                             nameXMMReg(eregOfRM(rm)),
   7182                             nameXMMReg(gregOfRM(rm)) );
   7183    } else {
   7184       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7185       imm8 = getIByte(delta+alen);
   7186       findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
   7187       assign( plain,
   7188               binop(
   7189                  op,
   7190                  getXMMReg(gregOfRM(rm)),
   7191                    all_lanes  ? loadLE(Ity_V128, mkexpr(addr))
   7192                  : sz == 8    ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
   7193                  : /*sz==4*/    unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr)))
   7194              )
   7195       );
   7196       delta += alen+1;
   7197       DIP("%s $%d,%s,%s\n", opname,
   7198                             (Int)imm8,
   7199                             dis_buf,
   7200                             nameXMMReg(gregOfRM(rm)) );
   7201    }
   7202 
   7203    if (needNot && all_lanes) {
   7204       putXMMReg( gregOfRM(rm),
   7205                  unop(Iop_NotV128, mkexpr(plain)) );
   7206    }
   7207    else
   7208    if (needNot && !all_lanes) {
   7209       mask = toUShort( sz==4 ? 0x000F : 0x00FF );
   7210       putXMMReg( gregOfRM(rm),
   7211                  binop(Iop_XorV128, mkexpr(plain), mkV128(mask)) );
   7212    }
   7213    else {
   7214       putXMMReg( gregOfRM(rm), mkexpr(plain) );
   7215    }
   7216 
   7217    return delta;
   7218 }
   7219 
   7220 
   7221 /* Vector by scalar shift of G by the amount specified at the bottom
   7222    of E. */
   7223 
   7224 static UInt dis_SSE_shiftG_byE ( UChar sorb, Int delta,
   7225                                  HChar* opname, IROp op )
   7226 {
   7227    HChar   dis_buf[50];
   7228    Int     alen, size;
   7229    IRTemp  addr;
   7230    Bool    shl, shr, sar;
   7231    UChar   rm   = getIByte(delta);
   7232    IRTemp  g0   = newTemp(Ity_V128);
   7233    IRTemp  g1   = newTemp(Ity_V128);
   7234    IRTemp  amt  = newTemp(Ity_I32);
   7235    IRTemp  amt8 = newTemp(Ity_I8);
   7236    if (epartIsReg(rm)) {
   7237       assign( amt, getXMMRegLane32(eregOfRM(rm), 0) );
   7238       DIP("%s %s,%s\n", opname,
   7239                         nameXMMReg(eregOfRM(rm)),
   7240                         nameXMMReg(gregOfRM(rm)) );
   7241       delta++;
   7242    } else {
   7243       addr = disAMode ( &alen, sorb, delta, dis_buf );
   7244       assign( amt, loadLE(Ity_I32, mkexpr(addr)) );
   7245       DIP("%s %s,%s\n", opname,
   7246                         dis_buf,
   7247                         nameXMMReg(gregOfRM(rm)) );
   7248       delta += alen;
   7249    }
   7250    assign( g0,   getXMMReg(gregOfRM(rm)) );
   7251    assign( amt8, unop(Iop_32to8, mkexpr(amt)) );
   7252 
   7253    shl = shr = sar = False;
   7254    size = 0;
   7255    switch (op) {
   7256       case Iop_ShlN16x8: shl = True; size = 32; break;
   7257       case Iop_ShlN32x4: shl = True; size = 32; break;
   7258       case Iop_ShlN64x2: shl = True; size = 64; break;
   7259       case Iop_SarN16x8: sar = True; size = 16; break;
   7260       case Iop_SarN32x4: sar = True; size = 32; break;
   7261       case Iop_ShrN16x8: shr = True; size = 16; break;
   7262       case Iop_ShrN32x4: shr = True; size = 32; break;
   7263       case Iop_ShrN64x2: shr = True; size = 64; break;
   7264       default: vassert(0);
   7265    }
   7266 
   7267    if (shl || shr) {
   7268      assign(
   7269         g1,
   7270         IRExpr_Mux0X(
   7271            unop(Iop_1Uto8,binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size))),
   7272            mkV128(0x0000),
   7273            binop(op, mkexpr(g0), mkexpr(amt8))
   7274         )
   7275      );
   7276    } else
   7277    if (sar) {
   7278      assign(
   7279         g1,
   7280         IRExpr_Mux0X(
   7281            unop(Iop_1Uto8,binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size))),
   7282            binop(op, mkexpr(g0), mkU8(size-1)),
   7283            binop(op, mkexpr(g0), mkexpr(amt8))
   7284         )
   7285      );
   7286    } else {
   7287       /*NOTREACHED*/
   7288       vassert(0);
   7289    }
   7290 
   7291    putXMMReg( gregOfRM(rm), mkexpr(g1) );
   7292    return delta;
   7293 }
   7294 
   7295 
   7296 /* Vector by scalar shift of E by an immediate byte. */
   7297 
   7298 static
   7299 UInt dis_SSE_shiftE_imm ( Int delta, HChar* opname, IROp op )
   7300 {
   7301    Bool    shl, shr, sar;
   7302    UChar   rm   = getIByte(delta);
   7303    IRTemp  e0   = newTemp(Ity_V128);
   7304    IRTemp  e1   = newTemp(Ity_V128);
   7305    UChar   amt, size;
   7306    vassert(epartIsReg(rm));
   7307    vassert(gregOfRM(rm) == 2
   7308            || gregOfRM(rm) == 4 || gregOfRM(rm) == 6);
   7309    amt = getIByte(delta+1);
   7310    delta += 2;
   7311    DIP("%s $%d,%s\n", opname,
   7312                       (Int)amt,
   7313                       nameXMMReg(eregOfRM(rm)) );
   7314    assign( e0, getXMMReg(eregOfRM(rm)) );
   7315 
   7316    shl = shr = sar = False;
   7317    size = 0;
   7318    switch (op) {
   7319       case Iop_ShlN16x8: shl = True; size = 16; break;
   7320       case Iop_ShlN32x4: shl = True; size = 32; break;
   7321       case Iop_ShlN64x2: shl = True; size = 64; break;
   7322       case Iop_SarN16x8: sar = True; size = 16; break;
   7323       case Iop_SarN32x4: sar = True; size = 32; break;
   7324       case Iop_ShrN16x8: shr = True; size = 16; break;
   7325       case Iop_ShrN32x4: shr = True; size = 32; break;
   7326       case Iop_ShrN64x2: shr = True; size = 64; break;
   7327       default: vassert(0);
   7328    }
   7329 
   7330    if (shl || shr) {
   7331       assign( e1, amt >= size
   7332                      ? mkV128(0x0000)
   7333                      : binop(op, mkexpr(e0), mkU8(amt))
   7334       );
   7335    } else
   7336    if (sar) {
   7337       assign( e1, amt >= size
   7338                      ? binop(op, mkexpr(e0), mkU8(size-1))
   7339                      : binop(op, mkexpr(e0), mkU8(amt))
   7340       );
   7341    } else {
   7342       /*NOTREACHED*/
   7343       vassert(0);
   7344    }
   7345 
   7346    putXMMReg( eregOfRM(rm), mkexpr(e1) );
   7347    return delta;
   7348 }
   7349 
   7350 
   7351 /* Get the current SSE rounding mode. */
   7352 
   7353 static IRExpr* /* :: Ity_I32 */ get_sse_roundingmode ( void )
   7354 {
   7355    return binop( Iop_And32,
   7356                  IRExpr_Get( OFFB_SSEROUND, Ity_I32 ),
   7357                  mkU32(3) );
   7358 }
   7359 
   7360 static void put_sse_roundingmode ( IRExpr* sseround )
   7361 {
   7362    vassert(typeOfIRExpr(irsb->tyenv, sseround) == Ity_I32);
   7363    stmt( IRStmt_Put( OFFB_SSEROUND, sseround ) );
   7364 }
   7365 
   7366 /* Break a 128-bit value up into four 32-bit ints. */
   7367 
   7368 static void breakup128to32s ( IRTemp t128,
   7369 			      /*OUTs*/
   7370                               IRTemp* t3, IRTemp* t2,
   7371                               IRTemp* t1, IRTemp* t0 )
   7372 {
   7373    IRTemp hi64 = newTemp(Ity_I64);
   7374    IRTemp lo64 = newTemp(Ity_I64);
   7375    assign( hi64, unop(Iop_V128HIto64, mkexpr(t128)) );
   7376    assign( lo64, unop(Iop_V128to64,   mkexpr(t128)) );
   7377 
   7378    vassert(t0 && *t0 == IRTemp_INVALID);
   7379    vassert(t1 && *t1 == IRTemp_INVALID);
   7380    vassert(t2 && *t2 == IRTemp_INVALID);
   7381    vassert(t3 && *t3 == IRTemp_INVALID);
   7382 
   7383    *t0 = newTemp(Ity_I32);
   7384    *t1 = newTemp(Ity_I32);
   7385    *t2 = newTemp(Ity_I32);
   7386    *t3 = newTemp(Ity_I32);
   7387    assign( *t0, unop(Iop_64to32,   mkexpr(lo64)) );
   7388    assign( *t1, unop(Iop_64HIto32, mkexpr(lo64)) );
   7389    assign( *t2, unop(Iop_64to32,   mkexpr(hi64)) );
   7390    assign( *t3, unop(Iop_64HIto32, mkexpr(hi64)) );
   7391 }
   7392 
   7393 /* Construct a 128-bit value from four 32-bit ints. */
   7394 
   7395 static IRExpr* mk128from32s ( IRTemp t3, IRTemp t2,
   7396                               IRTemp t1, IRTemp t0 )
   7397 {
   7398    return
   7399       binop( Iop_64HLtoV128,
   7400              binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
   7401              binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0))
   7402    );
   7403 }
   7404 
   7405 /* Break a 64-bit value up into four 16-bit ints. */
   7406 
   7407 static void breakup64to16s ( IRTemp t64,
   7408                              /*OUTs*/
   7409                              IRTemp* t3, IRTemp* t2,
   7410                              IRTemp* t1, IRTemp* t0 )
   7411 {
   7412    IRTemp hi32 = newTemp(Ity_I32);
   7413    IRTemp lo32 = newTemp(Ity_I32);
   7414    assign( hi32, unop(Iop_64HIto32, mkexpr(t64)) );
   7415    assign( lo32, unop(Iop_64to32,   mkexpr(t64)) );
   7416 
   7417    vassert(t0 && *t0 == IRTemp_INVALID);
   7418    vassert(t1 && *t1 == IRTemp_INVALID);
   7419    vassert(t2 && *t2 == IRTemp_INVALID);
   7420    vassert(t3 && *t3 == IRTemp_INVALID);
   7421 
   7422    *t0 = newTemp(Ity_I16);
   7423    *t1 = newTemp(Ity_I16);
   7424    *t2 = newTemp(Ity_I16);
   7425    *t3 = newTemp(Ity_I16);
   7426    assign( *t0, unop(Iop_32to16,   mkexpr(lo32)) );
   7427    assign( *t1, unop(Iop_32HIto16, mkexpr(lo32)) );
   7428    assign( *t2, unop(Iop_32to16,   mkexpr(hi32)) );
   7429    assign( *t3, unop(Iop_32HIto16, mkexpr(hi32)) );
   7430 }
   7431 
   7432 /* Construct a 64-bit value from four 16-bit ints. */
   7433 
   7434 static IRExpr* mk64from16s ( IRTemp t3, IRTemp t2,
   7435                              IRTemp t1, IRTemp t0 )
   7436 {
   7437    return
   7438       binop( Iop_32HLto64,
   7439              binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)),
   7440              binop(Iop_16HLto32, mkexpr(t1), mkexpr(t0))
   7441    );
   7442 }
   7443 
   7444 /* Generate IR to set the guest %EFLAGS from the pushfl-format image
   7445    in the given 32-bit temporary.  The flags that are set are: O S Z A
   7446    C P D ID AC.
   7447 
   7448    In all cases, code to set AC is generated.  However, VEX actually
   7449    ignores the AC value and so can optionally emit an emulation
   7450    warning when it is enabled.  In this routine, an emulation warning
   7451    is only emitted if emit_AC_emwarn is True, in which case
   7452    next_insn_EIP must be correct (this allows for correct code
   7453    generation for popfl/popfw).  If emit_AC_emwarn is False,
   7454    next_insn_EIP is unimportant (this allows for easy if kludgey code
   7455    generation for IRET.) */
   7456 
   7457 static
   7458 void set_EFLAGS_from_value ( IRTemp t1,
   7459                              Bool   emit_AC_emwarn,
   7460                              Addr32 next_insn_EIP )
   7461 {
   7462    vassert(typeOfIRTemp(irsb->tyenv,t1) == Ity_I32);
   7463 
   7464    /* t1 is the flag word.  Mask out everything except OSZACP and set
   7465       the flags thunk to X86G_CC_OP_COPY. */
   7466    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   7467    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   7468    stmt( IRStmt_Put( OFFB_CC_DEP1,
   7469                      binop(Iop_And32,
   7470                            mkexpr(t1),
   7471                            mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
   7472                                   | X86G_CC_MASK_A | X86G_CC_MASK_Z
   7473                                   | X86G_CC_MASK_S| X86G_CC_MASK_O )
   7474                           )
   7475                     )
   7476        );
   7477    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   7478       elimination of previous stores to this field work better. */
   7479    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   7480 
   7481    /* Also need to set the D flag, which is held in bit 10 of t1.
   7482       If zero, put 1 in OFFB_DFLAG, else -1 in OFFB_DFLAG. */
   7483    stmt( IRStmt_Put(
   7484             OFFB_DFLAG,
   7485             IRExpr_Mux0X(
   7486                unop(Iop_32to8,
   7487                     binop(Iop_And32,
   7488                           binop(Iop_Shr32, mkexpr(t1), mkU8(10)),
   7489                           mkU32(1))),
   7490                mkU32(1),
   7491                mkU32(0xFFFFFFFF)))
   7492        );
   7493 
   7494    /* Set the ID flag */
   7495    stmt( IRStmt_Put(
   7496             OFFB_IDFLAG,
   7497             IRExpr_Mux0X(
   7498                unop(Iop_32to8,
   7499                     binop(Iop_And32,
   7500                           binop(Iop_Shr32, mkexpr(t1), mkU8(21)),
   7501                           mkU32(1))),
   7502                mkU32(0),
   7503                mkU32(1)))
   7504        );
   7505 
   7506    /* And set the AC flag.  If setting it 1 to, possibly emit an
   7507       emulation warning. */
   7508    stmt( IRStmt_Put(
   7509             OFFB_ACFLAG,
   7510             IRExpr_Mux0X(
   7511                unop(Iop_32to8,
   7512                     binop(Iop_And32,
   7513                           binop(Iop_Shr32, mkexpr(t1), mkU8(18)),
   7514                           mkU32(1))),
   7515                mkU32(0),
   7516                mkU32(1)))
   7517        );
   7518 
   7519    if (emit_AC_emwarn) {
   7520       put_emwarn( mkU32(EmWarn_X86_acFlag) );
   7521       stmt(
   7522          IRStmt_Exit(
   7523             binop( Iop_CmpNE32,
   7524                    binop(Iop_And32, mkexpr(t1), mkU32(1<<18)),
   7525                    mkU32(0) ),
   7526             Ijk_EmWarn,
   7527             IRConst_U32( next_insn_EIP )
   7528          )
   7529       );
   7530    }
   7531 }
   7532 
   7533 
   7534 /* Helper for the SSSE3 (not SSE3) PMULHRSW insns.  Given two 64-bit
   7535    values (aa,bb), computes, for each of the 4 16-bit lanes:
   7536 
   7537    (((aa_lane *s32 bb_lane) >>u 14) + 1) >>u 1
   7538 */
   7539 static IRExpr* dis_PMULHRSW_helper ( IRExpr* aax, IRExpr* bbx )
   7540 {
   7541    IRTemp aa      = newTemp(Ity_I64);
   7542    IRTemp bb      = newTemp(Ity_I64);
   7543    IRTemp aahi32s = newTemp(Ity_I64);
   7544    IRTemp aalo32s = newTemp(Ity_I64);
   7545    IRTemp bbhi32s = newTemp(Ity_I64);
   7546    IRTemp bblo32s = newTemp(Ity_I64);
   7547    IRTemp rHi     = newTemp(Ity_I64);
   7548    IRTemp rLo     = newTemp(Ity_I64);
   7549    IRTemp one32x2 = newTemp(Ity_I64);
   7550    assign(aa, aax);
   7551    assign(bb, bbx);
   7552    assign( aahi32s,
   7553            binop(Iop_SarN32x2,
   7554                  binop(Iop_InterleaveHI16x4, mkexpr(aa), mkexpr(aa)),
   7555                  mkU8(16) ));
   7556    assign( aalo32s,
   7557            binop(Iop_SarN32x2,
   7558                  binop(Iop_InterleaveLO16x4, mkexpr(aa), mkexpr(aa)),
   7559                  mkU8(16) ));
   7560    assign( bbhi32s,
   7561            binop(Iop_SarN32x2,
   7562                  binop(Iop_InterleaveHI16x4, mkexpr(bb), mkexpr(bb)),
   7563                  mkU8(16) ));
   7564    assign( bblo32s,
   7565            binop(Iop_SarN32x2,
   7566                  binop(Iop_InterleaveLO16x4, mkexpr(bb), mkexpr(bb)),
   7567                  mkU8(16) ));
   7568    assign(one32x2, mkU64( (1ULL << 32) + 1 ));
   7569    assign(
   7570       rHi,
   7571       binop(
   7572          Iop_ShrN32x2,
   7573          binop(
   7574             Iop_Add32x2,
   7575             binop(
   7576                Iop_ShrN32x2,
   7577                binop(Iop_Mul32x2, mkexpr(aahi32s), mkexpr(bbhi32s)),
   7578                mkU8(14)
   7579             ),
   7580             mkexpr(one32x2)
   7581          ),
   7582          mkU8(1)
   7583       )
   7584    );
   7585    assign(
   7586       rLo,
   7587       binop(
   7588          Iop_ShrN32x2,
   7589          binop(
   7590             Iop_Add32x2,
   7591             binop(
   7592                Iop_ShrN32x2,
   7593                binop(Iop_Mul32x2, mkexpr(aalo32s), mkexpr(bblo32s)),
   7594                mkU8(14)
   7595             ),
   7596             mkexpr(one32x2)
   7597          ),
   7598          mkU8(1)
   7599       )
   7600    );
   7601    return
   7602       binop(Iop_CatEvenLanes16x4, mkexpr(rHi), mkexpr(rLo));
   7603 }
   7604 
   7605 /* Helper for the SSSE3 (not SSE3) PSIGN{B,W,D} insns.  Given two 64-bit
   7606    values (aa,bb), computes, for each lane:
   7607 
   7608           if aa_lane < 0 then - bb_lane
   7609      else if aa_lane > 0 then bb_lane
   7610      else 0
   7611 */
   7612 static IRExpr* dis_PSIGN_helper ( IRExpr* aax, IRExpr* bbx, Int laneszB )
   7613 {
   7614    IRTemp aa       = newTemp(Ity_I64);
   7615    IRTemp bb       = newTemp(Ity_I64);
   7616    IRTemp zero     = newTemp(Ity_I64);
   7617    IRTemp bbNeg    = newTemp(Ity_I64);
   7618    IRTemp negMask  = newTemp(Ity_I64);
   7619    IRTemp posMask  = newTemp(Ity_I64);
   7620    IROp   opSub    = Iop_INVALID;
   7621    IROp   opCmpGTS = Iop_INVALID;
   7622 
   7623    switch (laneszB) {
   7624       case 1: opSub = Iop_Sub8x8;  opCmpGTS = Iop_CmpGT8Sx8;  break;
   7625       case 2: opSub = Iop_Sub16x4; opCmpGTS = Iop_CmpGT16Sx4; break;
   7626       case 4: opSub = Iop_Sub32x2; opCmpGTS = Iop_CmpGT32Sx2; break;
   7627       default: vassert(0);
   7628    }
   7629 
   7630    assign( aa,      aax );
   7631    assign( bb,      bbx );
   7632    assign( zero,    mkU64(0) );
   7633    assign( bbNeg,   binop(opSub,    mkexpr(zero), mkexpr(bb)) );
   7634    assign( negMask, binop(opCmpGTS, mkexpr(zero), mkexpr(aa)) );
   7635    assign( posMask, binop(opCmpGTS, mkexpr(aa),   mkexpr(zero)) );
   7636 
   7637    return
   7638       binop(Iop_Or64,
   7639             binop(Iop_And64, mkexpr(bb),    mkexpr(posMask)),
   7640             binop(Iop_And64, mkexpr(bbNeg), mkexpr(negMask)) );
   7641 
   7642 }
   7643 
   7644 /* Helper for the SSSE3 (not SSE3) PABS{B,W,D} insns.  Given a 64-bit
   7645    value aa, computes, for each lane
   7646 
   7647    if aa < 0 then -aa else aa
   7648 
   7649    Note that the result is interpreted as unsigned, so that the
   7650    absolute value of the most negative signed input can be
   7651    represented.
   7652 */
   7653 static IRExpr* dis_PABS_helper ( IRExpr* aax, Int laneszB )
   7654 {
   7655    IRTemp aa      = newTemp(Ity_I64);
   7656    IRTemp zero    = newTemp(Ity_I64);
   7657    IRTemp aaNeg   = newTemp(Ity_I64);
   7658    IRTemp negMask = newTemp(Ity_I64);
   7659    IRTemp posMask = newTemp(Ity_I64);
   7660    IROp   opSub   = Iop_INVALID;
   7661    IROp   opSarN  = Iop_INVALID;
   7662 
   7663    switch (laneszB) {
   7664       case 1: opSub = Iop_Sub8x8;  opSarN = Iop_SarN8x8;  break;
   7665       case 2: opSub = Iop_Sub16x4; opSarN = Iop_SarN16x4; break;
   7666       case 4: opSub = Iop_Sub32x2; opSarN = Iop_SarN32x2; break;
   7667       default: vassert(0);
   7668    }
   7669 
   7670    assign( aa,      aax );
   7671    assign( negMask, binop(opSarN, mkexpr(aa), mkU8(8*laneszB-1)) );
   7672    assign( posMask, unop(Iop_Not64, mkexpr(negMask)) );
   7673    assign( zero,    mkU64(0) );
   7674    assign( aaNeg,   binop(opSub, mkexpr(zero), mkexpr(aa)) );
   7675    return
   7676       binop(Iop_Or64,
   7677             binop(Iop_And64, mkexpr(aa),    mkexpr(posMask)),
   7678             binop(Iop_And64, mkexpr(aaNeg), mkexpr(negMask)) );
   7679 }
   7680 
   7681 static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64,
   7682                                         IRTemp lo64, Int byteShift )
   7683 {
   7684    vassert(byteShift >= 1 && byteShift <= 7);
   7685    return
   7686       binop(Iop_Or64,
   7687             binop(Iop_Shl64, mkexpr(hi64), mkU8(8*(8-byteShift))),
   7688             binop(Iop_Shr64, mkexpr(lo64), mkU8(8*byteShift))
   7689       );
   7690 }
   7691 
   7692 /* Generate a SIGSEGV followed by a restart of the current instruction
   7693    if effective_addr is not 16-aligned.  This is required behaviour
   7694    for some SSE3 instructions and all 128-bit SSSE3 instructions.
   7695    This assumes that guest_RIP_curr_instr is set correctly! */
   7696 /* TODO(glider): we've replaced the 0xF mask with 0x0, effectively disabling
   7697  * the check. Need to enable it once TSan stops generating unaligned
   7698  * accesses in the wrappers.
   7699  * See http://code.google.com/p/data-race-test/issues/detail?id=49 */
   7700 static void gen_SEGV_if_not_16_aligned ( IRTemp effective_addr )
   7701 {
   7702    stmt(
   7703       IRStmt_Exit(
   7704          binop(Iop_CmpNE32,
   7705                binop(Iop_And32,mkexpr(effective_addr),mkU32(0x0)),
   7706                mkU32(0)),
   7707          Ijk_SigSEGV,
   7708          IRConst_U32(guest_EIP_curr_instr)
   7709       )
   7710    );
   7711 }
   7712 
   7713 
   7714 /* Helper for deciding whether a given insn (starting at the opcode
   7715    byte) may validly be used with a LOCK prefix.  The following insns
   7716    may be used with LOCK when their destination operand is in memory.
   7717    AFAICS this is exactly the same for both 32-bit and 64-bit mode.
   7718 
   7719    ADD        80 /0,  81 /0,  82 /0,  83 /0,  00,  01
   7720    OR         80 /1,  81 /1,  82 /x,  83 /1,  08,  09
   7721    ADC        80 /2,  81 /2,  82 /2,  83 /2,  10,  11
   7722    SBB        81 /3,  81 /3,  82 /x,  83 /3,  18,  19
   7723    AND        80 /4,  81 /4,  82 /x,  83 /4,  20,  21
   7724    SUB        80 /5,  81 /5,  82 /x,  83 /5,  28,  29
   7725    XOR        80 /6,  81 /6,  82 /x,  83 /6,  30,  31
   7726 
   7727    DEC        FE /1,  FF /1
   7728    INC        FE /0,  FF /0
   7729 
   7730    NEG        F6 /3,  F7 /3
   7731    NOT        F6 /2,  F7 /2
   7732 
   7733    XCHG       86, 87
   7734 
   7735    BTC        0F BB,  0F BA /7
   7736    BTR        0F B3,  0F BA /6
   7737    BTS        0F AB,  0F BA /5
   7738 
   7739    CMPXCHG    0F B0,  0F B1
   7740    CMPXCHG8B  0F C7 /1
   7741 
   7742    XADD       0F C0,  0F C1
   7743 
   7744    ------------------------------
   7745 
   7746    80 /0  =  addb $imm8,  rm8
   7747    81 /0  =  addl $imm32, rm32  and  addw $imm16, rm16
   7748    82 /0  =  addb $imm8,  rm8
   7749    83 /0  =  addl $simm8, rm32  and  addw $simm8, rm16
   7750 
   7751    00     =  addb r8,  rm8
   7752    01     =  addl r32, rm32  and  addw r16, rm16
   7753 
   7754    Same for ADD OR ADC SBB AND SUB XOR
   7755 
   7756    FE /1  = dec rm8
   7757    FF /1  = dec rm32  and  dec rm16
   7758 
   7759    FE /0  = inc rm8
   7760    FF /0  = inc rm32  and  inc rm16
   7761 
   7762    F6 /3  = neg rm8
   7763    F7 /3  = neg rm32  and  neg rm16
   7764 
   7765    F6 /2  = not rm8
   7766    F7 /2  = not rm32  and  not rm16
   7767 
   7768    0F BB     = btcw r16, rm16    and  btcl r32, rm32
   7769    OF BA /7  = btcw $imm8, rm16  and  btcw $imm8, rm32
   7770 
   7771    Same for BTS, BTR
   7772 */
   7773 static Bool can_be_used_with_LOCK_prefix ( UChar* opc )
   7774 {
   7775    switch (opc[0]) {
   7776       case 0x00: case 0x01: case 0x08: case 0x09:
   7777       case 0x10: case 0x11: case 0x18: case 0x19:
   7778       case 0x20: case 0x21: case 0x28: case 0x29:
   7779       case 0x30: case 0x31:
   7780          if (!epartIsReg(opc[1]))
   7781             return True;
   7782          break;
   7783 
   7784       case 0x80: case 0x81: case 0x82: case 0x83:
   7785          if (gregOfRM(opc[1]) >= 0 && gregOfRM(opc[1]) <= 6
   7786              && !epartIsReg(opc[1]))
   7787             return True;
   7788          break;
   7789 
   7790       case 0xFE: case 0xFF:
   7791          if (gregOfRM(opc[1]) >= 0 && gregOfRM(opc[1]) <= 1
   7792              && !epartIsReg(opc[1]))
   7793             return True;
   7794          break;
   7795 
   7796       case 0xF6: case 0xF7:
   7797          if (gregOfRM(opc[1]) >= 2 && gregOfRM(opc[1]) <= 3
   7798              && !epartIsReg(opc[1]))
   7799             return True;
   7800          break;
   7801 
   7802       case 0x86: case 0x87:
   7803          if (!epartIsReg(opc[1]))
   7804             return True;
   7805          break;
   7806 
   7807       case 0x0F: {
   7808          switch (opc[1]) {
   7809             case 0xBB: case 0xB3: case 0xAB:
   7810                if (!epartIsReg(opc[2]))
   7811                   return True;
   7812                break;
   7813             case 0xBA:
   7814                if (gregOfRM(opc[2]) >= 5 && gregOfRM(opc[2]) <= 7
   7815                    && !epartIsReg(opc[2]))
   7816                   return True;
   7817                break;
   7818             case 0xB0: case 0xB1:
   7819                if (!epartIsReg(opc[2]))
   7820                   return True;
   7821                break;
   7822             case 0xC7:
   7823                if (gregOfRM(opc[2]) == 1 && !epartIsReg(opc[2]) )
   7824                   return True;
   7825                break;
   7826             case 0xC0: case 0xC1:
   7827                if (!epartIsReg(opc[2]))
   7828                   return True;
   7829                break;
   7830             default:
   7831                break;
   7832          } /* switch (opc[1]) */
   7833          break;
   7834       }
   7835 
   7836       default:
   7837          break;
   7838    } /* switch (opc[0]) */
   7839 
   7840    return False;
   7841 }
   7842 
   7843 
   7844 /*------------------------------------------------------------*/
   7845 /*--- Disassemble a single instruction                     ---*/
   7846 /*------------------------------------------------------------*/
   7847 
   7848 /* Disassemble a single instruction into IR.  The instruction is
   7849    located in host memory at &guest_code[delta].  *expect_CAS is set
   7850    to True if the resulting IR is expected to contain an IRCAS
   7851    statement, and False if it's not expected to.  This makes it
   7852    possible for the caller of disInstr_X86_WRK to check that
   7853    LOCK-prefixed instructions are at least plausibly translated, in
   7854    that it becomes possible to check that a (validly) LOCK-prefixed
   7855    instruction generates a translation containing an IRCAS, and
   7856    instructions without LOCK prefixes don't generate translations
   7857    containing an IRCAS.
   7858 */
   7859 static
   7860 DisResult disInstr_X86_WRK (
   7861              /*OUT*/Bool* expect_CAS,
   7862              Bool         put_IP,
   7863              Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
   7864              Bool         resteerCisOk,
   7865              void*        callback_opaque,
   7866              Long         delta64,
   7867              VexArchInfo* archinfo,
   7868              VexAbiInfo*  vbi
   7869           )
   7870 {
   7871    IRType    ty;
   7872    IRTemp    addr, t0, t1, t2, t3, t4, t5, t6;
   7873    Int       alen;
   7874    UChar     opc, modrm, abyte, pre;
   7875    UInt      d32;
   7876    HChar     dis_buf[50];
   7877    Int       am_sz, d_sz, n_prefixes;
   7878    DisResult dres;
   7879    UChar*    insn; /* used in SSE decoders */
   7880 
   7881    /* The running delta */
   7882    Int delta = (Int)delta64;
   7883 
   7884    /* Holds eip at the start of the insn, so that we can print
   7885       consistent error messages for unimplemented insns. */
   7886    Int delta_start = delta;
   7887 
   7888    /* sz denotes the nominal data-op size of the insn; we change it to
   7889       2 if an 0x66 prefix is seen */
   7890    Int sz = 4;
   7891 
   7892    /* sorb holds the segment-override-prefix byte, if any.  Zero if no
   7893       prefix has been seen, else one of {0x26, 0x3E, 0x64, 0x65}
   7894       indicating the prefix.  */
   7895    UChar sorb = 0;
   7896 
   7897    /* Gets set to True if a LOCK prefix is seen. */
   7898    Bool pfx_lock = False;
   7899 
   7900    /* Set result defaults. */
   7901    dres.whatNext   = Dis_Continue;
   7902    dres.len        = 0;
   7903    dres.continueAt = 0;
   7904 
   7905    *expect_CAS = False;
   7906 
   7907    addr = t0 = t1 = t2 = t3 = t4 = t5 = t6 = IRTemp_INVALID;
   7908 
   7909    vassert(guest_EIP_bbstart + delta == guest_EIP_curr_instr);
   7910    DIP("\t0x%x:  ", guest_EIP_bbstart+delta);
   7911 
   7912    /* We may be asked to update the guest EIP before going further. */
   7913    if (put_IP)
   7914       stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_curr_instr)) );
   7915 
   7916    /* Spot "Special" instructions (see comment at top of file). */
   7917    {
   7918       UChar* code = (UChar*)(guest_code + delta);
   7919       /* Spot the 12-byte preamble:
   7920          C1C703   roll $3,  %edi
   7921          C1C70D   roll $13, %edi
   7922          C1C71D   roll $29, %edi
   7923          C1C713   roll $19, %edi
   7924       */
   7925       if (code[ 0] == 0xC1 && code[ 1] == 0xC7 && code[ 2] == 0x03 &&
   7926           code[ 3] == 0xC1 && code[ 4] == 0xC7 && code[ 5] == 0x0D &&
   7927           code[ 6] == 0xC1 && code[ 7] == 0xC7 && code[ 8] == 0x1D &&
   7928           code[ 9] == 0xC1 && code[10] == 0xC7 && code[11] == 0x13) {
   7929          /* Got a "Special" instruction preamble.  Which one is it? */
   7930          if (code[12] == 0x87 && code[13] == 0xDB /* xchgl %ebx,%ebx */) {
   7931             /* %EDX = client_request ( %EAX ) */
   7932             DIP("%%edx = client_request ( %%eax )\n");
   7933             delta += 14;
   7934             jmp_lit(Ijk_ClientReq, guest_EIP_bbstart+delta);
   7935             dres.whatNext = Dis_StopHere;
   7936             goto decode_success;
   7937          }
   7938          else
   7939          if (code[12] == 0x87 && code[13] == 0xC9 /* xchgl %ecx,%ecx */) {
   7940             /* %EAX = guest_NRADDR */
   7941             DIP("%%eax = guest_NRADDR\n");
   7942             delta += 14;
   7943             putIReg(4, R_EAX, IRExpr_Get( OFFB_NRADDR, Ity_I32 ));
   7944             goto decode_success;
   7945          }
   7946          else
   7947          if (code[12] == 0x87 && code[13] == 0xD2 /* xchgl %edx,%edx */) {
   7948             /* call-noredir *%EAX */
   7949             DIP("call-noredir *%%eax\n");
   7950             delta += 14;
   7951             t1 = newTemp(Ity_I32);
   7952             assign(t1, getIReg(4,R_EAX));
   7953             t2 = newTemp(Ity_I32);
   7954             assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
   7955             putIReg(4, R_ESP, mkexpr(t2));
   7956             storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta));
   7957             jmp_treg(Ijk_NoRedir,t1);
   7958             dres.whatNext = Dis_StopHere;
   7959             goto decode_success;
   7960          }
   7961          /* We don't know what it is. */
   7962          goto decode_failure;
   7963          /*NOTREACHED*/
   7964       }
   7965    }
   7966 
   7967    /* Handle a couple of weird-ass NOPs that have been observed in the
   7968       wild. */
   7969    {
   7970       UChar* code = (UChar*)(guest_code + delta);
   7971       /* Sun's JVM 1.5.0 uses the following as a NOP:
   7972          26 2E 64 65 90  %es:%cs:%fs:%gs:nop */
   7973       if (code[0] == 0x26 && code[1] == 0x2E && code[2] == 0x64
   7974           && code[3] == 0x65 && code[4] == 0x90) {
   7975          DIP("%%es:%%cs:%%fs:%%gs:nop\n");
   7976          delta += 5;
   7977          goto decode_success;
   7978       }
   7979       /* Don't barf on recent binutils padding,
   7980          all variants of which are: nopw %cs:0x0(%eax,%eax,1)
   7981          66 2e 0f 1f 84 00 00 00 00 00
   7982          66 66 2e 0f 1f 84 00 00 00 00 00
   7983          66 66 66 2e 0f 1f 84 00 00 00 00 00
   7984          66 66 66 66 2e 0f 1f 84 00 00 00 00 00
   7985          66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00
   7986          66 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00
   7987       */
   7988       if (code[0] == 0x66) {
   7989          Int data16_cnt;
   7990          for (data16_cnt = 1; data16_cnt < 6; data16_cnt++)
   7991             if (code[data16_cnt] != 0x66)
   7992                break;
   7993          if (code[data16_cnt] == 0x2E && code[data16_cnt + 1] == 0x0F
   7994              && code[data16_cnt + 2] == 0x1F && code[data16_cnt + 3] == 0x84
   7995              && code[data16_cnt + 4] == 0x00 && code[data16_cnt + 5] == 0x00
   7996              && code[data16_cnt + 6] == 0x00 && code[data16_cnt + 7] == 0x00
   7997              && code[data16_cnt + 8] == 0x00 ) {
   7998             DIP("nopw %%cs:0x0(%%eax,%%eax,1)\n");
   7999             delta += 9 + data16_cnt;
   8000             goto decode_success;
   8001          }
   8002       }
   8003    }
   8004 
   8005    /* Normal instruction handling starts here. */
   8006 
   8007    /* Deal with some but not all prefixes:
   8008          66(oso)
   8009          F0(lock)
   8010          2E(cs:) 3E(ds:) 26(es:) 64(fs:) 65(gs:) 36(ss:)
   8011       Not dealt with (left in place):
   8012          F2 F3
   8013    */
   8014    n_prefixes = 0;
   8015    while (True) {
   8016       if (n_prefixes > 7) goto decode_failure;
   8017       pre = getUChar(delta);
   8018       switch (pre) {
   8019          case 0x66:
   8020             sz = 2;
   8021             break;
   8022          case 0xF0:
   8023             pfx_lock = True;
   8024             *expect_CAS = True;
   8025             break;
   8026          case 0x3E: /* %DS: */
   8027          case 0x26: /* %ES: */
   8028          case 0x64: /* %FS: */
   8029          case 0x65: /* %GS: */
   8030             if (sorb != 0)
   8031                goto decode_failure; /* only one seg override allowed */
   8032             sorb = pre;
   8033             break;
   8034          case 0x2E: { /* %CS: */
   8035             /* 2E prefix on a conditional branch instruction is a
   8036                branch-prediction hint, which can safely be ignored.  */
   8037             UChar op1 = getIByte(delta+1);
   8038             UChar op2 = getIByte(delta+2);
   8039             if ((op1 >= 0x70 && op1 <= 0x7F)
   8040                 || (op1 == 0xE3)
   8041                 || (op1 == 0x0F && op2 >= 0x80 && op2 <= 0x8F)) {
   8042                if (0) vex_printf("vex x86->IR: ignoring branch hint\n");
   8043             } else {
   8044                /* All other CS override cases are not handled */
   8045                goto decode_failure;
   8046             }
   8047             break;
   8048          }
   8049          case 0x36: /* %SS: */
   8050             /* SS override cases are not handled */
   8051             goto decode_failure;
   8052          default:
   8053             goto not_a_prefix;
   8054       }
   8055       n_prefixes++;
   8056       delta++;
   8057    }
   8058 
   8059    not_a_prefix:
   8060 
   8061    /* Now we should be looking at the primary opcode byte or the
   8062       leading F2 or F3.  Check that any LOCK prefix is actually
   8063       allowed. */
   8064 
   8065    if (pfx_lock) {
   8066       if (can_be_used_with_LOCK_prefix( (UChar*)&guest_code[delta] )) {
   8067          DIP("lock ");
   8068       } else {
   8069          *expect_CAS = False;
   8070          goto decode_failure;
   8071       }
   8072    }
   8073 
   8074 
   8075    /* ---------------------------------------------------- */
   8076    /* --- The SSE decoder.                             --- */
   8077    /* ---------------------------------------------------- */
   8078 
   8079    /* What did I do to deserve SSE ?  Perhaps I was really bad in a
   8080       previous life? */
   8081 
   8082    /* Note, this doesn't handle SSE2 or SSE3.  That is handled in a
   8083       later section, further on. */
   8084 
   8085    insn = (UChar*)&guest_code[delta];
   8086 
   8087    /* Treat fxsave specially.  It should be doable even on an SSE0
   8088       (Pentium-II class) CPU.  Hence be prepared to handle it on
   8089       any subarchitecture variant.
   8090    */
   8091 
   8092    /* 0F AE /0 = FXSAVE m512 -- write x87 and SSE state to memory */
   8093    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
   8094        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 0) {
   8095       IRDirty* d;
   8096       modrm = getIByte(delta+2);
   8097       vassert(sz == 4);
   8098       vassert(!epartIsReg(modrm));
   8099 
   8100       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8101       delta += 2+alen;
   8102 
   8103       DIP("fxsave %s\n", dis_buf);
   8104 
   8105       /* Uses dirty helper:
   8106             void x86g_do_FXSAVE ( VexGuestX86State*, UInt ) */
   8107       d = unsafeIRDirty_0_N (
   8108              0/*regparms*/,
   8109              "x86g_dirtyhelper_FXSAVE",
   8110              &x86g_dirtyhelper_FXSAVE,
   8111              mkIRExprVec_1( mkexpr(addr) )
   8112           );
   8113       d->needsBBP = True;
   8114 
   8115       /* declare we're writing memory */
   8116       d->mFx   = Ifx_Write;
   8117       d->mAddr = mkexpr(addr);
   8118       d->mSize = 512;
   8119 
   8120       /* declare we're reading guest state */
   8121       d->nFxState = 7;
   8122 
   8123       d->fxState[0].fx     = Ifx_Read;
   8124       d->fxState[0].offset = OFFB_FTOP;
   8125       d->fxState[0].size   = sizeof(UInt);
   8126 
   8127       d->fxState[1].fx     = Ifx_Read;
   8128       d->fxState[1].offset = OFFB_FPREGS;
   8129       d->fxState[1].size   = 8 * sizeof(ULong);
   8130 
   8131       d->fxState[2].fx     = Ifx_Read;
   8132       d->fxState[2].offset = OFFB_FPTAGS;
   8133       d->fxState[2].size   = 8 * sizeof(UChar);
   8134 
   8135       d->fxState[3].fx     = Ifx_Read;
   8136       d->fxState[3].offset = OFFB_FPROUND;
   8137       d->fxState[3].size   = sizeof(UInt);
   8138 
   8139       d->fxState[4].fx     = Ifx_Read;
   8140       d->fxState[4].offset = OFFB_FC3210;
   8141       d->fxState[4].size   = sizeof(UInt);
   8142 
   8143       d->fxState[5].fx     = Ifx_Read;
   8144       d->fxState[5].offset = OFFB_XMM0;
   8145       d->fxState[5].size   = 8 * sizeof(U128);
   8146 
   8147       d->fxState[6].fx     = Ifx_Read;
   8148       d->fxState[6].offset = OFFB_SSEROUND;
   8149       d->fxState[6].size   = sizeof(UInt);
   8150 
   8151       /* Be paranoid ... this assertion tries to ensure the 8 %xmm
   8152 	 images are packed back-to-back.  If not, the value of
   8153 	 d->fxState[5].size is wrong. */
   8154       vassert(16 == sizeof(U128));
   8155       vassert(OFFB_XMM7 == (OFFB_XMM0 + 7 * 16));
   8156 
   8157       stmt( IRStmt_Dirty(d) );
   8158 
   8159       goto decode_success;
   8160    }
   8161 
   8162    /* 0F AE /1 = FXRSTOR m512 -- read x87 and SSE state from memory */
   8163    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
   8164        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 1) {
   8165       IRDirty* d;
   8166       modrm = getIByte(delta+2);
   8167       vassert(sz == 4);
   8168       vassert(!epartIsReg(modrm));
   8169 
   8170       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8171       delta += 2+alen;
   8172 
   8173       DIP("fxrstor %s\n", dis_buf);
   8174 
   8175       /* Uses dirty helper:
   8176             void x86g_do_FXRSTOR ( VexGuestX86State*, UInt ) */
   8177       d = unsafeIRDirty_0_N (
   8178              0/*regparms*/,
   8179              "x86g_dirtyhelper_FXRSTOR",
   8180              &x86g_dirtyhelper_FXRSTOR,
   8181              mkIRExprVec_1( mkexpr(addr) )
   8182           );
   8183       d->needsBBP = True;
   8184 
   8185       /* declare we're reading memory */
   8186       d->mFx   = Ifx_Read;
   8187       d->mAddr = mkexpr(addr);
   8188       d->mSize = 512;
   8189 
   8190       /* declare we're writing guest state */
   8191       d->nFxState = 7;
   8192 
   8193       d->fxState[0].fx     = Ifx_Write;
   8194       d->fxState[0].offset = OFFB_FTOP;
   8195       d->fxState[0].size   = sizeof(UInt);
   8196 
   8197       d->fxState[1].fx     = Ifx_Write;
   8198       d->fxState[1].offset = OFFB_FPREGS;
   8199       d->fxState[1].size   = 8 * sizeof(ULong);
   8200 
   8201       d->fxState[2].fx     = Ifx_Write;
   8202       d->fxState[2].offset = OFFB_FPTAGS;
   8203       d->fxState[2].size   = 8 * sizeof(UChar);
   8204 
   8205       d->fxState[3].fx     = Ifx_Write;
   8206       d->fxState[3].offset = OFFB_FPROUND;
   8207       d->fxState[3].size   = sizeof(UInt);
   8208 
   8209       d->fxState[4].fx     = Ifx_Write;
   8210       d->fxState[4].offset = OFFB_FC3210;
   8211       d->fxState[4].size   = sizeof(UInt);
   8212 
   8213       d->fxState[5].fx     = Ifx_Write;
   8214       d->fxState[5].offset = OFFB_XMM0;
   8215       d->fxState[5].size   = 8 * sizeof(U128);
   8216 
   8217       d->fxState[6].fx     = Ifx_Write;
   8218       d->fxState[6].offset = OFFB_SSEROUND;
   8219       d->fxState[6].size   = sizeof(UInt);
   8220 
   8221       /* Be paranoid ... this assertion tries to ensure the 8 %xmm
   8222 	 images are packed back-to-back.  If not, the value of
   8223 	 d->fxState[5].size is wrong. */
   8224       vassert(16 == sizeof(U128));
   8225       vassert(OFFB_XMM7 == (OFFB_XMM0 + 7 * 16));
   8226 
   8227       stmt( IRStmt_Dirty(d) );
   8228 
   8229       goto decode_success;
   8230    }
   8231 
   8232    /* ------ SSE decoder main ------ */
   8233 
   8234    /* Skip parts of the decoder which don't apply given the stated
   8235       guest subarchitecture. */
   8236    if (archinfo->hwcaps == 0/*baseline, no sse at all*/)
   8237       goto after_sse_decoders;
   8238 
   8239    /* Otherwise we must be doing sse1 or sse2, so we can at least try
   8240       for SSE1 here. */
   8241 
   8242    /* 0F 58 = ADDPS -- add 32Fx4 from R/M to R */
   8243    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x58) {
   8244       delta = dis_SSE_E_to_G_all( sorb, delta+2, "addps", Iop_Add32Fx4 );
   8245       goto decode_success;
   8246    }
   8247 
   8248    /* F3 0F 58 = ADDSS -- add 32F0x4 from R/M to R */
   8249    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x58) {
   8250       vassert(sz == 4);
   8251       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "addss", Iop_Add32F0x4 );
   8252       goto decode_success;
   8253    }
   8254 
   8255    /* 0F 55 = ANDNPS -- G = (not G) and E */
   8256    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x55) {
   8257       delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "andnps", Iop_AndV128 );
   8258       goto decode_success;
   8259    }
   8260 
   8261    /* 0F 54 = ANDPS -- G = G and E */
   8262    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x54) {
   8263       delta = dis_SSE_E_to_G_all( sorb, delta+2, "andps", Iop_AndV128 );
   8264       goto decode_success;
   8265    }
   8266 
   8267    /* 0F C2 = CMPPS -- 32Fx4 comparison from R/M to R */
   8268    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC2) {
   8269       delta = dis_SSEcmp_E_to_G( sorb, delta+2, "cmpps", True, 4 );
   8270       goto decode_success;
   8271    }
   8272 
   8273    /* F3 0F C2 = CMPSS -- 32F0x4 comparison from R/M to R */
   8274    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xC2) {
   8275       vassert(sz == 4);
   8276       delta = dis_SSEcmp_E_to_G( sorb, delta+3, "cmpss", False, 4 );
   8277       goto decode_success;
   8278    }
   8279 
   8280    /* 0F 2F = COMISS  -- 32F0x4 comparison G,E, and set ZCP */
   8281    /* 0F 2E = UCOMISS -- 32F0x4 comparison G,E, and set ZCP */
   8282    if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
   8283       IRTemp argL = newTemp(Ity_F32);
   8284       IRTemp argR = newTemp(Ity_F32);
   8285       modrm = getIByte(delta+2);
   8286       if (epartIsReg(modrm)) {
   8287          assign( argR, getXMMRegLane32F( eregOfRM(modrm), 0/*lowest lane*/ ) );
   8288          delta += 2+1;
   8289          DIP("[u]comiss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   8290                                   nameXMMReg(gregOfRM(modrm)) );
   8291       } else {
   8292          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8293 	 assign( argR, loadLE(Ity_F32, mkexpr(addr)) );
   8294          delta += 2+alen;
   8295          DIP("[u]comiss %s,%s\n", dis_buf,
   8296                                   nameXMMReg(gregOfRM(modrm)) );
   8297       }
   8298       assign( argL, getXMMRegLane32F( gregOfRM(modrm), 0/*lowest lane*/ ) );
   8299 
   8300       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   8301       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   8302       stmt( IRStmt_Put(
   8303                OFFB_CC_DEP1,
   8304                binop( Iop_And32,
   8305                       binop(Iop_CmpF64,
   8306                             unop(Iop_F32toF64,mkexpr(argL)),
   8307                             unop(Iop_F32toF64,mkexpr(argR))),
   8308                       mkU32(0x45)
   8309           )));
   8310       /* Set NDEP even though it isn't used.  This makes redundant-PUT
   8311          elimination of previous stores to this field work better. */
   8312       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   8313       goto decode_success;
   8314    }
   8315 
   8316    /* 0F 2A = CVTPI2PS -- convert 2 x I32 in mem/mmx to 2 x F32 in low
   8317       half xmm */
   8318    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x2A) {
   8319       IRTemp arg64 = newTemp(Ity_I64);
   8320       IRTemp rmode = newTemp(Ity_I32);
   8321       vassert(sz == 4);
   8322 
   8323       modrm = getIByte(delta+2);
   8324       do_MMX_preamble();
   8325       if (epartIsReg(modrm)) {
   8326          assign( arg64, getMMXReg(eregOfRM(modrm)) );
   8327          delta += 2+1;
   8328          DIP("cvtpi2ps %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   8329                                  nameXMMReg(gregOfRM(modrm)));
   8330       } else {
   8331          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8332 	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   8333          delta += 2+alen;
   8334          DIP("cvtpi2ps %s,%s\n", dis_buf,
   8335                                  nameXMMReg(gregOfRM(modrm)) );
   8336       }
   8337 
   8338       assign( rmode, get_sse_roundingmode() );
   8339 
   8340       putXMMRegLane32F(
   8341          gregOfRM(modrm), 0,
   8342          binop(Iop_F64toF32,
   8343                mkexpr(rmode),
   8344                unop(Iop_I32StoF64,
   8345                     unop(Iop_64to32, mkexpr(arg64)) )) );
   8346 
   8347       putXMMRegLane32F(
   8348          gregOfRM(modrm), 1,
   8349          binop(Iop_F64toF32,
   8350                mkexpr(rmode),
   8351                unop(Iop_I32StoF64,
   8352                     unop(Iop_64HIto32, mkexpr(arg64)) )) );
   8353 
   8354       goto decode_success;
   8355    }
   8356 
   8357    /* F3 0F 2A = CVTSI2SS -- convert I32 in mem/ireg to F32 in low
   8358       quarter xmm */
   8359    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x2A) {
   8360       IRTemp arg32 = newTemp(Ity_I32);
   8361       IRTemp rmode = newTemp(Ity_I32);
   8362       vassert(sz == 4);
   8363 
   8364       modrm = getIByte(delta+3);
   8365       if (epartIsReg(modrm)) {
   8366          assign( arg32, getIReg(4, eregOfRM(modrm)) );
   8367          delta += 3+1;
   8368          DIP("cvtsi2ss %s,%s\n", nameIReg(4, eregOfRM(modrm)),
   8369                                  nameXMMReg(gregOfRM(modrm)));
   8370       } else {
   8371          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   8372 	 assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   8373          delta += 3+alen;
   8374          DIP("cvtsi2ss %s,%s\n", dis_buf,
   8375                                  nameXMMReg(gregOfRM(modrm)) );
   8376       }
   8377 
   8378       assign( rmode, get_sse_roundingmode() );
   8379 
   8380       putXMMRegLane32F(
   8381          gregOfRM(modrm), 0,
   8382          binop(Iop_F64toF32,
   8383                mkexpr(rmode),
   8384                unop(Iop_I32StoF64, mkexpr(arg32)) ) );
   8385 
   8386       goto decode_success;
   8387    }
   8388 
   8389    /* 0F 2D = CVTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
   8390       I32 in mmx, according to prevailing SSE rounding mode */
   8391    /* 0F 2C = CVTTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
   8392       I32 in mmx, rounding towards zero */
   8393    if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
   8394       IRTemp dst64  = newTemp(Ity_I64);
   8395       IRTemp rmode  = newTemp(Ity_I32);
   8396       IRTemp f32lo  = newTemp(Ity_F32);
   8397       IRTemp f32hi  = newTemp(Ity_F32);
   8398       Bool   r2zero = toBool(insn[1] == 0x2C);
   8399 
   8400       do_MMX_preamble();
   8401       modrm = getIByte(delta+2);
   8402 
   8403       if (epartIsReg(modrm)) {
   8404          delta += 2+1;
   8405 	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
   8406 	 assign(f32hi, getXMMRegLane32F(eregOfRM(modrm), 1));
   8407          DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
   8408                                    nameXMMReg(eregOfRM(modrm)),
   8409                                    nameMMXReg(gregOfRM(modrm)));
   8410       } else {
   8411          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8412 	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   8413 	 assign(f32hi, loadLE(Ity_F32, binop( Iop_Add32,
   8414                                               mkexpr(addr),
   8415                                               mkU32(4) )));
   8416          delta += 2+alen;
   8417          DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
   8418                                    dis_buf,
   8419                                    nameMMXReg(gregOfRM(modrm)));
   8420       }
   8421 
   8422       if (r2zero) {
   8423          assign(rmode, mkU32((UInt)Irrm_ZERO) );
   8424       } else {
   8425          assign( rmode, get_sse_roundingmode() );
   8426       }
   8427 
   8428       assign(
   8429          dst64,
   8430          binop( Iop_32HLto64,
   8431                 binop( Iop_F64toI32S,
   8432                        mkexpr(rmode),
   8433                        unop( Iop_F32toF64, mkexpr(f32hi) ) ),
   8434                 binop( Iop_F64toI32S,
   8435                        mkexpr(rmode),
   8436                        unop( Iop_F32toF64, mkexpr(f32lo) ) )
   8437               )
   8438       );
   8439 
   8440       putMMXReg(gregOfRM(modrm), mkexpr(dst64));
   8441       goto decode_success;
   8442    }
   8443 
   8444    /* F3 0F 2D = CVTSS2SI -- convert F32 in mem/low quarter xmm to
   8445       I32 in ireg, according to prevailing SSE rounding mode */
   8446    /* F3 0F 2C = CVTTSS2SI -- convert F32 in mem/low quarter xmm to
   8447       I32 in ireg, rounding towards zero */
   8448    if (insn[0] == 0xF3 && insn[1] == 0x0F
   8449        && (insn[2] == 0x2D || insn[2] == 0x2C)) {
   8450       IRTemp rmode = newTemp(Ity_I32);
   8451       IRTemp f32lo = newTemp(Ity_F32);
   8452       Bool   r2zero = toBool(insn[2] == 0x2C);
   8453       vassert(sz == 4);
   8454 
   8455       modrm = getIByte(delta+3);
   8456       if (epartIsReg(modrm)) {
   8457          delta += 3+1;
   8458 	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
   8459          DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
   8460                                    nameXMMReg(eregOfRM(modrm)),
   8461                                    nameIReg(4, gregOfRM(modrm)));
   8462       } else {
   8463          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   8464 	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   8465          delta += 3+alen;
   8466          DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
   8467                                    dis_buf,
   8468                                    nameIReg(4, gregOfRM(modrm)));
   8469       }
   8470 
   8471       if (r2zero) {
   8472          assign( rmode, mkU32((UInt)Irrm_ZERO) );
   8473       } else {
   8474          assign( rmode, get_sse_roundingmode() );
   8475       }
   8476 
   8477       putIReg(4, gregOfRM(modrm),
   8478                  binop( Iop_F64toI32S,
   8479                         mkexpr(rmode),
   8480                         unop( Iop_F32toF64, mkexpr(f32lo) ) )
   8481       );
   8482 
   8483       goto decode_success;
   8484    }
   8485 
   8486    /* 0F 5E = DIVPS -- div 32Fx4 from R/M to R */
   8487    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5E) {
   8488       delta = dis_SSE_E_to_G_all( sorb, delta+2, "divps", Iop_Div32Fx4 );
   8489       goto decode_success;
   8490    }
   8491 
   8492    /* F3 0F 5E = DIVSS -- div 32F0x4 from R/M to R */
   8493    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5E) {
   8494       vassert(sz == 4);
   8495       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "divss", Iop_Div32F0x4 );
   8496       goto decode_success;
   8497    }
   8498 
   8499    /* 0F AE /2 = LDMXCSR m32 -- load %mxcsr */
   8500    if (insn[0] == 0x0F && insn[1] == 0xAE
   8501        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 2) {
   8502 
   8503       IRTemp t64 = newTemp(Ity_I64);
   8504       IRTemp ew = newTemp(Ity_I32);
   8505 
   8506       modrm = getIByte(delta+2);
   8507       vassert(!epartIsReg(modrm));
   8508       vassert(sz == 4);
   8509 
   8510       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8511       delta += 2+alen;
   8512       DIP("ldmxcsr %s\n", dis_buf);
   8513 
   8514       /* The only thing we observe in %mxcsr is the rounding mode.
   8515          Therefore, pass the 32-bit value (SSE native-format control
   8516          word) to a clean helper, getting back a 64-bit value, the
   8517          lower half of which is the SSEROUND value to store, and the
   8518          upper half of which is the emulation-warning token which may
   8519          be generated.
   8520       */
   8521       /* ULong x86h_check_ldmxcsr ( UInt ); */
   8522       assign( t64, mkIRExprCCall(
   8523                       Ity_I64, 0/*regparms*/,
   8524                       "x86g_check_ldmxcsr",
   8525                       &x86g_check_ldmxcsr,
   8526                       mkIRExprVec_1( loadLE(Ity_I32, mkexpr(addr)) )
   8527                    )
   8528             );
   8529 
   8530       put_sse_roundingmode( unop(Iop_64to32, mkexpr(t64)) );
   8531       assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
   8532       put_emwarn( mkexpr(ew) );
   8533       /* Finally, if an emulation warning was reported, side-exit to
   8534          the next insn, reporting the warning, so that Valgrind's
   8535          dispatcher sees the warning. */
   8536       stmt(
   8537          IRStmt_Exit(
   8538             binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   8539             Ijk_EmWarn,
   8540             IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta)
   8541          )
   8542       );
   8543       goto decode_success;
   8544    }
   8545 
   8546    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8547    /* 0F F7 = MASKMOVQ -- 8x8 masked store */
   8548    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF7) {
   8549       Bool ok = False;
   8550       delta = dis_MMX( &ok, sorb, sz, delta+1 );
   8551       if (!ok)
   8552          goto decode_failure;
   8553       goto decode_success;
   8554    }
   8555 
   8556    /* 0F 5F = MAXPS -- max 32Fx4 from R/M to R */
   8557    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5F) {
   8558       delta = dis_SSE_E_to_G_all( sorb, delta+2, "maxps", Iop_Max32Fx4 );
   8559       goto decode_success;
   8560    }
   8561 
   8562    /* F3 0F 5F = MAXSS -- max 32F0x4 from R/M to R */
   8563    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5F) {
   8564       vassert(sz == 4);
   8565       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "maxss", Iop_Max32F0x4 );
   8566       goto decode_success;
   8567    }
   8568 
   8569    /* 0F 5D = MINPS -- min 32Fx4 from R/M to R */
   8570    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5D) {
   8571       delta = dis_SSE_E_to_G_all( sorb, delta+2, "minps", Iop_Min32Fx4 );
   8572       goto decode_success;
   8573    }
   8574 
   8575    /* F3 0F 5D = MINSS -- min 32F0x4 from R/M to R */
   8576    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5D) {
   8577       vassert(sz == 4);
   8578       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "minss", Iop_Min32F0x4 );
   8579       goto decode_success;
   8580    }
   8581 
   8582    /* 0F 28 = MOVAPS -- move from E (mem or xmm) to G (xmm). */
   8583    /* 0F 10 = MOVUPS -- move from E (mem or xmm) to G (xmm). */
   8584    if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x28 || insn[1] == 0x10)) {
   8585       modrm = getIByte(delta+2);
   8586       if (epartIsReg(modrm)) {
   8587          putXMMReg( gregOfRM(modrm),
   8588                     getXMMReg( eregOfRM(modrm) ));
   8589          DIP("mov[ua]ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   8590                                   nameXMMReg(gregOfRM(modrm)));
   8591          delta += 2+1;
   8592       } else {
   8593          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8594          if (insn[1] == 0x28/*movaps*/)
   8595             gen_SEGV_if_not_16_aligned( addr );
   8596          putXMMReg( gregOfRM(modrm),
   8597                     loadLE(Ity_V128, mkexpr(addr)) );
   8598          DIP("mov[ua]ps %s,%s\n", dis_buf,
   8599                                   nameXMMReg(gregOfRM(modrm)));
   8600          delta += 2+alen;
   8601       }
   8602       goto decode_success;
   8603    }
   8604 
   8605    /* 0F 29 = MOVAPS -- move from G (xmm) to E (mem or xmm). */
   8606    /* 0F 11 = MOVUPS -- move from G (xmm) to E (mem or xmm). */
   8607    if (sz == 4 && insn[0] == 0x0F
   8608        && (insn[1] == 0x29 || insn[1] == 0x11)) {
   8609       modrm = getIByte(delta+2);
   8610       if (epartIsReg(modrm)) {
   8611          /* fall through; awaiting test case */
   8612       } else {
   8613          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8614          if (insn[1] == 0x29/*movaps*/)
   8615             gen_SEGV_if_not_16_aligned( addr );
   8616          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   8617          DIP("mov[ua]ps %s,%s\n", nameXMMReg(gregOfRM(modrm)),
   8618                                   dis_buf );
   8619          delta += 2+alen;
   8620          goto decode_success;
   8621       }
   8622    }
   8623 
   8624    /* 0F 16 = MOVHPS -- move from mem to high half of XMM. */
   8625    /* 0F 16 = MOVLHPS -- move from lo half to hi half of XMM. */
   8626    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x16) {
   8627       modrm = getIByte(delta+2);
   8628       if (epartIsReg(modrm)) {
   8629          delta += 2+1;
   8630          putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
   8631                           getXMMRegLane64( eregOfRM(modrm), 0 ) );
   8632          DIP("movhps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   8633                                nameXMMReg(gregOfRM(modrm)));
   8634       } else {
   8635          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8636          delta += 2+alen;
   8637          putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
   8638                           loadLE(Ity_I64, mkexpr(addr)) );
   8639          DIP("movhps %s,%s\n", dis_buf,
   8640                                nameXMMReg( gregOfRM(modrm) ));
   8641       }
   8642       goto decode_success;
   8643    }
   8644 
   8645    /* 0F 17 = MOVHPS -- move from high half of XMM to mem. */
   8646    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x17) {
   8647       if (!epartIsReg(insn[2])) {
   8648          delta += 2;
   8649          addr = disAMode ( &alen, sorb, delta, dis_buf );
   8650          delta += alen;
   8651          storeLE( mkexpr(addr),
   8652                   getXMMRegLane64( gregOfRM(insn[2]),
   8653                                    1/*upper lane*/ ) );
   8654          DIP("movhps %s,%s\n", nameXMMReg( gregOfRM(insn[2]) ),
   8655                                dis_buf);
   8656          goto decode_success;
   8657       }
   8658       /* else fall through */
   8659    }
   8660 
   8661    /* 0F 12 = MOVLPS -- move from mem to low half of XMM. */
   8662    /* OF 12 = MOVHLPS -- from from hi half to lo half of XMM. */
   8663    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x12) {
   8664       modrm = getIByte(delta+2);
   8665       if (epartIsReg(modrm)) {
   8666          delta += 2+1;
   8667          putXMMRegLane64( gregOfRM(modrm),
   8668                           0/*lower lane*/,
   8669                           getXMMRegLane64( eregOfRM(modrm), 1 ));
   8670          DIP("movhlps %s, %s\n", nameXMMReg(eregOfRM(modrm)),
   8671                                  nameXMMReg(gregOfRM(modrm)));
   8672       } else {
   8673          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8674          delta += 2+alen;
   8675          putXMMRegLane64( gregOfRM(modrm),  0/*lower lane*/,
   8676                           loadLE(Ity_I64, mkexpr(addr)) );
   8677          DIP("movlps %s, %s\n",
   8678              dis_buf, nameXMMReg( gregOfRM(modrm) ));
   8679       }
   8680       goto decode_success;
   8681    }
   8682 
   8683    /* 0F 13 = MOVLPS -- move from low half of XMM to mem. */
   8684    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x13) {
   8685       if (!epartIsReg(insn[2])) {
   8686          delta += 2;
   8687          addr = disAMode ( &alen, sorb, delta, dis_buf );
   8688          delta += alen;
   8689          storeLE( mkexpr(addr),
   8690                   getXMMRegLane64( gregOfRM(insn[2]),
   8691                                    0/*lower lane*/ ) );
   8692          DIP("movlps %s, %s\n", nameXMMReg( gregOfRM(insn[2]) ),
   8693                                 dis_buf);
   8694          goto decode_success;
   8695       }
   8696       /* else fall through */
   8697    }
   8698 
   8699    /* 0F 50 = MOVMSKPS - move 4 sign bits from 4 x F32 in xmm(E)
   8700       to 4 lowest bits of ireg(G) */
   8701    if (insn[0] == 0x0F && insn[1] == 0x50) {
   8702       modrm = getIByte(delta+2);
   8703       if (sz == 4 && epartIsReg(modrm)) {
   8704          Int src;
   8705          t0 = newTemp(Ity_I32);
   8706          t1 = newTemp(Ity_I32);
   8707          t2 = newTemp(Ity_I32);
   8708          t3 = newTemp(Ity_I32);
   8709          delta += 2+1;
   8710          src = eregOfRM(modrm);
   8711          assign( t0, binop( Iop_And32,
   8712                             binop(Iop_Shr32, getXMMRegLane32(src,0), mkU8(31)),
   8713                             mkU32(1) ));
   8714          assign( t1, binop( Iop_And32,
   8715                             binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(30)),
   8716                             mkU32(2) ));
   8717          assign( t2, binop( Iop_And32,
   8718                             binop(Iop_Shr32, getXMMRegLane32(src,2), mkU8(29)),
   8719                             mkU32(4) ));
   8720          assign( t3, binop( Iop_And32,
   8721                             binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(28)),
   8722                             mkU32(8) ));
   8723          putIReg(4, gregOfRM(modrm),
   8724                     binop(Iop_Or32,
   8725                           binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
   8726                           binop(Iop_Or32, mkexpr(t2), mkexpr(t3))
   8727                          )
   8728                  );
   8729          DIP("movmskps %s,%s\n", nameXMMReg(src),
   8730                                  nameIReg(4, gregOfRM(modrm)));
   8731          goto decode_success;
   8732       }
   8733       /* else fall through */
   8734    }
   8735 
   8736    /* 0F 2B = MOVNTPS -- for us, just a plain SSE store. */
   8737    /* 66 0F 2B = MOVNTPD -- for us, just a plain SSE store. */
   8738    if (insn[0] == 0x0F && insn[1] == 0x2B) {
   8739       modrm = getIByte(delta+2);
   8740       if (!epartIsReg(modrm)) {
   8741          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8742          gen_SEGV_if_not_16_aligned( addr );
   8743          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   8744          DIP("movntp%s %s,%s\n", sz==2 ? "d" : "s",
   8745                                  dis_buf,
   8746                                  nameXMMReg(gregOfRM(modrm)));
   8747          delta += 2+alen;
   8748          goto decode_success;
   8749       }
   8750       /* else fall through */
   8751    }
   8752 
   8753    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8754    /* 0F E7 = MOVNTQ -- for us, just a plain MMX store.  Note, the
   8755       Intel manual does not say anything about the usual business of
   8756       the FP reg tags getting trashed whenever an MMX insn happens.
   8757       So we just leave them alone.
   8758    */
   8759    if (insn[0] == 0x0F && insn[1] == 0xE7) {
   8760       modrm = getIByte(delta+2);
   8761       if (sz == 4 && !epartIsReg(modrm)) {
   8762          /* do_MMX_preamble(); Intel docs don't specify this */
   8763          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8764          storeLE( mkexpr(addr), getMMXReg(gregOfRM(modrm)) );
   8765          DIP("movntq %s,%s\n", dis_buf,
   8766                                nameMMXReg(gregOfRM(modrm)));
   8767          delta += 2+alen;
   8768          goto decode_success;
   8769       }
   8770       /* else fall through */
   8771    }
   8772 
   8773    /* F3 0F 10 = MOVSS -- move 32 bits from E (mem or lo 1/4 xmm) to G
   8774       (lo 1/4 xmm).  If E is mem, upper 3/4 of G is zeroed out. */
   8775    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x10) {
   8776       vassert(sz == 4);
   8777       modrm = getIByte(delta+3);
   8778       if (epartIsReg(modrm)) {
   8779          putXMMRegLane32( gregOfRM(modrm), 0,
   8780                           getXMMRegLane32( eregOfRM(modrm), 0 ));
   8781          DIP("movss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   8782                               nameXMMReg(gregOfRM(modrm)));
   8783          delta += 3+1;
   8784       } else {
   8785          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   8786          /* zero bits 127:64 */
   8787          putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
   8788          /* zero bits 63:32 */
   8789          putXMMRegLane32( gregOfRM(modrm), 1, mkU32(0) );
   8790          /* write bits 31:0 */
   8791          putXMMRegLane32( gregOfRM(modrm), 0,
   8792                           loadLE(Ity_I32, mkexpr(addr)) );
   8793          DIP("movss %s,%s\n", dis_buf,
   8794                               nameXMMReg(gregOfRM(modrm)));
   8795          delta += 3+alen;
   8796       }
   8797       goto decode_success;
   8798    }
   8799 
   8800    /* F3 0F 11 = MOVSS -- move 32 bits from G (lo 1/4 xmm) to E (mem
   8801       or lo 1/4 xmm). */
   8802    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x11) {
   8803       vassert(sz == 4);
   8804       modrm = getIByte(delta+3);
   8805       if (epartIsReg(modrm)) {
   8806          /* fall through, we don't yet have a test case */
   8807       } else {
   8808          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   8809          storeLE( mkexpr(addr),
   8810                   getXMMRegLane32(gregOfRM(modrm), 0) );
   8811          DIP("movss %s,%s\n", nameXMMReg(gregOfRM(modrm)),
   8812                               dis_buf);
   8813          delta += 3+alen;
   8814          goto decode_success;
   8815       }
   8816    }
   8817 
   8818    /* 0F 59 = MULPS -- mul 32Fx4 from R/M to R */
   8819    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x59) {
   8820       delta = dis_SSE_E_to_G_all( sorb, delta+2, "mulps", Iop_Mul32Fx4 );
   8821       goto decode_success;
   8822    }
   8823 
   8824    /* F3 0F 59 = MULSS -- mul 32F0x4 from R/M to R */
   8825    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x59) {
   8826       vassert(sz == 4);
   8827       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "mulss", Iop_Mul32F0x4 );
   8828       goto decode_success;
   8829    }
   8830 
   8831    /* 0F 56 = ORPS -- G = G and E */
   8832    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x56) {
   8833       delta = dis_SSE_E_to_G_all( sorb, delta+2, "orps", Iop_OrV128 );
   8834       goto decode_success;
   8835    }
   8836 
   8837    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8838    /* 0F E0 = PAVGB -- 8x8 unsigned Packed Average, with rounding */
   8839    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE0) {
   8840       do_MMX_preamble();
   8841       delta = dis_MMXop_regmem_to_reg (
   8842                 sorb, delta+2, insn[1], "pavgb", False );
   8843       goto decode_success;
   8844    }
   8845 
   8846    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8847    /* 0F E3 = PAVGW -- 16x4 unsigned Packed Average, with rounding */
   8848    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE3) {
   8849       do_MMX_preamble();
   8850       delta = dis_MMXop_regmem_to_reg (
   8851                 sorb, delta+2, insn[1], "pavgw", False );
   8852       goto decode_success;
   8853    }
   8854 
   8855    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8856    /* 0F C5 = PEXTRW -- extract 16-bit field from mmx(E) and put
   8857       zero-extend of it in ireg(G). */
   8858    if (insn[0] == 0x0F && insn[1] == 0xC5) {
   8859       modrm = insn[2];
   8860       if (sz == 4 && epartIsReg(modrm)) {
   8861          IRTemp sV = newTemp(Ity_I64);
   8862          t5 = newTemp(Ity_I16);
   8863          do_MMX_preamble();
   8864          assign(sV, getMMXReg(eregOfRM(modrm)));
   8865          breakup64to16s( sV, &t3, &t2, &t1, &t0 );
   8866          switch (insn[3] & 3) {
   8867             case 0:  assign(t5, mkexpr(t0)); break;
   8868             case 1:  assign(t5, mkexpr(t1)); break;
   8869             case 2:  assign(t5, mkexpr(t2)); break;
   8870             case 3:  assign(t5, mkexpr(t3)); break;
   8871             default: vassert(0); /*NOTREACHED*/
   8872          }
   8873          putIReg(4, gregOfRM(modrm), unop(Iop_16Uto32, mkexpr(t5)));
   8874          DIP("pextrw $%d,%s,%s\n",
   8875              (Int)insn[3], nameMMXReg(eregOfRM(modrm)),
   8876                            nameIReg(4,gregOfRM(modrm)));
   8877          delta += 4;
   8878          goto decode_success;
   8879       }
   8880       /* else fall through */
   8881    }
   8882 
   8883    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8884    /* 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
   8885       put it into the specified lane of mmx(G). */
   8886    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC4) {
   8887       /* Use t0 .. t3 to hold the 4 original 16-bit lanes of the
   8888          mmx reg.  t4 is the new lane value.  t5 is the original
   8889          mmx value. t6 is the new mmx value. */
   8890       Int lane;
   8891       t4 = newTemp(Ity_I16);
   8892       t5 = newTemp(Ity_I64);
   8893       t6 = newTemp(Ity_I64);
   8894       modrm = insn[2];
   8895       do_MMX_preamble();
   8896 
   8897       assign(t5, getMMXReg(gregOfRM(modrm)));
   8898       breakup64to16s( t5, &t3, &t2, &t1, &t0 );
   8899 
   8900       if (epartIsReg(modrm)) {
   8901          assign(t4, getIReg(2, eregOfRM(modrm)));
   8902          delta += 3+1;
   8903          lane = insn[3+1-1];
   8904          DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   8905                                    nameIReg(2,eregOfRM(modrm)),
   8906                                    nameMMXReg(gregOfRM(modrm)));
   8907       } else {
   8908          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   8909          delta += 3+alen;
   8910          lane = insn[3+alen-1];
   8911          assign(t4, loadLE(Ity_I16, mkexpr(addr)));
   8912          DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   8913                                    dis_buf,
   8914                                    nameMMXReg(gregOfRM(modrm)));
   8915       }
   8916 
   8917       switch (lane & 3) {
   8918          case 0:  assign(t6, mk64from16s(t3,t2,t1,t4)); break;
   8919          case 1:  assign(t6, mk64from16s(t3,t2,t4,t0)); break;
   8920          case 2:  assign(t6, mk64from16s(t3,t4,t1,t0)); break;
   8921          case 3:  assign(t6, mk64from16s(t4,t2,t1,t0)); break;
   8922          default: vassert(0); /*NOTREACHED*/
   8923       }
   8924       putMMXReg(gregOfRM(modrm), mkexpr(t6));
   8925       goto decode_success;
   8926    }
   8927 
   8928    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8929    /* 0F EE = PMAXSW -- 16x4 signed max */
   8930    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xEE) {
   8931       do_MMX_preamble();
   8932       delta = dis_MMXop_regmem_to_reg (
   8933                 sorb, delta+2, insn[1], "pmaxsw", False );
   8934       goto decode_success;
   8935    }
   8936 
   8937    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8938    /* 0F DE = PMAXUB -- 8x8 unsigned max */
   8939    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xDE) {
   8940       do_MMX_preamble();
   8941       delta = dis_MMXop_regmem_to_reg (
   8942                 sorb, delta+2, insn[1], "pmaxub", False );
   8943       goto decode_success;
   8944    }
   8945 
   8946    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8947    /* 0F EA = PMINSW -- 16x4 signed min */
   8948    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xEA) {
   8949       do_MMX_preamble();
   8950       delta = dis_MMXop_regmem_to_reg (
   8951                 sorb, delta+2, insn[1], "pminsw", False );
   8952       goto decode_success;
   8953    }
   8954 
   8955    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8956    /* 0F DA = PMINUB -- 8x8 unsigned min */
   8957    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xDA) {
   8958       do_MMX_preamble();
   8959       delta = dis_MMXop_regmem_to_reg (
   8960                 sorb, delta+2, insn[1], "pminub", False );
   8961       goto decode_success;
   8962    }
   8963 
   8964    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8965    /* 0F D7 = PMOVMSKB -- extract sign bits from each of 8 lanes in
   8966       mmx(G), turn them into a byte, and put zero-extend of it in
   8967       ireg(G). */
   8968    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xD7) {
   8969       modrm = insn[2];
   8970       if (epartIsReg(modrm)) {
   8971          do_MMX_preamble();
   8972          t0 = newTemp(Ity_I64);
   8973          t1 = newTemp(Ity_I32);
   8974          assign(t0, getMMXReg(eregOfRM(modrm)));
   8975          assign(t1, mkIRExprCCall(
   8976                        Ity_I32, 0/*regparms*/,
   8977                        "x86g_calculate_mmx_pmovmskb",
   8978                        &x86g_calculate_mmx_pmovmskb,
   8979                        mkIRExprVec_1(mkexpr(t0))));
   8980          putIReg(4, gregOfRM(modrm), mkexpr(t1));
   8981          DIP("pmovmskb %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   8982                                  nameIReg(4,gregOfRM(modrm)));
   8983          delta += 3;
   8984          goto decode_success;
   8985       }
   8986       /* else fall through */
   8987    }
   8988 
   8989    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   8990    /* 0F E4 = PMULUH -- 16x4 hi-half of unsigned widening multiply */
   8991    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE4) {
   8992       do_MMX_preamble();
   8993       delta = dis_MMXop_regmem_to_reg (
   8994                 sorb, delta+2, insn[1], "pmuluh", False );
   8995       goto decode_success;
   8996    }
   8997 
   8998    /* 0F 18 /0 = PREFETCHNTA -- prefetch into caches, */
   8999    /* 0F 18 /1 = PREFETCH0   -- with various different hints */
   9000    /* 0F 18 /2 = PREFETCH1 */
   9001    /* 0F 18 /3 = PREFETCH2 */
   9002    if (insn[0] == 0x0F && insn[1] == 0x18
   9003        && !epartIsReg(insn[2])
   9004        && gregOfRM(insn[2]) >= 0 && gregOfRM(insn[2]) <= 3) {
   9005       HChar* hintstr = "??";
   9006 
   9007       modrm = getIByte(delta+2);
   9008       vassert(!epartIsReg(modrm));
   9009 
   9010       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9011       delta += 2+alen;
   9012 
   9013       switch (gregOfRM(modrm)) {
   9014          case 0: hintstr = "nta"; break;
   9015          case 1: hintstr = "t0"; break;
   9016          case 2: hintstr = "t1"; break;
   9017          case 3: hintstr = "t2"; break;
   9018          default: vassert(0); /*NOTREACHED*/
   9019       }
   9020 
   9021       DIP("prefetch%s %s\n", hintstr, dis_buf);
   9022       goto decode_success;
   9023    }
   9024 
   9025    /* 0F 0D /0 = PREFETCH  m8 -- 3DNow! prefetch */
   9026    /* 0F 0D /1 = PREFETCHW m8 -- ditto, with some other hint */
   9027    if (insn[0] == 0x0F && insn[1] == 0x0D
   9028        && !epartIsReg(insn[2])
   9029        && gregOfRM(insn[2]) >= 0 && gregOfRM(insn[2]) <= 1) {
   9030       HChar* hintstr = "??";
   9031 
   9032       modrm = getIByte(delta+2);
   9033       vassert(!epartIsReg(modrm));
   9034 
   9035       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9036       delta += 2+alen;
   9037 
   9038       switch (gregOfRM(modrm)) {
   9039          case 0: hintstr = ""; break;
   9040          case 1: hintstr = "w"; break;
   9041          default: vassert(0); /*NOTREACHED*/
   9042       }
   9043 
   9044       DIP("prefetch%s %s\n", hintstr, dis_buf);
   9045       goto decode_success;
   9046    }
   9047 
   9048    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   9049    /* 0F F6 = PSADBW -- sum of 8Ux8 absolute differences */
   9050    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF6) {
   9051       do_MMX_preamble();
   9052       delta = dis_MMXop_regmem_to_reg (
   9053                  sorb, delta+2, insn[1], "psadbw", False );
   9054       goto decode_success;
   9055    }
   9056 
   9057    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   9058    /* 0F 70 = PSHUFW -- rearrange 4x16 from E(mmx or mem) to G(mmx) */
   9059    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x70) {
   9060       Int order;
   9061       IRTemp sV, dV, s3, s2, s1, s0;
   9062       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   9063       sV = newTemp(Ity_I64);
   9064       dV = newTemp(Ity_I64);
   9065       do_MMX_preamble();
   9066       modrm = insn[2];
   9067       if (epartIsReg(modrm)) {
   9068          assign( sV, getMMXReg(eregOfRM(modrm)) );
   9069          order = (Int)insn[3];
   9070          delta += 2+2;
   9071          DIP("pshufw $%d,%s,%s\n", order,
   9072                                    nameMMXReg(eregOfRM(modrm)),
   9073                                    nameMMXReg(gregOfRM(modrm)));
   9074       } else {
   9075          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9076          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   9077 	 order = (Int)insn[2+alen];
   9078          delta += 3+alen;
   9079          DIP("pshufw $%d,%s,%s\n", order,
   9080                                    dis_buf,
   9081                                    nameMMXReg(gregOfRM(modrm)));
   9082       }
   9083       breakup64to16s( sV, &s3, &s2, &s1, &s0 );
   9084 
   9085 #     define SEL(n) \
   9086                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   9087       assign(dV,
   9088 	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
   9089                           SEL((order>>2)&3), SEL((order>>0)&3) )
   9090       );
   9091       putMMXReg(gregOfRM(modrm), mkexpr(dV));
   9092 #     undef SEL
   9093       goto decode_success;
   9094    }
   9095 
   9096    /* 0F 53 = RCPPS -- approx reciprocal 32Fx4 from R/M to R */
   9097    if (insn[0] == 0x0F && insn[1] == 0x53) {
   9098       vassert(sz == 4);
   9099       delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
   9100                                         "rcpps", Iop_Recip32Fx4 );
   9101       goto decode_success;
   9102    }
   9103 
   9104    /* F3 0F 53 = RCPSS -- approx reciprocal 32F0x4 from R/M to R */
   9105    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x53) {
   9106       vassert(sz == 4);
   9107       delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
   9108                                          "rcpss", Iop_Recip32F0x4 );
   9109       goto decode_success;
   9110    }
   9111 
   9112    /* 0F 52 = RSQRTPS -- approx reciprocal sqrt 32Fx4 from R/M to R */
   9113    if (insn[0] == 0x0F && insn[1] == 0x52) {
   9114       vassert(sz == 4);
   9115       delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
   9116                                         "rsqrtps", Iop_RSqrt32Fx4 );
   9117       goto decode_success;
   9118    }
   9119 
   9120    /* F3 0F 52 = RSQRTSS -- approx reciprocal sqrt 32F0x4 from R/M to R */
   9121    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x52) {
   9122       vassert(sz == 4);
   9123       delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
   9124                                          "rsqrtss", Iop_RSqrt32F0x4 );
   9125       goto decode_success;
   9126    }
   9127 
   9128    /* 0F AE /7 = SFENCE -- flush pending operations to memory */
   9129    if (insn[0] == 0x0F && insn[1] == 0xAE
   9130        && epartIsReg(insn[2]) && gregOfRM(insn[2]) == 7) {
   9131       vassert(sz == 4);
   9132       delta += 3;
   9133       /* Insert a memory fence.  It's sometimes important that these
   9134          are carried through to the generated code. */
   9135       stmt( IRStmt_MBE(Imbe_Fence) );
   9136       DIP("sfence\n");
   9137       goto decode_success;
   9138    }
   9139 
   9140    /* 0F C6 /r ib = SHUFPS -- shuffle packed F32s */
   9141    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC6) {
   9142       Int    select;
   9143       IRTemp sV, dV;
   9144       IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   9145       sV = newTemp(Ity_V128);
   9146       dV = newTemp(Ity_V128);
   9147       s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   9148       modrm = insn[2];
   9149       assign( dV, getXMMReg(gregOfRM(modrm)) );
   9150 
   9151       if (epartIsReg(modrm)) {
   9152          assign( sV, getXMMReg(eregOfRM(modrm)) );
   9153          select = (Int)insn[3];
   9154          delta += 2+2;
   9155          DIP("shufps $%d,%s,%s\n", select,
   9156                                    nameXMMReg(eregOfRM(modrm)),
   9157                                    nameXMMReg(gregOfRM(modrm)));
   9158       } else {
   9159          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9160          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   9161          select = (Int)insn[2+alen];
   9162          delta += 3+alen;
   9163          DIP("shufps $%d,%s,%s\n", select,
   9164                                    dis_buf,
   9165                                    nameXMMReg(gregOfRM(modrm)));
   9166       }
   9167 
   9168       breakup128to32s( dV, &d3, &d2, &d1, &d0 );
   9169       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   9170 
   9171 #     define SELD(n) ((n)==0 ? d0 : ((n)==1 ? d1 : ((n)==2 ? d2 : d3)))
   9172 #     define SELS(n) ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   9173 
   9174       putXMMReg(
   9175          gregOfRM(modrm),
   9176          mk128from32s( SELS((select>>6)&3), SELS((select>>4)&3),
   9177                        SELD((select>>2)&3), SELD((select>>0)&3) )
   9178       );
   9179 
   9180 #     undef SELD
   9181 #     undef SELS
   9182 
   9183       goto decode_success;
   9184    }
   9185 
   9186    /* 0F 51 = SQRTPS -- approx sqrt 32Fx4 from R/M to R */
   9187    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x51) {
   9188       delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
   9189                                         "sqrtps", Iop_Sqrt32Fx4 );
   9190       goto decode_success;
   9191    }
   9192 
   9193    /* F3 0F 51 = SQRTSS -- approx sqrt 32F0x4 from R/M to R */
   9194    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x51) {
   9195       vassert(sz == 4);
   9196       delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
   9197                                          "sqrtss", Iop_Sqrt32F0x4 );
   9198       goto decode_success;
   9199    }
   9200 
   9201    /* 0F AE /3 = STMXCSR m32 -- store %mxcsr */
   9202    if (insn[0] == 0x0F && insn[1] == 0xAE
   9203        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 3) {
   9204       modrm = getIByte(delta+2);
   9205       vassert(sz == 4);
   9206       vassert(!epartIsReg(modrm));
   9207 
   9208       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9209       delta += 2+alen;
   9210 
   9211       /* Fake up a native SSE mxcsr word.  The only thing it depends
   9212          on is SSEROUND[1:0], so call a clean helper to cook it up.
   9213       */
   9214       /* UInt x86h_create_mxcsr ( UInt sseround ) */
   9215       DIP("stmxcsr %s\n", dis_buf);
   9216       storeLE( mkexpr(addr),
   9217                mkIRExprCCall(
   9218                   Ity_I32, 0/*regp*/,
   9219                   "x86g_create_mxcsr", &x86g_create_mxcsr,
   9220                   mkIRExprVec_1( get_sse_roundingmode() )
   9221                )
   9222              );
   9223       goto decode_success;
   9224    }
   9225 
   9226    /* 0F 5C = SUBPS -- sub 32Fx4 from R/M to R */
   9227    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5C) {
   9228       delta = dis_SSE_E_to_G_all( sorb, delta+2, "subps", Iop_Sub32Fx4 );
   9229       goto decode_success;
   9230    }
   9231 
   9232    /* F3 0F 5C = SUBSS -- sub 32F0x4 from R/M to R */
   9233    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5C) {
   9234       vassert(sz == 4);
   9235       delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "subss", Iop_Sub32F0x4 );
   9236       goto decode_success;
   9237    }
   9238 
   9239    /* 0F 15 = UNPCKHPS -- unpack and interleave high part F32s */
   9240    /* 0F 14 = UNPCKLPS -- unpack and interleave low part F32s */
   9241    /* These just appear to be special cases of SHUFPS */
   9242    if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
   9243       IRTemp sV, dV;
   9244       IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   9245       Bool hi = toBool(insn[1] == 0x15);
   9246       sV = newTemp(Ity_V128);
   9247       dV = newTemp(Ity_V128);
   9248       s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   9249       modrm = insn[2];
   9250       assign( dV, getXMMReg(gregOfRM(modrm)) );
   9251 
   9252       if (epartIsReg(modrm)) {
   9253          assign( sV, getXMMReg(eregOfRM(modrm)) );
   9254          delta += 2+1;
   9255          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   9256                                   nameXMMReg(eregOfRM(modrm)),
   9257                                   nameXMMReg(gregOfRM(modrm)));
   9258       } else {
   9259          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9260          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   9261          delta += 2+alen;
   9262          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   9263                                   dis_buf,
   9264                                   nameXMMReg(gregOfRM(modrm)));
   9265       }
   9266 
   9267       breakup128to32s( dV, &d3, &d2, &d1, &d0 );
   9268       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   9269 
   9270       if (hi) {
   9271          putXMMReg( gregOfRM(modrm), mk128from32s( s3, d3, s2, d2 ) );
   9272       } else {
   9273          putXMMReg( gregOfRM(modrm), mk128from32s( s1, d1, s0, d0 ) );
   9274       }
   9275 
   9276       goto decode_success;
   9277    }
   9278 
   9279    /* 0F 57 = XORPS -- G = G and E */
   9280    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x57) {
   9281       delta = dis_SSE_E_to_G_all( sorb, delta+2, "xorps", Iop_XorV128 );
   9282       goto decode_success;
   9283    }
   9284 
   9285    /* ---------------------------------------------------- */
   9286    /* --- end of the SSE decoder.                      --- */
   9287    /* ---------------------------------------------------- */
   9288 
   9289    /* ---------------------------------------------------- */
   9290    /* --- start of the SSE2 decoder.                   --- */
   9291    /* ---------------------------------------------------- */
   9292 
   9293    /* Skip parts of the decoder which don't apply given the stated
   9294       guest subarchitecture. */
   9295    if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2))
   9296       goto after_sse_decoders; /* no SSE2 capabilities */
   9297 
   9298    insn = (UChar*)&guest_code[delta];
   9299 
   9300    /* 66 0F 58 = ADDPD -- add 32Fx4 from R/M to R */
   9301    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x58) {
   9302       delta = dis_SSE_E_to_G_all( sorb, delta+2, "addpd", Iop_Add64Fx2 );
   9303       goto decode_success;
   9304    }
   9305 
   9306    /* F2 0F 58 = ADDSD -- add 64F0x2 from R/M to R */
   9307    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x58) {
   9308       vassert(sz == 4);
   9309       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "addsd", Iop_Add64F0x2 );
   9310       goto decode_success;
   9311    }
   9312 
   9313    /* 66 0F 55 = ANDNPD -- G = (not G) and E */
   9314    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x55) {
   9315       delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "andnpd", Iop_AndV128 );
   9316       goto decode_success;
   9317    }
   9318 
   9319    /* 66 0F 54 = ANDPD -- G = G and E */
   9320    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x54) {
   9321       delta = dis_SSE_E_to_G_all( sorb, delta+2, "andpd", Iop_AndV128 );
   9322       goto decode_success;
   9323    }
   9324 
   9325    /* 66 0F C2 = CMPPD -- 64Fx2 comparison from R/M to R */
   9326    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC2) {
   9327       delta = dis_SSEcmp_E_to_G( sorb, delta+2, "cmppd", True, 8 );
   9328       goto decode_success;
   9329    }
   9330 
   9331    /* F2 0F C2 = CMPSD -- 64F0x2 comparison from R/M to R */
   9332    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xC2) {
   9333       vassert(sz == 4);
   9334       delta = dis_SSEcmp_E_to_G( sorb, delta+3, "cmpsd", False, 8 );
   9335       goto decode_success;
   9336    }
   9337 
   9338    /* 66 0F 2F = COMISD  -- 64F0x2 comparison G,E, and set ZCP */
   9339    /* 66 0F 2E = UCOMISD -- 64F0x2 comparison G,E, and set ZCP */
   9340    if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
   9341       IRTemp argL = newTemp(Ity_F64);
   9342       IRTemp argR = newTemp(Ity_F64);
   9343       modrm = getIByte(delta+2);
   9344       if (epartIsReg(modrm)) {
   9345          assign( argR, getXMMRegLane64F( eregOfRM(modrm), 0/*lowest lane*/ ) );
   9346          delta += 2+1;
   9347          DIP("[u]comisd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9348                                   nameXMMReg(gregOfRM(modrm)) );
   9349       } else {
   9350          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9351 	 assign( argR, loadLE(Ity_F64, mkexpr(addr)) );
   9352          delta += 2+alen;
   9353          DIP("[u]comisd %s,%s\n", dis_buf,
   9354                                   nameXMMReg(gregOfRM(modrm)) );
   9355       }
   9356       assign( argL, getXMMRegLane64F( gregOfRM(modrm), 0/*lowest lane*/ ) );
   9357 
   9358       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   9359       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   9360       stmt( IRStmt_Put(
   9361                OFFB_CC_DEP1,
   9362                binop( Iop_And32,
   9363                       binop(Iop_CmpF64, mkexpr(argL), mkexpr(argR)),
   9364                       mkU32(0x45)
   9365           )));
   9366       /* Set NDEP even though it isn't used.  This makes redundant-PUT
   9367          elimination of previous stores to this field work better. */
   9368       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   9369       goto decode_success;
   9370    }
   9371 
   9372    /* F3 0F E6 = CVTDQ2PD -- convert 2 x I32 in mem/lo half xmm to 2 x
   9373       F64 in xmm(G) */
   9374    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xE6) {
   9375       IRTemp arg64 = newTemp(Ity_I64);
   9376       vassert(sz == 4);
   9377 
   9378       modrm = getIByte(delta+3);
   9379       if (epartIsReg(modrm)) {
   9380          assign( arg64, getXMMRegLane64(eregOfRM(modrm), 0) );
   9381          delta += 3+1;
   9382          DIP("cvtdq2pd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9383                                  nameXMMReg(gregOfRM(modrm)));
   9384       } else {
   9385          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9386 	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   9387          delta += 3+alen;
   9388          DIP("cvtdq2pd %s,%s\n", dis_buf,
   9389                                  nameXMMReg(gregOfRM(modrm)) );
   9390       }
   9391 
   9392       putXMMRegLane64F(
   9393          gregOfRM(modrm), 0,
   9394          unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)))
   9395       );
   9396 
   9397       putXMMRegLane64F(
   9398          gregOfRM(modrm), 1,
   9399          unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)))
   9400       );
   9401 
   9402       goto decode_success;
   9403    }
   9404 
   9405    /* 0F 5B = CVTDQ2PS -- convert 4 x I32 in mem/xmm to 4 x F32 in
   9406       xmm(G) */
   9407    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5B) {
   9408       IRTemp argV  = newTemp(Ity_V128);
   9409       IRTemp rmode = newTemp(Ity_I32);
   9410 
   9411       modrm = getIByte(delta+2);
   9412       if (epartIsReg(modrm)) {
   9413          assign( argV, getXMMReg(eregOfRM(modrm)) );
   9414          delta += 2+1;
   9415          DIP("cvtdq2ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9416                                  nameXMMReg(gregOfRM(modrm)));
   9417       } else {
   9418          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9419 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   9420          delta += 2+alen;
   9421          DIP("cvtdq2ps %s,%s\n", dis_buf,
   9422                                  nameXMMReg(gregOfRM(modrm)) );
   9423       }
   9424 
   9425       assign( rmode, get_sse_roundingmode() );
   9426       breakup128to32s( argV, &t3, &t2, &t1, &t0 );
   9427 
   9428 #     define CVT(_t)  binop( Iop_F64toF32,                    \
   9429                              mkexpr(rmode),                   \
   9430                              unop(Iop_I32StoF64,mkexpr(_t)))
   9431 
   9432       putXMMRegLane32F( gregOfRM(modrm), 3, CVT(t3) );
   9433       putXMMRegLane32F( gregOfRM(modrm), 2, CVT(t2) );
   9434       putXMMRegLane32F( gregOfRM(modrm), 1, CVT(t1) );
   9435       putXMMRegLane32F( gregOfRM(modrm), 0, CVT(t0) );
   9436 
   9437 #     undef CVT
   9438 
   9439       goto decode_success;
   9440    }
   9441 
   9442    /* F2 0F E6 = CVTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
   9443       lo half xmm(G), and zero upper half */
   9444    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xE6) {
   9445       IRTemp argV  = newTemp(Ity_V128);
   9446       IRTemp rmode = newTemp(Ity_I32);
   9447       vassert(sz == 4);
   9448 
   9449       modrm = getIByte(delta+3);
   9450       if (epartIsReg(modrm)) {
   9451          assign( argV, getXMMReg(eregOfRM(modrm)) );
   9452          delta += 3+1;
   9453          DIP("cvtpd2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9454                                  nameXMMReg(gregOfRM(modrm)));
   9455       } else {
   9456          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9457 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   9458          delta += 3+alen;
   9459          DIP("cvtpd2dq %s,%s\n", dis_buf,
   9460                                  nameXMMReg(gregOfRM(modrm)) );
   9461       }
   9462 
   9463       assign( rmode, get_sse_roundingmode() );
   9464       t0 = newTemp(Ity_F64);
   9465       t1 = newTemp(Ity_F64);
   9466       assign( t0, unop(Iop_ReinterpI64asF64,
   9467                        unop(Iop_V128to64, mkexpr(argV))) );
   9468       assign( t1, unop(Iop_ReinterpI64asF64,
   9469                        unop(Iop_V128HIto64, mkexpr(argV))) );
   9470 
   9471 #     define CVT(_t)  binop( Iop_F64toI32S,                   \
   9472                              mkexpr(rmode),                   \
   9473                              mkexpr(_t) )
   9474 
   9475       putXMMRegLane32( gregOfRM(modrm), 3, mkU32(0) );
   9476       putXMMRegLane32( gregOfRM(modrm), 2, mkU32(0) );
   9477       putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
   9478       putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
   9479 
   9480 #     undef CVT
   9481 
   9482       goto decode_success;
   9483    }
   9484 
   9485    /* 66 0F 2D = CVTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
   9486       I32 in mmx, according to prevailing SSE rounding mode */
   9487    /* 66 0F 2C = CVTTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
   9488       I32 in mmx, rounding towards zero */
   9489    if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
   9490       IRTemp dst64  = newTemp(Ity_I64);
   9491       IRTemp rmode  = newTemp(Ity_I32);
   9492       IRTemp f64lo  = newTemp(Ity_F64);
   9493       IRTemp f64hi  = newTemp(Ity_F64);
   9494       Bool   r2zero = toBool(insn[1] == 0x2C);
   9495 
   9496       do_MMX_preamble();
   9497       modrm = getIByte(delta+2);
   9498 
   9499       if (epartIsReg(modrm)) {
   9500          delta += 2+1;
   9501 	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
   9502 	 assign(f64hi, getXMMRegLane64F(eregOfRM(modrm), 1));
   9503          DIP("cvt%spd2pi %s,%s\n", r2zero ? "t" : "",
   9504                                    nameXMMReg(eregOfRM(modrm)),
   9505                                    nameMMXReg(gregOfRM(modrm)));
   9506       } else {
   9507          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9508 	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   9509 	 assign(f64hi, loadLE(Ity_F64, binop( Iop_Add32,
   9510                                               mkexpr(addr),
   9511                                               mkU32(8) )));
   9512          delta += 2+alen;
   9513          DIP("cvt%spf2pi %s,%s\n", r2zero ? "t" : "",
   9514                                    dis_buf,
   9515                                    nameMMXReg(gregOfRM(modrm)));
   9516       }
   9517 
   9518       if (r2zero) {
   9519          assign(rmode, mkU32((UInt)Irrm_ZERO) );
   9520       } else {
   9521          assign( rmode, get_sse_roundingmode() );
   9522       }
   9523 
   9524       assign(
   9525          dst64,
   9526          binop( Iop_32HLto64,
   9527                 binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64hi) ),
   9528                 binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo) )
   9529               )
   9530       );
   9531 
   9532       putMMXReg(gregOfRM(modrm), mkexpr(dst64));
   9533       goto decode_success;
   9534    }
   9535 
   9536    /* 66 0F 5A = CVTPD2PS -- convert 2 x F64 in mem/xmm to 2 x F32 in
   9537       lo half xmm(G), and zero upper half */
   9538    /* Note, this is practically identical to CVTPD2DQ.  It would have
   9539       been nicer to merge them together, but the insn[] offsets differ
   9540       by one. */
   9541    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5A) {
   9542       IRTemp argV  = newTemp(Ity_V128);
   9543       IRTemp rmode = newTemp(Ity_I32);
   9544 
   9545       modrm = getIByte(delta+2);
   9546       if (epartIsReg(modrm)) {
   9547          assign( argV, getXMMReg(eregOfRM(modrm)) );
   9548          delta += 2+1;
   9549          DIP("cvtpd2ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9550                                  nameXMMReg(gregOfRM(modrm)));
   9551       } else {
   9552          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9553 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   9554          delta += 2+alen;
   9555          DIP("cvtpd2ps %s,%s\n", dis_buf,
   9556                                  nameXMMReg(gregOfRM(modrm)) );
   9557       }
   9558 
   9559       assign( rmode, get_sse_roundingmode() );
   9560       t0 = newTemp(Ity_F64);
   9561       t1 = newTemp(Ity_F64);
   9562       assign( t0, unop(Iop_ReinterpI64asF64,
   9563                        unop(Iop_V128to64, mkexpr(argV))) );
   9564       assign( t1, unop(Iop_ReinterpI64asF64,
   9565                        unop(Iop_V128HIto64, mkexpr(argV))) );
   9566 
   9567 #     define CVT(_t)  binop( Iop_F64toF32,                    \
   9568                              mkexpr(rmode),                   \
   9569                              mkexpr(_t) )
   9570 
   9571       putXMMRegLane32(  gregOfRM(modrm), 3, mkU32(0) );
   9572       putXMMRegLane32(  gregOfRM(modrm), 2, mkU32(0) );
   9573       putXMMRegLane32F( gregOfRM(modrm), 1, CVT(t1) );
   9574       putXMMRegLane32F( gregOfRM(modrm), 0, CVT(t0) );
   9575 
   9576 #     undef CVT
   9577 
   9578       goto decode_success;
   9579    }
   9580 
   9581    /* 66 0F 2A = CVTPI2PD -- convert 2 x I32 in mem/mmx to 2 x F64 in
   9582       xmm(G) */
   9583    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x2A) {
   9584       IRTemp arg64 = newTemp(Ity_I64);
   9585 
   9586       modrm = getIByte(delta+2);
   9587       if (epartIsReg(modrm)) {
   9588          /* Only switch to MMX mode if the source is a MMX register.
   9589             This is inconsistent with all other instructions which
   9590             convert between XMM and (M64 or MMX), which always switch
   9591             to MMX mode even if 64-bit operand is M64 and not MMX.  At
   9592             least, that's what the Intel docs seem to me to say.
   9593             Fixes #210264. */
   9594          do_MMX_preamble();
   9595          assign( arg64, getMMXReg(eregOfRM(modrm)) );
   9596          delta += 2+1;
   9597          DIP("cvtpi2pd %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   9598                                  nameXMMReg(gregOfRM(modrm)));
   9599       } else {
   9600          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9601 	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   9602          delta += 2+alen;
   9603          DIP("cvtpi2pd %s,%s\n", dis_buf,
   9604                                  nameXMMReg(gregOfRM(modrm)) );
   9605       }
   9606 
   9607       putXMMRegLane64F(
   9608          gregOfRM(modrm), 0,
   9609          unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)) )
   9610       );
   9611 
   9612       putXMMRegLane64F(
   9613          gregOfRM(modrm), 1,
   9614          unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)) )
   9615       );
   9616 
   9617       goto decode_success;
   9618    }
   9619 
   9620    /* 66 0F 5B = CVTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
   9621       xmm(G) */
   9622    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5B) {
   9623       IRTemp argV  = newTemp(Ity_V128);
   9624       IRTemp rmode = newTemp(Ity_I32);
   9625 
   9626       modrm = getIByte(delta+2);
   9627       if (epartIsReg(modrm)) {
   9628          assign( argV, getXMMReg(eregOfRM(modrm)) );
   9629          delta += 2+1;
   9630          DIP("cvtps2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9631                                  nameXMMReg(gregOfRM(modrm)));
   9632       } else {
   9633          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9634 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   9635          delta += 2+alen;
   9636          DIP("cvtps2dq %s,%s\n", dis_buf,
   9637                                  nameXMMReg(gregOfRM(modrm)) );
   9638       }
   9639 
   9640       assign( rmode, get_sse_roundingmode() );
   9641       breakup128to32s( argV, &t3, &t2, &t1, &t0 );
   9642 
   9643       /* This is less than ideal.  If it turns out to be a performance
   9644 	 bottleneck it can be improved. */
   9645 #     define CVT(_t)                            \
   9646         binop( Iop_F64toI32S,                   \
   9647                mkexpr(rmode),                   \
   9648                unop( Iop_F32toF64,              \
   9649                      unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
   9650 
   9651       putXMMRegLane32( gregOfRM(modrm), 3, CVT(t3) );
   9652       putXMMRegLane32( gregOfRM(modrm), 2, CVT(t2) );
   9653       putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
   9654       putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
   9655 
   9656 #     undef CVT
   9657 
   9658       goto decode_success;
   9659    }
   9660 
   9661    /* 0F 5A = CVTPS2PD -- convert 2 x F32 in low half mem/xmm to 2 x
   9662       F64 in xmm(G). */
   9663    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5A) {
   9664       IRTemp f32lo = newTemp(Ity_F32);
   9665       IRTemp f32hi = newTemp(Ity_F32);
   9666 
   9667       modrm = getIByte(delta+2);
   9668       if (epartIsReg(modrm)) {
   9669          assign( f32lo, getXMMRegLane32F(eregOfRM(modrm), 0) );
   9670          assign( f32hi, getXMMRegLane32F(eregOfRM(modrm), 1) );
   9671          delta += 2+1;
   9672          DIP("cvtps2pd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9673                                  nameXMMReg(gregOfRM(modrm)));
   9674       } else {
   9675          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9676 	 assign( f32lo, loadLE(Ity_F32, mkexpr(addr)) );
   9677 	 assign( f32hi, loadLE(Ity_F32,
   9678                                binop(Iop_Add32,mkexpr(addr),mkU32(4))) );
   9679          delta += 2+alen;
   9680          DIP("cvtps2pd %s,%s\n", dis_buf,
   9681                                  nameXMMReg(gregOfRM(modrm)) );
   9682       }
   9683 
   9684       putXMMRegLane64F( gregOfRM(modrm), 1,
   9685                         unop(Iop_F32toF64, mkexpr(f32hi)) );
   9686       putXMMRegLane64F( gregOfRM(modrm), 0,
   9687                         unop(Iop_F32toF64, mkexpr(f32lo)) );
   9688 
   9689       goto decode_success;
   9690    }
   9691 
   9692    /* F2 0F 2D = CVTSD2SI -- convert F64 in mem/low half xmm to
   9693       I32 in ireg, according to prevailing SSE rounding mode */
   9694    /* F2 0F 2C = CVTTSD2SI -- convert F64 in mem/low half xmm to
   9695       I32 in ireg, rounding towards zero */
   9696    if (insn[0] == 0xF2 && insn[1] == 0x0F
   9697        && (insn[2] == 0x2D || insn[2] == 0x2C)) {
   9698       IRTemp rmode = newTemp(Ity_I32);
   9699       IRTemp f64lo = newTemp(Ity_F64);
   9700       Bool   r2zero = toBool(insn[2] == 0x2C);
   9701       vassert(sz == 4);
   9702 
   9703       modrm = getIByte(delta+3);
   9704       if (epartIsReg(modrm)) {
   9705          delta += 3+1;
   9706 	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
   9707          DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
   9708                                    nameXMMReg(eregOfRM(modrm)),
   9709                                    nameIReg(4, gregOfRM(modrm)));
   9710       } else {
   9711          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9712 	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   9713          delta += 3+alen;
   9714          DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
   9715                                    dis_buf,
   9716                                    nameIReg(4, gregOfRM(modrm)));
   9717       }
   9718 
   9719       if (r2zero) {
   9720          assign( rmode, mkU32((UInt)Irrm_ZERO) );
   9721       } else {
   9722          assign( rmode, get_sse_roundingmode() );
   9723       }
   9724 
   9725       putIReg(4, gregOfRM(modrm),
   9726                  binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo)) );
   9727 
   9728       goto decode_success;
   9729    }
   9730 
   9731    /* F2 0F 5A = CVTSD2SS -- convert F64 in mem/low half xmm to F32 in
   9732       low 1/4 xmm(G), according to prevailing SSE rounding mode */
   9733    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5A) {
   9734       IRTemp rmode = newTemp(Ity_I32);
   9735       IRTemp f64lo = newTemp(Ity_F64);
   9736       vassert(sz == 4);
   9737 
   9738       modrm = getIByte(delta+3);
   9739       if (epartIsReg(modrm)) {
   9740          delta += 3+1;
   9741 	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
   9742          DIP("cvtsd2ss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9743                                  nameXMMReg(gregOfRM(modrm)));
   9744       } else {
   9745          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9746 	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   9747          delta += 3+alen;
   9748          DIP("cvtsd2ss %s,%s\n", dis_buf,
   9749                                  nameXMMReg(gregOfRM(modrm)));
   9750       }
   9751 
   9752       assign( rmode, get_sse_roundingmode() );
   9753       putXMMRegLane32F(
   9754          gregOfRM(modrm), 0,
   9755          binop( Iop_F64toF32, mkexpr(rmode), mkexpr(f64lo) )
   9756       );
   9757 
   9758       goto decode_success;
   9759    }
   9760 
   9761    /* F2 0F 2A = CVTSI2SD -- convert I32 in mem/ireg to F64 in low
   9762       half xmm */
   9763    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x2A) {
   9764       IRTemp arg32 = newTemp(Ity_I32);
   9765       vassert(sz == 4);
   9766 
   9767       modrm = getIByte(delta+3);
   9768       if (epartIsReg(modrm)) {
   9769          assign( arg32, getIReg(4, eregOfRM(modrm)) );
   9770          delta += 3+1;
   9771          DIP("cvtsi2sd %s,%s\n", nameIReg(4, eregOfRM(modrm)),
   9772                                  nameXMMReg(gregOfRM(modrm)));
   9773       } else {
   9774          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9775 	 assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   9776          delta += 3+alen;
   9777          DIP("cvtsi2sd %s,%s\n", dis_buf,
   9778                                  nameXMMReg(gregOfRM(modrm)) );
   9779       }
   9780 
   9781       putXMMRegLane64F(
   9782          gregOfRM(modrm), 0,
   9783          unop(Iop_I32StoF64, mkexpr(arg32)) );
   9784 
   9785       goto decode_success;
   9786    }
   9787 
   9788    /* F3 0F 5A = CVTSS2SD -- convert F32 in mem/low 1/4 xmm to F64 in
   9789       low half xmm(G) */
   9790    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5A) {
   9791       IRTemp f32lo = newTemp(Ity_F32);
   9792       vassert(sz == 4);
   9793 
   9794       modrm = getIByte(delta+3);
   9795       if (epartIsReg(modrm)) {
   9796          delta += 3+1;
   9797 	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
   9798          DIP("cvtss2sd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9799                                  nameXMMReg(gregOfRM(modrm)));
   9800       } else {
   9801          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9802 	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   9803          delta += 3+alen;
   9804          DIP("cvtss2sd %s,%s\n", dis_buf,
   9805                                  nameXMMReg(gregOfRM(modrm)));
   9806       }
   9807 
   9808       putXMMRegLane64F( gregOfRM(modrm), 0,
   9809                         unop( Iop_F32toF64, mkexpr(f32lo) ) );
   9810 
   9811       goto decode_success;
   9812    }
   9813 
   9814    /* 66 0F E6 = CVTTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
   9815       lo half xmm(G), and zero upper half, rounding towards zero */
   9816    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE6) {
   9817       IRTemp argV  = newTemp(Ity_V128);
   9818       IRTemp rmode = newTemp(Ity_I32);
   9819 
   9820       modrm = getIByte(delta+2);
   9821       if (epartIsReg(modrm)) {
   9822          assign( argV, getXMMReg(eregOfRM(modrm)) );
   9823          delta += 2+1;
   9824          DIP("cvttpd2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9825                                   nameXMMReg(gregOfRM(modrm)));
   9826       } else {
   9827          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9828 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   9829          delta += 2+alen;
   9830          DIP("cvttpd2dq %s,%s\n", dis_buf,
   9831                                   nameXMMReg(gregOfRM(modrm)) );
   9832       }
   9833 
   9834       assign( rmode, mkU32((UInt)Irrm_ZERO) );
   9835 
   9836       t0 = newTemp(Ity_F64);
   9837       t1 = newTemp(Ity_F64);
   9838       assign( t0, unop(Iop_ReinterpI64asF64,
   9839                        unop(Iop_V128to64, mkexpr(argV))) );
   9840       assign( t1, unop(Iop_ReinterpI64asF64,
   9841                        unop(Iop_V128HIto64, mkexpr(argV))) );
   9842 
   9843 #     define CVT(_t)  binop( Iop_F64toI32S,                   \
   9844                              mkexpr(rmode),                   \
   9845                              mkexpr(_t) )
   9846 
   9847       putXMMRegLane32( gregOfRM(modrm), 3, mkU32(0) );
   9848       putXMMRegLane32( gregOfRM(modrm), 2, mkU32(0) );
   9849       putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
   9850       putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
   9851 
   9852 #     undef CVT
   9853 
   9854       goto decode_success;
   9855    }
   9856 
   9857    /* F3 0F 5B = CVTTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
   9858       xmm(G), rounding towards zero */
   9859    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5B) {
   9860       IRTemp argV  = newTemp(Ity_V128);
   9861       IRTemp rmode = newTemp(Ity_I32);
   9862       vassert(sz == 4);
   9863 
   9864       modrm = getIByte(delta+3);
   9865       if (epartIsReg(modrm)) {
   9866          assign( argV, getXMMReg(eregOfRM(modrm)) );
   9867          delta += 3+1;
   9868          DIP("cvttps2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   9869                                   nameXMMReg(gregOfRM(modrm)));
   9870       } else {
   9871          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   9872 	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   9873          delta += 3+alen;
   9874          DIP("cvttps2dq %s,%s\n", dis_buf,
   9875                                   nameXMMReg(gregOfRM(modrm)) );
   9876       }
   9877 
   9878       assign( rmode, mkU32((UInt)Irrm_ZERO) );
   9879       breakup128to32s( argV, &t3, &t2, &t1, &t0 );
   9880 
   9881       /* This is less than ideal.  If it turns out to be a performance
   9882 	 bottleneck it can be improved. */
   9883 #     define CVT(_t)                            \
   9884         binop( Iop_F64toI32S,                   \
   9885                mkexpr(rmode),                   \
   9886                unop( Iop_F32toF64,              \
   9887                      unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
   9888 
   9889       putXMMRegLane32( gregOfRM(modrm), 3, CVT(t3) );
   9890       putXMMRegLane32( gregOfRM(modrm), 2, CVT(t2) );
   9891       putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
   9892       putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
   9893 
   9894 #     undef CVT
   9895 
   9896       goto decode_success;
   9897    }
   9898 
   9899    /* 66 0F 5E = DIVPD -- div 64Fx2 from R/M to R */
   9900    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5E) {
   9901       delta = dis_SSE_E_to_G_all( sorb, delta+2, "divpd", Iop_Div64Fx2 );
   9902       goto decode_success;
   9903    }
   9904 
   9905    /* F2 0F 5E = DIVSD -- div 64F0x2 from R/M to R */
   9906    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5E) {
   9907       vassert(sz == 4);
   9908       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "divsd", Iop_Div64F0x2 );
   9909       goto decode_success;
   9910    }
   9911 
   9912    /* 0F AE /5 = LFENCE -- flush pending operations to memory */
   9913    /* 0F AE /6 = MFENCE -- flush pending operations to memory */
   9914    if (insn[0] == 0x0F && insn[1] == 0xAE
   9915        && epartIsReg(insn[2])
   9916        && (gregOfRM(insn[2]) == 5 || gregOfRM(insn[2]) == 6)) {
   9917       vassert(sz == 4);
   9918       delta += 3;
   9919       /* Insert a memory fence.  It's sometimes important that these
   9920          are carried through to the generated code. */
   9921       stmt( IRStmt_MBE(Imbe_Fence) );
   9922       DIP("%sfence\n", gregOfRM(insn[2])==5 ? "l" : "m");
   9923       goto decode_success;
   9924    }
   9925 
   9926    /* 66 0F 5F = MAXPD -- max 64Fx2 from R/M to R */
   9927    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5F) {
   9928       delta = dis_SSE_E_to_G_all( sorb, delta+2, "maxpd", Iop_Max64Fx2 );
   9929       goto decode_success;
   9930    }
   9931 
   9932    /* F2 0F 5F = MAXSD -- max 64F0x2 from R/M to R */
   9933    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5F) {
   9934       vassert(sz == 4);
   9935       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "maxsd", Iop_Max64F0x2 );
   9936       goto decode_success;
   9937    }
   9938 
   9939    /* 66 0F 5D = MINPD -- min 64Fx2 from R/M to R */
   9940    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5D) {
   9941       delta = dis_SSE_E_to_G_all( sorb, delta+2, "minpd", Iop_Min64Fx2 );
   9942       goto decode_success;
   9943    }
   9944 
   9945    /* F2 0F 5D = MINSD -- min 64F0x2 from R/M to R */
   9946    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5D) {
   9947       vassert(sz == 4);
   9948       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "minsd", Iop_Min64F0x2 );
   9949       goto decode_success;
   9950    }
   9951 
   9952    /* 66 0F 28 = MOVAPD -- move from E (mem or xmm) to G (xmm). */
   9953    /* 66 0F 10 = MOVUPD -- move from E (mem or xmm) to G (xmm). */
   9954    /* 66 0F 6F = MOVDQA -- move from E (mem or xmm) to G (xmm). */
   9955    if (sz == 2 && insn[0] == 0x0F
   9956        && (insn[1] == 0x28 || insn[1] == 0x10 || insn[1] == 0x6F)) {
   9957       HChar* wot = insn[1]==0x28 ? "apd" :
   9958                    insn[1]==0x10 ? "upd" : "dqa";
   9959       modrm = getIByte(delta+2);
   9960       if (epartIsReg(modrm)) {
   9961          putXMMReg( gregOfRM(modrm),
   9962                     getXMMReg( eregOfRM(modrm) ));
   9963          DIP("mov%s %s,%s\n", wot, nameXMMReg(eregOfRM(modrm)),
   9964                                    nameXMMReg(gregOfRM(modrm)));
   9965          delta += 2+1;
   9966       } else {
   9967          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9968          if (insn[1] == 0x28/*movapd*/ || insn[1] == 0x6F/*movdqa*/)
   9969             gen_SEGV_if_not_16_aligned( addr );
   9970          putXMMReg( gregOfRM(modrm),
   9971                     loadLE(Ity_V128, mkexpr(addr)) );
   9972          DIP("mov%s %s,%s\n", wot, dis_buf,
   9973                                    nameXMMReg(gregOfRM(modrm)));
   9974          delta += 2+alen;
   9975       }
   9976       goto decode_success;
   9977    }
   9978 
   9979    /* 66 0F 29 = MOVAPD -- move from G (xmm) to E (mem or xmm). */
   9980    /* 66 0F 11 = MOVUPD -- move from G (xmm) to E (mem or xmm). */
   9981    if (sz == 2 && insn[0] == 0x0F
   9982        && (insn[1] == 0x29 || insn[1] == 0x11)) {
   9983       HChar* wot = insn[1]==0x29 ? "apd" : "upd";
   9984       modrm = getIByte(delta+2);
   9985       if (epartIsReg(modrm)) {
   9986          /* fall through; awaiting test case */
   9987       } else {
   9988          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   9989          if (insn[1] == 0x29/*movapd*/)
   9990             gen_SEGV_if_not_16_aligned( addr );
   9991          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   9992          DIP("mov%s %s,%s\n", wot, nameXMMReg(gregOfRM(modrm)),
   9993                                    dis_buf );
   9994          delta += 2+alen;
   9995          goto decode_success;
   9996       }
   9997    }
   9998 
   9999    /* 66 0F 6E = MOVD from r/m32 to xmm, zeroing high 3/4 of xmm. */
   10000    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6E) {
   10001       modrm = getIByte(delta+2);
   10002       if (epartIsReg(modrm)) {
   10003          delta += 2+1;
   10004          putXMMReg(
   10005             gregOfRM(modrm),
   10006             unop( Iop_32UtoV128, getIReg(4, eregOfRM(modrm)) )
   10007          );
   10008          DIP("movd %s, %s\n",
   10009              nameIReg(4,eregOfRM(modrm)), nameXMMReg(gregOfRM(modrm)));
   10010       } else {
   10011          addr = disAMode( &alen, sorb, delta+2, dis_buf );
   10012          delta += 2+alen;
   10013          putXMMReg(
   10014             gregOfRM(modrm),
   10015             unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)) )
   10016          );
   10017          DIP("movd %s, %s\n", dis_buf, nameXMMReg(gregOfRM(modrm)));
   10018       }
   10019       goto decode_success;
   10020    }
   10021 
   10022    /* 66 0F 7E = MOVD from xmm low 1/4 to r/m32. */
   10023    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x7E) {
   10024       modrm = getIByte(delta+2);
   10025       if (epartIsReg(modrm)) {
   10026          delta += 2+1;
   10027          putIReg( 4, eregOfRM(modrm),
   10028                   getXMMRegLane32(gregOfRM(modrm), 0) );
   10029          DIP("movd %s, %s\n",
   10030              nameXMMReg(gregOfRM(modrm)), nameIReg(4,eregOfRM(modrm)));
   10031       } else {
   10032          addr = disAMode( &alen, sorb, delta+2, dis_buf );
   10033          delta += 2+alen;
   10034          storeLE( mkexpr(addr),
   10035                   getXMMRegLane32(gregOfRM(modrm), 0) );
   10036          DIP("movd %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
   10037       }
   10038       goto decode_success;
   10039    }
   10040 
   10041    /* 66 0F 7F = MOVDQA -- move from G (xmm) to E (mem or xmm). */
   10042    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x7F) {
   10043       modrm = getIByte(delta+2);
   10044       if (epartIsReg(modrm)) {
   10045          delta += 2+1;
   10046          putXMMReg( eregOfRM(modrm),
   10047                     getXMMReg(gregOfRM(modrm)) );
   10048          DIP("movdqa %s, %s\n", nameXMMReg(gregOfRM(modrm)),
   10049                                 nameXMMReg(eregOfRM(modrm)));
   10050       } else {
   10051          addr = disAMode( &alen, sorb, delta+2, dis_buf );
   10052          delta += 2+alen;
   10053          gen_SEGV_if_not_16_aligned( addr );
   10054          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   10055          DIP("movdqa %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
   10056       }
   10057       goto decode_success;
   10058    }
   10059 
   10060    /* F3 0F 6F = MOVDQU -- move from E (mem or xmm) to G (xmm). */
   10061    /* Unfortunately can't simply use the MOVDQA case since the
   10062       prefix lengths are different (66 vs F3) */
   10063    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x6F) {
   10064       vassert(sz == 4);
   10065       modrm = getIByte(delta+3);
   10066       if (epartIsReg(modrm)) {
   10067          putXMMReg( gregOfRM(modrm),
   10068                     getXMMReg( eregOfRM(modrm) ));
   10069          DIP("movdqu %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10070                                nameXMMReg(gregOfRM(modrm)));
   10071          delta += 3+1;
   10072       } else {
   10073          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   10074          putXMMReg( gregOfRM(modrm),
   10075                     loadLE(Ity_V128, mkexpr(addr)) );
   10076          DIP("movdqu %s,%s\n", dis_buf,
   10077                                nameXMMReg(gregOfRM(modrm)));
   10078          delta += 3+alen;
   10079       }
   10080       goto decode_success;
   10081    }
   10082 
   10083    /* F3 0F 7F = MOVDQU -- move from G (xmm) to E (mem or xmm). */
   10084    /* Unfortunately can't simply use the MOVDQA case since the
   10085       prefix lengths are different (66 vs F3) */
   10086    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x7F) {
   10087       vassert(sz == 4);
   10088       modrm = getIByte(delta+3);
   10089       if (epartIsReg(modrm)) {
   10090          delta += 3+1;
   10091          putXMMReg( eregOfRM(modrm),
   10092                     getXMMReg(gregOfRM(modrm)) );
   10093          DIP("movdqu %s, %s\n", nameXMMReg(gregOfRM(modrm)),
   10094                                 nameXMMReg(eregOfRM(modrm)));
   10095       } else {
   10096          addr = disAMode( &alen, sorb, delta+3, dis_buf );
   10097          delta += 3+alen;
   10098          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   10099          DIP("movdqu %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
   10100       }
   10101       goto decode_success;
   10102    }
   10103 
   10104    /* F2 0F D6 = MOVDQ2Q -- move from E (lo half xmm, not mem) to G (mmx). */
   10105    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xD6) {
   10106       vassert(sz == 4);
   10107       modrm = getIByte(delta+3);
   10108       if (epartIsReg(modrm)) {
   10109          do_MMX_preamble();
   10110          putMMXReg( gregOfRM(modrm),
   10111                     getXMMRegLane64( eregOfRM(modrm), 0 ));
   10112          DIP("movdq2q %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10113                                 nameMMXReg(gregOfRM(modrm)));
   10114          delta += 3+1;
   10115          goto decode_success;
   10116       } else {
   10117          /* fall through, apparently no mem case for this insn */
   10118       }
   10119    }
   10120 
   10121    /* 66 0F 16 = MOVHPD -- move from mem to high half of XMM. */
   10122    /* These seems identical to MOVHPS.  This instruction encoding is
   10123       completely crazy. */
   10124    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x16) {
   10125       modrm = getIByte(delta+2);
   10126       if (epartIsReg(modrm)) {
   10127          /* fall through; apparently reg-reg is not possible */
   10128       } else {
   10129          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10130          delta += 2+alen;
   10131          putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
   10132                           loadLE(Ity_I64, mkexpr(addr)) );
   10133          DIP("movhpd %s,%s\n", dis_buf,
   10134                                nameXMMReg( gregOfRM(modrm) ));
   10135          goto decode_success;
   10136       }
   10137    }
   10138 
   10139    /* 66 0F 17 = MOVHPD -- move from high half of XMM to mem. */
   10140    /* Again, this seems identical to MOVHPS. */
   10141    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x17) {
   10142       if (!epartIsReg(insn[2])) {
   10143          delta += 2;
   10144          addr = disAMode ( &alen, sorb, delta, dis_buf );
   10145          delta += alen;
   10146          storeLE( mkexpr(addr),
   10147                   getXMMRegLane64( gregOfRM(insn[2]),
   10148                                    1/*upper lane*/ ) );
   10149          DIP("movhpd %s,%s\n", nameXMMReg( gregOfRM(insn[2]) ),
   10150                                dis_buf);
   10151          goto decode_success;
   10152       }
   10153       /* else fall through */
   10154    }
   10155 
   10156    /* 66 0F 12 = MOVLPD -- move from mem to low half of XMM. */
   10157    /* Identical to MOVLPS ? */
   10158    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x12) {
   10159       modrm = getIByte(delta+2);
   10160       if (epartIsReg(modrm)) {
   10161          /* fall through; apparently reg-reg is not possible */
   10162       } else {
   10163          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10164          delta += 2+alen;
   10165          putXMMRegLane64( gregOfRM(modrm),  0/*lower lane*/,
   10166                           loadLE(Ity_I64, mkexpr(addr)) );
   10167          DIP("movlpd %s, %s\n",
   10168              dis_buf, nameXMMReg( gregOfRM(modrm) ));
   10169          goto decode_success;
   10170       }
   10171    }
   10172 
   10173    /* 66 0F 13 = MOVLPD -- move from low half of XMM to mem. */
   10174    /* Identical to MOVLPS ? */
   10175    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x13) {
   10176       if (!epartIsReg(insn[2])) {
   10177          delta += 2;
   10178          addr = disAMode ( &alen, sorb, delta, dis_buf );
   10179          delta += alen;
   10180          storeLE( mkexpr(addr),
   10181                   getXMMRegLane64( gregOfRM(insn[2]),
   10182                                    0/*lower lane*/ ) );
   10183          DIP("movlpd %s, %s\n", nameXMMReg( gregOfRM(insn[2]) ),
   10184                                 dis_buf);
   10185          goto decode_success;
   10186       }
   10187       /* else fall through */
   10188    }
   10189 
   10190    /* 66 0F 50 = MOVMSKPD - move 2 sign bits from 2 x F64 in xmm(E) to
   10191       2 lowest bits of ireg(G) */
   10192    if (insn[0] == 0x0F && insn[1] == 0x50) {
   10193       modrm = getIByte(delta+2);
   10194       if (sz == 2 && epartIsReg(modrm)) {
   10195          Int src;
   10196          t0 = newTemp(Ity_I32);
   10197          t1 = newTemp(Ity_I32);
   10198          delta += 2+1;
   10199          src = eregOfRM(modrm);
   10200          assign( t0, binop( Iop_And32,
   10201                             binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(31)),
   10202                             mkU32(1) ));
   10203          assign( t1, binop( Iop_And32,
   10204                             binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(30)),
   10205                             mkU32(2) ));
   10206          putIReg(4, gregOfRM(modrm),
   10207                     binop(Iop_Or32, mkexpr(t0), mkexpr(t1))
   10208                  );
   10209          DIP("movmskpd %s,%s\n", nameXMMReg(src),
   10210                                  nameIReg(4, gregOfRM(modrm)));
   10211          goto decode_success;
   10212       }
   10213       /* else fall through */
   10214    }
   10215 
   10216    /* 66 0F F7 = MASKMOVDQU -- store selected bytes of double quadword */
   10217    if (insn[0] == 0x0F && insn[1] == 0xF7) {
   10218       modrm = getIByte(delta+2);
   10219       if (sz == 2 && epartIsReg(modrm)) {
   10220          IRTemp regD    = newTemp(Ity_V128);
   10221          IRTemp mask    = newTemp(Ity_V128);
   10222          IRTemp olddata = newTemp(Ity_V128);
   10223          IRTemp newdata = newTemp(Ity_V128);
   10224                 addr    = newTemp(Ity_I32);
   10225 
   10226          assign( addr, handleSegOverride( sorb, getIReg(4, R_EDI) ));
   10227          assign( regD, getXMMReg( gregOfRM(modrm) ));
   10228 
   10229          /* Unfortunately can't do the obvious thing with SarN8x16
   10230             here since that can't be re-emitted as SSE2 code - no such
   10231             insn. */
   10232 	 assign(
   10233             mask,
   10234             binop(Iop_64HLtoV128,
   10235                   binop(Iop_SarN8x8,
   10236                         getXMMRegLane64( eregOfRM(modrm), 1 ),
   10237                         mkU8(7) ),
   10238                   binop(Iop_SarN8x8,
   10239                         getXMMRegLane64( eregOfRM(modrm), 0 ),
   10240                         mkU8(7) ) ));
   10241          assign( olddata, loadLE( Ity_V128, mkexpr(addr) ));
   10242          assign( newdata,
   10243                  binop(Iop_OrV128,
   10244                        binop(Iop_AndV128,
   10245                              mkexpr(regD),
   10246                              mkexpr(mask) ),
   10247                        binop(Iop_AndV128,
   10248                              mkexpr(olddata),
   10249                              unop(Iop_NotV128, mkexpr(mask)))) );
   10250          storeLE( mkexpr(addr), mkexpr(newdata) );
   10251 
   10252          delta += 2+1;
   10253          DIP("maskmovdqu %s,%s\n", nameXMMReg( eregOfRM(modrm) ),
   10254                                    nameXMMReg( gregOfRM(modrm) ) );
   10255          goto decode_success;
   10256       }
   10257       /* else fall through */
   10258    }
   10259 
   10260    /* 66 0F E7 = MOVNTDQ -- for us, just a plain SSE store. */
   10261    if (insn[0] == 0x0F && insn[1] == 0xE7) {
   10262       modrm = getIByte(delta+2);
   10263       if (sz == 2 && !epartIsReg(modrm)) {
   10264          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10265          gen_SEGV_if_not_16_aligned( addr );
   10266          storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
   10267          DIP("movntdq %s,%s\n", dis_buf,
   10268                                 nameXMMReg(gregOfRM(modrm)));
   10269          delta += 2+alen;
   10270          goto decode_success;
   10271       }
   10272       /* else fall through */
   10273    }
   10274 
   10275    /* 0F C3 = MOVNTI -- for us, just a plain ireg store. */
   10276    if (insn[0] == 0x0F && insn[1] == 0xC3) {
   10277       vassert(sz == 4);
   10278       modrm = getIByte(delta+2);
   10279       if (!epartIsReg(modrm)) {
   10280          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10281          storeLE( mkexpr(addr), getIReg(4, gregOfRM(modrm)) );
   10282          DIP("movnti %s,%s\n", dis_buf,
   10283                                nameIReg(4, gregOfRM(modrm)));
   10284          delta += 2+alen;
   10285          goto decode_success;
   10286       }
   10287       /* else fall through */
   10288    }
   10289 
   10290    /* 66 0F D6 = MOVQ -- move 64 bits from G (lo half xmm) to E (mem
   10291       or lo half xmm).  */
   10292    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD6) {
   10293       modrm = getIByte(delta+2);
   10294       if (epartIsReg(modrm)) {
   10295          /* fall through, awaiting test case */
   10296          /* dst: lo half copied, hi half zeroed */
   10297       } else {
   10298          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10299          storeLE( mkexpr(addr),
   10300                   getXMMRegLane64( gregOfRM(modrm), 0 ));
   10301          DIP("movq %s,%s\n", nameXMMReg(gregOfRM(modrm)), dis_buf );
   10302          delta += 2+alen;
   10303          goto decode_success;
   10304       }
   10305    }
   10306 
   10307    /* F3 0F D6 = MOVQ2DQ -- move from E (mmx) to G (lo half xmm, zero
   10308       hi half). */
   10309    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xD6) {
   10310       vassert(sz == 4);
   10311       modrm = getIByte(delta+3);
   10312       if (epartIsReg(modrm)) {
   10313          do_MMX_preamble();
   10314          putXMMReg( gregOfRM(modrm),
   10315                     unop(Iop_64UtoV128, getMMXReg( eregOfRM(modrm) )) );
   10316          DIP("movq2dq %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   10317                                 nameXMMReg(gregOfRM(modrm)));
   10318          delta += 3+1;
   10319          goto decode_success;
   10320       } else {
   10321          /* fall through, apparently no mem case for this insn */
   10322       }
   10323    }
   10324 
   10325    /* F3 0F 7E = MOVQ -- move 64 bits from E (mem or lo half xmm) to
   10326       G (lo half xmm).  Upper half of G is zeroed out. */
   10327    /* F2 0F 10 = MOVSD -- move 64 bits from E (mem or lo half xmm) to
   10328       G (lo half xmm).  If E is mem, upper half of G is zeroed out.
   10329       If E is reg, upper half of G is unchanged. */
   10330    if ((insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x10)
   10331        || (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x7E)) {
   10332       vassert(sz == 4);
   10333       modrm = getIByte(delta+3);
   10334       if (epartIsReg(modrm)) {
   10335          putXMMRegLane64( gregOfRM(modrm), 0,
   10336                           getXMMRegLane64( eregOfRM(modrm), 0 ));
   10337          if (insn[0] == 0xF3/*MOVQ*/) {
   10338             /* zero bits 127:64 */
   10339             putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
   10340          }
   10341          DIP("movsd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10342                               nameXMMReg(gregOfRM(modrm)));
   10343          delta += 3+1;
   10344       } else {
   10345          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   10346          /* zero bits 127:64 */
   10347          putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
   10348          /* write bits 63:0 */
   10349          putXMMRegLane64( gregOfRM(modrm), 0,
   10350                           loadLE(Ity_I64, mkexpr(addr)) );
   10351          DIP("movsd %s,%s\n", dis_buf,
   10352                               nameXMMReg(gregOfRM(modrm)));
   10353          delta += 3+alen;
   10354       }
   10355       goto decode_success;
   10356    }
   10357 
   10358    /* F2 0F 11 = MOVSD -- move 64 bits from G (lo half xmm) to E (mem
   10359       or lo half xmm). */
   10360    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x11) {
   10361       vassert(sz == 4);
   10362       modrm = getIByte(delta+3);
   10363       if (epartIsReg(modrm)) {
   10364          putXMMRegLane64( eregOfRM(modrm), 0,
   10365                           getXMMRegLane64( gregOfRM(modrm), 0 ));
   10366          DIP("movsd %s,%s\n", nameXMMReg(gregOfRM(modrm)),
   10367                               nameXMMReg(eregOfRM(modrm)));
   10368          delta += 3+1;
   10369       } else {
   10370          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   10371          storeLE( mkexpr(addr),
   10372                   getXMMRegLane64(gregOfRM(modrm), 0) );
   10373          DIP("movsd %s,%s\n", nameXMMReg(gregOfRM(modrm)),
   10374                               dis_buf);
   10375          delta += 3+alen;
   10376       }
   10377       goto decode_success;
   10378    }
   10379 
   10380    /* 66 0F 59 = MULPD -- mul 64Fx2 from R/M to R */
   10381    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x59) {
   10382       delta = dis_SSE_E_to_G_all( sorb, delta+2, "mulpd", Iop_Mul64Fx2 );
   10383       goto decode_success;
   10384    }
   10385 
   10386    /* F2 0F 59 = MULSD -- mul 64F0x2 from R/M to R */
   10387    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x59) {
   10388       vassert(sz == 4);
   10389       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "mulsd", Iop_Mul64F0x2 );
   10390       goto decode_success;
   10391    }
   10392 
   10393    /* 66 0F 56 = ORPD -- G = G and E */
   10394    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x56) {
   10395       delta = dis_SSE_E_to_G_all( sorb, delta+2, "orpd", Iop_OrV128 );
   10396       goto decode_success;
   10397    }
   10398 
   10399    /* 66 0F C6 /r ib = SHUFPD -- shuffle packed F64s */
   10400    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC6) {
   10401       Int    select;
   10402       IRTemp sV = newTemp(Ity_V128);
   10403       IRTemp dV = newTemp(Ity_V128);
   10404       IRTemp s1 = newTemp(Ity_I64);
   10405       IRTemp s0 = newTemp(Ity_I64);
   10406       IRTemp d1 = newTemp(Ity_I64);
   10407       IRTemp d0 = newTemp(Ity_I64);
   10408 
   10409       modrm = insn[2];
   10410       assign( dV, getXMMReg(gregOfRM(modrm)) );
   10411 
   10412       if (epartIsReg(modrm)) {
   10413          assign( sV, getXMMReg(eregOfRM(modrm)) );
   10414          select = (Int)insn[3];
   10415          delta += 2+2;
   10416          DIP("shufpd $%d,%s,%s\n", select,
   10417                                    nameXMMReg(eregOfRM(modrm)),
   10418                                    nameXMMReg(gregOfRM(modrm)));
   10419       } else {
   10420          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10421          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   10422          select = (Int)insn[2+alen];
   10423          delta += 3+alen;
   10424          DIP("shufpd $%d,%s,%s\n", select,
   10425                                    dis_buf,
   10426                                    nameXMMReg(gregOfRM(modrm)));
   10427       }
   10428 
   10429       assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
   10430       assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
   10431       assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
   10432       assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
   10433 
   10434 #     define SELD(n) mkexpr((n)==0 ? d0 : d1)
   10435 #     define SELS(n) mkexpr((n)==0 ? s0 : s1)
   10436 
   10437       putXMMReg(
   10438          gregOfRM(modrm),
   10439          binop(Iop_64HLtoV128, SELS((select>>1)&1), SELD((select>>0)&1) )
   10440       );
   10441 
   10442 #     undef SELD
   10443 #     undef SELS
   10444 
   10445       goto decode_success;
   10446    }
   10447 
   10448    /* 66 0F 51 = SQRTPD -- approx sqrt 64Fx2 from R/M to R */
   10449    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x51) {
   10450       delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
   10451                                         "sqrtpd", Iop_Sqrt64Fx2 );
   10452       goto decode_success;
   10453    }
   10454 
   10455    /* F2 0F 51 = SQRTSD -- approx sqrt 64F0x2 from R/M to R */
   10456    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x51) {
   10457       vassert(sz == 4);
   10458       delta = dis_SSE_E_to_G_unary_lo64( sorb, delta+3,
   10459                                          "sqrtsd", Iop_Sqrt64F0x2 );
   10460       goto decode_success;
   10461    }
   10462 
   10463    /* 66 0F 5C = SUBPD -- sub 64Fx2 from R/M to R */
   10464    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5C) {
   10465       delta = dis_SSE_E_to_G_all( sorb, delta+2, "subpd", Iop_Sub64Fx2 );
   10466       goto decode_success;
   10467    }
   10468 
   10469    /* F2 0F 5C = SUBSD -- sub 64F0x2 from R/M to R */
   10470    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5C) {
   10471       vassert(sz == 4);
   10472       delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "subsd", Iop_Sub64F0x2 );
   10473       goto decode_success;
   10474    }
   10475 
   10476    /* 66 0F 15 = UNPCKHPD -- unpack and interleave high part F64s */
   10477    /* 66 0F 14 = UNPCKLPD -- unpack and interleave low part F64s */
   10478    /* These just appear to be special cases of SHUFPS */
   10479    if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
   10480       IRTemp s1 = newTemp(Ity_I64);
   10481       IRTemp s0 = newTemp(Ity_I64);
   10482       IRTemp d1 = newTemp(Ity_I64);
   10483       IRTemp d0 = newTemp(Ity_I64);
   10484       IRTemp sV = newTemp(Ity_V128);
   10485       IRTemp dV = newTemp(Ity_V128);
   10486       Bool   hi = toBool(insn[1] == 0x15);
   10487 
   10488       modrm = insn[2];
   10489       assign( dV, getXMMReg(gregOfRM(modrm)) );
   10490 
   10491       if (epartIsReg(modrm)) {
   10492          assign( sV, getXMMReg(eregOfRM(modrm)) );
   10493          delta += 2+1;
   10494          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   10495                                   nameXMMReg(eregOfRM(modrm)),
   10496                                   nameXMMReg(gregOfRM(modrm)));
   10497       } else {
   10498          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10499          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   10500          delta += 2+alen;
   10501          DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   10502                                   dis_buf,
   10503                                   nameXMMReg(gregOfRM(modrm)));
   10504       }
   10505 
   10506       assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
   10507       assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
   10508       assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
   10509       assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
   10510 
   10511       if (hi) {
   10512          putXMMReg( gregOfRM(modrm),
   10513                     binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1)) );
   10514       } else {
   10515          putXMMReg( gregOfRM(modrm),
   10516                     binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)) );
   10517       }
   10518 
   10519       goto decode_success;
   10520    }
   10521 
   10522    /* 66 0F 57 = XORPD -- G = G and E */
   10523    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x57) {
   10524       delta = dis_SSE_E_to_G_all( sorb, delta+2, "xorpd", Iop_XorV128 );
   10525       goto decode_success;
   10526    }
   10527 
   10528    /* 66 0F 6B = PACKSSDW */
   10529    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6B) {
   10530       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10531                                  "packssdw", Iop_QNarrow32Sx4, True );
   10532       goto decode_success;
   10533    }
   10534 
   10535    /* 66 0F 63 = PACKSSWB */
   10536    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x63) {
   10537       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10538                                  "packsswb", Iop_QNarrow16Sx8, True );
   10539       goto decode_success;
   10540    }
   10541 
   10542    /* 66 0F 67 = PACKUSWB */
   10543    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x67) {
   10544       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10545                                  "packuswb", Iop_QNarrow16Ux8, True );
   10546       goto decode_success;
   10547    }
   10548 
   10549    /* 66 0F FC = PADDB */
   10550    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFC) {
   10551       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10552                                  "paddb", Iop_Add8x16, False );
   10553       goto decode_success;
   10554    }
   10555 
   10556    /* 66 0F FE = PADDD */
   10557    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFE) {
   10558       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10559                                  "paddd", Iop_Add32x4, False );
   10560       goto decode_success;
   10561    }
   10562 
   10563    /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   10564    /* 0F D4 = PADDQ -- add 64x1 */
   10565    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xD4) {
   10566       do_MMX_preamble();
   10567       delta = dis_MMXop_regmem_to_reg (
   10568                 sorb, delta+2, insn[1], "paddq", False );
   10569       goto decode_success;
   10570    }
   10571 
   10572    /* 66 0F D4 = PADDQ */
   10573    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD4) {
   10574       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10575                                  "paddq", Iop_Add64x2, False );
   10576       goto decode_success;
   10577    }
   10578 
   10579    /* 66 0F FD = PADDW */
   10580    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFD) {
   10581       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10582                                  "paddw", Iop_Add16x8, False );
   10583       goto decode_success;
   10584    }
   10585 
   10586    /* 66 0F EC = PADDSB */
   10587    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEC) {
   10588       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10589                                  "paddsb", Iop_QAdd8Sx16, False );
   10590       goto decode_success;
   10591    }
   10592 
   10593    /* 66 0F ED = PADDSW */
   10594    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xED) {
   10595       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10596                                  "paddsw", Iop_QAdd16Sx8, False );
   10597       goto decode_success;
   10598    }
   10599 
   10600    /* 66 0F DC = PADDUSB */
   10601    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDC) {
   10602       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10603                                  "paddusb", Iop_QAdd8Ux16, False );
   10604       goto decode_success;
   10605    }
   10606 
   10607    /* 66 0F DD = PADDUSW */
   10608    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDD) {
   10609       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10610                                  "paddusw", Iop_QAdd16Ux8, False );
   10611       goto decode_success;
   10612    }
   10613 
   10614    /* 66 0F DB = PAND */
   10615    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDB) {
   10616       delta = dis_SSE_E_to_G_all( sorb, delta+2, "pand", Iop_AndV128 );
   10617       goto decode_success;
   10618    }
   10619 
   10620    /* 66 0F DF = PANDN */
   10621    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDF) {
   10622       delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "pandn", Iop_AndV128 );
   10623       goto decode_success;
   10624    }
   10625 
   10626    /* 66 0F E0 = PAVGB */
   10627    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE0) {
   10628       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10629                                  "pavgb", Iop_Avg8Ux16, False );
   10630       goto decode_success;
   10631    }
   10632 
   10633    /* 66 0F E3 = PAVGW */
   10634    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE3) {
   10635       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10636                                  "pavgw", Iop_Avg16Ux8, False );
   10637       goto decode_success;
   10638    }
   10639 
   10640    /* 66 0F 74 = PCMPEQB */
   10641    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x74) {
   10642       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10643                                  "pcmpeqb", Iop_CmpEQ8x16, False );
   10644       goto decode_success;
   10645    }
   10646 
   10647    /* 66 0F 76 = PCMPEQD */
   10648    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x76) {
   10649       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10650                                  "pcmpeqd", Iop_CmpEQ32x4, False );
   10651       goto decode_success;
   10652    }
   10653 
   10654    /* 66 0F 75 = PCMPEQW */
   10655    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x75) {
   10656       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10657                                  "pcmpeqw", Iop_CmpEQ16x8, False );
   10658       goto decode_success;
   10659    }
   10660 
   10661    /* 66 0F 64 = PCMPGTB */
   10662    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x64) {
   10663       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10664                                  "pcmpgtb", Iop_CmpGT8Sx16, False );
   10665       goto decode_success;
   10666    }
   10667 
   10668    /* 66 0F 66 = PCMPGTD */
   10669    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x66) {
   10670       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10671                                  "pcmpgtd", Iop_CmpGT32Sx4, False );
   10672       goto decode_success;
   10673    }
   10674 
   10675    /* 66 0F 65 = PCMPGTW */
   10676    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x65) {
   10677       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10678                                  "pcmpgtw", Iop_CmpGT16Sx8, False );
   10679       goto decode_success;
   10680    }
   10681 
   10682    /* 66 0F C5 = PEXTRW -- extract 16-bit field from xmm(E) and put
   10683       zero-extend of it in ireg(G). */
   10684    if (insn[0] == 0x0F && insn[1] == 0xC5) {
   10685       modrm = insn[2];
   10686       if (sz == 2 && epartIsReg(modrm)) {
   10687          t5 = newTemp(Ity_V128);
   10688          t4 = newTemp(Ity_I16);
   10689          assign(t5, getXMMReg(eregOfRM(modrm)));
   10690          breakup128to32s( t5, &t3, &t2, &t1, &t0 );
   10691          switch (insn[3] & 7) {
   10692             case 0:  assign(t4, unop(Iop_32to16,   mkexpr(t0))); break;
   10693             case 1:  assign(t4, unop(Iop_32HIto16, mkexpr(t0))); break;
   10694             case 2:  assign(t4, unop(Iop_32to16,   mkexpr(t1))); break;
   10695             case 3:  assign(t4, unop(Iop_32HIto16, mkexpr(t1))); break;
   10696             case 4:  assign(t4, unop(Iop_32to16,   mkexpr(t2))); break;
   10697             case 5:  assign(t4, unop(Iop_32HIto16, mkexpr(t2))); break;
   10698             case 6:  assign(t4, unop(Iop_32to16,   mkexpr(t3))); break;
   10699             case 7:  assign(t4, unop(Iop_32HIto16, mkexpr(t3))); break;
   10700             default: vassert(0); /*NOTREACHED*/
   10701          }
   10702          putIReg(4, gregOfRM(modrm), unop(Iop_16Uto32, mkexpr(t4)));
   10703          DIP("pextrw $%d,%s,%s\n",
   10704              (Int)insn[3], nameXMMReg(eregOfRM(modrm)),
   10705                            nameIReg(4,gregOfRM(modrm)));
   10706          delta += 4;
   10707          goto decode_success;
   10708       }
   10709       /* else fall through */
   10710    }
   10711 
   10712    /* 66 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
   10713       put it into the specified lane of xmm(G). */
   10714    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC4) {
   10715       Int lane;
   10716       t4 = newTemp(Ity_I16);
   10717       modrm = insn[2];
   10718 
   10719       if (epartIsReg(modrm)) {
   10720          assign(t4, getIReg(2, eregOfRM(modrm)));
   10721          delta += 3+1;
   10722          lane = insn[3+1-1];
   10723          DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   10724                                    nameIReg(2,eregOfRM(modrm)),
   10725                                    nameXMMReg(gregOfRM(modrm)));
   10726       } else {
   10727          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10728          delta += 3+alen;
   10729          lane = insn[3+alen-1];
   10730          assign(t4, loadLE(Ity_I16, mkexpr(addr)));
   10731          DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   10732                                    dis_buf,
   10733                                    nameXMMReg(gregOfRM(modrm)));
   10734       }
   10735 
   10736       putXMMRegLane16( gregOfRM(modrm), lane & 7, mkexpr(t4) );
   10737       goto decode_success;
   10738    }
   10739 
   10740    /* 66 0F F5 = PMADDWD -- Multiply and add packed integers from
   10741       E(xmm or mem) to G(xmm) */
   10742    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF5) {
   10743       IRTemp s1V  = newTemp(Ity_V128);
   10744       IRTemp s2V  = newTemp(Ity_V128);
   10745       IRTemp dV   = newTemp(Ity_V128);
   10746       IRTemp s1Hi = newTemp(Ity_I64);
   10747       IRTemp s1Lo = newTemp(Ity_I64);
   10748       IRTemp s2Hi = newTemp(Ity_I64);
   10749       IRTemp s2Lo = newTemp(Ity_I64);
   10750       IRTemp dHi  = newTemp(Ity_I64);
   10751       IRTemp dLo  = newTemp(Ity_I64);
   10752       modrm = insn[2];
   10753       if (epartIsReg(modrm)) {
   10754          assign( s1V, getXMMReg(eregOfRM(modrm)) );
   10755          delta += 2+1;
   10756          DIP("pmaddwd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10757                                 nameXMMReg(gregOfRM(modrm)));
   10758       } else {
   10759          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10760          assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
   10761          delta += 2+alen;
   10762          DIP("pmaddwd %s,%s\n", dis_buf,
   10763                                 nameXMMReg(gregOfRM(modrm)));
   10764       }
   10765       assign( s2V, getXMMReg(gregOfRM(modrm)) );
   10766       assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
   10767       assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
   10768       assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
   10769       assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
   10770       assign( dHi, mkIRExprCCall(
   10771                       Ity_I64, 0/*regparms*/,
   10772                       "x86g_calculate_mmx_pmaddwd",
   10773                       &x86g_calculate_mmx_pmaddwd,
   10774                       mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
   10775                    ));
   10776       assign( dLo, mkIRExprCCall(
   10777                       Ity_I64, 0/*regparms*/,
   10778                       "x86g_calculate_mmx_pmaddwd",
   10779                       &x86g_calculate_mmx_pmaddwd,
   10780                       mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
   10781                    ));
   10782       assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
   10783       putXMMReg(gregOfRM(modrm), mkexpr(dV));
   10784       goto decode_success;
   10785    }
   10786 
   10787    /* 66 0F EE = PMAXSW -- 16x8 signed max */
   10788    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEE) {
   10789       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10790                                  "pmaxsw", Iop_Max16Sx8, False );
   10791       goto decode_success;
   10792    }
   10793 
   10794    /* 66 0F DE = PMAXUB -- 8x16 unsigned max */
   10795    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDE) {
   10796       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10797                                  "pmaxub", Iop_Max8Ux16, False );
   10798       goto decode_success;
   10799    }
   10800 
   10801    /* 66 0F EA = PMINSW -- 16x8 signed min */
   10802    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEA) {
   10803       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10804                                  "pminsw", Iop_Min16Sx8, False );
   10805       goto decode_success;
   10806    }
   10807 
   10808    /* 66 0F DA = PMINUB -- 8x16 unsigned min */
   10809    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDA) {
   10810       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10811                                  "pminub", Iop_Min8Ux16, False );
   10812       goto decode_success;
   10813    }
   10814 
   10815    /* 66 0F D7 = PMOVMSKB -- extract sign bits from each of 16 lanes in
   10816       xmm(G), turn them into a byte, and put zero-extend of it in
   10817       ireg(G).  Doing this directly is just too cumbersome; give up
   10818       therefore and call a helper. */
   10819    /* UInt x86g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo ); */
   10820    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD7) {
   10821       modrm = insn[2];
   10822       if (epartIsReg(modrm)) {
   10823          t0 = newTemp(Ity_I64);
   10824          t1 = newTemp(Ity_I64);
   10825          assign(t0, getXMMRegLane64(eregOfRM(modrm), 0));
   10826          assign(t1, getXMMRegLane64(eregOfRM(modrm), 1));
   10827          t5 = newTemp(Ity_I32);
   10828          assign(t5, mkIRExprCCall(
   10829                        Ity_I32, 0/*regparms*/,
   10830                        "x86g_calculate_sse_pmovmskb",
   10831                        &x86g_calculate_sse_pmovmskb,
   10832                        mkIRExprVec_2( mkexpr(t1), mkexpr(t0) )));
   10833          putIReg(4, gregOfRM(modrm), mkexpr(t5));
   10834          DIP("pmovmskb %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10835                                  nameIReg(4,gregOfRM(modrm)));
   10836          delta += 3;
   10837          goto decode_success;
   10838       }
   10839       /* else fall through */
   10840    }
   10841 
   10842    /* 66 0F E4 = PMULHUW -- 16x8 hi-half of unsigned widening multiply */
   10843    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE4) {
   10844       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10845                                  "pmulhuw", Iop_MulHi16Ux8, False );
   10846       goto decode_success;
   10847    }
   10848 
   10849    /* 66 0F E5 = PMULHW -- 16x8 hi-half of signed widening multiply */
   10850    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE5) {
   10851       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10852                                  "pmulhw", Iop_MulHi16Sx8, False );
   10853       goto decode_success;
   10854    }
   10855 
   10856    /* 66 0F D5 = PMULHL -- 16x8 multiply */
   10857    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD5) {
   10858       delta = dis_SSEint_E_to_G( sorb, delta+2,
   10859                                  "pmullw", Iop_Mul16x8, False );
   10860       goto decode_success;
   10861    }
   10862 
   10863    /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   10864    /* 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
   10865       0 to form 64-bit result */
   10866    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF4) {
   10867       IRTemp sV = newTemp(Ity_I64);
   10868       IRTemp dV = newTemp(Ity_I64);
   10869       t1 = newTemp(Ity_I32);
   10870       t0 = newTemp(Ity_I32);
   10871       modrm = insn[2];
   10872 
   10873       do_MMX_preamble();
   10874       assign( dV, getMMXReg(gregOfRM(modrm)) );
   10875 
   10876       if (epartIsReg(modrm)) {
   10877          assign( sV, getMMXReg(eregOfRM(modrm)) );
   10878          delta += 2+1;
   10879          DIP("pmuludq %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   10880                                 nameMMXReg(gregOfRM(modrm)));
   10881       } else {
   10882          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10883          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   10884          delta += 2+alen;
   10885          DIP("pmuludq %s,%s\n", dis_buf,
   10886                                 nameMMXReg(gregOfRM(modrm)));
   10887       }
   10888 
   10889       assign( t0, unop(Iop_64to32, mkexpr(dV)) );
   10890       assign( t1, unop(Iop_64to32, mkexpr(sV)) );
   10891       putMMXReg( gregOfRM(modrm),
   10892                  binop( Iop_MullU32, mkexpr(t0), mkexpr(t1) ) );
   10893       goto decode_success;
   10894    }
   10895 
   10896    /* 66 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
   10897       0 to form lower 64-bit half and lanes 2 x 2 to form upper 64-bit
   10898       half */
   10899    /* This is a really poor translation -- could be improved if
   10900       performance critical */
   10901    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF4) {
   10902       IRTemp sV, dV;
   10903       IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   10904       sV = newTemp(Ity_V128);
   10905       dV = newTemp(Ity_V128);
   10906       s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   10907       t1 = newTemp(Ity_I64);
   10908       t0 = newTemp(Ity_I64);
   10909       modrm = insn[2];
   10910       assign( dV, getXMMReg(gregOfRM(modrm)) );
   10911 
   10912       if (epartIsReg(modrm)) {
   10913          assign( sV, getXMMReg(eregOfRM(modrm)) );
   10914          delta += 2+1;
   10915          DIP("pmuludq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10916                                 nameXMMReg(gregOfRM(modrm)));
   10917       } else {
   10918          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10919          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   10920          delta += 2+alen;
   10921          DIP("pmuludq %s,%s\n", dis_buf,
   10922                                 nameXMMReg(gregOfRM(modrm)));
   10923       }
   10924 
   10925       breakup128to32s( dV, &d3, &d2, &d1, &d0 );
   10926       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   10927 
   10928       assign( t0, binop( Iop_MullU32, mkexpr(d0), mkexpr(s0)) );
   10929       putXMMRegLane64( gregOfRM(modrm), 0, mkexpr(t0) );
   10930       assign( t1, binop( Iop_MullU32, mkexpr(d2), mkexpr(s2)) );
   10931       putXMMRegLane64( gregOfRM(modrm), 1, mkexpr(t1) );
   10932       goto decode_success;
   10933    }
   10934 
   10935    /* 66 0F EB = POR */
   10936    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEB) {
   10937       delta = dis_SSE_E_to_G_all( sorb, delta+2, "por", Iop_OrV128 );
   10938       goto decode_success;
   10939    }
   10940 
   10941    /* 66 0F F6 = PSADBW -- 2 x (8x8 -> 48 zeroes ++ u16) Sum Abs Diffs
   10942       from E(xmm or mem) to G(xmm) */
   10943    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF6) {
   10944       IRTemp s1V  = newTemp(Ity_V128);
   10945       IRTemp s2V  = newTemp(Ity_V128);
   10946       IRTemp dV   = newTemp(Ity_V128);
   10947       IRTemp s1Hi = newTemp(Ity_I64);
   10948       IRTemp s1Lo = newTemp(Ity_I64);
   10949       IRTemp s2Hi = newTemp(Ity_I64);
   10950       IRTemp s2Lo = newTemp(Ity_I64);
   10951       IRTemp dHi  = newTemp(Ity_I64);
   10952       IRTemp dLo  = newTemp(Ity_I64);
   10953       modrm = insn[2];
   10954       if (epartIsReg(modrm)) {
   10955          assign( s1V, getXMMReg(eregOfRM(modrm)) );
   10956          delta += 2+1;
   10957          DIP("psadbw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   10958                                nameXMMReg(gregOfRM(modrm)));
   10959       } else {
   10960          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   10961          assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
   10962          delta += 2+alen;
   10963          DIP("psadbw %s,%s\n", dis_buf,
   10964                                nameXMMReg(gregOfRM(modrm)));
   10965       }
   10966       assign( s2V, getXMMReg(gregOfRM(modrm)) );
   10967       assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
   10968       assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
   10969       assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
   10970       assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
   10971       assign( dHi, mkIRExprCCall(
   10972                       Ity_I64, 0/*regparms*/,
   10973                       "x86g_calculate_mmx_psadbw",
   10974                       &x86g_calculate_mmx_psadbw,
   10975                       mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
   10976                    ));
   10977       assign( dLo, mkIRExprCCall(
   10978                       Ity_I64, 0/*regparms*/,
   10979                       "x86g_calculate_mmx_psadbw",
   10980                       &x86g_calculate_mmx_psadbw,
   10981                       mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
   10982                    ));
   10983       assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
   10984       putXMMReg(gregOfRM(modrm), mkexpr(dV));
   10985       goto decode_success;
   10986    }
   10987 
   10988    /* 66 0F 70 = PSHUFD -- rearrange 4x32 from E(xmm or mem) to G(xmm) */
   10989    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x70) {
   10990       Int order;
   10991       IRTemp sV, dV, s3, s2, s1, s0;
   10992       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   10993       sV = newTemp(Ity_V128);
   10994       dV = newTemp(Ity_V128);
   10995       modrm = insn[2];
   10996       if (epartIsReg(modrm)) {
   10997          assign( sV, getXMMReg(eregOfRM(modrm)) );
   10998          order = (Int)insn[3];
   10999          delta += 2+2;
   11000          DIP("pshufd $%d,%s,%s\n", order,
   11001                                    nameXMMReg(eregOfRM(modrm)),
   11002                                    nameXMMReg(gregOfRM(modrm)));
   11003       } else {
   11004          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11005          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11006 	 order = (Int)insn[2+alen];
   11007          delta += 3+alen;
   11008          DIP("pshufd $%d,%s,%s\n", order,
   11009                                    dis_buf,
   11010                                    nameXMMReg(gregOfRM(modrm)));
   11011       }
   11012       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   11013 
   11014 #     define SEL(n) \
   11015                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   11016       assign(dV,
   11017 	     mk128from32s( SEL((order>>6)&3), SEL((order>>4)&3),
   11018                            SEL((order>>2)&3), SEL((order>>0)&3) )
   11019       );
   11020       putXMMReg(gregOfRM(modrm), mkexpr(dV));
   11021 #     undef SEL
   11022       goto decode_success;
   11023    }
   11024 
   11025    /* F3 0F 70 = PSHUFHW -- rearrange upper half 4x16 from E(xmm or
   11026       mem) to G(xmm), and copy lower half */
   11027    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x70) {
   11028       Int order;
   11029       IRTemp sVhi, dVhi, sV, dV, s3, s2, s1, s0;
   11030       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11031       sV   = newTemp(Ity_V128);
   11032       dV   = newTemp(Ity_V128);
   11033       sVhi = newTemp(Ity_I64);
   11034       dVhi = newTemp(Ity_I64);
   11035       modrm = insn[3];
   11036       if (epartIsReg(modrm)) {
   11037          assign( sV, getXMMReg(eregOfRM(modrm)) );
   11038          order = (Int)insn[4];
   11039          delta += 4+1;
   11040          DIP("pshufhw $%d,%s,%s\n", order,
   11041                                     nameXMMReg(eregOfRM(modrm)),
   11042                                     nameXMMReg(gregOfRM(modrm)));
   11043       } else {
   11044          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11045          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11046 	 order = (Int)insn[3+alen];
   11047          delta += 4+alen;
   11048          DIP("pshufhw $%d,%s,%s\n", order,
   11049                                     dis_buf,
   11050                                     nameXMMReg(gregOfRM(modrm)));
   11051       }
   11052       assign( sVhi, unop(Iop_V128HIto64, mkexpr(sV)) );
   11053       breakup64to16s( sVhi, &s3, &s2, &s1, &s0 );
   11054 
   11055 #     define SEL(n) \
   11056                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   11057       assign(dVhi,
   11058 	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
   11059                           SEL((order>>2)&3), SEL((order>>0)&3) )
   11060       );
   11061       assign(dV, binop( Iop_64HLtoV128,
   11062                         mkexpr(dVhi),
   11063                         unop(Iop_V128to64, mkexpr(sV))) );
   11064       putXMMReg(gregOfRM(modrm), mkexpr(dV));
   11065 #     undef SEL
   11066       goto decode_success;
   11067    }
   11068 
   11069    /* F2 0F 70 = PSHUFLW -- rearrange lower half 4x16 from E(xmm or
   11070       mem) to G(xmm), and copy upper half */
   11071    if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x70) {
   11072       Int order;
   11073       IRTemp sVlo, dVlo, sV, dV, s3, s2, s1, s0;
   11074       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11075       sV   = newTemp(Ity_V128);
   11076       dV   = newTemp(Ity_V128);
   11077       sVlo = newTemp(Ity_I64);
   11078       dVlo = newTemp(Ity_I64);
   11079       modrm = insn[3];
   11080       if (epartIsReg(modrm)) {
   11081          assign( sV, getXMMReg(eregOfRM(modrm)) );
   11082          order = (Int)insn[4];
   11083          delta += 4+1;
   11084          DIP("pshuflw $%d,%s,%s\n", order,
   11085                                     nameXMMReg(eregOfRM(modrm)),
   11086                                     nameXMMReg(gregOfRM(modrm)));
   11087       } else {
   11088          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11089          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11090 	 order = (Int)insn[3+alen];
   11091          delta += 4+alen;
   11092          DIP("pshuflw $%d,%s,%s\n", order,
   11093                                     dis_buf,
   11094                                     nameXMMReg(gregOfRM(modrm)));
   11095       }
   11096       assign( sVlo, unop(Iop_V128to64, mkexpr(sV)) );
   11097       breakup64to16s( sVlo, &s3, &s2, &s1, &s0 );
   11098 
   11099 #     define SEL(n) \
   11100                 ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   11101       assign(dVlo,
   11102 	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
   11103                           SEL((order>>2)&3), SEL((order>>0)&3) )
   11104       );
   11105       assign(dV, binop( Iop_64HLtoV128,
   11106                         unop(Iop_V128HIto64, mkexpr(sV)),
   11107                         mkexpr(dVlo) ) );
   11108       putXMMReg(gregOfRM(modrm), mkexpr(dV));
   11109 #     undef SEL
   11110       goto decode_success;
   11111    }
   11112 
   11113    /* 66 0F 72 /6 ib = PSLLD by immediate */
   11114    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
   11115        && epartIsReg(insn[2])
   11116        && gregOfRM(insn[2]) == 6) {
   11117       delta = dis_SSE_shiftE_imm( delta+2, "pslld", Iop_ShlN32x4 );
   11118       goto decode_success;
   11119    }
   11120 
   11121    /* 66 0F F2 = PSLLD by E */
   11122    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF2) {
   11123       delta = dis_SSE_shiftG_byE( sorb, delta+2, "pslld", Iop_ShlN32x4 );
   11124       goto decode_success;
   11125    }
   11126 
   11127    /* 66 0F 73 /7 ib = PSLLDQ by immediate */
   11128    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
   11129        && epartIsReg(insn[2])
   11130        && gregOfRM(insn[2]) == 7) {
   11131       IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
   11132       Int    imm = (Int)insn[3];
   11133       Int    reg = eregOfRM(insn[2]);
   11134       DIP("pslldq $%d,%s\n", imm, nameXMMReg(reg));
   11135       vassert(imm >= 0 && imm <= 255);
   11136       delta += 4;
   11137 
   11138       sV    = newTemp(Ity_V128);
   11139       dV    = newTemp(Ity_V128);
   11140       hi64  = newTemp(Ity_I64);
   11141       lo64  = newTemp(Ity_I64);
   11142       hi64r = newTemp(Ity_I64);
   11143       lo64r = newTemp(Ity_I64);
   11144 
   11145       if (imm >= 16) {
   11146          putXMMReg(reg, mkV128(0x0000));
   11147          goto decode_success;
   11148       }
   11149 
   11150       assign( sV, getXMMReg(reg) );
   11151       assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
   11152       assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
   11153 
   11154       if (imm == 0) {
   11155          assign( lo64r, mkexpr(lo64) );
   11156          assign( hi64r, mkexpr(hi64) );
   11157       }
   11158       else
   11159       if (imm == 8) {
   11160          assign( lo64r, mkU64(0) );
   11161          assign( hi64r, mkexpr(lo64) );
   11162       }
   11163       else
   11164       if (imm > 8) {
   11165          assign( lo64r, mkU64(0) );
   11166          assign( hi64r, binop( Iop_Shl64,
   11167                                mkexpr(lo64),
   11168                                mkU8( 8*(imm-8) ) ));
   11169       } else {
   11170          assign( lo64r, binop( Iop_Shl64,
   11171                                mkexpr(lo64),
   11172                                mkU8(8 * imm) ));
   11173          assign( hi64r,
   11174                  binop( Iop_Or64,
   11175                         binop(Iop_Shl64, mkexpr(hi64),
   11176                                          mkU8(8 * imm)),
   11177                         binop(Iop_Shr64, mkexpr(lo64),
   11178                                          mkU8(8 * (8 - imm)) )
   11179                       )
   11180                );
   11181       }
   11182       assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
   11183       putXMMReg(reg, mkexpr(dV));
   11184       goto decode_success;
   11185    }
   11186 
   11187    /* 66 0F 73 /6 ib = PSLLQ by immediate */
   11188    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
   11189        && epartIsReg(insn[2])
   11190        && gregOfRM(insn[2]) == 6) {
   11191       delta = dis_SSE_shiftE_imm( delta+2, "psllq", Iop_ShlN64x2 );
   11192       goto decode_success;
   11193    }
   11194 
   11195    /* 66 0F F3 = PSLLQ by E */
   11196    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF3) {
   11197       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psllq", Iop_ShlN64x2 );
   11198       goto decode_success;
   11199    }
   11200 
   11201    /* 66 0F 71 /6 ib = PSLLW by immediate */
   11202    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
   11203        && epartIsReg(insn[2])
   11204        && gregOfRM(insn[2]) == 6) {
   11205       delta = dis_SSE_shiftE_imm( delta+2, "psllw", Iop_ShlN16x8 );
   11206       goto decode_success;
   11207    }
   11208 
   11209    /* 66 0F F1 = PSLLW by E */
   11210    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF1) {
   11211       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psllw", Iop_ShlN16x8 );
   11212       goto decode_success;
   11213    }
   11214 
   11215    /* 66 0F 72 /4 ib = PSRAD by immediate */
   11216    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
   11217        && epartIsReg(insn[2])
   11218        && gregOfRM(insn[2]) == 4) {
   11219       delta = dis_SSE_shiftE_imm( delta+2, "psrad", Iop_SarN32x4 );
   11220       goto decode_success;
   11221    }
   11222 
   11223    /* 66 0F E2 = PSRAD by E */
   11224    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE2) {
   11225       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrad", Iop_SarN32x4 );
   11226       goto decode_success;
   11227    }
   11228 
   11229    /* 66 0F 71 /4 ib = PSRAW by immediate */
   11230    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
   11231        && epartIsReg(insn[2])
   11232        && gregOfRM(insn[2]) == 4) {
   11233       delta = dis_SSE_shiftE_imm( delta+2, "psraw", Iop_SarN16x8 );
   11234       goto decode_success;
   11235    }
   11236 
   11237    /* 66 0F E1 = PSRAW by E */
   11238    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE1) {
   11239       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psraw", Iop_SarN16x8 );
   11240       goto decode_success;
   11241    }
   11242 
   11243    /* 66 0F 72 /2 ib = PSRLD by immediate */
   11244    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
   11245        && epartIsReg(insn[2])
   11246        && gregOfRM(insn[2]) == 2) {
   11247       delta = dis_SSE_shiftE_imm( delta+2, "psrld", Iop_ShrN32x4 );
   11248       goto decode_success;
   11249    }
   11250 
   11251    /* 66 0F D2 = PSRLD by E */
   11252    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD2) {
   11253       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrld", Iop_ShrN32x4 );
   11254       goto decode_success;
   11255    }
   11256 
   11257    /* 66 0F 73 /3 ib = PSRLDQ by immediate */
   11258    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
   11259        && epartIsReg(insn[2])
   11260        && gregOfRM(insn[2]) == 3) {
   11261       IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
   11262       Int    imm = (Int)insn[3];
   11263       Int    reg = eregOfRM(insn[2]);
   11264       DIP("psrldq $%d,%s\n", imm, nameXMMReg(reg));
   11265       vassert(imm >= 0 && imm <= 255);
   11266       delta += 4;
   11267 
   11268       sV    = newTemp(Ity_V128);
   11269       dV    = newTemp(Ity_V128);
   11270       hi64  = newTemp(Ity_I64);
   11271       lo64  = newTemp(Ity_I64);
   11272       hi64r = newTemp(Ity_I64);
   11273       lo64r = newTemp(Ity_I64);
   11274 
   11275       if (imm >= 16) {
   11276          putXMMReg(reg, mkV128(0x0000));
   11277          goto decode_success;
   11278       }
   11279 
   11280       assign( sV, getXMMReg(reg) );
   11281       assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
   11282       assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
   11283 
   11284       if (imm == 0) {
   11285          assign( lo64r, mkexpr(lo64) );
   11286          assign( hi64r, mkexpr(hi64) );
   11287       }
   11288       else
   11289       if (imm == 8) {
   11290          assign( hi64r, mkU64(0) );
   11291          assign( lo64r, mkexpr(hi64) );
   11292       }
   11293       else
   11294       if (imm > 8) {
   11295          assign( hi64r, mkU64(0) );
   11296          assign( lo64r, binop( Iop_Shr64,
   11297                                mkexpr(hi64),
   11298                                mkU8( 8*(imm-8) ) ));
   11299       } else {
   11300          assign( hi64r, binop( Iop_Shr64,
   11301                                mkexpr(hi64),
   11302                                mkU8(8 * imm) ));
   11303          assign( lo64r,
   11304                  binop( Iop_Or64,
   11305                         binop(Iop_Shr64, mkexpr(lo64),
   11306                                          mkU8(8 * imm)),
   11307                         binop(Iop_Shl64, mkexpr(hi64),
   11308                                          mkU8(8 * (8 - imm)) )
   11309                       )
   11310                );
   11311       }
   11312 
   11313       assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
   11314       putXMMReg(reg, mkexpr(dV));
   11315       goto decode_success;
   11316    }
   11317 
   11318    /* 66 0F 73 /2 ib = PSRLQ by immediate */
   11319    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
   11320        && epartIsReg(insn[2])
   11321        && gregOfRM(insn[2]) == 2) {
   11322       delta = dis_SSE_shiftE_imm( delta+2, "psrlq", Iop_ShrN64x2 );
   11323       goto decode_success;
   11324    }
   11325 
   11326    /* 66 0F D3 = PSRLQ by E */
   11327    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD3) {
   11328       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrlq", Iop_ShrN64x2 );
   11329       goto decode_success;
   11330    }
   11331 
   11332    /* 66 0F 71 /2 ib = PSRLW by immediate */
   11333    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
   11334        && epartIsReg(insn[2])
   11335        && gregOfRM(insn[2]) == 2) {
   11336       delta = dis_SSE_shiftE_imm( delta+2, "psrlw", Iop_ShrN16x8 );
   11337       goto decode_success;
   11338    }
   11339 
   11340    /* 66 0F D1 = PSRLW by E */
   11341    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD1) {
   11342       delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrlw", Iop_ShrN16x8 );
   11343       goto decode_success;
   11344    }
   11345 
   11346    /* 66 0F F8 = PSUBB */
   11347    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF8) {
   11348       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11349                                  "psubb", Iop_Sub8x16, False );
   11350       goto decode_success;
   11351    }
   11352 
   11353    /* 66 0F FA = PSUBD */
   11354    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFA) {
   11355       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11356                                  "psubd", Iop_Sub32x4, False );
   11357       goto decode_success;
   11358    }
   11359 
   11360    /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   11361    /* 0F FB = PSUBQ -- sub 64x1 */
   11362    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xFB) {
   11363       do_MMX_preamble();
   11364       delta = dis_MMXop_regmem_to_reg (
   11365                 sorb, delta+2, insn[1], "psubq", False );
   11366       goto decode_success;
   11367    }
   11368 
   11369    /* 66 0F FB = PSUBQ */
   11370    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFB) {
   11371       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11372                                  "psubq", Iop_Sub64x2, False );
   11373       goto decode_success;
   11374    }
   11375 
   11376    /* 66 0F F9 = PSUBW */
   11377    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF9) {
   11378       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11379                                  "psubw", Iop_Sub16x8, False );
   11380       goto decode_success;
   11381    }
   11382 
   11383    /* 66 0F E8 = PSUBSB */
   11384    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE8) {
   11385       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11386                                  "psubsb", Iop_QSub8Sx16, False );
   11387       goto decode_success;
   11388    }
   11389 
   11390    /* 66 0F E9 = PSUBSW */
   11391    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE9) {
   11392       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11393                                  "psubsw", Iop_QSub16Sx8, False );
   11394       goto decode_success;
   11395    }
   11396 
   11397    /* 66 0F D8 = PSUBSB */
   11398    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD8) {
   11399       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11400                                  "psubusb", Iop_QSub8Ux16, False );
   11401       goto decode_success;
   11402    }
   11403 
   11404    /* 66 0F D9 = PSUBSW */
   11405    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD9) {
   11406       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11407                                  "psubusw", Iop_QSub16Ux8, False );
   11408       goto decode_success;
   11409    }
   11410 
   11411    /* 66 0F 68 = PUNPCKHBW */
   11412    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x68) {
   11413       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11414                                  "punpckhbw",
   11415                                  Iop_InterleaveHI8x16, True );
   11416       goto decode_success;
   11417    }
   11418 
   11419    /* 66 0F 6A = PUNPCKHDQ */
   11420    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6A) {
   11421       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11422                                  "punpckhdq",
   11423                                  Iop_InterleaveHI32x4, True );
   11424       goto decode_success;
   11425    }
   11426 
   11427    /* 66 0F 6D = PUNPCKHQDQ */
   11428    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6D) {
   11429       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11430                                  "punpckhqdq",
   11431                                  Iop_InterleaveHI64x2, True );
   11432       goto decode_success;
   11433    }
   11434 
   11435    /* 66 0F 69 = PUNPCKHWD */
   11436    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x69) {
   11437       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11438                                  "punpckhwd",
   11439                                  Iop_InterleaveHI16x8, True );
   11440       goto decode_success;
   11441    }
   11442 
   11443    /* 66 0F 60 = PUNPCKLBW */
   11444    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x60) {
   11445       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11446                                  "punpcklbw",
   11447                                  Iop_InterleaveLO8x16, True );
   11448       goto decode_success;
   11449    }
   11450 
   11451    /* 66 0F 62 = PUNPCKLDQ */
   11452    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x62) {
   11453       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11454                                  "punpckldq",
   11455                                  Iop_InterleaveLO32x4, True );
   11456       goto decode_success;
   11457    }
   11458 
   11459    /* 66 0F 6C = PUNPCKLQDQ */
   11460    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6C) {
   11461       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11462                                  "punpcklqdq",
   11463                                  Iop_InterleaveLO64x2, True );
   11464       goto decode_success;
   11465    }
   11466 
   11467    /* 66 0F 61 = PUNPCKLWD */
   11468    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x61) {
   11469       delta = dis_SSEint_E_to_G( sorb, delta+2,
   11470                                  "punpcklwd",
   11471                                  Iop_InterleaveLO16x8, True );
   11472       goto decode_success;
   11473    }
   11474 
   11475    /* 66 0F EF = PXOR */
   11476    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEF) {
   11477       delta = dis_SSE_E_to_G_all( sorb, delta+2, "pxor", Iop_XorV128 );
   11478       goto decode_success;
   11479    }
   11480 
   11481 //--    /* FXSAVE/FXRSTOR m32 -- load/store the FPU/MMX/SSE state. */
   11482 //--    if (insn[0] == 0x0F && insn[1] == 0xAE
   11483 //--        && (!epartIsReg(insn[2]))
   11484 //--        && (gregOfRM(insn[2]) == 1 || gregOfRM(insn[2]) == 0) ) {
   11485 //--       Bool store = gregOfRM(insn[2]) == 0;
   11486 //--       vg_assert(sz == 4);
   11487 //--       pair = disAMode ( cb, sorb, eip+2, dis_buf );
   11488 //--       t1   = LOW24(pair);
   11489 //--       eip += 2+HI8(pair);
   11490 //--       uInstr3(cb, store ? SSE2a_MemWr : SSE2a_MemRd, 512,
   11491 //--                   Lit16, (((UShort)insn[0]) << 8) | (UShort)insn[1],
   11492 //--                   Lit16, (UShort)insn[2],
   11493 //--                   TempReg, t1 );
   11494 //--       DIP("fx%s %s\n", store ? "save" : "rstor", dis_buf );
   11495 //--       goto decode_success;
   11496 //--    }
   11497 
   11498    /* 0F AE /7 = CLFLUSH -- flush cache line */
   11499    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
   11500        && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 7) {
   11501 
   11502       /* This is something of a hack.  We need to know the size of the
   11503          cache line containing addr.  Since we don't (easily), assume
   11504          256 on the basis that no real cache would have a line that
   11505          big.  It's safe to invalidate more stuff than we need, just
   11506          inefficient. */
   11507       UInt lineszB = 256;
   11508 
   11509       addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11510       delta += 2+alen;
   11511 
   11512       /* Round addr down to the start of the containing block. */
   11513       stmt( IRStmt_Put(
   11514                OFFB_TISTART,
   11515                binop( Iop_And32,
   11516                       mkexpr(addr),
   11517                       mkU32( ~(lineszB-1) ))) );
   11518 
   11519       stmt( IRStmt_Put(OFFB_TILEN, mkU32(lineszB) ) );
   11520 
   11521       irsb->jumpkind = Ijk_TInval;
   11522       irsb->next     = mkU32(guest_EIP_bbstart+delta);
   11523       dres.whatNext  = Dis_StopHere;
   11524 
   11525       DIP("clflush %s\n", dis_buf);
   11526       goto decode_success;
   11527    }
   11528 
   11529    /* ---------------------------------------------------- */
   11530    /* --- end of the SSE2 decoder.                     --- */
   11531    /* ---------------------------------------------------- */
   11532 
   11533    /* ---------------------------------------------------- */
   11534    /* --- start of the SSE3 decoder.                   --- */
   11535    /* ---------------------------------------------------- */
   11536 
   11537    /* Skip parts of the decoder which don't apply given the stated
   11538       guest subarchitecture. */
   11539    /* if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE3)) */
   11540    /* In fact this is highly bogus; we accept SSE3 insns even on a
   11541       SSE2-only guest since they turn into IR which can be re-emitted
   11542       successfully on an SSE2 host. */
   11543    if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2))
   11544       goto after_sse_decoders; /* no SSE3 capabilities */
   11545 
   11546    insn = (UChar*)&guest_code[delta];
   11547 
   11548    /* F3 0F 12 = MOVSLDUP -- move from E (mem or xmm) to G (xmm),
   11549       duplicating some lanes (2:2:0:0). */
   11550    /* F3 0F 16 = MOVSHDUP -- move from E (mem or xmm) to G (xmm),
   11551       duplicating some lanes (3:3:1:1). */
   11552    if (sz == 4 && insn[0] == 0xF3 && insn[1] == 0x0F
   11553        && (insn[2] == 0x12 || insn[2] == 0x16)) {
   11554       IRTemp s3, s2, s1, s0;
   11555       IRTemp sV  = newTemp(Ity_V128);
   11556       Bool   isH = insn[2] == 0x16;
   11557       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11558 
   11559       modrm = insn[3];
   11560       if (epartIsReg(modrm)) {
   11561          assign( sV, getXMMReg( eregOfRM(modrm)) );
   11562          DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
   11563                                   nameXMMReg(eregOfRM(modrm)),
   11564                                   nameXMMReg(gregOfRM(modrm)));
   11565          delta += 3+1;
   11566       } else {
   11567          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11568          gen_SEGV_if_not_16_aligned( addr );
   11569          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11570          DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
   11571 	     dis_buf,
   11572              nameXMMReg(gregOfRM(modrm)));
   11573          delta += 3+alen;
   11574       }
   11575 
   11576       breakup128to32s( sV, &s3, &s2, &s1, &s0 );
   11577       putXMMReg( gregOfRM(modrm),
   11578                  isH ? mk128from32s( s3, s3, s1, s1 )
   11579                      : mk128from32s( s2, s2, s0, s0 ) );
   11580       goto decode_success;
   11581    }
   11582 
   11583    /* F2 0F 12 = MOVDDUP -- move from E (mem or xmm) to G (xmm),
   11584       duplicating some lanes (0:1:0:1). */
   11585    if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x12) {
   11586       IRTemp sV = newTemp(Ity_V128);
   11587       IRTemp d0 = newTemp(Ity_I64);
   11588 
   11589       modrm = insn[3];
   11590       if (epartIsReg(modrm)) {
   11591          assign( sV, getXMMReg( eregOfRM(modrm)) );
   11592          DIP("movddup %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11593                                 nameXMMReg(gregOfRM(modrm)));
   11594          delta += 3+1;
   11595          assign ( d0, unop(Iop_V128to64, mkexpr(sV)) );
   11596       } else {
   11597          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11598          assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
   11599          DIP("movddup %s,%s\n", dis_buf,
   11600                                 nameXMMReg(gregOfRM(modrm)));
   11601          delta += 3+alen;
   11602       }
   11603 
   11604       putXMMReg( gregOfRM(modrm), binop(Iop_64HLtoV128,mkexpr(d0),mkexpr(d0)) );
   11605       goto decode_success;
   11606    }
   11607 
   11608    /* F2 0F D0 = ADDSUBPS -- 32x4 +/-/+/- from E (mem or xmm) to G (xmm). */
   11609    if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xD0) {
   11610       IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
   11611       IRTemp eV   = newTemp(Ity_V128);
   11612       IRTemp gV   = newTemp(Ity_V128);
   11613       IRTemp addV = newTemp(Ity_V128);
   11614       IRTemp subV = newTemp(Ity_V128);
   11615       a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11616 
   11617       modrm = insn[3];
   11618       if (epartIsReg(modrm)) {
   11619          assign( eV, getXMMReg( eregOfRM(modrm)) );
   11620          DIP("addsubps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11621                                  nameXMMReg(gregOfRM(modrm)));
   11622          delta += 3+1;
   11623       } else {
   11624          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11625          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   11626          DIP("addsubps %s,%s\n", dis_buf,
   11627                                  nameXMMReg(gregOfRM(modrm)));
   11628          delta += 3+alen;
   11629       }
   11630 
   11631       assign( gV, getXMMReg(gregOfRM(modrm)) );
   11632 
   11633       assign( addV, binop(Iop_Add32Fx4, mkexpr(gV), mkexpr(eV)) );
   11634       assign( subV, binop(Iop_Sub32Fx4, mkexpr(gV), mkexpr(eV)) );
   11635 
   11636       breakup128to32s( addV, &a3, &a2, &a1, &a0 );
   11637       breakup128to32s( subV, &s3, &s2, &s1, &s0 );
   11638 
   11639       putXMMReg( gregOfRM(modrm), mk128from32s( a3, s2, a1, s0 ));
   11640       goto decode_success;
   11641    }
   11642 
   11643    /* 66 0F D0 = ADDSUBPD -- 64x4 +/- from E (mem or xmm) to G (xmm). */
   11644    if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD0) {
   11645       IRTemp eV   = newTemp(Ity_V128);
   11646       IRTemp gV   = newTemp(Ity_V128);
   11647       IRTemp addV = newTemp(Ity_V128);
   11648       IRTemp subV = newTemp(Ity_V128);
   11649       IRTemp a1     = newTemp(Ity_I64);
   11650       IRTemp s0     = newTemp(Ity_I64);
   11651 
   11652       modrm = insn[2];
   11653       if (epartIsReg(modrm)) {
   11654          assign( eV, getXMMReg( eregOfRM(modrm)) );
   11655          DIP("addsubpd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11656                                  nameXMMReg(gregOfRM(modrm)));
   11657          delta += 2+1;
   11658       } else {
   11659          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11660          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   11661          DIP("addsubpd %s,%s\n", dis_buf,
   11662                                  nameXMMReg(gregOfRM(modrm)));
   11663          delta += 2+alen;
   11664       }
   11665 
   11666       assign( gV, getXMMReg(gregOfRM(modrm)) );
   11667 
   11668       assign( addV, binop(Iop_Add64Fx2, mkexpr(gV), mkexpr(eV)) );
   11669       assign( subV, binop(Iop_Sub64Fx2, mkexpr(gV), mkexpr(eV)) );
   11670 
   11671       assign( a1, unop(Iop_V128HIto64, mkexpr(addV) ));
   11672       assign( s0, unop(Iop_V128to64,   mkexpr(subV) ));
   11673 
   11674       putXMMReg( gregOfRM(modrm),
   11675                  binop(Iop_64HLtoV128, mkexpr(a1), mkexpr(s0)) );
   11676       goto decode_success;
   11677    }
   11678 
   11679    /* F2 0F 7D = HSUBPS -- 32x4 sub across from E (mem or xmm) to G (xmm). */
   11680    /* F2 0F 7C = HADDPS -- 32x4 add across from E (mem or xmm) to G (xmm). */
   11681    if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F
   11682        && (insn[2] == 0x7C || insn[2] == 0x7D)) {
   11683       IRTemp e3, e2, e1, e0, g3, g2, g1, g0;
   11684       IRTemp eV     = newTemp(Ity_V128);
   11685       IRTemp gV     = newTemp(Ity_V128);
   11686       IRTemp leftV  = newTemp(Ity_V128);
   11687       IRTemp rightV = newTemp(Ity_V128);
   11688       Bool   isAdd  = insn[2] == 0x7C;
   11689       HChar* str    = isAdd ? "add" : "sub";
   11690       e3 = e2 = e1 = e0 = g3 = g2 = g1 = g0 = IRTemp_INVALID;
   11691 
   11692       modrm = insn[3];
   11693       if (epartIsReg(modrm)) {
   11694          assign( eV, getXMMReg( eregOfRM(modrm)) );
   11695          DIP("h%sps %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
   11696                                    nameXMMReg(gregOfRM(modrm)));
   11697          delta += 3+1;
   11698       } else {
   11699          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11700          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   11701          DIP("h%sps %s,%s\n", str, dis_buf,
   11702                                    nameXMMReg(gregOfRM(modrm)));
   11703          delta += 3+alen;
   11704       }
   11705 
   11706       assign( gV, getXMMReg(gregOfRM(modrm)) );
   11707 
   11708       breakup128to32s( eV, &e3, &e2, &e1, &e0 );
   11709       breakup128to32s( gV, &g3, &g2, &g1, &g0 );
   11710 
   11711       assign( leftV,  mk128from32s( e2, e0, g2, g0 ) );
   11712       assign( rightV, mk128from32s( e3, e1, g3, g1 ) );
   11713 
   11714       putXMMReg( gregOfRM(modrm),
   11715                  binop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4,
   11716                        mkexpr(leftV), mkexpr(rightV) ) );
   11717       goto decode_success;
   11718    }
   11719 
   11720    /* 66 0F 7D = HSUBPD -- 64x2 sub across from E (mem or xmm) to G (xmm). */
   11721    /* 66 0F 7C = HADDPD -- 64x2 add across from E (mem or xmm) to G (xmm). */
   11722    if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x7C || insn[1] == 0x7D)) {
   11723       IRTemp e1     = newTemp(Ity_I64);
   11724       IRTemp e0     = newTemp(Ity_I64);
   11725       IRTemp g1     = newTemp(Ity_I64);
   11726       IRTemp g0     = newTemp(Ity_I64);
   11727       IRTemp eV     = newTemp(Ity_V128);
   11728       IRTemp gV     = newTemp(Ity_V128);
   11729       IRTemp leftV  = newTemp(Ity_V128);
   11730       IRTemp rightV = newTemp(Ity_V128);
   11731       Bool   isAdd  = insn[1] == 0x7C;
   11732       HChar* str    = isAdd ? "add" : "sub";
   11733 
   11734       modrm = insn[2];
   11735       if (epartIsReg(modrm)) {
   11736          assign( eV, getXMMReg( eregOfRM(modrm)) );
   11737          DIP("h%spd %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
   11738                                    nameXMMReg(gregOfRM(modrm)));
   11739          delta += 2+1;
   11740       } else {
   11741          addr = disAMode ( &alen, sorb, delta+2, dis_buf );
   11742          assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   11743          DIP("h%spd %s,%s\n", str, dis_buf,
   11744                               nameXMMReg(gregOfRM(modrm)));
   11745          delta += 2+alen;
   11746       }
   11747 
   11748       assign( gV, getXMMReg(gregOfRM(modrm)) );
   11749 
   11750       assign( e1, unop(Iop_V128HIto64, mkexpr(eV) ));
   11751       assign( e0, unop(Iop_V128to64, mkexpr(eV) ));
   11752       assign( g1, unop(Iop_V128HIto64, mkexpr(gV) ));
   11753       assign( g0, unop(Iop_V128to64, mkexpr(gV) ));
   11754 
   11755       assign( leftV,  binop(Iop_64HLtoV128, mkexpr(e0),mkexpr(g0)) );
   11756       assign( rightV, binop(Iop_64HLtoV128, mkexpr(e1),mkexpr(g1)) );
   11757 
   11758       putXMMReg( gregOfRM(modrm),
   11759                  binop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2,
   11760                        mkexpr(leftV), mkexpr(rightV) ) );
   11761       goto decode_success;
   11762    }
   11763 
   11764    /* F2 0F F0 = LDDQU -- move from E (mem or xmm) to G (xmm). */
   11765    if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xF0) {
   11766       modrm = getIByte(delta+3);
   11767       if (epartIsReg(modrm)) {
   11768          goto decode_failure;
   11769       } else {
   11770          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11771          putXMMReg( gregOfRM(modrm),
   11772                     loadLE(Ity_V128, mkexpr(addr)) );
   11773          DIP("lddqu %s,%s\n", dis_buf,
   11774                               nameXMMReg(gregOfRM(modrm)));
   11775          delta += 3+alen;
   11776       }
   11777       goto decode_success;
   11778    }
   11779 
   11780    /* ---------------------------------------------------- */
   11781    /* --- end of the SSE3 decoder.                     --- */
   11782    /* ---------------------------------------------------- */
   11783 
   11784    /* ---------------------------------------------------- */
   11785    /* --- start of the SSSE3 decoder.                  --- */
   11786    /* ---------------------------------------------------- */
   11787 
   11788    /* 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
   11789       Unsigned Bytes (MMX) */
   11790    if (sz == 4
   11791        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
   11792       IRTemp sV        = newTemp(Ity_I64);
   11793       IRTemp dV        = newTemp(Ity_I64);
   11794       IRTemp sVoddsSX  = newTemp(Ity_I64);
   11795       IRTemp sVevensSX = newTemp(Ity_I64);
   11796       IRTemp dVoddsZX  = newTemp(Ity_I64);
   11797       IRTemp dVevensZX = newTemp(Ity_I64);
   11798 
   11799       modrm = insn[3];
   11800       do_MMX_preamble();
   11801       assign( dV, getMMXReg(gregOfRM(modrm)) );
   11802 
   11803       if (epartIsReg(modrm)) {
   11804          assign( sV, getMMXReg(eregOfRM(modrm)) );
   11805          delta += 3+1;
   11806          DIP("pmaddubsw %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   11807                                   nameMMXReg(gregOfRM(modrm)));
   11808       } else {
   11809          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11810          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   11811          delta += 3+alen;
   11812          DIP("pmaddubsw %s,%s\n", dis_buf,
   11813                                   nameMMXReg(gregOfRM(modrm)));
   11814       }
   11815 
   11816       /* compute dV unsigned x sV signed */
   11817       assign( sVoddsSX,
   11818               binop(Iop_SarN16x4, mkexpr(sV), mkU8(8)) );
   11819       assign( sVevensSX,
   11820               binop(Iop_SarN16x4,
   11821                     binop(Iop_ShlN16x4, mkexpr(sV), mkU8(8)),
   11822                     mkU8(8)) );
   11823       assign( dVoddsZX,
   11824               binop(Iop_ShrN16x4, mkexpr(dV), mkU8(8)) );
   11825       assign( dVevensZX,
   11826               binop(Iop_ShrN16x4,
   11827                     binop(Iop_ShlN16x4, mkexpr(dV), mkU8(8)),
   11828                     mkU8(8)) );
   11829 
   11830       putMMXReg(
   11831          gregOfRM(modrm),
   11832          binop(Iop_QAdd16Sx4,
   11833                binop(Iop_Mul16x4, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
   11834                binop(Iop_Mul16x4, mkexpr(sVevensSX), mkexpr(dVevensZX))
   11835          )
   11836       );
   11837       goto decode_success;
   11838    }
   11839 
   11840    /* 66 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
   11841       Unsigned Bytes (XMM) */
   11842    if (sz == 2
   11843        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
   11844       IRTemp sV        = newTemp(Ity_V128);
   11845       IRTemp dV        = newTemp(Ity_V128);
   11846       IRTemp sVoddsSX  = newTemp(Ity_V128);
   11847       IRTemp sVevensSX = newTemp(Ity_V128);
   11848       IRTemp dVoddsZX  = newTemp(Ity_V128);
   11849       IRTemp dVevensZX = newTemp(Ity_V128);
   11850 
   11851       modrm = insn[3];
   11852       assign( dV, getXMMReg(gregOfRM(modrm)) );
   11853 
   11854       if (epartIsReg(modrm)) {
   11855          assign( sV, getXMMReg(eregOfRM(modrm)) );
   11856          delta += 3+1;
   11857          DIP("pmaddubsw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   11858                                   nameXMMReg(gregOfRM(modrm)));
   11859       } else {
   11860          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11861          gen_SEGV_if_not_16_aligned( addr );
   11862          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11863          delta += 3+alen;
   11864          DIP("pmaddubsw %s,%s\n", dis_buf,
   11865                                   nameXMMReg(gregOfRM(modrm)));
   11866       }
   11867 
   11868       /* compute dV unsigned x sV signed */
   11869       assign( sVoddsSX,
   11870               binop(Iop_SarN16x8, mkexpr(sV), mkU8(8)) );
   11871       assign( sVevensSX,
   11872               binop(Iop_SarN16x8,
   11873                     binop(Iop_ShlN16x8, mkexpr(sV), mkU8(8)),
   11874                     mkU8(8)) );
   11875       assign( dVoddsZX,
   11876               binop(Iop_ShrN16x8, mkexpr(dV), mkU8(8)) );
   11877       assign( dVevensZX,
   11878               binop(Iop_ShrN16x8,
   11879                     binop(Iop_ShlN16x8, mkexpr(dV), mkU8(8)),
   11880                     mkU8(8)) );
   11881 
   11882       putXMMReg(
   11883          gregOfRM(modrm),
   11884          binop(Iop_QAdd16Sx8,
   11885                binop(Iop_Mul16x8, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
   11886                binop(Iop_Mul16x8, mkexpr(sVevensSX), mkexpr(dVevensZX))
   11887          )
   11888       );
   11889       goto decode_success;
   11890    }
   11891 
   11892    /* ***--- these are MMX class insns introduced in SSSE3 ---*** */
   11893    /* 0F 38 03 = PHADDSW -- 16x4 signed qadd across from E (mem or
   11894       mmx) and G to G (mmx). */
   11895    /* 0F 38 07 = PHSUBSW -- 16x4 signed qsub across from E (mem or
   11896       mmx) and G to G (mmx). */
   11897    /* 0F 38 01 = PHADDW -- 16x4 add across from E (mem or mmx) and G
   11898       to G (mmx). */
   11899    /* 0F 38 05 = PHSUBW -- 16x4 sub across from E (mem or mmx) and G
   11900       to G (mmx). */
   11901    /* 0F 38 02 = PHADDD -- 32x2 add across from E (mem or mmx) and G
   11902       to G (mmx). */
   11903    /* 0F 38 06 = PHSUBD -- 32x2 sub across from E (mem or mmx) and G
   11904       to G (mmx). */
   11905 
   11906    if (sz == 4
   11907        && insn[0] == 0x0F && insn[1] == 0x38
   11908        && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
   11909            || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
   11910       HChar* str    = "???";
   11911       IROp   opV64  = Iop_INVALID;
   11912       IROp   opCatO = Iop_CatOddLanes16x4;
   11913       IROp   opCatE = Iop_CatEvenLanes16x4;
   11914       IRTemp sV     = newTemp(Ity_I64);
   11915       IRTemp dV     = newTemp(Ity_I64);
   11916 
   11917       modrm = insn[3];
   11918 
   11919       switch (insn[2]) {
   11920          case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
   11921          case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
   11922          case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
   11923          case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
   11924          case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
   11925          case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
   11926          default: vassert(0);
   11927       }
   11928       if (insn[2] == 0x02 || insn[2] == 0x06) {
   11929          opCatO = Iop_InterleaveHI32x2;
   11930          opCatE = Iop_InterleaveLO32x2;
   11931       }
   11932 
   11933       do_MMX_preamble();
   11934       assign( dV, getMMXReg(gregOfRM(modrm)) );
   11935 
   11936       if (epartIsReg(modrm)) {
   11937          assign( sV, getMMXReg(eregOfRM(modrm)) );
   11938          delta += 3+1;
   11939          DIP("ph%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
   11940                                   nameMMXReg(gregOfRM(modrm)));
   11941       } else {
   11942          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   11943          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   11944          delta += 3+alen;
   11945          DIP("ph%s %s,%s\n", str, dis_buf,
   11946                                   nameMMXReg(gregOfRM(modrm)));
   11947       }
   11948 
   11949       putMMXReg(
   11950          gregOfRM(modrm),
   11951          binop(opV64,
   11952                binop(opCatE,mkexpr(sV),mkexpr(dV)),
   11953                binop(opCatO,mkexpr(sV),mkexpr(dV))
   11954          )
   11955       );
   11956       goto decode_success;
   11957    }
   11958 
   11959    /* 66 0F 38 03 = PHADDSW -- 16x8 signed qadd across from E (mem or
   11960       xmm) and G to G (xmm). */
   11961    /* 66 0F 38 07 = PHSUBSW -- 16x8 signed qsub across from E (mem or
   11962       xmm) and G to G (xmm). */
   11963    /* 66 0F 38 01 = PHADDW -- 16x8 add across from E (mem or xmm) and
   11964       G to G (xmm). */
   11965    /* 66 0F 38 05 = PHSUBW -- 16x8 sub across from E (mem or xmm) and
   11966       G to G (xmm). */
   11967    /* 66 0F 38 02 = PHADDD -- 32x4 add across from E (mem or xmm) and
   11968       G to G (xmm). */
   11969    /* 66 0F 38 06 = PHSUBD -- 32x4 sub across from E (mem or xmm) and
   11970       G to G (xmm). */
   11971 
   11972    if (sz == 2
   11973        && insn[0] == 0x0F && insn[1] == 0x38
   11974        && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
   11975            || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
   11976       HChar* str    = "???";
   11977       IROp   opV64  = Iop_INVALID;
   11978       IROp   opCatO = Iop_CatOddLanes16x4;
   11979       IROp   opCatE = Iop_CatEvenLanes16x4;
   11980       IRTemp sV     = newTemp(Ity_V128);
   11981       IRTemp dV     = newTemp(Ity_V128);
   11982       IRTemp sHi    = newTemp(Ity_I64);
   11983       IRTemp sLo    = newTemp(Ity_I64);
   11984       IRTemp dHi    = newTemp(Ity_I64);
   11985       IRTemp dLo    = newTemp(Ity_I64);
   11986 
   11987       modrm = insn[3];
   11988 
   11989       switch (insn[2]) {
   11990          case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
   11991          case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
   11992          case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
   11993          case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
   11994          case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
   11995          case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
   11996          default: vassert(0);
   11997       }
   11998       if (insn[2] == 0x02 || insn[2] == 0x06) {
   11999          opCatO = Iop_InterleaveHI32x2;
   12000          opCatE = Iop_InterleaveLO32x2;
   12001       }
   12002 
   12003       assign( dV, getXMMReg(gregOfRM(modrm)) );
   12004 
   12005       if (epartIsReg(modrm)) {
   12006          assign( sV, getXMMReg( eregOfRM(modrm)) );
   12007          DIP("ph%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
   12008                                   nameXMMReg(gregOfRM(modrm)));
   12009          delta += 3+1;
   12010       } else {
   12011          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12012          gen_SEGV_if_not_16_aligned( addr );
   12013          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12014          DIP("ph%s %s,%s\n", str, dis_buf,
   12015                              nameXMMReg(gregOfRM(modrm)));
   12016          delta += 3+alen;
   12017       }
   12018 
   12019       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   12020       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   12021       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12022       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12023 
   12024       /* This isn't a particularly efficient way to compute the
   12025          result, but at least it avoids a proliferation of IROps,
   12026          hence avoids complication all the backends. */
   12027       putXMMReg(
   12028          gregOfRM(modrm),
   12029          binop(Iop_64HLtoV128,
   12030                binop(opV64,
   12031                      binop(opCatE,mkexpr(sHi),mkexpr(sLo)),
   12032                      binop(opCatO,mkexpr(sHi),mkexpr(sLo))
   12033                ),
   12034                binop(opV64,
   12035                      binop(opCatE,mkexpr(dHi),mkexpr(dLo)),
   12036                      binop(opCatO,mkexpr(dHi),mkexpr(dLo))
   12037                )
   12038          )
   12039       );
   12040       goto decode_success;
   12041    }
   12042 
   12043    /* 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and Scale
   12044       (MMX) */
   12045    if (sz == 4
   12046        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
   12047       IRTemp sV = newTemp(Ity_I64);
   12048       IRTemp dV = newTemp(Ity_I64);
   12049 
   12050       modrm = insn[3];
   12051       do_MMX_preamble();
   12052       assign( dV, getMMXReg(gregOfRM(modrm)) );
   12053 
   12054       if (epartIsReg(modrm)) {
   12055          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12056          delta += 3+1;
   12057          DIP("pmulhrsw %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   12058                                  nameMMXReg(gregOfRM(modrm)));
   12059       } else {
   12060          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12061          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12062          delta += 3+alen;
   12063          DIP("pmulhrsw %s,%s\n", dis_buf,
   12064                                  nameMMXReg(gregOfRM(modrm)));
   12065       }
   12066 
   12067       putMMXReg(
   12068          gregOfRM(modrm),
   12069          dis_PMULHRSW_helper( mkexpr(sV), mkexpr(dV) )
   12070       );
   12071       goto decode_success;
   12072    }
   12073 
   12074    /* 66 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and
   12075       Scale (XMM) */
   12076    if (sz == 2
   12077        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
   12078       IRTemp sV  = newTemp(Ity_V128);
   12079       IRTemp dV  = newTemp(Ity_V128);
   12080       IRTemp sHi = newTemp(Ity_I64);
   12081       IRTemp sLo = newTemp(Ity_I64);
   12082       IRTemp dHi = newTemp(Ity_I64);
   12083       IRTemp dLo = newTemp(Ity_I64);
   12084 
   12085       modrm = insn[3];
   12086       assign( dV, getXMMReg(gregOfRM(modrm)) );
   12087 
   12088       if (epartIsReg(modrm)) {
   12089          assign( sV, getXMMReg(eregOfRM(modrm)) );
   12090          delta += 3+1;
   12091          DIP("pmulhrsw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   12092                                  nameXMMReg(gregOfRM(modrm)));
   12093       } else {
   12094          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12095          gen_SEGV_if_not_16_aligned( addr );
   12096          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12097          delta += 3+alen;
   12098          DIP("pmulhrsw %s,%s\n", dis_buf,
   12099                                  nameXMMReg(gregOfRM(modrm)));
   12100       }
   12101 
   12102       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   12103       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   12104       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12105       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12106 
   12107       putXMMReg(
   12108          gregOfRM(modrm),
   12109          binop(Iop_64HLtoV128,
   12110                dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
   12111                dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
   12112          )
   12113       );
   12114       goto decode_success;
   12115    }
   12116 
   12117    /* 0F 38 08 = PSIGNB -- Packed Sign 8x8  (MMX) */
   12118    /* 0F 38 09 = PSIGNW -- Packed Sign 16x4 (MMX) */
   12119    /* 0F 38 09 = PSIGND -- Packed Sign 32x2 (MMX) */
   12120    if (sz == 4
   12121        && insn[0] == 0x0F && insn[1] == 0x38
   12122        && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
   12123       IRTemp sV      = newTemp(Ity_I64);
   12124       IRTemp dV      = newTemp(Ity_I64);
   12125       HChar* str     = "???";
   12126       Int    laneszB = 0;
   12127 
   12128       switch (insn[2]) {
   12129          case 0x08: laneszB = 1; str = "b"; break;
   12130          case 0x09: laneszB = 2; str = "w"; break;
   12131          case 0x0A: laneszB = 4; str = "d"; break;
   12132          default: vassert(0);
   12133       }
   12134 
   12135       modrm = insn[3];
   12136       do_MMX_preamble();
   12137       assign( dV, getMMXReg(gregOfRM(modrm)) );
   12138 
   12139       if (epartIsReg(modrm)) {
   12140          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12141          delta += 3+1;
   12142          DIP("psign%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
   12143                                      nameMMXReg(gregOfRM(modrm)));
   12144       } else {
   12145          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12146          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12147          delta += 3+alen;
   12148          DIP("psign%s %s,%s\n", str, dis_buf,
   12149                                      nameMMXReg(gregOfRM(modrm)));
   12150       }
   12151 
   12152       putMMXReg(
   12153          gregOfRM(modrm),
   12154          dis_PSIGN_helper( mkexpr(sV), mkexpr(dV), laneszB )
   12155       );
   12156       goto decode_success;
   12157    }
   12158 
   12159    /* 66 0F 38 08 = PSIGNB -- Packed Sign 8x16 (XMM) */
   12160    /* 66 0F 38 09 = PSIGNW -- Packed Sign 16x8 (XMM) */
   12161    /* 66 0F 38 09 = PSIGND -- Packed Sign 32x4 (XMM) */
   12162    if (sz == 2
   12163        && insn[0] == 0x0F && insn[1] == 0x38
   12164        && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
   12165       IRTemp sV      = newTemp(Ity_V128);
   12166       IRTemp dV      = newTemp(Ity_V128);
   12167       IRTemp sHi     = newTemp(Ity_I64);
   12168       IRTemp sLo     = newTemp(Ity_I64);
   12169       IRTemp dHi     = newTemp(Ity_I64);
   12170       IRTemp dLo     = newTemp(Ity_I64);
   12171       HChar* str     = "???";
   12172       Int    laneszB = 0;
   12173 
   12174       switch (insn[2]) {
   12175          case 0x08: laneszB = 1; str = "b"; break;
   12176          case 0x09: laneszB = 2; str = "w"; break;
   12177          case 0x0A: laneszB = 4; str = "d"; break;
   12178          default: vassert(0);
   12179       }
   12180 
   12181       modrm = insn[3];
   12182       assign( dV, getXMMReg(gregOfRM(modrm)) );
   12183 
   12184       if (epartIsReg(modrm)) {
   12185          assign( sV, getXMMReg(eregOfRM(modrm)) );
   12186          delta += 3+1;
   12187          DIP("psign%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
   12188                                      nameXMMReg(gregOfRM(modrm)));
   12189       } else {
   12190          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12191          gen_SEGV_if_not_16_aligned( addr );
   12192          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12193          delta += 3+alen;
   12194          DIP("psign%s %s,%s\n", str, dis_buf,
   12195                                      nameXMMReg(gregOfRM(modrm)));
   12196       }
   12197 
   12198       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   12199       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   12200       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12201       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12202 
   12203       putXMMReg(
   12204          gregOfRM(modrm),
   12205          binop(Iop_64HLtoV128,
   12206                dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
   12207                dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
   12208          )
   12209       );
   12210       goto decode_success;
   12211    }
   12212 
   12213    /* 0F 38 1C = PABSB -- Packed Absolute Value 8x8  (MMX) */
   12214    /* 0F 38 1D = PABSW -- Packed Absolute Value 16x4 (MMX) */
   12215    /* 0F 38 1E = PABSD -- Packed Absolute Value 32x2 (MMX) */
   12216    if (sz == 4
   12217        && insn[0] == 0x0F && insn[1] == 0x38
   12218        && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
   12219       IRTemp sV      = newTemp(Ity_I64);
   12220       HChar* str     = "???";
   12221       Int    laneszB = 0;
   12222 
   12223       switch (insn[2]) {
   12224          case 0x1C: laneszB = 1; str = "b"; break;
   12225          case 0x1D: laneszB = 2; str = "w"; break;
   12226          case 0x1E: laneszB = 4; str = "d"; break;
   12227          default: vassert(0);
   12228       }
   12229 
   12230       modrm = insn[3];
   12231       do_MMX_preamble();
   12232 
   12233       if (epartIsReg(modrm)) {
   12234          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12235          delta += 3+1;
   12236          DIP("pabs%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
   12237                                     nameMMXReg(gregOfRM(modrm)));
   12238       } else {
   12239          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12240          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12241          delta += 3+alen;
   12242          DIP("pabs%s %s,%s\n", str, dis_buf,
   12243                                     nameMMXReg(gregOfRM(modrm)));
   12244       }
   12245 
   12246       putMMXReg(
   12247          gregOfRM(modrm),
   12248          dis_PABS_helper( mkexpr(sV), laneszB )
   12249       );
   12250       goto decode_success;
   12251    }
   12252 
   12253    /* 66 0F 38 1C = PABSB -- Packed Absolute Value 8x16 (XMM) */
   12254    /* 66 0F 38 1D = PABSW -- Packed Absolute Value 16x8 (XMM) */
   12255    /* 66 0F 38 1E = PABSD -- Packed Absolute Value 32x4 (XMM) */
   12256    if (sz == 2
   12257        && insn[0] == 0x0F && insn[1] == 0x38
   12258        && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
   12259       IRTemp sV      = newTemp(Ity_V128);
   12260       IRTemp sHi     = newTemp(Ity_I64);
   12261       IRTemp sLo     = newTemp(Ity_I64);
   12262       HChar* str     = "???";
   12263       Int    laneszB = 0;
   12264 
   12265       switch (insn[2]) {
   12266          case 0x1C: laneszB = 1; str = "b"; break;
   12267          case 0x1D: laneszB = 2; str = "w"; break;
   12268          case 0x1E: laneszB = 4; str = "d"; break;
   12269          default: vassert(0);
   12270       }
   12271 
   12272       modrm = insn[3];
   12273 
   12274       if (epartIsReg(modrm)) {
   12275          assign( sV, getXMMReg(eregOfRM(modrm)) );
   12276          delta += 3+1;
   12277          DIP("pabs%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
   12278                                     nameXMMReg(gregOfRM(modrm)));
   12279       } else {
   12280          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12281          gen_SEGV_if_not_16_aligned( addr );
   12282          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12283          delta += 3+alen;
   12284          DIP("pabs%s %s,%s\n", str, dis_buf,
   12285                                     nameXMMReg(gregOfRM(modrm)));
   12286       }
   12287 
   12288       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12289       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12290 
   12291       putXMMReg(
   12292          gregOfRM(modrm),
   12293          binop(Iop_64HLtoV128,
   12294                dis_PABS_helper( mkexpr(sHi), laneszB ),
   12295                dis_PABS_helper( mkexpr(sLo), laneszB )
   12296          )
   12297       );
   12298       goto decode_success;
   12299    }
   12300 
   12301    /* 0F 3A 0F = PALIGNR -- Packed Align Right (MMX) */
   12302    if (sz == 4
   12303        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
   12304       IRTemp sV  = newTemp(Ity_I64);
   12305       IRTemp dV  = newTemp(Ity_I64);
   12306       IRTemp res = newTemp(Ity_I64);
   12307 
   12308       modrm = insn[3];
   12309       do_MMX_preamble();
   12310       assign( dV, getMMXReg(gregOfRM(modrm)) );
   12311 
   12312       if (epartIsReg(modrm)) {
   12313          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12314          d32 = (UInt)insn[3+1];
   12315          delta += 3+1+1;
   12316          DIP("palignr $%d,%s,%s\n",  (Int)d32,
   12317                                      nameMMXReg(eregOfRM(modrm)),
   12318                                      nameMMXReg(gregOfRM(modrm)));
   12319       } else {
   12320          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12321          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12322          d32 = (UInt)insn[3+alen];
   12323          delta += 3+alen+1;
   12324          DIP("palignr $%d%s,%s\n", (Int)d32,
   12325                                    dis_buf,
   12326                                    nameMMXReg(gregOfRM(modrm)));
   12327       }
   12328 
   12329       if (d32 == 0) {
   12330          assign( res, mkexpr(sV) );
   12331       }
   12332       else if (d32 >= 1 && d32 <= 7) {
   12333          assign(res,
   12334                 binop(Iop_Or64,
   12335                       binop(Iop_Shr64, mkexpr(sV), mkU8(8*d32)),
   12336                       binop(Iop_Shl64, mkexpr(dV), mkU8(8*(8-d32))
   12337                      )));
   12338       }
   12339       else if (d32 == 8) {
   12340         assign( res, mkexpr(dV) );
   12341       }
   12342       else if (d32 >= 9 && d32 <= 15) {
   12343          assign( res, binop(Iop_Shr64, mkexpr(dV), mkU8(8*(d32-8))) );
   12344       }
   12345       else if (d32 >= 16 && d32 <= 255) {
   12346          assign( res, mkU64(0) );
   12347       }
   12348       else
   12349          vassert(0);
   12350 
   12351       putMMXReg( gregOfRM(modrm), mkexpr(res) );
   12352       goto decode_success;
   12353    }
   12354 
   12355    /* 66 0F 3A 0F = PALIGNR -- Packed Align Right (XMM) */
   12356    if (sz == 2
   12357        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
   12358       IRTemp sV  = newTemp(Ity_V128);
   12359       IRTemp dV  = newTemp(Ity_V128);
   12360       IRTemp sHi = newTemp(Ity_I64);
   12361       IRTemp sLo = newTemp(Ity_I64);
   12362       IRTemp dHi = newTemp(Ity_I64);
   12363       IRTemp dLo = newTemp(Ity_I64);
   12364       IRTemp rHi = newTemp(Ity_I64);
   12365       IRTemp rLo = newTemp(Ity_I64);
   12366 
   12367       modrm = insn[3];
   12368       assign( dV, getXMMReg(gregOfRM(modrm)) );
   12369 
   12370       if (epartIsReg(modrm)) {
   12371          assign( sV, getXMMReg(eregOfRM(modrm)) );
   12372          d32 = (UInt)insn[3+1];
   12373          delta += 3+1+1;
   12374          DIP("palignr $%d,%s,%s\n", (Int)d32,
   12375                                     nameXMMReg(eregOfRM(modrm)),
   12376                                     nameXMMReg(gregOfRM(modrm)));
   12377       } else {
   12378          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12379          gen_SEGV_if_not_16_aligned( addr );
   12380          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12381          d32 = (UInt)insn[3+alen];
   12382          delta += 3+alen+1;
   12383          DIP("palignr $%d,%s,%s\n", (Int)d32,
   12384                                     dis_buf,
   12385                                     nameXMMReg(gregOfRM(modrm)));
   12386       }
   12387 
   12388       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   12389       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   12390       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12391       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12392 
   12393       if (d32 == 0) {
   12394          assign( rHi, mkexpr(sHi) );
   12395          assign( rLo, mkexpr(sLo) );
   12396       }
   12397       else if (d32 >= 1 && d32 <= 7) {
   12398          assign( rHi, dis_PALIGNR_XMM_helper(dLo, sHi, d32) );
   12399          assign( rLo, dis_PALIGNR_XMM_helper(sHi, sLo, d32) );
   12400       }
   12401       else if (d32 == 8) {
   12402          assign( rHi, mkexpr(dLo) );
   12403          assign( rLo, mkexpr(sHi) );
   12404       }
   12405       else if (d32 >= 9 && d32 <= 15) {
   12406          assign( rHi, dis_PALIGNR_XMM_helper(dHi, dLo, d32-8) );
   12407          assign( rLo, dis_PALIGNR_XMM_helper(dLo, sHi, d32-8) );
   12408       }
   12409       else if (d32 == 16) {
   12410          assign( rHi, mkexpr(dHi) );
   12411          assign( rLo, mkexpr(dLo) );
   12412       }
   12413       else if (d32 >= 17 && d32 <= 23) {
   12414          assign( rHi, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d32-16))) );
   12415          assign( rLo, dis_PALIGNR_XMM_helper(dHi, dLo, d32-16) );
   12416       }
   12417       else if (d32 == 24) {
   12418          assign( rHi, mkU64(0) );
   12419          assign( rLo, mkexpr(dHi) );
   12420       }
   12421       else if (d32 >= 25 && d32 <= 31) {
   12422          assign( rHi, mkU64(0) );
   12423          assign( rLo, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d32-24))) );
   12424       }
   12425       else if (d32 >= 32 && d32 <= 255) {
   12426          assign( rHi, mkU64(0) );
   12427          assign( rLo, mkU64(0) );
   12428       }
   12429       else
   12430          vassert(0);
   12431 
   12432       putXMMReg(
   12433          gregOfRM(modrm),
   12434          binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
   12435       );
   12436       goto decode_success;
   12437    }
   12438 
   12439    /* 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x8 (MMX) */
   12440    if (sz == 4
   12441        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
   12442       IRTemp sV      = newTemp(Ity_I64);
   12443       IRTemp dV      = newTemp(Ity_I64);
   12444 
   12445       modrm = insn[3];
   12446       do_MMX_preamble();
   12447       assign( dV, getMMXReg(gregOfRM(modrm)) );
   12448 
   12449       if (epartIsReg(modrm)) {
   12450          assign( sV, getMMXReg(eregOfRM(modrm)) );
   12451          delta += 3+1;
   12452          DIP("pshufb %s,%s\n", nameMMXReg(eregOfRM(modrm)),
   12453                                nameMMXReg(gregOfRM(modrm)));
   12454       } else {
   12455          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12456          assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12457          delta += 3+alen;
   12458          DIP("pshufb %s,%s\n", dis_buf,
   12459                                nameMMXReg(gregOfRM(modrm)));
   12460       }
   12461 
   12462       putMMXReg(
   12463          gregOfRM(modrm),
   12464          binop(
   12465             Iop_And64,
   12466             /* permute the lanes */
   12467             binop(
   12468                Iop_Perm8x8,
   12469                mkexpr(dV),
   12470                binop(Iop_And64, mkexpr(sV), mkU64(0x0707070707070707ULL))
   12471             ),
   12472             /* mask off lanes which have (index & 0x80) == 0x80 */
   12473             unop(Iop_Not64, binop(Iop_SarN8x8, mkexpr(sV), mkU8(7)))
   12474          )
   12475       );
   12476       goto decode_success;
   12477    }
   12478 
   12479    /* 66 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x16 (XMM) */
   12480    if (sz == 2
   12481        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
   12482       IRTemp sV         = newTemp(Ity_V128);
   12483       IRTemp dV         = newTemp(Ity_V128);
   12484       IRTemp sHi        = newTemp(Ity_I64);
   12485       IRTemp sLo        = newTemp(Ity_I64);
   12486       IRTemp dHi        = newTemp(Ity_I64);
   12487       IRTemp dLo        = newTemp(Ity_I64);
   12488       IRTemp rHi        = newTemp(Ity_I64);
   12489       IRTemp rLo        = newTemp(Ity_I64);
   12490       IRTemp sevens     = newTemp(Ity_I64);
   12491       IRTemp mask0x80hi = newTemp(Ity_I64);
   12492       IRTemp mask0x80lo = newTemp(Ity_I64);
   12493       IRTemp maskBit3hi = newTemp(Ity_I64);
   12494       IRTemp maskBit3lo = newTemp(Ity_I64);
   12495       IRTemp sAnd7hi    = newTemp(Ity_I64);
   12496       IRTemp sAnd7lo    = newTemp(Ity_I64);
   12497       IRTemp permdHi    = newTemp(Ity_I64);
   12498       IRTemp permdLo    = newTemp(Ity_I64);
   12499 
   12500       modrm = insn[3];
   12501       assign( dV, getXMMReg(gregOfRM(modrm)) );
   12502 
   12503       if (epartIsReg(modrm)) {
   12504          assign( sV, getXMMReg(eregOfRM(modrm)) );
   12505          delta += 3+1;
   12506          DIP("pshufb %s,%s\n", nameXMMReg(eregOfRM(modrm)),
   12507                                nameXMMReg(gregOfRM(modrm)));
   12508       } else {
   12509          addr = disAMode ( &alen, sorb, delta+3, dis_buf );
   12510          gen_SEGV_if_not_16_aligned( addr );
   12511          assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12512          delta += 3+alen;
   12513          DIP("pshufb %s,%s\n", dis_buf,
   12514                                nameXMMReg(gregOfRM(modrm)));
   12515       }
   12516 
   12517       assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   12518       assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   12519       assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   12520       assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   12521 
   12522       assign( sevens, mkU64(0x0707070707070707ULL) );
   12523 
   12524       /*
   12525       mask0x80hi = Not(SarN8x8(sHi,7))
   12526       maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7)
   12527       sAnd7hi    = And(sHi,sevens)
   12528       permdHi    = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi),
   12529                        And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) )
   12530       rHi        = And(permdHi,mask0x80hi)
   12531       */
   12532       assign(
   12533          mask0x80hi,
   12534          unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7))));
   12535 
   12536       assign(
   12537          maskBit3hi,
   12538          binop(Iop_SarN8x8,
   12539                binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)),
   12540                mkU8(7)));
   12541 
   12542       assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens)));
   12543 
   12544       assign(
   12545          permdHi,
   12546          binop(
   12547             Iop_Or64,
   12548             binop(Iop_And64,
   12549                   binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)),
   12550                   mkexpr(maskBit3hi)),
   12551             binop(Iop_And64,
   12552                   binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)),
   12553                   unop(Iop_Not64,mkexpr(maskBit3hi))) ));
   12554 
   12555       assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) );
   12556 
   12557       /* And the same for the lower half of the result.  What fun. */
   12558 
   12559       assign(
   12560          mask0x80lo,
   12561          unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7))));
   12562 
   12563       assign(
   12564          maskBit3lo,
   12565          binop(Iop_SarN8x8,
   12566                binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)),
   12567                mkU8(7)));
   12568 
   12569       assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens)));
   12570 
   12571       assign(
   12572          permdLo,
   12573          binop(
   12574             Iop_Or64,
   12575             binop(Iop_And64,
   12576                   binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)),
   12577                   mkexpr(maskBit3lo)),
   12578             binop(Iop_And64,
   12579                   binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)),
   12580                   unop(Iop_Not64,mkexpr(maskBit3lo))) ));
   12581 
   12582       assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) );
   12583 
   12584       putXMMReg(
   12585          gregOfRM(modrm),
   12586          binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
   12587       );
   12588       goto decode_success;
   12589    }
   12590 
   12591    /* ---------------------------------------------------- */
   12592    /* --- end of the SSSE3 decoder.                    --- */
   12593    /* ---------------------------------------------------- */
   12594 
   12595    /* ---------------------------------------------------- */
   12596    /* --- start of the SSE4 decoder                    --- */
   12597    /* ---------------------------------------------------- */
   12598 
   12599    /* 66 0F 3A 0B /r ib = ROUNDSD imm8, xmm2/m64, xmm1
   12600       (Partial implementation only -- only deal with cases where
   12601       the rounding mode is specified directly by the immediate byte.)
   12602       66 0F 3A 0A /r ib = ROUNDSS imm8, xmm2/m32, xmm1
   12603       (Limitations ditto)
   12604    */
   12605    if (sz == 2
   12606        && insn[0] == 0x0F && insn[1] == 0x3A
   12607        && (/*insn[2] == 0x0B || */insn[2] == 0x0A)) {
   12608 
   12609       Bool   isD = insn[2] == 0x0B;
   12610       IRTemp src = newTemp(isD ? Ity_F64 : Ity_F32);
   12611       IRTemp res = newTemp(isD ? Ity_F64 : Ity_F32);
   12612       Int    imm = 0;
   12613 
   12614       modrm = insn[3];
   12615 
   12616       if (epartIsReg(modrm)) {
   12617          assign( src,
   12618                  isD ? getXMMRegLane64F( eregOfRM(modrm), 0 )
   12619                      : getXMMRegLane32F( eregOfRM(modrm), 0 ) );
   12620          imm = insn[3+1];
   12621          if (imm & ~3) goto decode_failure;
   12622          delta += 3+1+1;
   12623          DIP( "rounds%c $%d,%s,%s\n",
   12624               isD ? 'd' : 's',
   12625               imm, nameXMMReg( eregOfRM(modrm) ),
   12626                    nameXMMReg( gregOfRM(modrm) ) );
   12627       } else {
   12628          addr = disAMode( &alen, sorb, delta+3, dis_buf );
   12629          assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
   12630          imm = insn[3+alen];
   12631          if (imm & ~3) goto decode_failure;
   12632          delta += 3+alen+1;
   12633          DIP( "roundsd $%d,%s,%s\n",
   12634               imm, dis_buf, nameXMMReg( gregOfRM(modrm) ) );
   12635       }
   12636 
   12637       /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   12638          that encoding is the same as the encoding for IRRoundingMode,
   12639          we can use that value directly in the IR as a rounding
   12640          mode. */
   12641       assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
   12642                   mkU32(imm & 3), mkexpr(src)) );
   12643 
   12644       if (isD)
   12645          putXMMRegLane64F( gregOfRM(modrm), 0, mkexpr(res) );
   12646       else
   12647          putXMMRegLane32F( gregOfRM(modrm), 0, mkexpr(res) );
   12648 
   12649       goto decode_success;
   12650    }
   12651 
   12652    /* F3 0F BD -- LZCNT (count leading zeroes.  An AMD extension,
   12653       which we can only decode if we're sure this is an AMD cpu that
   12654       supports LZCNT, since otherwise it's BSR, which behaves
   12655       differently. */
   12656    if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xBD
   12657        && 0 != (archinfo->hwcaps & VEX_HWCAPS_X86_LZCNT)) {
   12658       vassert(sz == 2 || sz == 4);
   12659       /*IRType*/ ty  = szToITy(sz);
   12660       IRTemp     src = newTemp(ty);
   12661       modrm = insn[3];
   12662       if (epartIsReg(modrm)) {
   12663          assign(src, getIReg(sz, eregOfRM(modrm)));
   12664          delta += 3+1;
   12665          DIP("lzcnt%c %s, %s\n", nameISize(sz),
   12666              nameIReg(sz, eregOfRM(modrm)),
   12667              nameIReg(sz, gregOfRM(modrm)));
   12668       } else {
   12669          addr = disAMode( &alen, sorb, delta+3, dis_buf );
   12670          assign(src, loadLE(ty, mkexpr(addr)));
   12671          delta += 3+alen;
   12672          DIP("lzcnt%c %s, %s\n", nameISize(sz), dis_buf,
   12673              nameIReg(sz, gregOfRM(modrm)));
   12674       }
   12675 
   12676       IRTemp res = gen_LZCNT(ty, src);
   12677       putIReg(sz, gregOfRM(modrm), mkexpr(res));
   12678 
   12679       // Update flags.  This is pretty lame .. perhaps can do better
   12680       // if this turns out to be performance critical.
   12681       // O S A P are cleared.  Z is set if RESULT == 0.
   12682       // C is set if SRC is zero.
   12683       IRTemp src32 = newTemp(Ity_I32);
   12684       IRTemp res32 = newTemp(Ity_I32);
   12685       assign(src32, widenUto32(mkexpr(src)));
   12686       assign(res32, widenUto32(mkexpr(res)));
   12687 
   12688       IRTemp oszacp = newTemp(Ity_I32);
   12689       assign(
   12690          oszacp,
   12691          binop(Iop_Or32,
   12692                binop(Iop_Shl32,
   12693                      unop(Iop_1Uto32,
   12694                           binop(Iop_CmpEQ32, mkexpr(res32), mkU32(0))),
   12695                      mkU8(X86G_CC_SHIFT_Z)),
   12696                binop(Iop_Shl32,
   12697                      unop(Iop_1Uto32,
   12698                           binop(Iop_CmpEQ32, mkexpr(src32), mkU32(0))),
   12699                      mkU8(X86G_CC_SHIFT_C))
   12700          )
   12701       );
   12702 
   12703       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   12704       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   12705       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   12706       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
   12707 
   12708       goto decode_success;
   12709    }
   12710 
   12711    /* ---------------------------------------------------- */
   12712    /* --- end of the SSE4 decoder                      --- */
   12713    /* ---------------------------------------------------- */
   12714 
   12715    after_sse_decoders:
   12716 
   12717    /* ---------------------------------------------------- */
   12718    /* --- deal with misc 0x67 pfxs (addr size override) -- */
   12719    /* ---------------------------------------------------- */
   12720 
   12721    /* 67 E3 = JCXZ (for JECXZ see below) */
   12722    if (insn[0] == 0x67 && insn[1] == 0xE3 && sz == 4) {
   12723       delta += 2;
   12724       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
   12725       delta ++;
   12726       stmt( IRStmt_Exit(
   12727                binop(Iop_CmpEQ16, getIReg(2,R_ECX), mkU16(0)),
   12728                Ijk_Boring,
   12729                IRConst_U32(d32)
   12730             ));
   12731        DIP("jcxz 0x%x\n", d32);
   12732        goto decode_success;
   12733    }
   12734 
   12735    /* ---------------------------------------------------- */
   12736    /* --- start of the baseline insn decoder            -- */
   12737    /* ---------------------------------------------------- */
   12738 
   12739    /* Get the primary opcode. */
   12740    opc = getIByte(delta); delta++;
   12741 
   12742    /* We get here if the current insn isn't SSE, or this CPU doesn't
   12743       support SSE. */
   12744 
   12745    switch (opc) {
   12746 
   12747    /* ------------------------ Control flow --------------- */
   12748 
   12749    case 0xC2: /* RET imm16 */
   12750       d32 = getUDisp16(delta);
   12751       delta += 2;
   12752       dis_ret(d32);
   12753       dres.whatNext = Dis_StopHere;
   12754       DIP("ret %d\n", (Int)d32);
   12755       break;
   12756    case 0xC3: /* RET */
   12757       dis_ret(0);
   12758       dres.whatNext = Dis_StopHere;
   12759       DIP("ret\n");
   12760       break;
   12761 
   12762    case 0xCF: /* IRET */
   12763       /* Note, this is an extremely kludgey and limited implementation
   12764          of iret.  All it really does is:
   12765             popl %EIP; popl %CS; popl %EFLAGS.
   12766          %CS is set but ignored (as it is in (eg) popw %cs)". */
   12767       t1 = newTemp(Ity_I32); /* ESP */
   12768       t2 = newTemp(Ity_I32); /* new EIP */
   12769       t3 = newTemp(Ity_I32); /* new CS */
   12770       t4 = newTemp(Ity_I32); /* new EFLAGS */
   12771       assign(t1, getIReg(4,R_ESP));
   12772       assign(t2, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(0) )));
   12773       assign(t3, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(4) )));
   12774       assign(t4, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(8) )));
   12775       /* Get stuff off stack */
   12776       putIReg(4, R_ESP,binop(Iop_Add32, mkexpr(t1), mkU32(12)));
   12777       /* set %CS (which is ignored anyway) */
   12778       putSReg( R_CS, unop(Iop_32to16, mkexpr(t3)) );
   12779       /* set %EFLAGS */
   12780       set_EFLAGS_from_value( t4, False/*!emit_AC_emwarn*/, 0/*unused*/ );
   12781       /* goto new EIP value */
   12782       jmp_treg(Ijk_Ret,t2);
   12783       dres.whatNext = Dis_StopHere;
   12784       DIP("iret (very kludgey)\n");
   12785       break;
   12786 
   12787    case 0xE8: /* CALL J4 */
   12788       d32 = getUDisp32(delta); delta += 4;
   12789       d32 += (guest_EIP_bbstart+delta);
   12790       /* (guest_eip_bbstart+delta) == return-to addr, d32 == call-to addr */
   12791       if (d32 == guest_EIP_bbstart+delta && getIByte(delta) >= 0x58
   12792                                          && getIByte(delta) <= 0x5F) {
   12793          /* Specially treat the position-independent-code idiom
   12794                  call X
   12795               X: popl %reg
   12796             as
   12797                  movl %eip, %reg.
   12798             since this generates better code, but for no other reason. */
   12799          Int archReg = getIByte(delta) - 0x58;
   12800          /* vex_printf("-- fPIC thingy\n"); */
   12801          putIReg(4, archReg, mkU32(guest_EIP_bbstart+delta));
   12802          delta++; /* Step over the POP */
   12803          DIP("call 0x%x ; popl %s\n",d32,nameIReg(4,archReg));
   12804       } else {
   12805          /* The normal sequence for a call. */
   12806          t1 = newTemp(Ity_I32);
   12807          assign(t1, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
   12808          putIReg(4, R_ESP, mkexpr(t1));
   12809          storeLE( mkexpr(t1), mkU32(guest_EIP_bbstart+delta));
   12810          if (resteerOkFn( callback_opaque, (Addr64)(Addr32)d32 )) {
   12811             /* follow into the call target. */
   12812             dres.whatNext   = Dis_ResteerU;
   12813             dres.continueAt = (Addr64)(Addr32)d32;
   12814          } else {
   12815             jmp_lit(Ijk_Call,d32);
   12816             dres.whatNext = Dis_StopHere;
   12817          }
   12818          DIP("call 0x%x\n",d32);
   12819       }
   12820       break;
   12821 
   12822 //--    case 0xC8: /* ENTER */
   12823 //--       d32 = getUDisp16(eip); eip += 2;
   12824 //--       abyte = getIByte(delta); delta++;
   12825 //--
   12826 //--       vg_assert(sz == 4);
   12827 //--       vg_assert(abyte == 0);
   12828 //--
   12829 //--       t1 = newTemp(cb); t2 = newTemp(cb);
   12830 //--       uInstr2(cb, GET,   sz, ArchReg, R_EBP, TempReg, t1);
   12831 //--       uInstr2(cb, GET,    4, ArchReg, R_ESP, TempReg, t2);
   12832 //--       uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
   12833 //--       uLiteral(cb, sz);
   12834 //--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
   12835 //--       uInstr2(cb, STORE,  4, TempReg, t1,    TempReg, t2);
   12836 //--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_EBP);
   12837 //--       if (d32) {
   12838 //--          uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
   12839 //--          uLiteral(cb, d32);
   12840 //--          uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
   12841 //--       }
   12842 //--       DIP("enter 0x%x, 0x%x", d32, abyte);
   12843 //--       break;
   12844 
   12845    case 0xC9: /* LEAVE */
   12846       vassert(sz == 4);
   12847       t1 = newTemp(Ity_I32); t2 = newTemp(Ity_I32);
   12848       assign(t1, getIReg(4,R_EBP));
   12849       /* First PUT ESP looks redundant, but need it because ESP must
   12850          always be up-to-date for Memcheck to work... */
   12851       putIReg(4, R_ESP, mkexpr(t1));
   12852       assign(t2, loadLE(Ity_I32,mkexpr(t1)));
   12853       putIReg(4, R_EBP, mkexpr(t2));
   12854       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t1), mkU32(4)) );
   12855       DIP("leave\n");
   12856       break;
   12857 
   12858    /* ---------------- Misc weird-ass insns --------------- */
   12859 
   12860    case 0x27: /* DAA */
   12861    case 0x2F: /* DAS */
   12862    case 0x37: /* AAA */
   12863    case 0x3F: /* AAS */
   12864       /* An ugly implementation for some ugly instructions.  Oh
   12865 	 well. */
   12866       if (sz != 4) goto decode_failure;
   12867       t1 = newTemp(Ity_I32);
   12868       t2 = newTemp(Ity_I32);
   12869       /* Make up a 32-bit value (t1), with the old value of AX in the
   12870          bottom 16 bits, and the old OSZACP bitmask in the upper 16
   12871          bits. */
   12872       assign(t1,
   12873              binop(Iop_16HLto32,
   12874                    unop(Iop_32to16,
   12875                         mk_x86g_calculate_eflags_all()),
   12876                    getIReg(2, R_EAX)
   12877             ));
   12878       /* Call the helper fn, to get a new AX and OSZACP value, and
   12879          poke both back into the guest state.  Also pass the helper
   12880          the actual opcode so it knows which of the 4 instructions it
   12881          is doing the computation for. */
   12882       vassert(opc == 0x27 || opc == 0x2F || opc == 0x37 || opc == 0x3F);
   12883       assign(t2,
   12884               mkIRExprCCall(
   12885                  Ity_I32, 0/*regparm*/, "x86g_calculate_daa_das_aaa_aas",
   12886                  &x86g_calculate_daa_das_aaa_aas,
   12887                  mkIRExprVec_2( mkexpr(t1), mkU32( opc & 0xFF) )
   12888             ));
   12889      putIReg(2, R_EAX, unop(Iop_32to16, mkexpr(t2) ));
   12890 
   12891      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   12892      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   12893      stmt( IRStmt_Put( OFFB_CC_DEP1,
   12894                        binop(Iop_And32,
   12895                              binop(Iop_Shr32, mkexpr(t2), mkU8(16)),
   12896                              mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
   12897                                     | X86G_CC_MASK_A | X86G_CC_MASK_Z
   12898                                     | X86G_CC_MASK_S| X86G_CC_MASK_O )
   12899                             )
   12900                       )
   12901          );
   12902      /* Set NDEP even though it isn't used.  This makes redundant-PUT
   12903         elimination of previous stores to this field work better. */
   12904      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   12905      switch (opc) {
   12906         case 0x27: DIP("daa\n"); break;
   12907         case 0x2F: DIP("das\n"); break;
   12908         case 0x37: DIP("aaa\n"); break;
   12909         case 0x3F: DIP("aas\n"); break;
   12910         default: vassert(0);
   12911      }
   12912      break;
   12913 
   12914    case 0xD4: /* AAM */
   12915    case 0xD5: /* AAD */
   12916       d32 = getIByte(delta); delta++;
   12917       if (sz != 4 || d32 != 10) goto decode_failure;
   12918       t1 = newTemp(Ity_I32);
   12919       t2 = newTemp(Ity_I32);
   12920       /* Make up a 32-bit value (t1), with the old value of AX in the
   12921          bottom 16 bits, and the old OSZACP bitmask in the upper 16
   12922          bits. */
   12923       assign(t1,
   12924              binop(Iop_16HLto32,
   12925                    unop(Iop_32to16,
   12926                         mk_x86g_calculate_eflags_all()),
   12927                    getIReg(2, R_EAX)
   12928             ));
   12929       /* Call the helper fn, to get a new AX and OSZACP value, and
   12930          poke both back into the guest state.  Also pass the helper
   12931          the actual opcode so it knows which of the 2 instructions it
   12932          is doing the computation for. */
   12933       assign(t2,
   12934               mkIRExprCCall(
   12935                  Ity_I32, 0/*regparm*/, "x86g_calculate_aad_aam",
   12936                  &x86g_calculate_aad_aam,
   12937                  mkIRExprVec_2( mkexpr(t1), mkU32( opc & 0xFF) )
   12938             ));
   12939       putIReg(2, R_EAX, unop(Iop_32to16, mkexpr(t2) ));
   12940 
   12941       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   12942       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   12943       stmt( IRStmt_Put( OFFB_CC_DEP1,
   12944                         binop(Iop_And32,
   12945                               binop(Iop_Shr32, mkexpr(t2), mkU8(16)),
   12946                               mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
   12947                                      | X86G_CC_MASK_A | X86G_CC_MASK_Z
   12948                                      | X86G_CC_MASK_S| X86G_CC_MASK_O )
   12949                              )
   12950                        )
   12951           );
   12952       /* Set NDEP even though it isn't used.  This makes
   12953          redundant-PUT elimination of previous stores to this field
   12954          work better. */
   12955       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   12956 
   12957       DIP(opc == 0xD4 ? "aam\n" : "aad\n");
   12958       break;
   12959 
   12960    /* ------------------------ CWD/CDQ -------------------- */
   12961 
   12962    case 0x98: /* CBW */
   12963       if (sz == 4) {
   12964          putIReg(4, R_EAX, unop(Iop_16Sto32, getIReg(2, R_EAX)));
   12965          DIP("cwde\n");
   12966       } else {
   12967          vassert(sz == 2);
   12968          putIReg(2, R_EAX, unop(Iop_8Sto16, getIReg(1, R_EAX)));
   12969          DIP("cbw\n");
   12970       }
   12971       break;
   12972 
   12973    case 0x99: /* CWD/CDQ */
   12974       ty = szToITy(sz);
   12975       putIReg(sz, R_EDX,
   12976                   binop(mkSizedOp(ty,Iop_Sar8),
   12977                         getIReg(sz, R_EAX),
   12978                         mkU8(sz == 2 ? 15 : 31)) );
   12979       DIP(sz == 2 ? "cwdq\n" : "cdqq\n");
   12980       break;
   12981 
   12982    /* ------------------------ FPU ops -------------------- */
   12983 
   12984    case 0x9E: /* SAHF */
   12985       codegen_SAHF();
   12986       DIP("sahf\n");
   12987       break;
   12988 
   12989    case 0x9F: /* LAHF */
   12990       codegen_LAHF();
   12991       DIP("lahf\n");
   12992       break;
   12993 
   12994    case 0x9B: /* FWAIT */
   12995       /* ignore? */
   12996       DIP("fwait\n");
   12997       break;
   12998 
   12999    case 0xD8:
   13000    case 0xD9:
   13001    case 0xDA:
   13002    case 0xDB:
   13003    case 0xDC:
   13004    case 0xDD:
   13005    case 0xDE:
   13006    case 0xDF: {
   13007       Int  delta0    = delta;
   13008       Bool decode_OK = False;
   13009       delta = dis_FPU ( &decode_OK, sorb, delta );
   13010       if (!decode_OK) {
   13011          delta = delta0;
   13012          goto decode_failure;
   13013       }
   13014       break;
   13015    }
   13016 
   13017    /* ------------------------ INC & DEC ------------------ */
   13018 
   13019    case 0x40: /* INC eAX */
   13020    case 0x41: /* INC eCX */
   13021    case 0x42: /* INC eDX */
   13022    case 0x43: /* INC eBX */
   13023    case 0x44: /* INC eSP */
   13024    case 0x45: /* INC eBP */
   13025    case 0x46: /* INC eSI */
   13026    case 0x47: /* INC eDI */
   13027       vassert(sz == 2 || sz == 4);
   13028       ty = szToITy(sz);
   13029       t1 = newTemp(ty);
   13030       assign( t1, binop(mkSizedOp(ty,Iop_Add8),
   13031                         getIReg(sz, (UInt)(opc - 0x40)),
   13032                         mkU(ty,1)) );
   13033       setFlags_INC_DEC( True, t1, ty );
   13034       putIReg(sz, (UInt)(opc - 0x40), mkexpr(t1));
   13035       DIP("inc%c %s\n", nameISize(sz), nameIReg(sz,opc-0x40));
   13036       break;
   13037 
   13038    case 0x48: /* DEC eAX */
   13039    case 0x49: /* DEC eCX */
   13040    case 0x4A: /* DEC eDX */
   13041    case 0x4B: /* DEC eBX */
   13042    case 0x4C: /* DEC eSP */
   13043    case 0x4D: /* DEC eBP */
   13044    case 0x4E: /* DEC eSI */
   13045    case 0x4F: /* DEC eDI */
   13046       vassert(sz == 2 || sz == 4);
   13047       ty = szToITy(sz);
   13048       t1 = newTemp(ty);
   13049       assign( t1, binop(mkSizedOp(ty,Iop_Sub8),
   13050                         getIReg(sz, (UInt)(opc - 0x48)),
   13051                         mkU(ty,1)) );
   13052       setFlags_INC_DEC( False, t1, ty );
   13053       putIReg(sz, (UInt)(opc - 0x48), mkexpr(t1));
   13054       DIP("dec%c %s\n", nameISize(sz), nameIReg(sz,opc-0x48));
   13055       break;
   13056 
   13057    /* ------------------------ INT ------------------------ */
   13058 
   13059    case 0xCC: /* INT 3 */
   13060       jmp_lit(Ijk_SigTRAP,((Addr32)guest_EIP_bbstart)+delta);
   13061       dres.whatNext = Dis_StopHere;
   13062       DIP("int $0x3\n");
   13063       break;
   13064 
   13065    case 0xCD: /* INT imm8 */
   13066       d32 = getIByte(delta); delta++;
   13067 
   13068       /* For any of the cases where we emit a jump (that is, for all
   13069          currently handled cases), it's important that all ArchRegs
   13070          carry their up-to-date value at this point.  So we declare an
   13071          end-of-block here, which forces any TempRegs caching ArchRegs
   13072          to be flushed. */
   13073 
   13074       /* Handle int $0x40 .. $0x43 by synthesising a segfault and a
   13075          restart of this instruction (hence the "-2" two lines below,
   13076          to get the restart EIP to be this instruction.  This is
   13077          probably Linux-specific and it would be more correct to only
   13078          do this if the VexAbiInfo says that is what we should do. */
   13079       if (d32 >= 0x40 && d32 <= 0x43) {
   13080          jmp_lit(Ijk_SigSEGV,((Addr32)guest_EIP_bbstart)+delta-2);
   13081          dres.whatNext = Dis_StopHere;
   13082          DIP("int $0x%x\n", (Int)d32);
   13083          break;
   13084       }
   13085 
   13086       /* Handle int $0x80 (linux syscalls), int $0x81 and $0x82
   13087          (darwin syscalls).  As part of this, note where we are, so we
   13088          can back up the guest to this point if the syscall needs to
   13089          be restarted. */
   13090       if (d32 == 0x80) {
   13091          stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
   13092                            mkU32(guest_EIP_curr_instr) ) );
   13093          jmp_lit(Ijk_Sys_int128,((Addr32)guest_EIP_bbstart)+delta);
   13094          dres.whatNext = Dis_StopHere;
   13095          DIP("int $0x80\n");
   13096          break;
   13097       }
   13098       if (d32 == 0x81) {
   13099          stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
   13100                            mkU32(guest_EIP_curr_instr) ) );
   13101          jmp_lit(Ijk_Sys_int129,((Addr32)guest_EIP_bbstart)+delta);
   13102          dres.whatNext = Dis_StopHere;
   13103          DIP("int $0x81\n");
   13104          break;
   13105       }
   13106       if (d32 == 0x82) {
   13107          stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
   13108                            mkU32(guest_EIP_curr_instr) ) );
   13109          jmp_lit(Ijk_Sys_int130,((Addr32)guest_EIP_bbstart)+delta);
   13110          dres.whatNext = Dis_StopHere;
   13111          DIP("int $0x82\n");
   13112          break;
   13113       }
   13114 
   13115       /* none of the above */
   13116       goto decode_failure;
   13117 
   13118    /* ------------------------ Jcond, byte offset --------- */
   13119 
   13120    case 0xEB: /* Jb (jump, byte offset) */
   13121       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
   13122       delta++;
   13123       if (resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
   13124          dres.whatNext   = Dis_ResteerU;
   13125          dres.continueAt = (Addr64)(Addr32)d32;
   13126       } else {
   13127          jmp_lit(Ijk_Boring,d32);
   13128          dres.whatNext = Dis_StopHere;
   13129       }
   13130       DIP("jmp-8 0x%x\n", d32);
   13131       break;
   13132 
   13133    case 0xE9: /* Jv (jump, 16/32 offset) */
   13134       vassert(sz == 4); /* JRS added 2004 July 11 */
   13135       d32 = (((Addr32)guest_EIP_bbstart)+delta+sz) + getSDisp(sz,delta);
   13136       delta += sz;
   13137       if (resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
   13138          dres.whatNext   = Dis_ResteerU;
   13139          dres.continueAt = (Addr64)(Addr32)d32;
   13140       } else {
   13141          jmp_lit(Ijk_Boring,d32);
   13142          dres.whatNext = Dis_StopHere;
   13143       }
   13144       DIP("jmp 0x%x\n", d32);
   13145       break;
   13146 
   13147    case 0x70:
   13148    case 0x71:
   13149    case 0x72: /* JBb/JNAEb (jump below) */
   13150    case 0x73: /* JNBb/JAEb (jump not below) */
   13151    case 0x74: /* JZb/JEb (jump zero) */
   13152    case 0x75: /* JNZb/JNEb (jump not zero) */
   13153    case 0x76: /* JBEb/JNAb (jump below or equal) */
   13154    case 0x77: /* JNBEb/JAb (jump not below or equal) */
   13155    case 0x78: /* JSb (jump negative) */
   13156    case 0x79: /* JSb (jump not negative) */
   13157    case 0x7A: /* JP (jump parity even) */
   13158    case 0x7B: /* JNP/JPO (jump parity odd) */
   13159    case 0x7C: /* JLb/JNGEb (jump less) */
   13160    case 0x7D: /* JGEb/JNLb (jump greater or equal) */
   13161    case 0x7E: /* JLEb/JNGb (jump less or equal) */
   13162    case 0x7F: /* JGb/JNLEb (jump greater) */
   13163     { Int    jmpDelta;
   13164       HChar* comment  = "";
   13165       jmpDelta = (Int)getSDisp8(delta);
   13166       vassert(-128 <= jmpDelta && jmpDelta < 128);
   13167       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + jmpDelta;
   13168       delta++;
   13169       if (resteerCisOk
   13170           && vex_control.guest_chase_cond
   13171           && (Addr32)d32 != (Addr32)guest_EIP_bbstart
   13172           && jmpDelta < 0
   13173           && resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
   13174          /* Speculation: assume this backward branch is taken.  So we
   13175             need to emit a side-exit to the insn following this one,
   13176             on the negation of the condition, and continue at the
   13177             branch target address (d32).  If we wind up back at the
   13178             first instruction of the trace, just stop; it's better to
   13179             let the IR loop unroller handle that case. */
   13180          stmt( IRStmt_Exit(
   13181                   mk_x86g_calculate_condition((X86Condcode)(1 ^ (opc - 0x70))),
   13182                   Ijk_Boring,
   13183                   IRConst_U32(guest_EIP_bbstart+delta) ) );
   13184          dres.whatNext   = Dis_ResteerC;
   13185          dres.continueAt = (Addr64)(Addr32)d32;
   13186          comment = "(assumed taken)";
   13187       }
   13188       else
   13189       if (resteerCisOk
   13190           && vex_control.guest_chase_cond
   13191           && (Addr32)d32 != (Addr32)guest_EIP_bbstart
   13192           && jmpDelta >= 0
   13193           && resteerOkFn( callback_opaque,
   13194                           (Addr64)(Addr32)(guest_EIP_bbstart+delta)) ) {
   13195          /* Speculation: assume this forward branch is not taken.  So
   13196             we need to emit a side-exit to d32 (the dest) and continue
   13197             disassembling at the insn immediately following this
   13198             one. */
   13199          stmt( IRStmt_Exit(
   13200                   mk_x86g_calculate_condition((X86Condcode)(opc - 0x70)),
   13201                   Ijk_Boring,
   13202                   IRConst_U32(d32) ) );
   13203          dres.whatNext   = Dis_ResteerC;
   13204          dres.continueAt = (Addr64)(Addr32)(guest_EIP_bbstart+delta);
   13205          comment = "(assumed not taken)";
   13206       }
   13207       else {
   13208          /* Conservative default translation - end the block at this
   13209             point. */
   13210          jcc_01( (X86Condcode)(opc - 0x70),
   13211                  (Addr32)(guest_EIP_bbstart+delta), d32);
   13212          dres.whatNext = Dis_StopHere;
   13213       }
   13214       DIP("j%s-8 0x%x %s\n", name_X86Condcode(opc - 0x70), d32, comment);
   13215       break;
   13216     }
   13217 
   13218    case 0xE3: /* JECXZ (for JCXZ see above) */
   13219       if (sz != 4) goto decode_failure;
   13220       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
   13221       delta ++;
   13222       stmt( IRStmt_Exit(
   13223                binop(Iop_CmpEQ32, getIReg(4,R_ECX), mkU32(0)),
   13224             Ijk_Boring,
   13225             IRConst_U32(d32)
   13226           ));
   13227       DIP("jecxz 0x%x\n", d32);
   13228       break;
   13229 
   13230    case 0xE0: /* LOOPNE disp8: decrement count, jump if count != 0 && ZF==0 */
   13231    case 0xE1: /* LOOPE  disp8: decrement count, jump if count != 0 && ZF==1 */
   13232    case 0xE2: /* LOOP   disp8: decrement count, jump if count != 0 */
   13233     { /* Again, the docs say this uses ECX/CX as a count depending on
   13234          the address size override, not the operand one.  Since we
   13235          don't handle address size overrides, I guess that means
   13236          ECX. */
   13237       IRExpr* zbit  = NULL;
   13238       IRExpr* count = NULL;
   13239       IRExpr* cond  = NULL;
   13240       HChar*  xtra  = NULL;
   13241 
   13242       if (sz != 4) goto decode_failure;
   13243       d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
   13244       delta++;
   13245       putIReg(4, R_ECX, binop(Iop_Sub32, getIReg(4,R_ECX), mkU32(1)));
   13246 
   13247       count = getIReg(4,R_ECX);
   13248       cond = binop(Iop_CmpNE32, count, mkU32(0));
   13249       switch (opc) {
   13250          case 0xE2:
   13251             xtra = "";
   13252             break;
   13253          case 0xE1:
   13254             xtra = "e";
   13255             zbit = mk_x86g_calculate_condition( X86CondZ );
   13256 	    cond = mkAnd1(cond, zbit);
   13257             break;
   13258          case 0xE0:
   13259             xtra = "ne";
   13260             zbit = mk_x86g_calculate_condition( X86CondNZ );
   13261 	    cond = mkAnd1(cond, zbit);
   13262             break;
   13263          default:
   13264 	    vassert(0);
   13265       }
   13266       stmt( IRStmt_Exit(cond, Ijk_Boring, IRConst_U32(d32)) );
   13267 
   13268       DIP("loop%s 0x%x\n", xtra, d32);
   13269       break;
   13270     }
   13271 
   13272    /* ------------------------ IMUL ----------------------- */
   13273 
   13274    case 0x69: /* IMUL Iv, Ev, Gv */
   13275       delta = dis_imul_I_E_G ( sorb, sz, delta, sz );
   13276       break;
   13277    case 0x6B: /* IMUL Ib, Ev, Gv */
   13278       delta = dis_imul_I_E_G ( sorb, sz, delta, 1 );
   13279       break;
   13280 
   13281    /* ------------------------ MOV ------------------------ */
   13282 
   13283    case 0x88: /* MOV Gb,Eb */
   13284       delta = dis_mov_G_E(sorb, 1, delta);
   13285       break;
   13286 
   13287    case 0x89: /* MOV Gv,Ev */
   13288       delta = dis_mov_G_E(sorb, sz, delta);
   13289       break;
   13290 
   13291    case 0x8A: /* MOV Eb,Gb */
   13292       delta = dis_mov_E_G(sorb, 1, delta);
   13293       break;
   13294 
   13295    case 0x8B: /* MOV Ev,Gv */
   13296       delta = dis_mov_E_G(sorb, sz, delta);
   13297       break;
   13298 
   13299    case 0x8D: /* LEA M,Gv */
   13300       if (sz != 4)
   13301          goto decode_failure;
   13302       modrm = getIByte(delta);
   13303       if (epartIsReg(modrm))
   13304          goto decode_failure;
   13305       /* NOTE!  this is the one place where a segment override prefix
   13306          has no effect on the address calculation.  Therefore we pass
   13307          zero instead of sorb here. */
   13308       addr = disAMode ( &alen, /*sorb*/ 0, delta, dis_buf );
   13309       delta += alen;
   13310       putIReg(sz, gregOfRM(modrm), mkexpr(addr));
   13311       DIP("lea%c %s, %s\n", nameISize(sz), dis_buf,
   13312                             nameIReg(sz,gregOfRM(modrm)));
   13313       break;
   13314 
   13315    case 0x8C: /* MOV Sw,Ew -- MOV from a SEGMENT REGISTER */
   13316       delta = dis_mov_Sw_Ew(sorb, sz, delta);
   13317       break;
   13318 
   13319    case 0x8E: /* MOV Ew,Sw -- MOV to a SEGMENT REGISTER */
   13320       delta = dis_mov_Ew_Sw(sorb, delta);
   13321       break;
   13322 
   13323    case 0xA0: /* MOV Ob,AL */
   13324       sz = 1;
   13325       /* Fall through ... */
   13326    case 0xA1: /* MOV Ov,eAX */
   13327       d32 = getUDisp32(delta); delta += 4;
   13328       ty = szToITy(sz);
   13329       addr = newTemp(Ity_I32);
   13330       assign( addr, handleSegOverride(sorb, mkU32(d32)) );
   13331       putIReg(sz, R_EAX, loadLE(ty, mkexpr(addr)));
   13332       DIP("mov%c %s0x%x, %s\n", nameISize(sz), sorbTxt(sorb),
   13333                                 d32, nameIReg(sz,R_EAX));
   13334       break;
   13335 
   13336    case 0xA2: /* MOV Ob,AL */
   13337       sz = 1;
   13338       /* Fall through ... */
   13339    case 0xA3: /* MOV eAX,Ov */
   13340       d32 = getUDisp32(delta); delta += 4;
   13341       ty = szToITy(sz);
   13342       addr = newTemp(Ity_I32);
   13343       assign( addr, handleSegOverride(sorb, mkU32(d32)) );
   13344       storeLE( mkexpr(addr), getIReg(sz,R_EAX) );
   13345       DIP("mov%c %s, %s0x%x\n", nameISize(sz), nameIReg(sz,R_EAX),
   13346                                 sorbTxt(sorb), d32);
   13347       break;
   13348 
   13349    case 0xB0: /* MOV imm,AL */
   13350    case 0xB1: /* MOV imm,CL */
   13351    case 0xB2: /* MOV imm,DL */
   13352    case 0xB3: /* MOV imm,BL */
   13353    case 0xB4: /* MOV imm,AH */
   13354    case 0xB5: /* MOV imm,CH */
   13355    case 0xB6: /* MOV imm,DH */
   13356    case 0xB7: /* MOV imm,BH */
   13357       d32 = getIByte(delta); delta += 1;
   13358       putIReg(1, opc-0xB0, mkU8(d32));
   13359       DIP("movb $0x%x,%s\n", d32, nameIReg(1,opc-0xB0));
   13360       break;
   13361 
   13362    case 0xB8: /* MOV imm,eAX */
   13363    case 0xB9: /* MOV imm,eCX */
   13364    case 0xBA: /* MOV imm,eDX */
   13365    case 0xBB: /* MOV imm,eBX */
   13366    case 0xBC: /* MOV imm,eSP */
   13367    case 0xBD: /* MOV imm,eBP */
   13368    case 0xBE: /* MOV imm,eSI */
   13369    case 0xBF: /* MOV imm,eDI */
   13370       d32 = getUDisp(sz,delta); delta += sz;
   13371       putIReg(sz, opc-0xB8, mkU(szToITy(sz), d32));
   13372       DIP("mov%c $0x%x,%s\n", nameISize(sz), d32, nameIReg(sz,opc-0xB8));
   13373       break;
   13374 
   13375    case 0xC6: /* MOV Ib,Eb */
   13376       sz = 1;
   13377       goto do_Mov_I_E;
   13378    case 0xC7: /* MOV Iv,Ev */
   13379       goto do_Mov_I_E;
   13380 
   13381    do_Mov_I_E:
   13382       modrm = getIByte(delta);
   13383       if (epartIsReg(modrm)) {
   13384          delta++; /* mod/rm byte */
   13385          d32 = getUDisp(sz,delta); delta += sz;
   13386          putIReg(sz, eregOfRM(modrm), mkU(szToITy(sz), d32));
   13387          DIP("mov%c $0x%x, %s\n", nameISize(sz), d32,
   13388                                   nameIReg(sz,eregOfRM(modrm)));
   13389       } else {
   13390          addr = disAMode ( &alen, sorb, delta, dis_buf );
   13391          delta += alen;
   13392          d32 = getUDisp(sz,delta); delta += sz;
   13393          storeLE(mkexpr(addr), mkU(szToITy(sz), d32));
   13394          DIP("mov%c $0x%x, %s\n", nameISize(sz), d32, dis_buf);
   13395       }
   13396       break;
   13397 
   13398    /* ------------------------ opl imm, A ----------------- */
   13399 
   13400    case 0x04: /* ADD Ib, AL */
   13401       delta = dis_op_imm_A(  1, False, Iop_Add8, True, delta, "add" );
   13402       break;
   13403    case 0x05: /* ADD Iv, eAX */
   13404       delta = dis_op_imm_A( sz, False, Iop_Add8, True, delta, "add" );
   13405       break;
   13406 
   13407    case 0x0C: /* OR Ib, AL */
   13408       delta = dis_op_imm_A(  1, False, Iop_Or8, True, delta, "or" );
   13409       break;
   13410    case 0x0D: /* OR Iv, eAX */
   13411       delta = dis_op_imm_A( sz, False, Iop_Or8, True, delta, "or" );
   13412       break;
   13413 
   13414    case 0x14: /* ADC Ib, AL */
   13415       delta = dis_op_imm_A(  1, True, Iop_Add8, True, delta, "adc" );
   13416       break;
   13417    case 0x15: /* ADC Iv, eAX */
   13418       delta = dis_op_imm_A( sz, True, Iop_Add8, True, delta, "adc" );
   13419       break;
   13420 
   13421    case 0x1C: /* SBB Ib, AL */
   13422       delta = dis_op_imm_A( 1, True, Iop_Sub8, True, delta, "sbb" );
   13423       break;
   13424    case 0x1D: /* SBB Iv, eAX */
   13425       delta = dis_op_imm_A( sz, True, Iop_Sub8, True, delta, "sbb" );
   13426       break;
   13427 
   13428    case 0x24: /* AND Ib, AL */
   13429       delta = dis_op_imm_A(  1, False, Iop_And8, True, delta, "and" );
   13430       break;
   13431    case 0x25: /* AND Iv, eAX */
   13432       delta = dis_op_imm_A( sz, False, Iop_And8, True, delta, "and" );
   13433       break;
   13434 
   13435    case 0x2C: /* SUB Ib, AL */
   13436       delta = dis_op_imm_A(  1, False, Iop_Sub8, True, delta, "sub" );
   13437       break;
   13438    case 0x2D: /* SUB Iv, eAX */
   13439       delta = dis_op_imm_A( sz, False, Iop_Sub8, True, delta, "sub" );
   13440       break;
   13441 
   13442    case 0x34: /* XOR Ib, AL */
   13443       delta = dis_op_imm_A(  1, False, Iop_Xor8, True, delta, "xor" );
   13444       break;
   13445    case 0x35: /* XOR Iv, eAX */
   13446       delta = dis_op_imm_A( sz, False, Iop_Xor8, True, delta, "xor" );
   13447       break;
   13448 
   13449    case 0x3C: /* CMP Ib, AL */
   13450       delta = dis_op_imm_A(  1, False, Iop_Sub8, False, delta, "cmp" );
   13451       break;
   13452    case 0x3D: /* CMP Iv, eAX */
   13453       delta = dis_op_imm_A( sz, False, Iop_Sub8, False, delta, "cmp" );
   13454       break;
   13455 
   13456    case 0xA8: /* TEST Ib, AL */
   13457       delta = dis_op_imm_A(  1, False, Iop_And8, False, delta, "test" );
   13458       break;
   13459    case 0xA9: /* TEST Iv, eAX */
   13460       delta = dis_op_imm_A( sz, False, Iop_And8, False, delta, "test" );
   13461       break;
   13462 
   13463    /* ------------------------ opl Ev, Gv ----------------- */
   13464 
   13465    case 0x02: /* ADD Eb,Gb */
   13466       delta = dis_op2_E_G ( sorb, False, Iop_Add8, True, 1, delta, "add" );
   13467       break;
   13468    case 0x03: /* ADD Ev,Gv */
   13469       delta = dis_op2_E_G ( sorb, False, Iop_Add8, True, sz, delta, "add" );
   13470       break;
   13471 
   13472    case 0x0A: /* OR Eb,Gb */
   13473       delta = dis_op2_E_G ( sorb, False, Iop_Or8, True, 1, delta, "or" );
   13474       break;
   13475    case 0x0B: /* OR Ev,Gv */
   13476       delta = dis_op2_E_G ( sorb, False, Iop_Or8, True, sz, delta, "or" );
   13477       break;
   13478 
   13479    case 0x12: /* ADC Eb,Gb */
   13480       delta = dis_op2_E_G ( sorb, True, Iop_Add8, True, 1, delta, "adc" );
   13481       break;
   13482    case 0x13: /* ADC Ev,Gv */
   13483       delta = dis_op2_E_G ( sorb, True, Iop_Add8, True, sz, delta, "adc" );
   13484       break;
   13485 
   13486    case 0x1A: /* SBB Eb,Gb */
   13487       delta = dis_op2_E_G ( sorb, True, Iop_Sub8, True, 1, delta, "sbb" );
   13488       break;
   13489    case 0x1B: /* SBB Ev,Gv */
   13490       delta = dis_op2_E_G ( sorb, True, Iop_Sub8, True, sz, delta, "sbb" );
   13491       break;
   13492 
   13493    case 0x22: /* AND Eb,Gb */
   13494       delta = dis_op2_E_G ( sorb, False, Iop_And8, True, 1, delta, "and" );
   13495       break;
   13496    case 0x23: /* AND Ev,Gv */
   13497       delta = dis_op2_E_G ( sorb, False, Iop_And8, True, sz, delta, "and" );
   13498       break;
   13499 
   13500    case 0x2A: /* SUB Eb,Gb */
   13501       delta = dis_op2_E_G ( sorb, False, Iop_Sub8, True, 1, delta, "sub" );
   13502       break;
   13503    case 0x2B: /* SUB Ev,Gv */
   13504       delta = dis_op2_E_G ( sorb, False, Iop_Sub8, True, sz, delta, "sub" );
   13505       break;
   13506 
   13507    case 0x32: /* XOR Eb,Gb */
   13508       delta = dis_op2_E_G ( sorb, False, Iop_Xor8, True, 1, delta, "xor" );
   13509       break;
   13510    case 0x33: /* XOR Ev,Gv */
   13511       delta = dis_op2_E_G ( sorb, False, Iop_Xor8, True, sz, delta, "xor" );
   13512       break;
   13513 
   13514    case 0x3A: /* CMP Eb,Gb */
   13515       delta = dis_op2_E_G ( sorb, False, Iop_Sub8, False, 1, delta, "cmp" );
   13516       break;
   13517    case 0x3B: /* CMP Ev,Gv */
   13518       delta = dis_op2_E_G ( sorb, False, Iop_Sub8, False, sz, delta, "cmp" );
   13519       break;
   13520 
   13521    case 0x84: /* TEST Eb,Gb */
   13522       delta = dis_op2_E_G ( sorb, False, Iop_And8, False, 1, delta, "test" );
   13523       break;
   13524    case 0x85: /* TEST Ev,Gv */
   13525       delta = dis_op2_E_G ( sorb, False, Iop_And8, False, sz, delta, "test" );
   13526       break;
   13527 
   13528    /* ------------------------ opl Gv, Ev ----------------- */
   13529 
   13530    case 0x00: /* ADD Gb,Eb */
   13531       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13532                             Iop_Add8, True, 1, delta, "add" );
   13533       break;
   13534    case 0x01: /* ADD Gv,Ev */
   13535       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13536                             Iop_Add8, True, sz, delta, "add" );
   13537       break;
   13538 
   13539    case 0x08: /* OR Gb,Eb */
   13540       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13541                             Iop_Or8, True, 1, delta, "or" );
   13542       break;
   13543    case 0x09: /* OR Gv,Ev */
   13544       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13545                             Iop_Or8, True, sz, delta, "or" );
   13546       break;
   13547 
   13548    case 0x10: /* ADC Gb,Eb */
   13549       delta = dis_op2_G_E ( sorb, pfx_lock, True,
   13550                             Iop_Add8, True, 1, delta, "adc" );
   13551       break;
   13552    case 0x11: /* ADC Gv,Ev */
   13553       delta = dis_op2_G_E ( sorb, pfx_lock, True,
   13554                             Iop_Add8, True, sz, delta, "adc" );
   13555       break;
   13556 
   13557    case 0x18: /* SBB Gb,Eb */
   13558       delta = dis_op2_G_E ( sorb, pfx_lock, True,
   13559                             Iop_Sub8, True, 1, delta, "sbb" );
   13560       break;
   13561    case 0x19: /* SBB Gv,Ev */
   13562       delta = dis_op2_G_E ( sorb, pfx_lock, True,
   13563                             Iop_Sub8, True, sz, delta, "sbb" );
   13564       break;
   13565 
   13566    case 0x20: /* AND Gb,Eb */
   13567       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13568                             Iop_And8, True, 1, delta, "and" );
   13569       break;
   13570    case 0x21: /* AND Gv,Ev */
   13571       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13572                             Iop_And8, True, sz, delta, "and" );
   13573       break;
   13574 
   13575    case 0x28: /* SUB Gb,Eb */
   13576       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13577                             Iop_Sub8, True, 1, delta, "sub" );
   13578       break;
   13579    case 0x29: /* SUB Gv,Ev */
   13580       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13581                             Iop_Sub8, True, sz, delta, "sub" );
   13582       break;
   13583 
   13584    case 0x30: /* XOR Gb,Eb */
   13585       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13586                             Iop_Xor8, True, 1, delta, "xor" );
   13587       break;
   13588    case 0x31: /* XOR Gv,Ev */
   13589       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13590                             Iop_Xor8, True, sz, delta, "xor" );
   13591       break;
   13592 
   13593    case 0x38: /* CMP Gb,Eb */
   13594       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13595                             Iop_Sub8, False, 1, delta, "cmp" );
   13596       break;
   13597    case 0x39: /* CMP Gv,Ev */
   13598       delta = dis_op2_G_E ( sorb, pfx_lock, False,
   13599                             Iop_Sub8, False, sz, delta, "cmp" );
   13600       break;
   13601 
   13602    /* ------------------------ POP ------------------------ */
   13603 
   13604    case 0x58: /* POP eAX */
   13605    case 0x59: /* POP eCX */
   13606    case 0x5A: /* POP eDX */
   13607    case 0x5B: /* POP eBX */
   13608    case 0x5D: /* POP eBP */
   13609    case 0x5E: /* POP eSI */
   13610    case 0x5F: /* POP eDI */
   13611    case 0x5C: /* POP eSP */
   13612       vassert(sz == 2 || sz == 4);
   13613       t1 = newTemp(szToITy(sz)); t2 = newTemp(Ity_I32);
   13614       assign(t2, getIReg(4, R_ESP));
   13615       assign(t1, loadLE(szToITy(sz),mkexpr(t2)));
   13616       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t2), mkU32(sz)));
   13617       putIReg(sz, opc-0x58, mkexpr(t1));
   13618       DIP("pop%c %s\n", nameISize(sz), nameIReg(sz,opc-0x58));
   13619       break;
   13620 
   13621    case 0x9D: /* POPF */
   13622       vassert(sz == 2 || sz == 4);
   13623       t1 = newTemp(Ity_I32); t2 = newTemp(Ity_I32);
   13624       assign(t2, getIReg(4, R_ESP));
   13625       assign(t1, widenUto32(loadLE(szToITy(sz),mkexpr(t2))));
   13626       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t2), mkU32(sz)));
   13627 
   13628       /* Generate IR to set %EFLAGS{O,S,Z,A,C,P,D,ID,AC} from the
   13629 	 value in t1. */
   13630       set_EFLAGS_from_value( t1, True/*emit_AC_emwarn*/,
   13631                                  ((Addr32)guest_EIP_bbstart)+delta );
   13632 
   13633       DIP("popf%c\n", nameISize(sz));
   13634       break;
   13635 
   13636    case 0x61: /* POPA */
   13637       /* This is almost certainly wrong for sz==2.  So ... */
   13638       if (sz != 4) goto decode_failure;
   13639 
   13640       /* t5 is the old %ESP value. */
   13641       t5 = newTemp(Ity_I32);
   13642       assign( t5, getIReg(4, R_ESP) );
   13643 
   13644       /* Reload all the registers, except %esp. */
   13645       putIReg(4,R_EAX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(28)) ));
   13646       putIReg(4,R_ECX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(24)) ));
   13647       putIReg(4,R_EDX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(20)) ));
   13648       putIReg(4,R_EBX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(16)) ));
   13649       /* ignore saved %ESP */
   13650       putIReg(4,R_EBP, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 8)) ));
   13651       putIReg(4,R_ESI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 4)) ));
   13652       putIReg(4,R_EDI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 0)) ));
   13653 
   13654       /* and move %ESP back up */
   13655       putIReg( 4, R_ESP, binop(Iop_Add32, mkexpr(t5), mkU32(8*4)) );
   13656 
   13657       DIP("popa%c\n", nameISize(sz));
   13658       break;
   13659 
   13660    case 0x8F: /* POPL/POPW m32 */
   13661      { Int    len;
   13662        UChar  rm = getIByte(delta);
   13663 
   13664        /* make sure this instruction is correct POP */
   13665        if (epartIsReg(rm) || gregOfRM(rm) != 0)
   13666           goto decode_failure;
   13667        /* and has correct size */
   13668        if (sz != 4 && sz != 2)
   13669           goto decode_failure;
   13670        ty = szToITy(sz);
   13671 
   13672        t1 = newTemp(Ity_I32); /* stack address */
   13673        t3 = newTemp(ty); /* data */
   13674        /* set t1 to ESP: t1 = ESP */
   13675        assign( t1, getIReg(4, R_ESP) );
   13676        /* load M[ESP] to virtual register t3: t3 = M[t1] */
   13677        assign( t3, loadLE(ty, mkexpr(t1)) );
   13678 
   13679        /* increase ESP; must be done before the STORE.  Intel manual says:
   13680             If the ESP register is used as a base register for addressing
   13681             a destination operand in memory, the POP instruction computes
   13682             the effective address of the operand after it increments the
   13683             ESP register.
   13684        */
   13685        putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t1), mkU32(sz)) );
   13686 
   13687        /* resolve MODR/M */
   13688        addr = disAMode ( &len, sorb, delta, dis_buf);
   13689        storeLE( mkexpr(addr), mkexpr(t3) );
   13690 
   13691        DIP("pop%c %s\n", sz==2 ? 'w' : 'l', dis_buf);
   13692 
   13693        delta += len;
   13694        break;
   13695      }
   13696 
   13697    case 0x1F: /* POP %DS */
   13698       dis_pop_segreg( R_DS, sz ); break;
   13699    case 0x07: /* POP %ES */
   13700       dis_pop_segreg( R_ES, sz ); break;
   13701    case 0x17: /* POP %SS */
   13702       dis_pop_segreg( R_SS, sz ); break;
   13703 
   13704    /* ------------------------ PUSH ----------------------- */
   13705 
   13706    case 0x50: /* PUSH eAX */
   13707    case 0x51: /* PUSH eCX */
   13708    case 0x52: /* PUSH eDX */
   13709    case 0x53: /* PUSH eBX */
   13710    case 0x55: /* PUSH eBP */
   13711    case 0x56: /* PUSH eSI */
   13712    case 0x57: /* PUSH eDI */
   13713    case 0x54: /* PUSH eSP */
   13714       /* This is the Right Way, in that the value to be pushed is
   13715          established before %esp is changed, so that pushl %esp
   13716          correctly pushes the old value. */
   13717       vassert(sz == 2 || sz == 4);
   13718       ty = sz==2 ? Ity_I16 : Ity_I32;
   13719       t1 = newTemp(ty); t2 = newTemp(Ity_I32);
   13720       assign(t1, getIReg(sz, opc-0x50));
   13721       assign(t2, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)));
   13722       putIReg(4, R_ESP, mkexpr(t2) );
   13723       storeLE(mkexpr(t2),mkexpr(t1));
   13724       DIP("push%c %s\n", nameISize(sz), nameIReg(sz,opc-0x50));
   13725       break;
   13726 
   13727 
   13728    case 0x68: /* PUSH Iv */
   13729       d32 = getUDisp(sz,delta); delta += sz;
   13730       goto do_push_I;
   13731    case 0x6A: /* PUSH Ib, sign-extended to sz */
   13732       d32 = getSDisp8(delta); delta += 1;
   13733       goto do_push_I;
   13734    do_push_I:
   13735       ty = szToITy(sz);
   13736       t1 = newTemp(Ity_I32); t2 = newTemp(ty);
   13737       assign( t1, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
   13738       putIReg(4, R_ESP, mkexpr(t1) );
   13739       /* stop mkU16 asserting if d32 is a negative 16-bit number
   13740          (bug #132813) */
   13741       if (ty == Ity_I16)
   13742          d32 &= 0xFFFF;
   13743       storeLE( mkexpr(t1), mkU(ty,d32) );
   13744       DIP("push%c $0x%x\n", nameISize(sz), d32);
   13745       break;
   13746 
   13747    case 0x9C: /* PUSHF */ {
   13748       vassert(sz == 2 || sz == 4);
   13749 
   13750       t1 = newTemp(Ity_I32);
   13751       assign( t1, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
   13752       putIReg(4, R_ESP, mkexpr(t1) );
   13753 
   13754       /* Calculate OSZACP, and patch in fixed fields as per
   13755          Intel docs.
   13756          - bit 1 is always 1
   13757          - bit 9 is Interrupt Enable (should always be 1 in user mode?)
   13758       */
   13759       t2 = newTemp(Ity_I32);
   13760       assign( t2, binop(Iop_Or32,
   13761                         mk_x86g_calculate_eflags_all(),
   13762                         mkU32( (1<<1)|(1<<9) ) ));
   13763 
   13764       /* Patch in the D flag.  This can simply be a copy of bit 10 of
   13765          baseBlock[OFFB_DFLAG]. */
   13766       t3 = newTemp(Ity_I32);
   13767       assign( t3, binop(Iop_Or32,
   13768                         mkexpr(t2),
   13769                         binop(Iop_And32,
   13770                               IRExpr_Get(OFFB_DFLAG,Ity_I32),
   13771                               mkU32(1<<10)))
   13772             );
   13773 
   13774       /* And patch in the ID flag. */
   13775       t4 = newTemp(Ity_I32);
   13776       assign( t4, binop(Iop_Or32,
   13777                         mkexpr(t3),
   13778                         binop(Iop_And32,
   13779                               binop(Iop_Shl32, IRExpr_Get(OFFB_IDFLAG,Ity_I32),
   13780                                                mkU8(21)),
   13781                               mkU32(1<<21)))
   13782             );
   13783 
   13784       /* And patch in the AC flag. */
   13785       t5 = newTemp(Ity_I32);
   13786       assign( t5, binop(Iop_Or32,
   13787                         mkexpr(t4),
   13788                         binop(Iop_And32,
   13789                               binop(Iop_Shl32, IRExpr_Get(OFFB_ACFLAG,Ity_I32),
   13790                                                mkU8(18)),
   13791                               mkU32(1<<18)))
   13792             );
   13793 
   13794       /* if sz==2, the stored value needs to be narrowed. */
   13795       if (sz == 2)
   13796         storeLE( mkexpr(t1), unop(Iop_32to16,mkexpr(t5)) );
   13797       else
   13798         storeLE( mkexpr(t1), mkexpr(t5) );
   13799 
   13800       DIP("pushf%c\n", nameISize(sz));
   13801       break;
   13802    }
   13803 
   13804    case 0x60: /* PUSHA */
   13805       /* This is almost certainly wrong for sz==2.  So ... */
   13806       if (sz != 4) goto decode_failure;
   13807 
   13808       /* This is the Right Way, in that the value to be pushed is
   13809          established before %esp is changed, so that pusha
   13810          correctly pushes the old %esp value.  New value of %esp is
   13811          pushed at start. */
   13812       /* t0 is the %ESP value we're going to push. */
   13813       t0 = newTemp(Ity_I32);
   13814       assign( t0, getIReg(4, R_ESP) );
   13815 
   13816       /* t5 will be the new %ESP value. */
   13817       t5 = newTemp(Ity_I32);
   13818       assign( t5, binop(Iop_Sub32, mkexpr(t0), mkU32(8*4)) );
   13819 
   13820       /* Update guest state before prodding memory. */
   13821       putIReg(4, R_ESP, mkexpr(t5));
   13822 
   13823       /* Dump all the registers. */
   13824       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(28)), getIReg(4,R_EAX) );
   13825       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(24)), getIReg(4,R_ECX) );
   13826       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(20)), getIReg(4,R_EDX) );
   13827       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(16)), getIReg(4,R_EBX) );
   13828       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(12)), mkexpr(t0) /*esp*/);
   13829       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 8)), getIReg(4,R_EBP) );
   13830       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 4)), getIReg(4,R_ESI) );
   13831       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 0)), getIReg(4,R_EDI) );
   13832 
   13833       DIP("pusha%c\n", nameISize(sz));
   13834       break;
   13835 
   13836    case 0x0E: /* PUSH %CS */
   13837       dis_push_segreg( R_CS, sz ); break;
   13838    case 0x1E: /* PUSH %DS */
   13839       dis_push_segreg( R_DS, sz ); break;
   13840    case 0x06: /* PUSH %ES */
   13841       dis_push_segreg( R_ES, sz ); break;
   13842    case 0x16: /* PUSH %SS */
   13843       dis_push_segreg( R_SS, sz ); break;
   13844 
   13845    /* ------------------------ SCAS et al ----------------- */
   13846 
   13847    case 0xA4: /* MOVS, no REP prefix */
   13848    case 0xA5:
   13849       if (sorb != 0)
   13850          goto decode_failure; /* else dis_string_op asserts */
   13851       dis_string_op( dis_MOVS, ( opc == 0xA4 ? 1 : sz ), "movs", sorb );
   13852       break;
   13853 
   13854   case 0xA6: /* CMPSb, no REP prefix */
   13855   case 0xA7:
   13856       if (sorb != 0)
   13857          goto decode_failure; /* else dis_string_op asserts */
   13858       dis_string_op( dis_CMPS, ( opc == 0xA6 ? 1 : sz ), "cmps", sorb );
   13859       break;
   13860 
   13861    case 0xAA: /* STOS, no REP prefix */
   13862    case 0xAB:
   13863       if (sorb != 0)
   13864          goto decode_failure; /* else dis_string_op asserts */
   13865       dis_string_op( dis_STOS, ( opc == 0xAA ? 1 : sz ), "stos", sorb );
   13866       break;
   13867 
   13868    case 0xAC: /* LODS, no REP prefix */
   13869    case 0xAD:
   13870       if (sorb != 0)
   13871          goto decode_failure; /* else dis_string_op asserts */
   13872       dis_string_op( dis_LODS, ( opc == 0xAC ? 1 : sz ), "lods", sorb );
   13873       break;
   13874 
   13875    case 0xAE: /* SCAS, no REP prefix */
   13876    case 0xAF:
   13877       if (sorb != 0)
   13878          goto decode_failure; /* else dis_string_op asserts */
   13879       dis_string_op( dis_SCAS, ( opc == 0xAE ? 1 : sz ), "scas", sorb );
   13880       break;
   13881 
   13882 
   13883    case 0xFC: /* CLD */
   13884       stmt( IRStmt_Put( OFFB_DFLAG, mkU32(1)) );
   13885       DIP("cld\n");
   13886       break;
   13887 
   13888    case 0xFD: /* STD */
   13889       stmt( IRStmt_Put( OFFB_DFLAG, mkU32(0xFFFFFFFF)) );
   13890       DIP("std\n");
   13891       break;
   13892 
   13893    case 0xF8: /* CLC */
   13894    case 0xF9: /* STC */
   13895    case 0xF5: /* CMC */
   13896       t0 = newTemp(Ity_I32);
   13897       t1 = newTemp(Ity_I32);
   13898       assign( t0, mk_x86g_calculate_eflags_all() );
   13899       switch (opc) {
   13900          case 0xF8:
   13901             assign( t1, binop(Iop_And32, mkexpr(t0),
   13902                                          mkU32(~X86G_CC_MASK_C)));
   13903             DIP("clc\n");
   13904             break;
   13905          case 0xF9:
   13906             assign( t1, binop(Iop_Or32, mkexpr(t0),
   13907                                         mkU32(X86G_CC_MASK_C)));
   13908             DIP("stc\n");
   13909             break;
   13910          case 0xF5:
   13911             assign( t1, binop(Iop_Xor32, mkexpr(t0),
   13912                                          mkU32(X86G_CC_MASK_C)));
   13913             DIP("cmc\n");
   13914             break;
   13915          default:
   13916             vpanic("disInstr(x86)(clc/stc/cmc)");
   13917       }
   13918       stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   13919       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   13920       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t1) ));
   13921       /* Set NDEP even though it isn't used.  This makes redundant-PUT
   13922          elimination of previous stores to this field work better. */
   13923       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   13924       break;
   13925 
   13926    case 0xD6: /* SALC */
   13927       t0 = newTemp(Ity_I32);
   13928       t1 = newTemp(Ity_I32);
   13929       assign( t0,  binop(Iop_And32,
   13930                          mk_x86g_calculate_eflags_c(),
   13931                          mkU32(1)) );
   13932       assign( t1, binop(Iop_Sar32,
   13933                         binop(Iop_Shl32, mkexpr(t0), mkU8(31)),
   13934                         mkU8(31)) );
   13935       putIReg(1, R_EAX, unop(Iop_32to8, mkexpr(t1)) );
   13936       DIP("salc\n");
   13937       break;
   13938 
   13939    /* REPNE prefix insn */
   13940    case 0xF2: {
   13941       Addr32 eip_orig = guest_EIP_bbstart + delta_start;
   13942       if (sorb != 0) goto decode_failure;
   13943       abyte = getIByte(delta); delta++;
   13944 
   13945       if (abyte == 0x66) { sz = 2; abyte = getIByte(delta); delta++; }
   13946       dres.whatNext = Dis_StopHere;
   13947 
   13948       switch (abyte) {
   13949       /* According to the Intel manual, "repne movs" should never occur, but
   13950        * in practice it has happened, so allow for it here... */
   13951       case 0xA4: sz = 1;   /* REPNE MOVS<sz> */
   13952       case 0xA5:
   13953          dis_REP_op ( X86CondNZ, dis_MOVS, sz, eip_orig,
   13954                                  guest_EIP_bbstart+delta, "repne movs" );
   13955          break;
   13956 
   13957       case 0xA6: sz = 1;   /* REPNE CMP<sz> */
   13958       case 0xA7:
   13959          dis_REP_op ( X86CondNZ, dis_CMPS, sz, eip_orig,
   13960                                  guest_EIP_bbstart+delta, "repne cmps" );
   13961          break;
   13962 
   13963       case 0xAA: sz = 1;   /* REPNE STOS<sz> */
   13964       case 0xAB:
   13965          dis_REP_op ( X86CondNZ, dis_STOS, sz, eip_orig,
   13966                                  guest_EIP_bbstart+delta, "repne stos" );
   13967          break;
   13968 
   13969       case 0xAE: sz = 1;   /* REPNE SCAS<sz> */
   13970       case 0xAF:
   13971          dis_REP_op ( X86CondNZ, dis_SCAS, sz, eip_orig,
   13972                                  guest_EIP_bbstart+delta, "repne scas" );
   13973          break;
   13974 
   13975       default:
   13976          goto decode_failure;
   13977       }
   13978       break;
   13979    }
   13980 
   13981    /* REP/REPE prefix insn (for SCAS and CMPS, 0xF3 means REPE,
   13982       for the rest, it means REP) */
   13983    case 0xF3: {
   13984       Addr32 eip_orig = guest_EIP_bbstart + delta_start;
   13985       if (sorb != 0) goto decode_failure;
   13986       abyte = getIByte(delta); delta++;
   13987 
   13988       if (abyte == 0x66) { sz = 2; abyte = getIByte(delta); delta++; }
   13989       dres.whatNext = Dis_StopHere;
   13990 
   13991       switch (abyte) {
   13992       case 0xA4: sz = 1;   /* REP MOVS<sz> */
   13993       case 0xA5:
   13994          dis_REP_op ( X86CondAlways, dis_MOVS, sz, eip_orig,
   13995                                      guest_EIP_bbstart+delta, "rep movs" );
   13996          break;
   13997 
   13998       case 0xA6: sz = 1;   /* REPE CMP<sz> */
   13999       case 0xA7:
   14000          dis_REP_op ( X86CondZ, dis_CMPS, sz, eip_orig,
   14001                                 guest_EIP_bbstart+delta, "repe cmps" );
   14002          break;
   14003 
   14004       case 0xAA: sz = 1;   /* REP STOS<sz> */
   14005       case 0xAB:
   14006          dis_REP_op ( X86CondAlways, dis_STOS, sz, eip_orig,
   14007                                      guest_EIP_bbstart+delta, "rep stos" );
   14008          break;
   14009 
   14010       case 0xAC: sz = 1;   /* REP LODS<sz> */
   14011       case 0xAD:
   14012          dis_REP_op ( X86CondAlways, dis_LODS, sz, eip_orig,
   14013                                      guest_EIP_bbstart+delta, "rep lods" );
   14014          break;
   14015 
   14016       case 0xAE: sz = 1;   /* REPE SCAS<sz> */
   14017       case 0xAF:
   14018          dis_REP_op ( X86CondZ, dis_SCAS, sz, eip_orig,
   14019                                 guest_EIP_bbstart+delta, "repe scas" );
   14020          break;
   14021 
   14022       case 0x90:           /* REP NOP (PAUSE) */
   14023          /* a hint to the P4 re spin-wait loop */
   14024          DIP("rep nop (P4 pause)\n");
   14025          /* "observe" the hint.  The Vex client needs to be careful not
   14026             to cause very long delays as a result, though. */
   14027          jmp_lit(Ijk_Yield, ((Addr32)guest_EIP_bbstart)+delta);
   14028          dres.whatNext = Dis_StopHere;
   14029          break;
   14030 
   14031       case 0xC3:           /* REP RET -- same as normal ret? */
   14032          dis_ret(0);
   14033          dres.whatNext = Dis_StopHere;
   14034          DIP("rep ret\n");
   14035          break;
   14036 
   14037       default:
   14038          goto decode_failure;
   14039       }
   14040       break;
   14041    }
   14042 
   14043    /* ------------------------ XCHG ----------------------- */
   14044 
   14045    /* XCHG reg,mem automatically asserts LOCK# even without a LOCK
   14046       prefix; hence it must be translated with an IRCAS (at least, the
   14047       memory variant). */
   14048    case 0x86: /* XCHG Gb,Eb */
   14049       sz = 1;
   14050       /* Fall through ... */
   14051    case 0x87: /* XCHG Gv,Ev */
   14052       modrm = getIByte(delta);
   14053       ty = szToITy(sz);
   14054       t1 = newTemp(ty); t2 = newTemp(ty);
   14055       if (epartIsReg(modrm)) {
   14056          assign(t1, getIReg(sz, eregOfRM(modrm)));
   14057          assign(t2, getIReg(sz, gregOfRM(modrm)));
   14058          putIReg(sz, gregOfRM(modrm), mkexpr(t1));
   14059          putIReg(sz, eregOfRM(modrm), mkexpr(t2));
   14060          delta++;
   14061          DIP("xchg%c %s, %s\n",
   14062              nameISize(sz), nameIReg(sz,gregOfRM(modrm)),
   14063                             nameIReg(sz,eregOfRM(modrm)));
   14064       } else {
   14065          *expect_CAS = True;
   14066          addr = disAMode ( &alen, sorb, delta, dis_buf );
   14067          assign( t1, loadLE(ty,mkexpr(addr)) );
   14068          assign( t2, getIReg(sz,gregOfRM(modrm)) );
   14069          casLE( mkexpr(addr),
   14070                 mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
   14071          putIReg( sz, gregOfRM(modrm), mkexpr(t1) );
   14072          delta += alen;
   14073          DIP("xchg%c %s, %s\n", nameISize(sz),
   14074                                 nameIReg(sz,gregOfRM(modrm)), dis_buf);
   14075       }
   14076       break;
   14077 
   14078    case 0x90: /* XCHG eAX,eAX */
   14079       DIP("nop\n");
   14080       break;
   14081    case 0x91: /* XCHG eAX,eCX */
   14082    case 0x92: /* XCHG eAX,eDX */
   14083    case 0x93: /* XCHG eAX,eBX */
   14084    case 0x94: /* XCHG eAX,eSP */
   14085    case 0x95: /* XCHG eAX,eBP */
   14086    case 0x96: /* XCHG eAX,eSI */
   14087    case 0x97: /* XCHG eAX,eDI */
   14088       codegen_xchg_eAX_Reg ( sz, opc - 0x90 );
   14089       break;
   14090 
   14091    /* ------------------------ XLAT ----------------------- */
   14092 
   14093    case 0xD7: /* XLAT */
   14094       if (sz != 4) goto decode_failure; /* sz == 2 is also allowed (0x66) */
   14095       putIReg(
   14096          1,
   14097          R_EAX/*AL*/,
   14098          loadLE(Ity_I8,
   14099                 handleSegOverride(
   14100                    sorb,
   14101                    binop(Iop_Add32,
   14102                          getIReg(4, R_EBX),
   14103                          unop(Iop_8Uto32, getIReg(1, R_EAX/*AL*/))))));
   14104 
   14105       DIP("xlat%c [ebx]\n", nameISize(sz));
   14106       break;
   14107 
   14108    /* ------------------------ IN / OUT ----------------------- */
   14109 
   14110    case 0xE4: /* IN imm8, AL */
   14111       sz = 1;
   14112       t1 = newTemp(Ity_I32);
   14113       abyte = getIByte(delta); delta++;
   14114       assign(t1, mkU32( abyte & 0xFF ));
   14115       DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIReg(sz,R_EAX));
   14116       goto do_IN;
   14117    case 0xE5: /* IN imm8, eAX */
   14118       vassert(sz == 2 || sz == 4);
   14119       t1 = newTemp(Ity_I32);
   14120       abyte = getIByte(delta); delta++;
   14121       assign(t1, mkU32( abyte & 0xFF ));
   14122       DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIReg(sz,R_EAX));
   14123       goto do_IN;
   14124    case 0xEC: /* IN %DX, AL */
   14125       sz = 1;
   14126       t1 = newTemp(Ity_I32);
   14127       assign(t1, unop(Iop_16Uto32, getIReg(2, R_EDX)));
   14128       DIP("in%c %s,%s\n", nameISize(sz), nameIReg(2,R_EDX),
   14129                                          nameIReg(sz,R_EAX));
   14130       goto do_IN;
   14131    case 0xED: /* IN %DX, eAX */
   14132       vassert(sz == 2 || sz == 4);
   14133       t1 = newTemp(Ity_I32);
   14134       assign(t1, unop(Iop_16Uto32, getIReg(2, R_EDX)));
   14135       DIP("in%c %s,%s\n", nameISize(sz), nameIReg(2,R_EDX),
   14136                                          nameIReg(sz,R_EAX));
   14137       goto do_IN;
   14138    do_IN: {
   14139       /* At this point, sz indicates the width, and t1 is a 32-bit
   14140          value giving port number. */
   14141       IRDirty* d;
   14142       vassert(sz == 1 || sz == 2 || sz == 4);
   14143       ty = szToITy(sz);
   14144       t2 = newTemp(Ity_I32);
   14145       d = unsafeIRDirty_1_N(
   14146              t2,
   14147              0/*regparms*/,
   14148              "x86g_dirtyhelper_IN",
   14149              &x86g_dirtyhelper_IN,
   14150              mkIRExprVec_2( mkexpr(t1), mkU32(sz) )
   14151           );
   14152       /* do the call, dumping the result in t2. */
   14153       stmt( IRStmt_Dirty(d) );
   14154       putIReg(sz, R_EAX, narrowTo( ty, mkexpr(t2) ) );
   14155       break;
   14156    }
   14157 
   14158    case 0xE6: /* OUT AL, imm8 */
   14159       sz = 1;
   14160       t1 = newTemp(Ity_I32);
   14161       abyte = getIByte(delta); delta++;
   14162       assign( t1, mkU32( abyte & 0xFF ) );
   14163       DIP("out%c %s,$%d\n", nameISize(sz), nameIReg(sz,R_EAX), (Int)abyte);
   14164       goto do_OUT;
   14165    case 0xE7: /* OUT eAX, imm8 */
   14166       vassert(sz == 2 || sz == 4);
   14167       t1 = newTemp(Ity_I32);
   14168       abyte = getIByte(delta); delta++;
   14169       assign( t1, mkU32( abyte & 0xFF ) );
   14170       DIP("out%c %s,$%d\n", nameISize(sz), nameIReg(sz,R_EAX), (Int)abyte);
   14171       goto do_OUT;
   14172    case 0xEE: /* OUT AL, %DX */
   14173       sz = 1;
   14174       t1 = newTemp(Ity_I32);
   14175       assign( t1, unop(Iop_16Uto32, getIReg(2, R_EDX)) );
   14176       DIP("out%c %s,%s\n", nameISize(sz), nameIReg(sz,R_EAX),
   14177                                           nameIReg(2,R_EDX));
   14178       goto do_OUT;
   14179    case 0xEF: /* OUT eAX, %DX */
   14180       vassert(sz == 2 || sz == 4);
   14181       t1 = newTemp(Ity_I32);
   14182       assign( t1, unop(Iop_16Uto32, getIReg(2, R_EDX)) );
   14183       DIP("out%c %s,%s\n", nameISize(sz), nameIReg(sz,R_EAX),
   14184                                           nameIReg(2,R_EDX));
   14185       goto do_OUT;
   14186    do_OUT: {
   14187       /* At this point, sz indicates the width, and t1 is a 32-bit
   14188          value giving port number. */
   14189       IRDirty* d;
   14190       vassert(sz == 1 || sz == 2 || sz == 4);
   14191       ty = szToITy(sz);
   14192       d = unsafeIRDirty_0_N(
   14193              0/*regparms*/,
   14194              "x86g_dirtyhelper_OUT",
   14195              &x86g_dirtyhelper_OUT,
   14196              mkIRExprVec_3( mkexpr(t1),
   14197                             widenUto32( getIReg(sz, R_EAX) ),
   14198                             mkU32(sz) )
   14199           );
   14200       stmt( IRStmt_Dirty(d) );
   14201       break;
   14202    }
   14203 
   14204    /* ------------------------ (Grp1 extensions) ---------- */
   14205 
   14206    case 0x82: /* Grp1 Ib,Eb too.  Apparently this is the same as
   14207                  case 0x80, but only in 32-bit mode. */
   14208       /* fallthru */
   14209    case 0x80: /* Grp1 Ib,Eb */
   14210       modrm = getIByte(delta);
   14211       am_sz = lengthAMode(delta);
   14212       sz    = 1;
   14213       d_sz  = 1;
   14214       d32   = getUChar(delta + am_sz);
   14215       delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
   14216       break;
   14217 
   14218    case 0x81: /* Grp1 Iv,Ev */
   14219       modrm = getIByte(delta);
   14220       am_sz = lengthAMode(delta);
   14221       d_sz  = sz;
   14222       d32   = getUDisp(d_sz, delta + am_sz);
   14223       delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
   14224       break;
   14225 
   14226    case 0x83: /* Grp1 Ib,Ev */
   14227       modrm = getIByte(delta);
   14228       am_sz = lengthAMode(delta);
   14229       d_sz  = 1;
   14230       d32   = getSDisp8(delta + am_sz);
   14231       delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
   14232       break;
   14233 
   14234    /* ------------------------ (Grp2 extensions) ---------- */
   14235 
   14236    case 0xC0: { /* Grp2 Ib,Eb */
   14237       Bool decode_OK = True;
   14238       modrm = getIByte(delta);
   14239       am_sz = lengthAMode(delta);
   14240       d_sz  = 1;
   14241       d32   = getUChar(delta + am_sz);
   14242       sz    = 1;
   14243       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14244                          mkU8(d32 & 0xFF), NULL, &decode_OK );
   14245       if (!decode_OK)
   14246          goto decode_failure;
   14247       break;
   14248    }
   14249    case 0xC1: { /* Grp2 Ib,Ev */
   14250       Bool decode_OK = True;
   14251       modrm = getIByte(delta);
   14252       am_sz = lengthAMode(delta);
   14253       d_sz  = 1;
   14254       d32   = getUChar(delta + am_sz);
   14255       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14256                          mkU8(d32 & 0xFF), NULL, &decode_OK );
   14257       if (!decode_OK)
   14258          goto decode_failure;
   14259       break;
   14260    }
   14261    case 0xD0: { /* Grp2 1,Eb */
   14262       Bool decode_OK = True;
   14263       modrm = getIByte(delta);
   14264       am_sz = lengthAMode(delta);
   14265       d_sz  = 0;
   14266       d32   = 1;
   14267       sz    = 1;
   14268       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14269                          mkU8(d32), NULL, &decode_OK );
   14270       if (!decode_OK)
   14271          goto decode_failure;
   14272       break;
   14273    }
   14274    case 0xD1: { /* Grp2 1,Ev */
   14275       Bool decode_OK = True;
   14276       modrm = getUChar(delta);
   14277       am_sz = lengthAMode(delta);
   14278       d_sz  = 0;
   14279       d32   = 1;
   14280       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14281                          mkU8(d32), NULL, &decode_OK );
   14282       if (!decode_OK)
   14283          goto decode_failure;
   14284       break;
   14285    }
   14286    case 0xD2: { /* Grp2 CL,Eb */
   14287       Bool decode_OK = True;
   14288       modrm = getUChar(delta);
   14289       am_sz = lengthAMode(delta);
   14290       d_sz  = 0;
   14291       sz    = 1;
   14292       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14293                          getIReg(1,R_ECX), "%cl", &decode_OK );
   14294       if (!decode_OK)
   14295          goto decode_failure;
   14296       break;
   14297    }
   14298    case 0xD3: { /* Grp2 CL,Ev */
   14299       Bool decode_OK = True;
   14300       modrm = getIByte(delta);
   14301       am_sz = lengthAMode(delta);
   14302       d_sz  = 0;
   14303       delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
   14304                          getIReg(1,R_ECX), "%cl", &decode_OK );
   14305       if (!decode_OK)
   14306          goto decode_failure;
   14307       break;
   14308    }
   14309 
   14310    /* ------------------------ (Grp3 extensions) ---------- */
   14311 
   14312    case 0xF6: { /* Grp3 Eb */
   14313       Bool decode_OK = True;
   14314       delta = dis_Grp3 ( sorb, pfx_lock, 1, delta, &decode_OK );
   14315       if (!decode_OK)
   14316          goto decode_failure;
   14317       break;
   14318    }
   14319    case 0xF7: { /* Grp3 Ev */
   14320       Bool decode_OK = True;
   14321       delta = dis_Grp3 ( sorb, pfx_lock, sz, delta, &decode_OK );
   14322       if (!decode_OK)
   14323          goto decode_failure;
   14324       break;
   14325    }
   14326 
   14327    /* ------------------------ (Grp4 extensions) ---------- */
   14328 
   14329    case 0xFE: { /* Grp4 Eb */
   14330       Bool decode_OK = True;
   14331       delta = dis_Grp4 ( sorb, pfx_lock, delta, &decode_OK );
   14332       if (!decode_OK)
   14333          goto decode_failure;
   14334       break;
   14335    }
   14336 
   14337    /* ------------------------ (Grp5 extensions) ---------- */
   14338 
   14339    case 0xFF: { /* Grp5 Ev */
   14340       Bool decode_OK = True;
   14341       delta = dis_Grp5 ( sorb, pfx_lock, sz, delta, &dres, &decode_OK );
   14342       if (!decode_OK)
   14343          goto decode_failure;
   14344       break;
   14345    }
   14346 
   14347    /* ------------------------ Escapes to 2-byte opcodes -- */
   14348 
   14349    case 0x0F: {
   14350       opc = getIByte(delta); delta++;
   14351       switch (opc) {
   14352 
   14353       /* =-=-=-=-=-=-=-=-=- Grp8 =-=-=-=-=-=-=-=-=-=-=-= */
   14354 
   14355       case 0xBA: { /* Grp8 Ib,Ev */
   14356          Bool decode_OK = False;
   14357          modrm = getUChar(delta);
   14358          am_sz = lengthAMode(delta);
   14359          d32   = getSDisp8(delta + am_sz);
   14360          delta = dis_Grp8_Imm ( sorb, pfx_lock, delta, modrm,
   14361                                 am_sz, sz, d32, &decode_OK );
   14362          if (!decode_OK)
   14363             goto decode_failure;
   14364          break;
   14365       }
   14366 
   14367       /* =-=-=-=-=-=-=-=-=- BSF/BSR -=-=-=-=-=-=-=-=-=-= */
   14368 
   14369       case 0xBC: /* BSF Gv,Ev */
   14370          delta = dis_bs_E_G ( sorb, sz, delta, True );
   14371          break;
   14372       case 0xBD: /* BSR Gv,Ev */
   14373          delta = dis_bs_E_G ( sorb, sz, delta, False );
   14374          break;
   14375 
   14376       /* =-=-=-=-=-=-=-=-=- BSWAP -=-=-=-=-=-=-=-=-=-=-= */
   14377 
   14378       case 0xC8: /* BSWAP %eax */
   14379       case 0xC9:
   14380       case 0xCA:
   14381       case 0xCB:
   14382       case 0xCC:
   14383       case 0xCD:
   14384       case 0xCE:
   14385       case 0xCF: /* BSWAP %edi */
   14386          /* AFAICS from the Intel docs, this only exists at size 4. */
   14387          vassert(sz == 4);
   14388          t1 = newTemp(Ity_I32);
   14389          t2 = newTemp(Ity_I32);
   14390          assign( t1, getIReg(4, opc-0xC8) );
   14391 
   14392          assign( t2,
   14393             binop(Iop_Or32,
   14394                binop(Iop_Shl32, mkexpr(t1), mkU8(24)),
   14395             binop(Iop_Or32,
   14396                binop(Iop_And32, binop(Iop_Shl32, mkexpr(t1), mkU8(8)),
   14397                                 mkU32(0x00FF0000)),
   14398             binop(Iop_Or32,
   14399                binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(8)),
   14400                                 mkU32(0x0000FF00)),
   14401                binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(24)),
   14402                                 mkU32(0x000000FF) )
   14403             )))
   14404          );
   14405 
   14406          putIReg(4, opc-0xC8, mkexpr(t2));
   14407          DIP("bswapl %s\n", nameIReg(4, opc-0xC8));
   14408          break;
   14409 
   14410       /* =-=-=-=-=-=-=-=-=- BT/BTS/BTR/BTC =-=-=-=-=-=-= */
   14411 
   14412       case 0xA3: /* BT Gv,Ev */
   14413          delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpNone );
   14414          break;
   14415       case 0xB3: /* BTR Gv,Ev */
   14416          delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpReset );
   14417          break;
   14418       case 0xAB: /* BTS Gv,Ev */
   14419          delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpSet );
   14420          break;
   14421       case 0xBB: /* BTC Gv,Ev */
   14422          delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpComp );
   14423          break;
   14424 
   14425       /* =-=-=-=-=-=-=-=-=- CMOV =-=-=-=-=-=-=-=-=-=-=-= */
   14426 
   14427       case 0x40:
   14428       case 0x41:
   14429       case 0x42: /* CMOVBb/CMOVNAEb (cmov below) */
   14430       case 0x43: /* CMOVNBb/CMOVAEb (cmov not below) */
   14431       case 0x44: /* CMOVZb/CMOVEb (cmov zero) */
   14432       case 0x45: /* CMOVNZb/CMOVNEb (cmov not zero) */
   14433       case 0x46: /* CMOVBEb/CMOVNAb (cmov below or equal) */
   14434       case 0x47: /* CMOVNBEb/CMOVAb (cmov not below or equal) */
   14435       case 0x48: /* CMOVSb (cmov negative) */
   14436       case 0x49: /* CMOVSb (cmov not negative) */
   14437       case 0x4A: /* CMOVP (cmov parity even) */
   14438       case 0x4B: /* CMOVNP (cmov parity odd) */
   14439       case 0x4C: /* CMOVLb/CMOVNGEb (cmov less) */
   14440       case 0x4D: /* CMOVGEb/CMOVNLb (cmov greater or equal) */
   14441       case 0x4E: /* CMOVLEb/CMOVNGb (cmov less or equal) */
   14442       case 0x4F: /* CMOVGb/CMOVNLEb (cmov greater) */
   14443          delta = dis_cmov_E_G(sorb, sz, (X86Condcode)(opc - 0x40), delta);
   14444          break;
   14445 
   14446       /* =-=-=-=-=-=-=-=-=- CMPXCHG -=-=-=-=-=-=-=-=-=-= */
   14447 
   14448       case 0xB0: /* CMPXCHG Gb,Eb */
   14449          delta = dis_cmpxchg_G_E ( sorb, pfx_lock, 1, delta );
   14450          break;
   14451       case 0xB1: /* CMPXCHG Gv,Ev */
   14452          delta = dis_cmpxchg_G_E ( sorb, pfx_lock, sz, delta );
   14453          break;
   14454 
   14455       case 0xC7: { /* CMPXCHG8B Gv (0F C7 /1) */
   14456          IRTemp expdHi    = newTemp(Ity_I32);
   14457          IRTemp expdLo    = newTemp(Ity_I32);
   14458          IRTemp dataHi    = newTemp(Ity_I32);
   14459          IRTemp dataLo    = newTemp(Ity_I32);
   14460          IRTemp oldHi     = newTemp(Ity_I32);
   14461          IRTemp oldLo     = newTemp(Ity_I32);
   14462          IRTemp flags_old = newTemp(Ity_I32);
   14463          IRTemp flags_new = newTemp(Ity_I32);
   14464          IRTemp success   = newTemp(Ity_I1);
   14465 
   14466          /* Translate this using a DCAS, even if there is no LOCK
   14467             prefix.  Life is too short to bother with generating two
   14468             different translations for the with/without-LOCK-prefix
   14469             cases. */
   14470          *expect_CAS = True;
   14471 
   14472 	 /* Decode, and generate address. */
   14473          if (sz != 4) goto decode_failure;
   14474          modrm = getIByte(delta);
   14475          if (epartIsReg(modrm)) goto decode_failure;
   14476          if (gregOfRM(modrm) != 1) goto decode_failure;
   14477          addr = disAMode ( &alen, sorb, delta, dis_buf );
   14478          delta += alen;
   14479 
   14480          /* Get the expected and new values. */
   14481          assign( expdHi, getIReg(4,R_EDX) );
   14482          assign( expdLo, getIReg(4,R_EAX) );
   14483          assign( dataHi, getIReg(4,R_ECX) );
   14484          assign( dataLo, getIReg(4,R_EBX) );
   14485 
   14486          /* Do the DCAS */
   14487          stmt( IRStmt_CAS(
   14488                   mkIRCAS( oldHi, oldLo,
   14489                            Iend_LE, mkexpr(addr),
   14490                            mkexpr(expdHi), mkexpr(expdLo),
   14491                            mkexpr(dataHi), mkexpr(dataLo)
   14492                )));
   14493 
   14494          /* success when oldHi:oldLo == expdHi:expdLo */
   14495          assign( success,
   14496                  binop(Iop_CasCmpEQ32,
   14497                        binop(Iop_Or32,
   14498                              binop(Iop_Xor32, mkexpr(oldHi), mkexpr(expdHi)),
   14499                              binop(Iop_Xor32, mkexpr(oldLo), mkexpr(expdLo))
   14500                        ),
   14501                        mkU32(0)
   14502                  ));
   14503 
   14504          /* If the DCAS is successful, that is to say oldHi:oldLo ==
   14505             expdHi:expdLo, then put expdHi:expdLo back in EDX:EAX,
   14506             which is where they came from originally.  Both the actual
   14507             contents of these two regs, and any shadow values, are
   14508             unchanged.  If the DCAS fails then we're putting into
   14509             EDX:EAX the value seen in memory. */
   14510          putIReg(4, R_EDX,
   14511                     IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(success)),
   14512                                   mkexpr(oldHi),
   14513                                   mkexpr(expdHi)
   14514                 ));
   14515          putIReg(4, R_EAX,
   14516                     IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(success)),
   14517                                   mkexpr(oldLo),
   14518                                   mkexpr(expdLo)
   14519                 ));
   14520 
   14521          /* Copy the success bit into the Z flag and leave the others
   14522             unchanged */
   14523          assign( flags_old, widenUto32(mk_x86g_calculate_eflags_all()));
   14524          assign(
   14525             flags_new,
   14526             binop(Iop_Or32,
   14527                   binop(Iop_And32, mkexpr(flags_old),
   14528                                    mkU32(~X86G_CC_MASK_Z)),
   14529                   binop(Iop_Shl32,
   14530                         binop(Iop_And32,
   14531                               unop(Iop_1Uto32, mkexpr(success)), mkU32(1)),
   14532                         mkU8(X86G_CC_SHIFT_Z)) ));
   14533 
   14534          stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
   14535          stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(flags_new) ));
   14536          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
   14537          /* Set NDEP even though it isn't used.  This makes
   14538             redundant-PUT elimination of previous stores to this field
   14539             work better. */
   14540          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
   14541 
   14542          /* Sheesh.  Aren't you glad it was me and not you that had to
   14543 	    write and validate all this grunge? */
   14544 
   14545 	 DIP("cmpxchg8b %s\n", dis_buf);
   14546 	 break;
   14547       }
   14548 
   14549       /* =-=-=-=-=-=-=-=-=- CPUID -=-=-=-=-=-=-=-=-=-=-= */
   14550 
   14551       case 0xA2: { /* CPUID */
   14552          /* Uses dirty helper:
   14553                void dirtyhelper_CPUID_sse[012] ( VexGuestX86State* )
   14554             declared to mod eax, wr ebx, ecx, edx
   14555          */
   14556          IRDirty* d     = NULL;
   14557          HChar*   fName = NULL;
   14558          void*    fAddr = NULL;
   14559          if (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2) {
   14560             fName = "x86g_dirtyhelper_CPUID_sse2";
   14561             fAddr = &x86g_dirtyhelper_CPUID_sse2;
   14562          }
   14563          else
   14564          if (archinfo->hwcaps & VEX_HWCAPS_X86_SSE1) {
   14565             fName = "x86g_dirtyhelper_CPUID_sse1";
   14566             fAddr = &x86g_dirtyhelper_CPUID_sse1;
   14567          }
   14568          else
   14569          if (archinfo->hwcaps == 0/*no SSE*/) {
   14570             fName = "x86g_dirtyhelper_CPUID_sse0";
   14571             fAddr = &x86g_dirtyhelper_CPUID_sse0;
   14572          } else
   14573             vpanic("disInstr(x86)(cpuid)");
   14574 
   14575          vassert(fName); vassert(fAddr);
   14576          d = unsafeIRDirty_0_N ( 0/*regparms*/,
   14577                                  fName, fAddr, mkIRExprVec_0() );
   14578          /* declare guest state effects */
   14579          d->needsBBP = True;
   14580          d->nFxState = 4;
   14581          d->fxState[0].fx     = Ifx_Modify;
   14582          d->fxState[0].offset = OFFB_EAX;
   14583          d->fxState[0].size   = 4;
   14584          d->fxState[1].fx     = Ifx_Write;
   14585          d->fxState[1].offset = OFFB_EBX;
   14586          d->fxState[1].size   = 4;
   14587          d->fxState[2].fx     = Ifx_Modify;
   14588          d->fxState[2].offset = OFFB_ECX;
   14589          d->fxState[2].size   = 4;
   14590          d->fxState[3].fx     = Ifx_Write;
   14591          d->fxState[3].offset = OFFB_EDX;
   14592          d->fxState[3].size   = 4;
   14593          /* execute the dirty call, side-effecting guest state */
   14594          stmt( IRStmt_Dirty(d) );
   14595          /* CPUID is a serialising insn.  So, just in case someone is
   14596             using it as a memory fence ... */
   14597          stmt( IRStmt_MBE(Imbe_Fence) );
   14598          DIP("cpuid\n");
   14599          break;
   14600       }
   14601 
   14602 //--          if (!VG_(cpu_has_feature)(VG_X86_FEAT_CPUID))
   14603 //--             goto decode_failure;
   14604 //--
   14605 //--          t1 = newTemp(cb);
   14606 //--          t2 = newTemp(cb);
   14607 //--          t3 = newTemp(cb);
   14608 //--          t4 = newTemp(cb);
   14609 //--          uInstr0(cb, CALLM_S, 0);
   14610 //--
   14611 //--          uInstr2(cb, GET,   4, ArchReg, R_EAX, TempReg, t1);
   14612 //--          uInstr1(cb, PUSH,  4, TempReg, t1);
   14613 //--
   14614 //--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t2);
   14615 //--          uLiteral(cb, 0);
   14616 //--          uInstr1(cb, PUSH,  4, TempReg, t2);
   14617 //--
   14618 //--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t3);
   14619 //--          uLiteral(cb, 0);
   14620 //--          uInstr1(cb, PUSH,  4, TempReg, t3);
   14621 //--
   14622 //--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t4);
   14623 //--          uLiteral(cb, 0);
   14624 //--          uInstr1(cb, PUSH,  4, TempReg, t4);
   14625 //--
   14626 //--          uInstr1(cb, CALLM, 0, Lit16,   VGOFF_(helper_CPUID));
   14627 //--          uFlagsRWU(cb, FlagsEmpty, FlagsEmpty, FlagsEmpty);
   14628 //--
   14629 //--          uInstr1(cb, POP,   4, TempReg, t4);
   14630 //--          uInstr2(cb, PUT,   4, TempReg, t4, ArchReg, R_EDX);
   14631 //--
   14632 //--          uInstr1(cb, POP,   4, TempReg, t3);
   14633 //--          uInstr2(cb, PUT,   4, TempReg, t3, ArchReg, R_ECX);
   14634 //--
   14635 //--          uInstr1(cb, POP,   4, TempReg, t2);
   14636 //--          uInstr2(cb, PUT,   4, TempReg, t2, ArchReg, R_EBX);
   14637 //--
   14638 //--          uInstr1(cb, POP,   4, TempReg, t1);
   14639 //--          uInstr2(cb, PUT,   4, TempReg, t1, ArchReg, R_EAX);
   14640 //--
   14641 //--          uInstr0(cb, CALLM_E, 0);
   14642 //--          DIP("cpuid\n");
   14643 //--          break;
   14644 //--
   14645       /* =-=-=-=-=-=-=-=-=- MOVZX, MOVSX =-=-=-=-=-=-=-= */
   14646 
   14647       case 0xB6: /* MOVZXb Eb,Gv */
   14648          if (sz != 2 && sz != 4)
   14649             goto decode_failure;
   14650          delta = dis_movx_E_G ( sorb, delta, 1, sz, False );
   14651          break;
   14652 
   14653       case 0xB7: /* MOVZXw Ew,Gv */
   14654          if (sz != 4)
   14655             goto decode_failure;
   14656          delta = dis_movx_E_G ( sorb, delta, 2, 4, False );
   14657          break;
   14658 
   14659       case 0xBE: /* MOVSXb Eb,Gv */
   14660          if (sz != 2 && sz != 4)
   14661             goto decode_failure;
   14662          delta = dis_movx_E_G ( sorb, delta, 1, sz, True );
   14663          break;
   14664 
   14665       case 0xBF: /* MOVSXw Ew,Gv */
   14666          if (sz != 4 && /* accept movsww, sigh, see #250799 */sz != 2)
   14667             goto decode_failure;
   14668          delta = dis_movx_E_G ( sorb, delta, 2, sz, True );
   14669          break;
   14670 
   14671 //--       /* =-=-=-=-=-=-=-=-=-=-= MOVNTI -=-=-=-=-=-=-=-=-= */
   14672 //--
   14673 //--       case 0xC3: /* MOVNTI Gv,Ev */
   14674 //--          vg_assert(sz == 4);
   14675 //--          modrm = getUChar(eip);
   14676 //--          vg_assert(!epartIsReg(modrm));
   14677 //--          t1 = newTemp(cb);
   14678 //--          uInstr2(cb, GET, 4, ArchReg, gregOfRM(modrm), TempReg, t1);
   14679 //--          pair = disAMode ( cb, sorb, eip, dis_buf );
   14680 //--          t2 = LOW24(pair);
   14681 //--          eip += HI8(pair);
   14682 //--          uInstr2(cb, STORE, 4, TempReg, t1, TempReg, t2);
   14683 //--          DIP("movnti %s,%s\n", nameIReg(4,gregOfRM(modrm)), dis_buf);
   14684 //--          break;
   14685 
   14686       /* =-=-=-=-=-=-=-=-=- MUL/IMUL =-=-=-=-=-=-=-=-=-= */
   14687 
   14688       case 0xAF: /* IMUL Ev, Gv */
   14689          delta = dis_mul_E_G ( sorb, sz, delta );
   14690          break;
   14691 
   14692       /* =-=-=-=-=-=-=-=-=- NOPs =-=-=-=-=-=-=-=-=-=-=-= */
   14693 
   14694       case 0x1F:
   14695          modrm = getUChar(delta);
   14696          if (epartIsReg(modrm)) goto decode_failure;
   14697          addr = disAMode ( &alen, sorb, delta, dis_buf );
   14698          delta += alen;
   14699          DIP("nop%c %s\n", nameISize(sz), dis_buf);
   14700          break;
   14701 
   14702       /* =-=-=-=-=-=-=-=-=- Jcond d32 -=-=-=-=-=-=-=-=-= */
   14703       case 0x80:
   14704       case 0x81:
   14705       case 0x82: /* JBb/JNAEb (jump below) */
   14706       case 0x83: /* JNBb/JAEb (jump not below) */
   14707       case 0x84: /* JZb/JEb (jump zero) */
   14708       case 0x85: /* JNZb/JNEb (jump not zero) */
   14709       case 0x86: /* JBEb/JNAb (jump below or equal) */
   14710       case 0x87: /* JNBEb/JAb (jump not below or equal) */
   14711       case 0x88: /* JSb (jump negative) */
   14712       case 0x89: /* JSb (jump not negative) */
   14713       case 0x8A: /* JP (jump parity even) */
   14714       case 0x8B: /* JNP/JPO (jump parity odd) */
   14715       case 0x8C: /* JLb/JNGEb (jump less) */
   14716       case 0x8D: /* JGEb/JNLb (jump greater or equal) */
   14717       case 0x8E: /* JLEb/JNGb (jump less or equal) */
   14718       case 0x8F: /* JGb/JNLEb (jump greater) */
   14719        { Int    jmpDelta;
   14720          HChar* comment  = "";
   14721          jmpDelta = (Int)getUDisp32(delta);
   14722          d32 = (((Addr32)guest_EIP_bbstart)+delta+4) + jmpDelta;
   14723          delta += 4;
   14724          if (resteerCisOk
   14725              && vex_control.guest_chase_cond
   14726              && (Addr32)d32 != (Addr32)guest_EIP_bbstart
   14727              && jmpDelta < 0
   14728              && resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
   14729             /* Speculation: assume this backward branch is taken.  So
   14730                we need to emit a side-exit to the insn following this
   14731                one, on the negation of the condition, and continue at
   14732                the branch target address (d32).  If we wind up back at
   14733                the first instruction of the trace, just stop; it's
   14734                better to let the IR loop unroller handle that case.*/
   14735             stmt( IRStmt_Exit(
   14736                      mk_x86g_calculate_condition((X86Condcode)
   14737                                                  (1 ^ (opc - 0x80))),
   14738                      Ijk_Boring,
   14739                      IRConst_U32(guest_EIP_bbstart+delta) ) );
   14740             dres.whatNext   = Dis_ResteerC;
   14741             dres.continueAt = (Addr64)(Addr32)d32;
   14742             comment = "(assumed taken)";
   14743          }
   14744          else
   14745          if (resteerCisOk
   14746              && vex_control.guest_chase_cond
   14747              && (Addr32)d32 != (Addr32)guest_EIP_bbstart
   14748              && jmpDelta >= 0
   14749              && resteerOkFn( callback_opaque,
   14750                              (Addr64)(Addr32)(guest_EIP_bbstart+delta)) ) {
   14751             /* Speculation: assume this forward branch is not taken.
   14752                So we need to emit a side-exit to d32 (the dest) and
   14753                continue disassembling at the insn immediately
   14754                following this one. */
   14755             stmt( IRStmt_Exit(
   14756                      mk_x86g_calculate_condition((X86Condcode)(opc - 0x80)),
   14757                      Ijk_Boring,
   14758                      IRConst_U32(d32) ) );
   14759             dres.whatNext   = Dis_ResteerC;
   14760             dres.continueAt = (Addr64)(Addr32)(guest_EIP_bbstart+delta);
   14761             comment = "(assumed not taken)";
   14762          }
   14763          else {
   14764             /* Conservative default translation - end the block at
   14765                this point. */
   14766             jcc_01( (X86Condcode)(opc - 0x80),
   14767                     (Addr32)(guest_EIP_bbstart+delta), d32);
   14768             dres.whatNext = Dis_StopHere;
   14769          }
   14770          DIP("j%s-32 0x%x %s\n", name_X86Condcode(opc - 0x80), d32, comment);
   14771          break;
   14772        }
   14773 
   14774       /* =-=-=-=-=-=-=-=-=- RDTSC -=-=-=-=-=-=-=-=-=-=-= */
   14775       case 0x31: { /* RDTSC */
   14776          IRTemp   val  = newTemp(Ity_I64);
   14777          IRExpr** args = mkIRExprVec_0();
   14778          IRDirty* d    = unsafeIRDirty_1_N (
   14779                             val,
   14780                             0/*regparms*/,
   14781                             "x86g_dirtyhelper_RDTSC",
   14782                             &x86g_dirtyhelper_RDTSC,
   14783                             args
   14784                          );
   14785          /* execute the dirty call, dumping the result in val. */
   14786          stmt( IRStmt_Dirty(d) );
   14787          putIReg(4, R_EDX, unop(Iop_64HIto32, mkexpr(val)));
   14788          putIReg(4, R_EAX, unop(Iop_64to32, mkexpr(val)));
   14789          DIP("rdtsc\n");
   14790          break;
   14791       }
   14792 
   14793       /* =-=-=-=-=-=-=-=-=- PUSH/POP Sreg =-=-=-=-=-=-=-=-=-= */
   14794 
   14795       case 0xA1: /* POP %FS */
   14796          dis_pop_segreg( R_FS, sz ); break;
   14797       case 0xA9: /* POP %GS */
   14798          dis_pop_segreg( R_GS, sz ); break;
   14799 
   14800       case 0xA0: /* PUSH %FS */
   14801          dis_push_segreg( R_FS, sz ); break;
   14802       case 0xA8: /* PUSH %GS */
   14803          dis_push_segreg( R_GS, sz ); break;
   14804 
   14805       /* =-=-=-=-=-=-=-=-=- SETcc Eb =-=-=-=-=-=-=-=-=-= */
   14806       case 0x90:
   14807       case 0x91:
   14808       case 0x92: /* set-Bb/set-NAEb (jump below) */
   14809       case 0x93: /* set-NBb/set-AEb (jump not below) */
   14810       case 0x94: /* set-Zb/set-Eb (jump zero) */
   14811       case 0x95: /* set-NZb/set-NEb (jump not zero) */
   14812       case 0x96: /* set-BEb/set-NAb (jump below or equal) */
   14813       case 0x97: /* set-NBEb/set-Ab (jump not below or equal) */
   14814       case 0x98: /* set-Sb (jump negative) */
   14815       case 0x99: /* set-Sb (jump not negative) */
   14816       case 0x9A: /* set-P (jump parity even) */
   14817       case 0x9B: /* set-NP (jump parity odd) */
   14818       case 0x9C: /* set-Lb/set-NGEb (jump less) */
   14819       case 0x9D: /* set-GEb/set-NLb (jump greater or equal) */
   14820       case 0x9E: /* set-LEb/set-NGb (jump less or equal) */
   14821       case 0x9F: /* set-Gb/set-NLEb (jump greater) */
   14822          t1 = newTemp(Ity_I8);
   14823          assign( t1, unop(Iop_1Uto8,mk_x86g_calculate_condition(opc-0x90)) );
   14824          modrm = getIByte(delta);
   14825          if (epartIsReg(modrm)) {
   14826             delta++;
   14827             putIReg(1, eregOfRM(modrm), mkexpr(t1));
   14828             DIP("set%s %s\n", name_X86Condcode(opc-0x90),
   14829                               nameIReg(1,eregOfRM(modrm)));
   14830          } else {
   14831            addr = disAMode ( &alen, sorb, delta, dis_buf );
   14832            delta += alen;
   14833            storeLE( mkexpr(addr), mkexpr(t1) );
   14834            DIP("set%s %s\n", name_X86Condcode(opc-0x90), dis_buf);
   14835          }
   14836          break;
   14837 
   14838       /* =-=-=-=-=-=-=-=-=- SHLD/SHRD -=-=-=-=-=-=-=-=-= */
   14839 
   14840       case 0xA4: /* SHLDv imm8,Gv,Ev */
   14841          modrm = getIByte(delta);
   14842          d32   = delta + lengthAMode(delta);
   14843          vex_sprintf(dis_buf, "$%d", getIByte(d32));
   14844          delta = dis_SHLRD_Gv_Ev (
   14845                   sorb, delta, modrm, sz,
   14846                   mkU8(getIByte(d32)), True, /* literal */
   14847                   dis_buf, True );
   14848          break;
   14849       case 0xA5: /* SHLDv %cl,Gv,Ev */
   14850          modrm = getIByte(delta);
   14851          delta = dis_SHLRD_Gv_Ev (
   14852                     sorb, delta, modrm, sz,
   14853                     getIReg(1,R_ECX), False, /* not literal */
   14854                     "%cl", True );
   14855          break;
   14856 
   14857       case 0xAC: /* SHRDv imm8,Gv,Ev */
   14858          modrm = getIByte(delta);
   14859          d32   = delta + lengthAMode(delta);
   14860          vex_sprintf(dis_buf, "$%d", getIByte(d32));
   14861          delta = dis_SHLRD_Gv_Ev (
   14862                     sorb, delta, modrm, sz,
   14863                     mkU8(getIByte(d32)), True, /* literal */
   14864                     dis_buf, False );
   14865          break;
   14866       case 0xAD: /* SHRDv %cl,Gv,Ev */
   14867          modrm = getIByte(delta);
   14868          delta = dis_SHLRD_Gv_Ev (
   14869                     sorb, delta, modrm, sz,
   14870                     getIReg(1,R_ECX), False, /* not literal */
   14871                     "%cl", False );
   14872          break;
   14873 
   14874       /* =-=-=-=-=-=-=-=-=- SYSENTER -=-=-=-=-=-=-=-=-=-= */
   14875 
   14876       case 0x34:
   14877          /* Simple implementation needing a long explaination.
   14878 
   14879             sysenter is a kind of syscall entry.  The key thing here
   14880             is that the return address is not known -- that is
   14881             something that is beyond Vex's knowledge.  So this IR
   14882             forces a return to the scheduler, which can do what it
   14883             likes to simulate the systenter, but it MUST set this
   14884             thread's guest_EIP field with the continuation address
   14885             before resuming execution.  If that doesn't happen, the
   14886             thread will jump to address zero, which is probably
   14887             fatal.
   14888          */
   14889 
   14890          /* Note where we are, so we can back up the guest to this
   14891             point if the syscall needs to be restarted. */
   14892          stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
   14893                            mkU32(guest_EIP_curr_instr) ) );
   14894          jmp_lit(Ijk_Sys_sysenter, 0/*bogus next EIP value*/);
   14895          dres.whatNext = Dis_StopHere;
   14896          DIP("sysenter");
   14897          break;
   14898 
   14899       /* =-=-=-=-=-=-=-=-=- XADD -=-=-=-=-=-=-=-=-=-= */
   14900 
   14901       case 0xC0: { /* XADD Gb,Eb */
   14902          Bool decodeOK;
   14903          delta = dis_xadd_G_E ( sorb, pfx_lock, 1, delta, &decodeOK );
   14904          if (!decodeOK) goto decode_failure;
   14905          break;
   14906       }
   14907       case 0xC1: { /* XADD Gv,Ev */
   14908          Bool decodeOK;
   14909          delta = dis_xadd_G_E ( sorb, pfx_lock, sz, delta, &decodeOK );
   14910          if (!decodeOK) goto decode_failure;
   14911          break;
   14912       }
   14913 
   14914       /* =-=-=-=-=-=-=-=-=- MMXery =-=-=-=-=-=-=-=-=-=-= */
   14915 
   14916       case 0x71:
   14917       case 0x72:
   14918       case 0x73: /* PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
   14919 
   14920       case 0x6E: /* MOVD (src)ireg-or-mem, (dst)mmxreg */
   14921       case 0x7E: /* MOVD (src)mmxreg, (dst)ireg-or-mem */
   14922       case 0x7F: /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
   14923       case 0x6F: /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
   14924 
   14925       case 0xFC:
   14926       case 0xFD:
   14927       case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
   14928 
   14929       case 0xEC:
   14930       case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
   14931 
   14932       case 0xDC:
   14933       case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   14934 
   14935       case 0xF8:
   14936       case 0xF9:
   14937       case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
   14938 
   14939       case 0xE8:
   14940       case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
   14941 
   14942       case 0xD8:
   14943       case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   14944 
   14945       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
   14946       case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
   14947 
   14948       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
   14949 
   14950       case 0x74:
   14951       case 0x75:
   14952       case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
   14953 
   14954       case 0x64:
   14955       case 0x65:
   14956       case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
   14957 
   14958       case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
   14959       case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
   14960       case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
   14961 
   14962       case 0x68:
   14963       case 0x69:
   14964       case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
   14965 
   14966       case 0x60:
   14967       case 0x61:
   14968       case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
   14969 
   14970       case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
   14971       case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
   14972       case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
   14973       case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
   14974 
   14975       case 0xF1: /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
   14976       case 0xF2:
   14977       case 0xF3:
   14978 
   14979       case 0xD1: /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
   14980       case 0xD2:
   14981       case 0xD3:
   14982 
   14983       case 0xE1: /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
   14984       case 0xE2:
   14985       {
   14986          Int  delta0    = delta-1;
   14987          Bool decode_OK = False;
   14988 
   14989          /* If sz==2 this is SSE, and we assume sse idec has
   14990             already spotted those cases by now. */
   14991          if (sz != 4)
   14992             goto decode_failure;
   14993 
   14994          delta = dis_MMX ( &decode_OK, sorb, sz, delta-1 );
   14995          if (!decode_OK) {
   14996             delta = delta0;
   14997             goto decode_failure;
   14998          }
   14999          break;
   15000       }
   15001 
   15002       case 0x77: /* EMMS */
   15003          if (sz != 4)
   15004             goto decode_failure;
   15005          do_EMMS_preamble();
   15006          DIP("emms\n");
   15007          break;
   15008 
   15009       /* =-=-=-=-=-=-=-=-=- SGDT and SIDT =-=-=-=-=-=-=-=-=-=-= */
   15010       case 0x01: /* 0F 01 /0 -- SGDT */
   15011                  /* 0F 01 /1 -- SIDT */
   15012       {
   15013           /* This is really revolting, but ... since each processor
   15014              (core) only has one IDT and one GDT, just let the guest
   15015              see it (pass-through semantics).  I can't see any way to
   15016              construct a faked-up value, so don't bother to try. */
   15017          modrm = getUChar(delta);
   15018          addr = disAMode ( &alen, sorb, delta, dis_buf );
   15019          delta += alen;
   15020          if (epartIsReg(modrm)) goto decode_failure;
   15021          if (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)
   15022             goto decode_failure;
   15023          switch (gregOfRM(modrm)) {
   15024             case 0: DIP("sgdt %s\n", dis_buf); break;
   15025             case 1: DIP("sidt %s\n", dis_buf); break;
   15026             default: vassert(0); /*NOTREACHED*/
   15027          }
   15028 
   15029          IRDirty* d = unsafeIRDirty_0_N (
   15030                           0/*regparms*/,
   15031                           "x86g_dirtyhelper_SxDT",
   15032                           &x86g_dirtyhelper_SxDT,
   15033                           mkIRExprVec_2( mkexpr(addr),
   15034                                          mkU32(gregOfRM(modrm)) )
   15035                       );
   15036          /* declare we're writing memory */
   15037          d->mFx   = Ifx_Write;
   15038          d->mAddr = mkexpr(addr);
   15039          d->mSize = 6;
   15040          stmt( IRStmt_Dirty(d) );
   15041          break;
   15042       }
   15043 
   15044       /* =-=-=-=-=-=-=-=-=- unimp2 =-=-=-=-=-=-=-=-=-=-= */
   15045 
   15046       default:
   15047          goto decode_failure;
   15048    } /* switch (opc) for the 2-byte opcodes */
   15049    goto decode_success;
   15050    } /* case 0x0F: of primary opcode */
   15051 
   15052    /* ------------------------ ??? ------------------------ */
   15053 
   15054   default:
   15055   decode_failure:
   15056    /* All decode failures end up here. */
   15057    vex_printf("vex x86->IR: unhandled instruction bytes: "
   15058               "0x%x 0x%x 0x%x 0x%x\n",
   15059               (Int)getIByte(delta_start+0),
   15060               (Int)getIByte(delta_start+1),
   15061               (Int)getIByte(delta_start+2),
   15062               (Int)getIByte(delta_start+3) );
   15063 
   15064    /* Tell the dispatcher that this insn cannot be decoded, and so has
   15065       not been executed, and (is currently) the next to be executed.
   15066       EIP should be up-to-date since it made so at the start of each
   15067       insn, but nevertheless be paranoid and update it again right
   15068       now. */
   15069    stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_curr_instr) ) );
   15070    jmp_lit(Ijk_NoDecode, guest_EIP_curr_instr);
   15071    dres.whatNext = Dis_StopHere;
   15072    dres.len = 0;
   15073    /* We also need to say that a CAS is not expected now, regardless
   15074       of what it might have been set to at the start of the function,
   15075       since the IR that we've emitted just above (to synthesis a
   15076       SIGILL) does not involve any CAS, and presumably no other IR has
   15077       been emitted for this (non-decoded) insn. */
   15078    *expect_CAS = False;
   15079    return dres;
   15080 
   15081    } /* switch (opc) for the main (primary) opcode switch. */
   15082 
   15083   decode_success:
   15084    /* All decode successes end up here. */
   15085    DIP("\n");
   15086    dres.len = delta - delta_start;
   15087    return dres;
   15088 }
   15089 
   15090 #undef DIP
   15091 #undef DIS
   15092 
   15093 
   15094 /*------------------------------------------------------------*/
   15095 /*--- Top-level fn                                         ---*/
   15096 /*------------------------------------------------------------*/
   15097 
   15098 /* Disassemble a single instruction into IR.  The instruction
   15099    is located in host memory at &guest_code[delta]. */
   15100 
   15101 DisResult disInstr_X86 ( IRSB*        irsb_IN,
   15102                          Bool         put_IP,
   15103                          Bool         (*resteerOkFn) ( void*, Addr64 ),
   15104                          Bool         resteerCisOk,
   15105                          void*        callback_opaque,
   15106                          UChar*       guest_code_IN,
   15107                          Long         delta,
   15108                          Addr64       guest_IP,
   15109                          VexArch      guest_arch,
   15110                          VexArchInfo* archinfo,
   15111                          VexAbiInfo*  abiinfo,
   15112                          Bool         host_bigendian_IN )
   15113 {
   15114    Int       i, x1, x2;
   15115    Bool      expect_CAS, has_CAS;
   15116    DisResult dres;
   15117 
   15118    /* Set globals (see top of this file) */
   15119    vassert(guest_arch == VexArchX86);
   15120    guest_code           = guest_code_IN;
   15121    irsb                 = irsb_IN;
   15122    host_is_bigendian    = host_bigendian_IN;
   15123    guest_EIP_curr_instr = (Addr32)guest_IP;
   15124    guest_EIP_bbstart    = (Addr32)toUInt(guest_IP - delta);
   15125 
   15126    x1 = irsb_IN->stmts_used;
   15127    expect_CAS = False;
   15128    dres = disInstr_X86_WRK ( &expect_CAS, put_IP, resteerOkFn,
   15129                              resteerCisOk,
   15130                              callback_opaque,
   15131                              delta, archinfo, abiinfo );
   15132    x2 = irsb_IN->stmts_used;
   15133    vassert(x2 >= x1);
   15134 
   15135    /* See comment at the top of disInstr_X86_WRK for meaning of
   15136       expect_CAS.  Here, we (sanity-)check for the presence/absence of
   15137       IRCAS as directed by the returned expect_CAS value. */
   15138    has_CAS = False;
   15139    for (i = x1; i < x2; i++) {
   15140       if (irsb_IN->stmts[i]->tag == Ist_CAS)
   15141          has_CAS = True;
   15142    }
   15143 
   15144    if (expect_CAS != has_CAS) {
   15145       /* inconsistency detected.  re-disassemble the instruction so as
   15146          to generate a useful error message; then assert. */
   15147       vex_traceflags |= VEX_TRACE_FE;
   15148       dres = disInstr_X86_WRK ( &expect_CAS, put_IP, resteerOkFn,
   15149                                 resteerCisOk,
   15150                                 callback_opaque,
   15151                                 delta, archinfo, abiinfo );
   15152       for (i = x1; i < x2; i++) {
   15153          vex_printf("\t\t");
   15154          ppIRStmt(irsb_IN->stmts[i]);
   15155          vex_printf("\n");
   15156       }
   15157       /* Failure of this assertion is serious and denotes a bug in
   15158          disInstr. */
   15159       vpanic("disInstr_X86: inconsistency in LOCK prefix handling");
   15160    }
   15161 
   15162    return dres;
   15163 }
   15164 
   15165 
   15166 /*--------------------------------------------------------------------*/
   15167 /*--- end                                         guest_x86_toIR.c ---*/
   15168 /*--------------------------------------------------------------------*/
   15169